diff --git a/Dockerfile b/Dockerfile index 3dde4cff04d..97b39935090 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,4 +8,4 @@ RUN conda install -c conda-forge google-cloud-sdk && \ rm -rf /var/lib/apt/lists/* # Install sky -RUN pip install --no-cache-dir "skypilot[all]==0.5.0" +RUN pip install --no-cache-dir "skypilot[all]==0.6.0" diff --git a/Dockerfile_k8s b/Dockerfile_k8s index 7b311dde13f..63def8682b2 100644 --- a/Dockerfile_k8s +++ b/Dockerfile_k8s @@ -3,9 +3,11 @@ FROM continuumio/miniconda3:23.3.1-0 # TODO(romilb): Investigate if this image can be consolidated with the skypilot # client image (`Dockerfile`) +ARG DEBIAN_FRONTEND=noninteractive + # Initialize conda for root user, install ssh and other local dependencies RUN apt update -y && \ - apt install gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat curl -y && \ + apt install git gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat curl -y && \ rm -rf /var/lib/apt/lists/* && \ apt remove -y python3 && \ conda init @@ -25,14 +27,20 @@ RUN useradd -m -s /bin/bash sky && \ # Switch to sky user USER sky +# Set HOME environment variable for sky user +ENV HOME /home/sky + +# Set current working directory +WORKDIR /home/sky + # Install SkyPilot pip dependencies preemptively to speed up provisioning time -RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \ - pip install networkx oauth2client pandas pendulum PrettyTable && \ - pip install ray[default]==2.9.3 rich tabulate filelock && \ - pip install packaging 'protobuf<4.0.0' pulp && \ - pip install pycryptodome==3.12.0 && \ - pip install docker kubernetes==28.1.0 && \ - pip install grpcio==1.51.3 python-dotenv==1.0.1 +RUN conda init && \ + pip install wheel Click colorama cryptography jinja2 jsonschema networkx \ + oauth2client pandas pendulum PrettyTable rich tabulate filelock packaging \ + 'protobuf<4.0.0' pulp pycryptodome==3.12.0 docker kubernetes==28.1.0 \ + grpcio==1.51.3 python-dotenv==1.0.1 ray[default]==2.9.3 && \ + curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \ + sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl # Add /home/sky/.local/bin/ to PATH RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc @@ -43,7 +51,3 @@ COPY --chown=sky . /skypilot/sky/ # Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately ENV PYTHONUNBUFFERED=1 - -# Set WORKDIR and initialize conda for sky user -WORKDIR /home/sky -RUN conda init diff --git a/Dockerfile_k8s_gpu b/Dockerfile_k8s_gpu index f570181d8e7..f9bc7258c61 100644 --- a/Dockerfile_k8s_gpu +++ b/Dockerfile_k8s_gpu @@ -1,46 +1,52 @@ -# TODO(romilb) - The base image used here (ray) is very large (11.4GB). -# as a result, this built image is about 13.5GB. We need to pick a lighter base -# image. -FROM rayproject/ray:2.9.3-py310-gpu +# We use the cuda runtime image instead of devel image to reduce size (1.3GB vs 3.6GB) +FROM nvidia/cuda:12.1.1-runtime-ubuntu20.04 -# Initialize conda for root user, install ssh and other local dependencies +ARG DEBIAN_FRONTEND=noninteractive + +# Install ssh and other local dependencies # We remove cuda lists to avoid conflicts with the cuda version installed by ray -RUN sudo rm -rf /etc/apt/sources.list.d/cuda* && \ - sudo apt update -y && \ - sudo apt install gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \ - sudo rm -rf /var/lib/apt/lists/* && \ - sudo apt remove -y python3 && \ - conda init +RUN rm -rf /etc/apt/sources.list.d/cuda* && \ + apt update -y && \ + apt install git gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \ + rm -rf /var/lib/apt/lists/* + +# Setup SSH and generate hostkeys +RUN sudo mkdir -p /var/run/sshd && \ + sudo sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + sudo sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \ + cd /etc/ssh/ && \ + sudo ssh-keygen -A # Setup new user named sky and add to sudoers. \ -# Also add /opt/conda/bin to sudo path and give sky user access to /home/ray +# Also add /opt/conda/bin to sudo path and give sky user permission to run sudo without password RUN sudo useradd -m -s /bin/bash sky && \ sudo /bin/bash -c 'echo "sky ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers' && \ - sudo /bin/bash -c "echo 'Defaults secure_path=\"/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\"' > /etc/sudoers.d/sky" && \ - sudo chmod -R a+rwx /home/ray + sudo /bin/bash -c "echo 'Defaults secure_path=\"/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\"' > /etc/sudoers.d/sky" # Switch to sky user USER sky -# Set HOME environment variable for sky user, otherwise Ray base image HOME overrides +# Set HOME environment variable for sky user ENV HOME /home/sky -# Setup SSH and generate hostkeys -RUN sudo mkdir -p /var/run/sshd && \ - sudo chmod 0755 /var/run/sshd && \ - sudo sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ - sudo sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \ - cd /etc/ssh/ && \ - ssh-keygen -A +# Set current working directory +WORKDIR /home/sky -# Install SkyPilot pip dependencies -RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \ - pip install networkx oauth2client pandas pendulum PrettyTable && \ - pip install rich tabulate filelock && \ - pip install packaging 'protobuf<4.0.0' pulp && \ - pip install pycryptodome==3.12.0 && \ - pip install docker kubernetes==28.1.0 && \ - pip install grpcio==1.51.3 python-dotenv==1.0.1 +SHELL ["/bin/bash", "-c"] + +# Install conda and other dependencies +# Keep the conda and Ray versions below in sync with the ones in skylet.constants +RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && \ + bash Miniconda3-Linux-x86_64.sh -b && \ + eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true && conda activate base && \ + grep "# >>> conda initialize >>>" ~/.bashrc || { conda init && source ~/.bashrc; } && \ + rm Miniconda3-Linux-x86_64.sh && \ + pip install wheel Click colorama cryptography jinja2 jsonschema networkx \ + oauth2client pandas pendulum PrettyTable rich tabulate filelock packaging \ + 'protobuf<4.0.0' pulp pycryptodome==3.12.0 docker kubernetes==28.1.0 \ + grpcio==1.51.3 python-dotenv==1.0.1 ray[default]==2.9.3 && \ + curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \ + sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl # Add /home/sky/.local/bin/ to PATH RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc @@ -51,7 +57,3 @@ COPY --chown=sky . /skypilot/sky/ # Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately ENV PYTHONUNBUFFERED=1 - -# Set WORKDIR and initialize conda for sky user -WORKDIR /home/sky -RUN conda init diff --git a/README.md b/README.md index 07eff4e0104..a2704df3643 100644 --- a/README.md +++ b/README.md @@ -27,12 +27,11 @@ ---- :fire: *News* :fire: +- [Jun, 2024] Reproduce **GPT** with [llm.c](https://github.com/karpathy/llm.c/discussions/481) on any cloud: [**guide**](./llm/gpt-2/) - [Apr, 2024] Serve and finetune [**Llama 3**](https://skypilot.readthedocs.io/en/latest/gallery/llms/llama-3.html) on any cloud or Kubernetes: [**example**](./llm/llama-3/) - [Apr, 2024] Serve [**Qwen-110B**](https://qwenlm.github.io/blog/qwen1.5-110b/) on your infra: [**example**](./llm/qwen/) - [Apr, 2024] Using [**Ollama**](https://github.com/ollama/ollama) to deploy quantized LLMs on CPUs and GPUs: [**example**](./llm/ollama/) -- [Mar, 2024] Serve and deploy [**Databricks DBRX**](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) on your infra: [**example**](./llm/dbrx/) - [Feb, 2024] Deploying and scaling [**Gemma**](https://blog.google/technology/developers/gemma-open-models/) with SkyServe: [**example**](./llm/gemma/) -- [Feb, 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/) - [Feb, 2024] Serving [**Code Llama 70B**](https://ai.meta.com/blog/code-llama-large-language-model-coding/) with vLLM and SkyServe: [**example**](./llm/codellama/) - [Dec, 2023] [**Mixtral 8x7B**](https://mistral.ai/news/mixtral-of-experts/), a high quality sparse mixture-of-experts model, was released by Mistral AI! Deploy via SkyPilot on any cloud: [**example**](./llm/mixtral/) - [Nov, 2023] Using [**Axolotl**](https://github.com/OpenAccess-AI-Collective/axolotl) to finetune Mistral 7B on the cloud (on-demand and spot): [**example**](./llm/axolotl/) @@ -43,6 +42,8 @@
Archived +- [Mar, 2024] Serve and deploy [**Databricks DBRX**](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) on your infra: [**example**](./llm/dbrx/) +- [Feb, 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/) - [Dec, 2023] Using [**LoRAX**](https://github.com/predibase/lorax) to serve 1000s of finetuned LLMs on a single instance in the cloud: [**example**](./llm/lorax/) - [Sep, 2023] [**Mistral 7B**](https://mistral.ai/news/announcing-mistral-7b/), a high-quality open LLM, was released! Deploy via SkyPilot on any cloud: [**Mistral docs**](https://docs.mistral.ai/self-deployment/skypilot) - [July, 2023] Self-Hosted **Llama-2 Chatbot** on Any Cloud: [**example**](./llm/llama-2/) @@ -153,8 +154,9 @@ To learn more, see our [Documentation](https://skypilot.readthedocs.io/en/latest Runnable examples: - LLMs on SkyPilot + - [GPT-2 via `llm.c`](./llm/gpt-2/) - [Llama 3](./llm/llama-3/) - - [Qwen](./llm/qwen/) + - [Qwen](./llm/qwen/) - [Databricks DBRX](./llm/dbrx/) - [Gemma](./llm/gemma/) - [Mixtral 8x7B](./llm/mixtral/); [Mistral 7B](https://docs.mistral.ai/self-deployment/skypilot/) (from official Mistral team) @@ -172,7 +174,7 @@ Runnable examples: - [LocalGPT](./llm/localgpt) - [Falcon](./llm/falcon) - Add yours here & see more in [`llm/`](./llm)! -- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama) and [many more (`examples/`)](./examples). +- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2) and [many more (`examples/`)](./examples). Follow updates: - [Twitter](https://twitter.com/skypilot_org) @@ -183,6 +185,7 @@ Read the research: - [SkyPilot paper](https://www.usenix.org/system/files/nsdi23-yang-zongheng.pdf) and [talk](https://www.usenix.org/conference/nsdi23/presentation/yang-zongheng) (NSDI 2023) - [Sky Computing whitepaper](https://arxiv.org/abs/2205.07147) - [Sky Computing vision paper](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s02-stoica.pdf) (HotOS 2021) +- [Policy for Managed Spot Jobs](https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao) (NSDI 2024) ## Support and Questions We are excited to hear your feedback! diff --git a/docs/source/_gallery_original/index.rst b/docs/source/_gallery_original/index.rst index 67f4eef11dc..e8a540c883c 100644 --- a/docs/source/_gallery_original/index.rst +++ b/docs/source/_gallery_original/index.rst @@ -1,3 +1,5 @@ +.. _ai-gallery: + AI Gallery ==================== diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index 11affaf4c43..5630793d8ff 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -28,7 +28,6 @@ document.addEventListener('DOMContentLoaded', () => { { selector: '.caption-text', text: 'SkyServe: Model Serving' }, { selector: '.toctree-l1 > a', text: 'Managed Jobs' }, { selector: '.toctree-l1 > a', text: 'Running on Kubernetes' }, - { selector: '.toctree-l1 > a', text: 'DBRX (Databricks)' }, { selector: '.toctree-l1 > a', text: 'Ollama' }, { selector: '.toctree-l1 > a', text: 'Llama-3 (Meta)' }, { selector: '.toctree-l1 > a', text: 'Qwen (Alibaba)' }, diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst index 5318d76b1a3..df1d2c5e161 100644 --- a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst +++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst @@ -9,31 +9,15 @@ for authentication and creating resources on your Kubernetes cluster. When running inside your Kubernetes cluster (e.g., as a Spot controller or Serve controller), SkyPilot can operate using either of the following three authentication methods: -1. **Using your local kubeconfig file**: In this case, SkyPilot will - copy your local ``~/.kube/config`` file to the controller pod and use it for - authentication. This is the default method when running inside the cluster, - and no additional configuration is required. - - .. note:: - - If your cluster uses exec based authentication in your ``~/.kube/config`` file - (e.g., GKE uses exec auth by default), SkyPilot may not be able to authenticate using this method. In this case, - consider using the service account methods below. - -2. **Creating a service account**: SkyPilot can automatically create the service +1. **Automatically create a service account**: SkyPilot can automatically create the service account and roles for itself to manage resources in the Kubernetes cluster. - To use this method, set ``remote_identity: SERVICE_ACCOUNT`` to your - Kubernetes configuration in the :ref:`~/.sky/config.yaml ` file: - - .. code-block:: yaml - - kubernetes: - remote_identity: SERVICE_ACCOUNT + This is the default method when running inside the cluster, and no + additional configuration is required. For details on the permissions that are granted to the service account, - refer to the `Permissions required for SkyPilot`_ section below. + refer to the `Minimum Permissions Required for SkyPilot`_ section below. -3. **Using a custom service account**: If you have a custom service account +2. **Using a custom service account**: If you have a custom service account with the `necessary permissions `__, you can configure SkyPilot to use it by adding this to your :ref:`~/.sky/config.yaml ` file: @@ -42,6 +26,22 @@ SkyPilot can operate using either of the following three authentication methods: kubernetes: remote_identity: your-service-account-name +3. **Using your local kubeconfig file**: In this case, SkyPilot will + copy your local ``~/.kube/config`` file to the controller pod and use it for + authentication. To use this method, set ``remote_identity: LOCAL_CREDENTIALS`` to your + Kubernetes configuration in the :ref:`~/.sky/config.yaml ` file: + + .. code-block:: yaml + + kubernetes: + remote_identity: LOCAL_CREDENTIALS + + .. note:: + + If your cluster uses exec based authentication in your ``~/.kube/config`` file + (e.g., GKE uses exec auth by default), SkyPilot may not be able to authenticate using this method. In this case, + consider using the service account methods below. + .. note:: Service account based authentication applies only when the remote SkyPilot @@ -53,8 +53,8 @@ Below are the permissions required by SkyPilot and an example service account YA .. _k8s-permissions: -Permissions required for SkyPilot ---------------------------------- +Minimum Permissions Required for SkyPilot +----------------------------------------- SkyPilot requires permissions equivalent to the following roles to be able to manage the resources in the Kubernetes cluster: @@ -62,12 +62,12 @@ SkyPilot requires permissions equivalent to the following roles to be able to ma # Namespaced role for the service account # Required for creating pods, services and other necessary resources in the namespace. - # Note these permissions only apply in the namespace where SkyPilot is deployed. + # Note these permissions only apply in the namespace where SkyPilot is deployed, and the namespace can be changed below. kind: Role apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: sky-sa-role - namespace: default + name: sky-sa-role # Can be changed if needed + namespace: default # Change to your namespace if using a different one. rules: - apiGroups: ["*"] resources: ["*"] @@ -77,49 +77,104 @@ SkyPilot requires permissions equivalent to the following roles to be able to ma kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: sky-sa-cluster-role - namespace: default - labels: - parent: skypilot + name: sky-sa-cluster-role # Can be changed if needed + namespace: default # Change to your namespace if using a different one. + labels: + parent: skypilot rules: - - apiGroups: [""] - resources: ["nodes"] # Required for getting node resources. - verbs: ["get", "list", "watch"] - - apiGroups: ["rbac.authorization.k8s.io"] - resources: ["clusterroles", "clusterrolebindings"] # Required for launching more SkyPilot clusters from within the pod. - verbs: ["get", "list", "watch"] - - apiGroups: ["node.k8s.io"] - resources: ["runtimeclasses"] # Required for autodetecting the runtime class of the nodes. - verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["nodes"] # Required for getting node resources. + verbs: ["get", "list", "watch"] + - apiGroups: ["node.k8s.io"] + resources: ["runtimeclasses"] # Required for autodetecting the runtime class of the nodes. + verbs: ["get", "list", "watch"] + + +.. tip:: + + If you are using a different namespace than ``default``, make sure to change the namespace in the above manifests. + +These roles must apply to both the user account configured in the kubeconfig file and the service account used by SkyPilot (if configured). + +If your tasks use object store mounting or require access to ingress resources, you will need to grant additional permissions as described below. + +Permissions for Object Store Mounting +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If your tasks use object store mounting (e.g., S3, GCS, etc.), SkyPilot will need to run a DaemonSet to expose the FUSE device as a Kubernetes resource to SkyPilot pods. + +To allow this, you will need to also create a ``skypilot-system`` namespace which will run the DaemonSet and grant the necessary permissions to the service account in that namespace. + + +.. code-block:: yaml + + # Required only if using object store mounting + # Create namespace for SkyPilot system + apiVersion: v1 + kind: Namespace + metadata: + name: skypilot-system # Do not change this + labels: + parent: skypilot --- - # Optional: If using ingresses, role for accessing ingress service IP + # Role for the skypilot-system namespace to create FUSE device manager and + # any other system components required by SkyPilot. + # This role must be bound in the skypilot-system namespace to the service account used for SkyPilot. + kind: Role + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: skypilot-system-service-account-role # Can be changed if needed + namespace: skypilot-system # Do not change this namespace + labels: + parent: skypilot + rules: + - apiGroups: ["*"] + resources: ["*"] + verbs: ["*"] + + +Permissions for using Ingress +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If your tasks use :ref:`Ingress ` for exposing ports, you will need to grant the necessary permissions to the service account in the ``ingress-nginx`` namespace. + +.. code-block:: yaml + + # Required only if using ingresses + # Role for accessing ingress service IP apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - namespace: ingress-nginx - name: sky-sa-role-ingress-nginx + namespace: ingress-nginx # Do not change this + name: sky-sa-role-ingress-nginx # Can be changed if needed rules: - - apiGroups: [""] - resources: ["services"] - verbs: ["list", "get"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "get"] -These roles must apply to both the user account configured in the kubeconfig file and the service account used by SkyPilot (if configured). .. _k8s-sa-example: Example using Custom Service Account ------------------------------------ -To create a service account that has the necessary permissions for SkyPilot, you can use the following YAML: +To create a service account that has all necessary permissions for SkyPilot (including for accessing object stores), you can use the following YAML. + +.. tip:: + + In this example, the service account is named ``sky-sa`` and is created in the ``default`` namespace. + Change the namespace and service account name as needed. + .. code-block:: yaml + :linenos: # create-sky-sa.yaml kind: ServiceAccount apiVersion: v1 metadata: - name: sky-sa - namespace: default + name: sky-sa # Change to your service account name + namespace: default # Change to your namespace if using a different one. labels: parent: skypilot --- @@ -127,8 +182,8 @@ To create a service account that has the necessary permissions for SkyPilot, you kind: Role apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: sky-sa-role - namespace: default + name: sky-sa-role # Can be changed if needed + namespace: default # Change to your namespace if using a different one. labels: parent: skypilot rules: @@ -140,85 +195,126 @@ To create a service account that has the necessary permissions for SkyPilot, you kind: RoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: sky-sa-rb - namespace: default + name: sky-sa-rb # Can be changed if needed + namespace: default # Change to your namespace if using a different one. labels: parent: skypilot subjects: - - kind: ServiceAccount - name: sky-sa + - kind: ServiceAccount + name: sky-sa # Change to your service account name roleRef: - kind: Role - name: sky-sa-role - apiGroup: rbac.authorization.k8s.io + kind: Role + name: sky-sa-role # Use the same name as the role at line 14 + apiGroup: rbac.authorization.k8s.io --- - # Role for accessing ingress resources + # ClusterRole for the service account + kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: sky-sa-cluster-role # Can be changed if needed + namespace: default # Change to your namespace if using a different one. + labels: + parent: skypilot + rules: + - apiGroups: [""] + resources: ["nodes"] # Required for getting node resources. + verbs: ["get", "list", "watch"] + - apiGroups: ["node.k8s.io"] + resources: ["runtimeclasses"] # Required for autodetecting the runtime class of the nodes. + verbs: ["get", "list", "watch"] + - apiGroups: ["networking.k8s.io"] # Required for exposing services through ingresses + resources: ["ingressclasses"] + verbs: ["get", "list", "watch"] + --- + # ClusterRoleBinding for the service account + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: sky-sa-cluster-role-binding # Can be changed if needed + namespace: default # Change to your namespace if using a different one. + labels: + parent: skypilot + subjects: + - kind: ServiceAccount + name: sky-sa # Change to your service account name + namespace: default # Change to your namespace if using a different one. + roleRef: + kind: ClusterRole + name: sky-sa-cluster-role # Use the same name as the cluster role at line 43 + apiGroup: rbac.authorization.k8s.io + --- + # Optional: If using object store mounting, create the skypilot-system namespace + apiVersion: v1 + kind: Namespace + metadata: + name: skypilot-system # Do not change this + labels: + parent: skypilot + --- + # Optional: If using object store mounting, create role in the skypilot-system + # namespace to create FUSE device manager. kind: Role + apiVersion: rbac.authorization.k8s.io/v1 metadata: - namespace: ingress-nginx - name: sky-sa-role-ingress-nginx + name: skypilot-system-service-account-role # Can be changed if needed + namespace: skypilot-system # Do not change this namespace + labels: + parent: skypilot rules: - - apiGroups: [""] - resources: ["services"] - verbs: ["list", "get", "watch"] - - apiGroups: ["rbac.authorization.k8s.io"] - resources: ["roles", "rolebindings"] - verbs: ["list", "get", "watch"] + - apiGroups: ["*"] + resources: ["*"] + verbs: ["*"] --- - # RoleBinding for accessing ingress resources + # Optional: If using object store mounting, create rolebinding in the skypilot-system + # namespace to create FUSE device manager. apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: sky-sa-rolebinding-ingress-nginx - namespace: ingress-nginx + name: sky-sa-skypilot-system-role-binding + namespace: skypilot-system # Do not change this namespace + labels: + parent: skypilot subjects: - - kind: ServiceAccount - name: sky-sa - namespace: default + - kind: ServiceAccount + name: sky-sa # Change to your service account name + namespace: default # Change this to the namespace where the service account is created roleRef: kind: Role - name: sky-sa-role-ingress-nginx + name: skypilot-system-service-account-role # Use the same name as the role at line 88 apiGroup: rbac.authorization.k8s.io --- - # ClusterRole for the service account - kind: ClusterRole + # Optional: Role for accessing ingress resources apiVersion: rbac.authorization.k8s.io/v1 + kind: Role metadata: - name: sky-sa-cluster-role - namespace: default + name: sky-sa-role-ingress-nginx # Can be changed if needed + namespace: ingress-nginx # Do not change this namespace labels: parent: skypilot rules: - - apiGroups: [""] - resources: ["nodes"] # Required for getting node resources. - verbs: ["get", "list", "watch"] - - apiGroups: ["rbac.authorization.k8s.io"] - resources: ["clusterroles", "clusterrolebindings"] # Required for launching more SkyPilot clusters from within the pod. - verbs: ["get", "list", "watch"] - - apiGroups: ["node.k8s.io"] - resources: ["runtimeclasses"] # Required for autodetecting the runtime class of the nodes. - verbs: ["get", "list", "watch"] - - apiGroups: ["networking.k8s.io"] # Required for exposing services. - resources: ["ingressclasses"] - verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "get", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings"] + verbs: ["list", "get", "watch"] --- - # ClusterRoleBinding for the service account + # Optional: RoleBinding for accessing ingress resources apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRoleBinding + kind: RoleBinding metadata: - name: sky-sa-cluster-role-binding - namespace: default + name: sky-sa-rolebinding-ingress-nginx # Can be changed if needed + namespace: ingress-nginx # Do not change this namespace labels: - parent: skypilot + parent: skypilot subjects: - - kind: ServiceAccount - name: sky-sa - namespace: default + - kind: ServiceAccount + name: sky-sa # Change to your service account name + namespace: default # Change this to the namespace where the service account is created roleRef: - kind: ClusterRole - name: sky-sa-cluster-role - apiGroup: rbac.authorization.k8s.io + kind: Role + name: sky-sa-role-ingress-nginx # Use the same name as the role at line 119 + apiGroup: rbac.authorization.k8s.io Create the service account using the following command: @@ -226,9 +322,12 @@ Create the service account using the following command: $ kubectl apply -f create-sky-sa.yaml -After creating the service account, configure SkyPilot to use it through ``~/.sky/config.yaml``: +After creating the service account, the cluster admin may distribute kubeconfigs with the ``sky-sa`` service account to users who need to access the cluster. + +Users should also configure SkyPilot to use the ``sky-sa`` service account through ``~/.sky/config.yaml``: .. code-block:: yaml + # ~/.sky/config.yaml kubernetes: remote_identity: sky-sa # Or your service account name diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst index 676c8be6c7c..5a648dbcda4 100644 --- a/docs/source/docs/index.rst +++ b/docs/source/docs/index.rst @@ -69,6 +69,7 @@ Runnable examples: * **LLMs on SkyPilot** + * `GPT-2 via llm.c `_ * `Llama 3 `_ * `Qwen `_ * `Databricks DBRX `_ @@ -89,7 +90,7 @@ Runnable examples: * `Falcon `_ * Add yours here & see more in `llm/ `_! -* Framework examples: `PyTorch DDP `_, `DeepSpeed `_, `JAX/Flax on TPU `_, `Stable Diffusion `_, `Detectron2 `_, `Distributed `_ `TensorFlow `_, `NeMo `_, `programmatic grid search `_, `Docker `_, `Cog `_, `Unsloth `_, `Ollama `_ and `many more `_. +* Framework examples: `PyTorch DDP `_, `DeepSpeed `_, `JAX/Flax on TPU `_, `Stable Diffusion `_, `Detectron2 `_, `Distributed `_ `TensorFlow `_, `NeMo `_, `programmatic grid search `_, `Docker `_, `Cog `_, `Unsloth `_, `Ollama `_, `llm.c `__ and `many more `_. Follow updates: @@ -125,7 +126,7 @@ Contents ../reference/job-queue ../examples/auto-failover ../reference/kubernetes/index - ../running-jobs/index + ../running-jobs/distributed-jobs .. toctree:: :maxdepth: 1 @@ -154,12 +155,14 @@ Contents :maxdepth: 1 :caption: User Guides + ../running-jobs/environment-variables ../examples/docker-containers ../examples/ports ../reference/tpu ../reference/logging ../reference/faq + .. toctree:: :maxdepth: 1 :caption: Developer Guides diff --git a/docs/source/examples/docker-containers.rst b/docs/source/examples/docker-containers.rst index 9fe835d6b9a..8bc7ae16837 100644 --- a/docs/source/examples/docker-containers.rst +++ b/docs/source/examples/docker-containers.rst @@ -8,6 +8,10 @@ SkyPilot can run a container either as a task, or as the runtime environment of * If the container image is invocable / has an entrypoint: run it :ref:`as a task `. * If the container image is to be used as a runtime environment (e.g., ``ubuntu``, ``nvcr.io/nvidia/pytorch:23.10-py3``, etc.) and if you have extra commands to run inside the container: run it :ref:`as a runtime environment `. +.. note:: + + Running docker containers is `not supported on RunPod `_. To use RunPod, use ``setup`` and ``run`` to configure your environment. See `GitHub issue `_ for more. + .. _docker-containers-as-tasks: Running Containers as Tasks diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index e5b318d4f87..d7770f079ec 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -311,25 +311,26 @@ Fluidstack Cudo Compute ~~~~~~~~~~~~~~~~~~ -`Cudo Compute `__ GPU cloud provides low cost GPUs powered with green energy. -1. Create a billing account by following `this guide `__. -2. Create a project ``__. -3. Create an API Key by following `this guide `__. -3. Download and install the `cudoctl `__ command line tool -3. Run :code:`cudoctl init`: +`Cudo Compute `__ provides low cost GPUs powered by green energy. -.. code-block:: shell +1. Create a `billing account `__. +2. Create a `project `__. +3. Create an `API Key `__. +4. Download and install the `cudoctl `__ command line tool +5. Run :code:`cudoctl init`: + + .. code-block:: shell - cudoctl init - ✔ api key: my-api-key - ✔ project: my-project - ✔ billing account: my-billing-account - ✔ context: default - config file saved ~/.config/cudo/cudo.yml + cudoctl init + ✔ api key: my-api-key + ✔ project: my-project + ✔ billing account: my-billing-account + ✔ context: default + config file saved ~/.config/cudo/cudo.yml - pip install "cudo-compute>=0.1.10" + pip install "cudo-compute>=0.1.10" -If you want to want to use skypilot with a different Cudo Compute account or project, just run :code:`cudoctl init`: again. +If you want to want to use SkyPilot with a different Cudo Compute account or project, run :code:`cudoctl init` again. diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst index bb281087736..bfc6fd17e05 100644 --- a/docs/source/getting-started/quickstart.rst +++ b/docs/source/getting-started/quickstart.rst @@ -72,6 +72,8 @@ To launch a cluster and run a task, use :code:`sky launch`: You can use the ``-c`` flag to give the cluster an easy-to-remember name. If not specified, a name is autogenerated. + If the cluster name is an existing cluster shown in ``sky status``, the cluster will be reused. + The ``sky launch`` command performs much heavy-lifting: - selects an appropriate cloud and VM based on the specified resource constraints; @@ -208,7 +210,7 @@ Managed spot jobs run on much cheaper spot instances, with automatic preemption .. code-block:: console - $ sky spot launch hello_sky.yaml + $ sky jobs launch --use-spot hello_sky.yaml Next steps ----------- diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index dce0ce1f643..ea744f925f1 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -40,6 +40,49 @@ Available fields and semantics: - gcp - kubernetes + docker: + # Additional Docker run options (optional). + # + # When image_id: docker: is used in a task YAML, additional + # run options for starting the Docker container can be specified here. + # These options will be passed directly as command line args to `docker run`, + # see: https://docs.docker.com/reference/cli/docker/container/run/ + # + # The following run options are applied by default and cannot be overridden: + # --net=host + # --cap-add=SYS_ADMIN + # --device=/dev/fuse + # --security-opt=apparmor:unconfined + # --runtime=nvidia # Applied if nvidia GPUs are detected on the host + # + # This field can be useful for mounting volumes and other advanced Docker + # configurations. You can specify a list of arguments or a string, where the + # former will be combined into a single string with spaces. The following is + # an example option for allowing running Docker inside Docker and increase + # the size of /dev/shm.: + # sky launch --cloud aws --image-id docker:continuumio/miniconda3 "apt update; apt install -y docker.io; docker run hello-world" + run_options: + - -v /var/run/docker.sock:/var/run/docker.sock + - --shm-size=2g + + nvidia_gpus: + # Disable ECC for NVIDIA GPUs (optional). + # + # Set to true to disable ECC for NVIDIA GPUs during provisioning. This is + # useful to improve the GPU performance in some cases (up to 30% + # improvement). This will only be applied if a cluster is requested with + # NVIDIA GPUs. This is best-effort -- not guaranteed to work on all clouds + # e.g., RunPod and Kubernetes does not allow rebooting the node, though + # RunPod has ECC disabled by default. + # + # Note: this setting will cause a reboot during the first provisioning of + # the cluster, which may take a few minutes. + # + # Reference: https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW + # + # Default: false. + disable_ecc: false + # Advanced AWS configurations (optional). # Apply to all new instances but not existing ones. aws: @@ -247,6 +290,30 @@ Available fields and semantics: - projects/my-project/reservations/my-reservation2 + # Managed instance group / DWS (optional). + # + # SkyPilot supports launching instances in a managed instance group (MIG) + # which schedules the GPU instance creation through DWS, offering a better + # availability. This feature is only applied when a resource request + # contains GPU instances. + managed_instance_group: + # Duration for a created instance to be kept alive (in seconds, required). + # + # This is required for the DWS to work properly. After the + # specified duration, the instance will be terminated. + run_duration: 3600 + # Timeout for provisioning an instance by DWS (in seconds, optional). + # + # This timeout determines how long SkyPilot will wait for a managed + # instance group to create the requested resources before giving up, + # deleting the MIG and failing over to other locations. Larger timeouts + # may increase the chance for getting a resource, but will blcok failover + # to go to other zones/regions/clouds. + # + # Default: 900 + provision_timeout: 900 + + # Identity to use for all GCP instances (optional). # # LOCAL_CREDENTIALS: The user's local credential files will be uploaded to @@ -438,4 +505,3 @@ Available fields and semantics: us-ashburn-1: vcn_subnet: ocid1.subnet.oc1.iad.aaaaaaaafbj7i3aqc4ofjaapa5edakde6g4ea2yaslcsay32cthp7qo55pxa - diff --git a/docs/source/reference/kubernetes/index.rst b/docs/source/reference/kubernetes/index.rst index 1edfde01240..bde97615e80 100644 --- a/docs/source/reference/kubernetes/index.rst +++ b/docs/source/reference/kubernetes/index.rst @@ -3,247 +3,107 @@ Running on Kubernetes ============================= -.. note:: - Kubernetes support is under active development. `Please share your feedback `_ - or `directly reach out to the development team `_ - for feature requests and more. - SkyPilot tasks can be run on your private on-prem or cloud Kubernetes clusters. The Kubernetes cluster gets added to the list of "clouds" in SkyPilot and SkyPilot tasks can be submitted to your Kubernetes cluster just like any other cloud provider. -**Benefits of using SkyPilot to run jobs on your Kubernetes cluster:** - -* Get SkyPilot features (setup management, job execution, queuing, logging, SSH access) on your Kubernetes resources -* Replace complex Kubernetes manifests with simple SkyPilot tasks -* Seamlessly "burst" jobs to the cloud if your Kubernetes cluster is congested -* Retain observability and control over your cluster with your existing Kubernetes tools - -**Supported Kubernetes deployments:** - -* Hosted Kubernetes services (EKS, GKE) -* On-prem clusters (Kubeadm, Rancher) -* Local development clusters (KinD, minikube) - - -Kubernetes Cluster Requirements +Why use SkyPilot on Kubernetes? ------------------------------- -To connect and use a Kubernetes cluster, SkyPilot needs: - -* An existing Kubernetes cluster running Kubernetes v1.20 or later. -* A `Kubeconfig `_ file containing access credentials and namespace to be used. - -In a typical workflow: - -1. A cluster administrator sets up a Kubernetes cluster. Detailed admin guides for - different deployment environments (Amazon EKS, Google GKE, On-Prem and local debugging) are included in the :ref:`Kubernetes cluster setup guide `. - -2. Users who want to run SkyPilot tasks on this cluster are issued Kubeconfig - files containing their credentials (`kube-context `_). - SkyPilot reads this Kubeconfig file to communicate with the cluster. - -Submitting SkyPilot tasks to Kubernetes Clusters ------------------------------------------------- -.. _kubernetes-instructions: - -Once your cluster administrator has :ref:`setup a Kubernetes cluster ` and provided you with a kubeconfig file: - -0. Make sure `kubectl `_, ``socat`` and ``nc`` (netcat) are installed on your local machine. - - .. code-block:: console - - $ # MacOS - $ brew install kubectl socat netcat - - $ # Linux (may have socat already installed) - $ sudo apt-get install kubectl socat netcat - - -1. Place your kubeconfig file at ``~/.kube/config``. - - .. code-block:: console - - $ mkdir -p ~/.kube - $ cp /path/to/kubeconfig ~/.kube/config - - You can verify your credentials are setup correctly by running :code:`kubectl get pods`. - -2. Run :code:`sky check` and verify that Kubernetes is enabled in SkyPilot. - - .. code-block:: console - - $ sky check - - Checking credentials to enable clouds for SkyPilot. - ... - Kubernetes: enabled - ... - - - .. note:: - :code:`sky check` will also check if GPU support is available on your cluster. If GPU support is not available, it - will show the reason. - To setup GPU support on the cluster, refer to the :ref:`Kubernetes cluster setup guide `. - -4. You can now run any SkyPilot task on your Kubernetes cluster. - - .. code-block:: console +.. tab-set:: - $ sky launch --cpus 2+ task.yaml - == Optimizer == - Target: minimizing cost - Estimated cost: $0.0 / hour + .. tab-item:: For AI Developers + :sync: why-ai-devs-tab - Considered resources (1 node): - --------------------------------------------------------------------------------------------------- - CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN - --------------------------------------------------------------------------------------------------- - Kubernetes 2CPU--2GB 2 2 - kubernetes 0.00 ✔ - AWS m6i.large 2 8 - us-east-1 0.10 - Azure Standard_D2s_v5 2 8 - eastus 0.10 - GCP n2-standard-2 2 8 - us-central1 0.10 - IBM bx2-8x32 8 32 - us-east 0.38 - Lambda gpu_1x_a10 30 200 A10:1 us-east-1 0.60 - ---------------------------------------------------------------------------------------------------. + .. grid:: 2 + :gutter: 3 + .. grid-item-card:: ✅ Ease of use + :text-align: center -.. note:: - SkyPilot will use the cluster and namespace set in the ``current-context`` in the - kubeconfig file. To manage your ``current-context``: + .. + TODO(romilb): We should have a comparison of a popular Kubernetes manifest vs a SkyPilot YAML in terms of LoC in a mini blog and link it here. - .. code-block:: console + No complex kubernetes manifests - write a simple SkyPilot YAML and run with one command ``sky launch``. - $ # See current context - $ kubectl config current-context + .. grid-item-card:: 📋 Interactive development on Kubernetes + :text-align: center - $ # Switch current-context - $ kubectl config use-context mycontext + :ref:`SSH access to pods `, :ref:`VSCode integration `, :ref:`job management `, :ref:`autodown idle pods ` and more. - $ # Set a specific namespace to be used in the current-context - $ kubectl config set-context --current --namespace=mynamespace + .. grid-item-card:: ☁️ Burst to the cloud + :text-align: center + Kubernetes cluster is full? SkyPilot :ref:`seamlessly gets resources on the cloud ` to get your job running sooner. -Using Custom Images -------------------- -By default, we use and maintain a SkyPilot container image that has conda and a few other basic tools installed. + .. grid-item-card:: 🖼 Run popular models on Kubernetes + :text-align: center -To use your own image, add :code:`image_id: docker:` to the :code:`resources` section of your task YAML. + Train and serve `Llama-3 `_, `Mixtral `_, and more on your Kubernetes with ready-to-use recipes from the :ref:`AI gallery `. -.. code-block:: yaml - resources: - image_id: docker:myrepo/myimage:latest - ... + .. tab-item:: For Infrastructure Admins + :sync: why-admins-tab -Your image must satisfy the following requirements: + .. grid:: 2 + :gutter: 3 -* Image must be **debian-based** and must have the apt package manager installed. -* The default user in the image must have root privileges or passwordless sudo access. + .. grid-item-card:: ☁️ Unified platform for all Infrastructure + :text-align: center -.. note:: + Scale beyond your Kubernetes cluster to capacity on :ref:`across clouds and regions ` without manual intervention. - If your cluster runs on non-x86_64 architecture (e.g., Apple Silicon), your image must be built natively for that architecture. Otherwise, your job may get stuck at :code:`Start streaming logs ...`. See `GitHub issue `_ for more. + .. grid-item-card:: 🚯️ Minimize resource wastage + :text-align: center -Using Images from Private Repositories -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To use images from private repositories (e.g., Private DockerHub, Amazon ECR, Google Container Registry), create a `secret `_ in your Kubernetes cluster and edit your :code:`~/.sky/config.yaml` to specify the secret like so: + SkyPilot can run with your custom pod scheduler and automatically terminate idle pods to free up resources for other users. -.. code-block:: yaml + .. grid-item-card:: 👀 Observability + :text-align: center - kubernetes: - pod_config: - spec: - imagePullSecrets: - - name: your-secret-here + Works with your existing observability and monitoring tools, such as the :ref:`Kubernetes Dashboard `. -.. tip:: + .. grid-item-card:: 🍽️ Self-serve infra for your teams + :text-align: center - If you use Amazon ECR, your secret credentials may expire every 12 hours. Consider using `k8s-ecr-login-renew `_ to automatically refresh your secrets. + Reduce operational overhead by letting your teams provision their own resources, while you retain control over the Kubernetes cluster. -Opening Ports -------------- +Table of Contents +----------------- -Opening ports on SkyPilot clusters running on Kubernetes is supported through two modes: +.. grid:: 1 1 3 3 + :gutter: 3 -1. `LoadBalancer services `_ (default) -2. `Nginx IngressController `_ + .. grid-item-card:: 👋 Get Started + :link: kubernetes-getting-started + :link-type: ref + :text-align: center -One of these modes must be supported and configured on your cluster. Refer to the :ref:`setting up ports on Kubernetes guide ` on how to do this. + Already have a kubeconfig? Launch your first SkyPilot task on Kubernetes - it's as simple as ``sky launch``. -.. tip:: + .. grid-item-card:: ⚙️ Cluster Configuration + :link: kubernetes-setup + :link-type: ref + :text-align: center - On Google GKE, Amazon EKS or other cloud-hosted Kubernetes services, the default LoadBalancer services mode is supported out of the box and no additional configuration is needed. + Are you a cluster admin? Find cluster deployment guides and setup instructions here. -Once your cluster is configured, launch a task which exposes services on a port by adding :code:`ports` to the :code:`resources` section of your task YAML. + .. grid-item-card:: 🔍️ Troubleshooting + :link: kubernetes-troubleshooting + :link-type: ref + :text-align: center -.. code-block:: yaml + Running into problems with SkyPilot on your Kubernetes cluster? Find common issues and solutions here. - # task.yaml - resources: - ports: 8888 - run: | - python -m http.server 8888 - -After launching the cluster with :code:`sky launch -c myclus task.yaml`, you can get the URL to access the port using :code:`sky status --endpoints myclus`. - -.. code-block:: bash - - # List all ports exposed by the cluster - $ sky status --endpoints myclus - 8888: 34.173.13.241:8888 - - # curl a specific port's endpoint - $ curl $(sky status --endpoint 8888 myclus) - ... - -.. tip:: - - To learn more about opening ports in SkyPilot tasks, see :ref:`Opening Ports `. - -FAQs ----- - -* **Are autoscaling Kubernetes clusters supported?** - - To run on an autoscaling cluster, you may need to adjust the resource provisioning timeout (:code:`Kubernetes.TIMEOUT` in `clouds/kubernetes.py`) to a large value to give enough time for the cluster to autoscale. We are working on a better interface to adjust this timeout - stay tuned! - -* **Can SkyPilot provision a Kubernetes cluster for me? Will SkyPilot add more nodes to my Kubernetes clusters?** - - The goal of Kubernetes support is to run SkyPilot tasks on an existing Kubernetes cluster. It does not provision any new Kubernetes clusters or add new nodes to an existing Kubernetes cluster. - -* **I have multiple users in my organization who share the same Kubernetes cluster. How do I provide isolation for their SkyPilot workloads?** - - For isolation, you can create separate Kubernetes namespaces and set them in the kubeconfig distributed to users. SkyPilot will use the namespace set in the kubeconfig for running all tasks. - -* **How can I specify custom configuration for the pods created by SkyPilot?** - - You can override the pod configuration used by SkyPilot by setting the :code:`pod_config` key in :code:`~/.sky/config.yaml`. - The value of :code:`pod_config` should be a dictionary that follows the `Kubernetes Pod API `_. - - For example, to set custom environment variables and attach a volume on your pods, you can add the following to your :code:`~/.sky/config.yaml` file: - - .. code-block:: yaml +.. toctree:: + :hidden: - kubernetes: - pod_config: - spec: - containers: - - env: - - name: MY_ENV_VAR - value: MY_ENV_VALUE - volumeMounts: # Custom volume mounts for the pod - - mountPath: /foo - name: example-volume - volumes: - - name: example-volume - hostPath: - path: /tmp - type: Directory + Getting Started + kubernetes-setup + kubernetes-troubleshooting - For more details refer to :ref:`config-yaml`. Features and Roadmap -------------------- @@ -256,11 +116,4 @@ Kubernetes support is under active development. Some features are in progress an * Multi-node tasks - ✅ Available * Custom images - ✅ Available * Opening ports and exposing services - ✅ Available -* Multiple Kubernetes Clusters - 🚧 In progress - - -.. toctree:: - :hidden: - - kubernetes-setup - kubernetes-troubleshooting +* Multiple Kubernetes Clusters - 🚧 In progress \ No newline at end of file diff --git a/docs/source/reference/kubernetes/kubernetes-deployment.rst b/docs/source/reference/kubernetes/kubernetes-deployment.rst new file mode 100644 index 00000000000..eb5bb31d78d --- /dev/null +++ b/docs/source/reference/kubernetes/kubernetes-deployment.rst @@ -0,0 +1,270 @@ +.. _kubernetes-deployment: + +Deployment Guides +----------------- +Below we include minimal guides to set up a new Kubernetes cluster in different environments, including hosted services on the cloud. + +.. grid:: 2 + :gutter: 3 + + .. grid-item-card:: Local Development Cluster + :link: kubernetes-setup-kind + :link-type: ref + :text-align: center + + Run a local Kubernetes cluster on your laptop with ``sky local up``. + + .. grid-item-card:: On-prem Clusters (RKE2, K3s, etc.) + :link: kubernetes-setup-onprem + :link-type: ref + :text-align: center + + For on-prem deployments with kubeadm, RKE2, K3s or other distributions. + + .. grid-item-card:: Google Cloud - GKE + :link: kubernetes-setup-gke + :link-type: ref + :text-align: center + + Google's hosted Kubernetes service. + + .. grid-item-card:: Amazon - EKS + :link: kubernetes-setup-eks + :link-type: ref + :text-align: center + + Amazon's hosted Kubernetes service. + +.. _kubernetes-setup-kind: + + +Deploying locally on your laptop +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To try out SkyPilot on Kubernetes on your laptop or run SkyPilot +tasks locally without requiring any cloud access, we provide the +:code:`sky local up` CLI to create a 1-node Kubernetes cluster locally. + +Under the hood, :code:`sky local up` uses `kind `_, +a tool for creating a Kubernetes cluster on your local machine. +It runs a Kubernetes cluster inside a container, so no setup is required. + +1. Install `Docker `_ and `kind `_. +2. Run :code:`sky local up` to launch a Kubernetes cluster and automatically configure your kubeconfig file: + + .. code-block:: console + + $ sky local up + +3. Run :code:`sky check` and verify that Kubernetes is enabled in SkyPilot. You can now run SkyPilot tasks on this locally hosted Kubernetes cluster using :code:`sky launch`. +4. After you are done using the cluster, you can remove it with :code:`sky local down`. This will destroy the local kubernetes cluster and switch your kubeconfig back to it's original context: + + .. code-block:: console + + $ sky local down + +.. note:: + We recommend allocating at least 4 or more CPUs to your docker runtime to + ensure kind has enough resources. See instructions to increase CPU allocation + `here `_. + +.. note:: + kind does not support multiple nodes and GPUs. + It is not recommended for use in a production environment. + If you want to run a private on-prem cluster, see the section on :ref:`on-prem deployment ` for more. + + +.. _kubernetes-setup-gke: + +Deploying on Google Cloud GKE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. Create a GKE standard cluster with at least 1 node. We recommend creating nodes with at least 4 vCPUs. + + .. raw:: HTML + +
+ + Example: create a GKE cluster with 2 nodes, each having 16 CPUs. + + .. code-block:: bash + + PROJECT_ID=$(gcloud config get-value project) + CLUSTER_NAME=testcluster + gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.29.4-gke.1043002" --release-channel "regular" --machine-type "e2-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" + + .. raw:: html + +
+ + +2. Get the kubeconfig for your cluster. The following command will automatically update ``~/.kube/config`` with new kubecontext for the GKE cluster: + + .. code-block:: console + + $ gcloud container clusters get-credentials --region + + # Example: + # gcloud container clusters get-credentials testcluster --region us-central1-c + +3. [If using GPUs] If your GKE nodes have GPUs, you may need to to + `manually install `_ + nvidia drivers. You can do so by deploying the daemonset + depending on the GPU and OS on your nodes: + + .. code-block:: console + + # For Container Optimized OS (COS) based nodes with GPUs other than Nvidia L4 (e.g., V100, A100, ...): + $ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml + + # For Container Optimized OS (COS) based nodes with L4 GPUs: + $ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml + + # For Ubuntu based nodes with GPUs other than Nvidia L4 (e.g., V100, A100, ...): + $ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded.yaml + + # For Ubuntu based nodes with L4 GPUs: + $ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded-R525.yaml + + To verify if GPU drivers are set up, run ``kubectl describe nodes`` and verify that ``nvidia.com/gpu`` is listed under the ``Capacity`` section. + +4. Verify your kubernetes cluster is correctly set up for SkyPilot by running :code:`sky check`: + + .. code-block:: console + + $ sky check + +5. [If using GPUs] Check available GPUs in the kubernetes cluster with :code:`sky show-gpus --cloud kubernetes` + + .. code-block:: console + + $ sky show-gpus --cloud kubernetes + GPU QTY_PER_NODE TOTAL_GPUS TOTAL_FREE_GPUS + L4 1, 2, 3, 4 8 6 + A100 1, 2 4 2 + + +.. note:: + GKE autopilot clusters are currently not supported. Only GKE standard clusters are supported. + + +.. _kubernetes-setup-eks: + +Deploying on Amazon EKS +^^^^^^^^^^^^^^^^^^^^^^^ + +1. Create a EKS cluster with at least 1 node. We recommend creating nodes with at least 4 vCPUs. + +2. Get the kubeconfig for your cluster. The following command will automatically update ``~/.kube/config`` with new kubecontext for the EKS cluster: + + .. code-block:: console + + $ aws eks update-kubeconfig --name --region + + # Example: + # aws eks update-kubeconfig --name testcluster --region us-west-2 + +3. [If using GPUs] EKS clusters already come with Nvidia drivers set up. However, you will need to label the nodes with the GPU type. Use the SkyPilot node labelling tool to do so: + + .. code-block:: console + + python -m sky.utils.kubernetes.gpu_labeler + + + This will create a job on each node to read the GPU type from `nvidia-smi` and assign a ``skypilot.co/accelerator`` label to the node. You can check the status of these jobs by running: + + .. code-block:: console + + kubectl get jobs -n kube-system + +4. Verify your kubernetes cluster is correctly set up for SkyPilot by running :code:`sky check`: + + .. code-block:: console + + $ sky check + +5. [If using GPUs] Check available GPUs in the kubernetes cluster with :code:`sky show-gpus --cloud kubernetes` + + .. code-block:: console + + $ sky show-gpus --cloud kubernetes + GPU QTY_PER_NODE TOTAL_GPUS TOTAL_FREE_GPUS + A100 1, 2 4 2 + +.. _kubernetes-setup-onprem: + +Deploying on on-prem clusters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can also deploy Kubernetes on your on-prem clusters using off-the-shelf tools, +such as `kubeadm `_, +`k3s `_ or +`Rancher `_. +Please follow their respective guides to deploy your Kubernetes cluster. + + +.. _kubernetes-setup-onprem-distro-specific: + +Notes for specific Kubernetes distributions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some Kubernetes distributions require additional steps to set up GPU support. + +Rancher Kubernetes Engine 2 (RKE2) +********************************** + +Nvidia GPU operator installation on RKE2 through helm requires extra flags to set ``nvidia`` as the default runtime for containerd. + +.. code-block:: console + + $ helm install gpu-operator -n gpu-operator --create-namespace \ + nvidia/gpu-operator $HELM_OPTIONS \ + --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \ + --set 'toolkit.env[0].value=/var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl' \ + --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \ + --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \ + --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \ + --set 'toolkit.env[2].value=nvidia' \ + --set 'toolkit.env[3].name=CONTAINERD_SET_AS_DEFAULT' \ + --set-string 'toolkit.env[3].value=true' + +Refer to instructions on `Nvidia GPU Operator installation with Helm on RKE2 `_ for details. + +K3s +*** + +Installing Nvidia GPU operator on K3s is similar to `RKE2 instructions from Nvidia `_, but requires changing +the ``CONTAINERD_CONFIG`` variable to ``/var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl``. Here is an example command to install the Nvidia GPU operator on K3s: + +.. code-block:: console + + $ helm install gpu-operator -n gpu-operator --create-namespace \ + nvidia/gpu-operator $HELM_OPTIONS \ + --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \ + --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \ + --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \ + --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \ + --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \ + --set 'toolkit.env[2].value=nvidia' + +Check the status of the GPU operator installation by running ``kubectl get pods -n gpu-operator``. It takes a few minutes to install and some CrashLoopBackOff errors are expected during the installation process. + +.. tip:: + + If your gpu-operator installation stays stuck in CrashLoopBackOff, you may need to create a symlink to the ``ldconfig`` binary to work around a `known issue `_ with nvidia-docker runtime. Run the following command on your nodes: + + .. code-block:: console + + $ ln -s /sbin/ldconfig /sbin/ldconfig.real + +After the GPU operator is installed, create the nvidia RuntimeClass required by K3s. This runtime class will automatically be used by SkyPilot to schedule GPU pods: + +.. code-block:: console + + $ kubectl apply -f - <`_ file containing access credentials and namespace to be used. + +**Supported Kubernetes deployments:** + +* Hosted Kubernetes services (EKS, GKE) +* On-prem clusters (Kubeadm, Rancher, K3s) +* Local development clusters (KinD, minikube) + +In a typical workflow: + +1. A cluster administrator sets up a Kubernetes cluster. Refer to admin guides for + :ref:`Kubernetes cluster setup ` for different deployment environments (Amazon EKS, Google GKE, On-Prem and local debugging). + +2. Users who want to run SkyPilot tasks on this cluster are issued Kubeconfig + files containing their credentials (`kube-context `_). + SkyPilot reads this Kubeconfig file to communicate with the cluster. + +Launching your first task +------------------------- +.. _kubernetes-instructions: + +Once your cluster administrator has :ref:`setup a Kubernetes cluster ` and provided you with a kubeconfig file: + +0. Make sure `kubectl `_, ``socat`` and ``nc`` (netcat) are installed on your local machine. + + .. code-block:: console + + $ # MacOS + $ brew install kubectl socat netcat + + $ # Linux (may have socat already installed) + $ sudo apt-get install kubectl socat netcat + + +1. Place your kubeconfig file at ``~/.kube/config``. + + .. code-block:: console + + $ mkdir -p ~/.kube + $ cp /path/to/kubeconfig ~/.kube/config + + You can verify your credentials are setup correctly by running :code:`kubectl get pods`. + + .. note:: + + If your cluster administrator has also provided you with a specific service account to use, set it in your ``~/.sky/config.yaml`` file: + + .. code-block:: yaml + + kubernetes: + remote_identity: your-service-account-name + + +2. Run :code:`sky check` and verify that Kubernetes is enabled in SkyPilot. + + .. code-block:: console + + $ sky check + + Checking credentials to enable clouds for SkyPilot. + ... + Kubernetes: enabled + ... + + + .. note:: + :code:`sky check` will also check if GPU support is available on your cluster. If GPU support is not available, it + will show the reason. + To setup GPU support on the cluster, refer to the :ref:`Kubernetes cluster setup guide `. + +.. _kubernetes-optimizer-table: + +4. You can now run any SkyPilot task on your Kubernetes cluster. + + .. code-block:: console + + $ sky launch --cpus 2+ task.yaml + == Optimizer == + Target: minimizing cost + Estimated cost: $0.0 / hour + + Considered resources (1 node): + --------------------------------------------------------------------------------------------------- + CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN + --------------------------------------------------------------------------------------------------- + Kubernetes 2CPU--2GB 2 2 - kubernetes 0.00 ✔ + AWS m6i.large 2 8 - us-east-1 0.10 + Azure Standard_D2s_v5 2 8 - eastus 0.10 + GCP n2-standard-2 2 8 - us-central1 0.10 + IBM bx2-8x32 8 32 - us-east 0.38 + Lambda gpu_1x_a10 30 200 A10:1 us-east-1 0.60 + ---------------------------------------------------------------------------------------------------. + + +.. note:: + SkyPilot will use the cluster and namespace set in the ``current-context`` in the + kubeconfig file. To manage your ``current-context``: + + .. code-block:: console + + $ # See current context + $ kubectl config current-context + + $ # Switch current-context + $ kubectl config use-context mycontext + + $ # Set a specific namespace to be used in the current-context + $ kubectl config set-context --current --namespace=mynamespace + + +Using Custom Images +------------------- +By default, we use and maintain a SkyPilot container image that has conda and a few other basic tools installed. + +To use your own image, add :code:`image_id: docker:` to the :code:`resources` section of your task YAML. + +.. code-block:: yaml + + resources: + image_id: docker:myrepo/myimage:latest + ... + +Your image must satisfy the following requirements: + +* Image must be **debian-based** and must have the apt package manager installed. +* The default user in the image must have root privileges or passwordless sudo access. + +.. note:: + + If your cluster runs on non-x86_64 architecture (e.g., Apple Silicon), your image must be built natively for that architecture. Otherwise, your job may get stuck at :code:`Start streaming logs ...`. See `GitHub issue `_ for more. + +Using Images from Private Repositories +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +To use images from private repositories (e.g., Private DockerHub, Amazon ECR, Google Container Registry), create a `secret `_ in your Kubernetes cluster and edit your :code:`~/.sky/config.yaml` to specify the secret like so: + +.. code-block:: yaml + + kubernetes: + pod_config: + spec: + imagePullSecrets: + - name: your-secret-here + +.. tip:: + + If you use Amazon ECR, your secret credentials may expire every 12 hours. Consider using `k8s-ecr-login-renew `_ to automatically refresh your secrets. + + +Opening Ports +------------- + +Opening ports on SkyPilot clusters running on Kubernetes is supported through two modes: + +1. `LoadBalancer services `_ (default) +2. `Nginx IngressController `_ + +One of these modes must be supported and configured on your cluster. Refer to the :ref:`setting up ports on Kubernetes guide ` on how to do this. + +.. tip:: + + On Google GKE, Amazon EKS or other cloud-hosted Kubernetes services, the default LoadBalancer services mode is supported out of the box and no additional configuration is needed. + +Once your cluster is configured, launch a task which exposes services on a port by adding :code:`ports` to the :code:`resources` section of your task YAML. + +.. code-block:: yaml + + # task.yaml + resources: + ports: 8888 + + run: | + python -m http.server 8888 + +After launching the cluster with :code:`sky launch -c myclus task.yaml`, you can get the URL to access the port using :code:`sky status --endpoints myclus`. + +.. code-block:: bash + + # List all ports exposed by the cluster + $ sky status --endpoints myclus + 8888: 34.173.13.241:8888 + + # curl a specific port's endpoint + $ curl $(sky status --endpoint 8888 myclus) + ... + +.. tip:: + + To learn more about opening ports in SkyPilot tasks, see :ref:`Opening Ports `. + +FAQs +---- + +* **Are autoscaling Kubernetes clusters supported?** + + To run on an autoscaling cluster, you may need to adjust the resource provisioning timeout (:code:`Kubernetes.TIMEOUT` in `clouds/kubernetes.py`) to a large value to give enough time for the cluster to autoscale. We are working on a better interface to adjust this timeout - stay tuned! + +* **Can SkyPilot provision a Kubernetes cluster for me? Will SkyPilot add more nodes to my Kubernetes clusters?** + + The goal of Kubernetes support is to run SkyPilot tasks on an existing Kubernetes cluster. It does not provision any new Kubernetes clusters or add new nodes to an existing Kubernetes cluster. + +* **I have multiple users in my organization who share the same Kubernetes cluster. How do I provide isolation for their SkyPilot workloads?** + + For isolation, you can create separate Kubernetes namespaces and set them in the kubeconfig distributed to users. SkyPilot will use the namespace set in the kubeconfig for running all tasks. + +* **How do I view the pods created by SkyPilot on my Kubernetes cluster?** + + You can use your existing observability tools to filter resources with the label :code:`parent=skypilot` (:code:`kubectl get pods -l 'parent=skypilot'`). As an example, follow the instructions :ref:`here ` to deploy the Kubernetes Dashboard on your cluster. + +* **How can I specify custom configuration for the pods created by SkyPilot?** + + You can override the pod configuration used by SkyPilot by setting the :code:`pod_config` key in :code:`~/.sky/config.yaml`. + The value of :code:`pod_config` should be a dictionary that follows the `Kubernetes Pod API `_. + + For example, to set custom environment variables and attach a volume on your pods, you can add the following to your :code:`~/.sky/config.yaml` file: + + .. code-block:: yaml + + kubernetes: + pod_config: + spec: + containers: + - env: + - name: MY_ENV_VAR + value: MY_ENV_VALUE + volumeMounts: # Custom volume mounts for the pod + - mountPath: /foo + name: example-volume + volumes: + - name: example-volume + hostPath: + path: /tmp + type: Directory + + For more details refer to :ref:`config-yaml`. diff --git a/docs/source/reference/kubernetes/kubernetes-ports.rst b/docs/source/reference/kubernetes/kubernetes-ports.rst new file mode 100644 index 00000000000..0f538363131 --- /dev/null +++ b/docs/source/reference/kubernetes/kubernetes-ports.rst @@ -0,0 +1,119 @@ +.. _kubernetes-ports: + +Exposing Services on Kubernetes +------------------------------- + +.. note:: + This is a guide on how to configure an existing Kubernetes cluster (along with the caveats involved) to successfully expose ports and services externally through SkyPilot. + + If you are a SkyPilot user and your cluster has already been set up to expose ports, + :ref:`Opening Ports ` explains how to expose services in your task through SkyPilot. + +SkyServe and SkyPilot clusters can :ref:`open ports ` to expose services. For SkyPilot +clusters running on Kubernetes, we support either of two modes to expose ports: + +* :ref:`LoadBalancer Service ` (default) +* :ref:`Nginx Ingress ` + + +By default, SkyPilot creates a `LoadBalancer Service `__ on your Kubernetes cluster to expose the port. + +If your cluster does not support LoadBalancer services, SkyPilot can also use `an existing Nginx IngressController `_ to create an `Ingress `_ to expose your service. + +.. _kubernetes-loadbalancer: + +LoadBalancer Service +^^^^^^^^^^^^^^^^^^^^ + +This mode exposes ports through a Kubernetes `LoadBalancer Service `__. This is the default mode used by SkyPilot. + +To use this mode, you must have a Kubernetes cluster that supports LoadBalancer Services: + +* On Google GKE, Amazon EKS or other cloud-hosted Kubernetes services, this mode is supported out of the box and no additional configuration is needed. +* On bare metal and self-managed Kubernetes clusters, `MetalLB `_ can be used to support LoadBalancer Services. + +When using this mode, SkyPilot will create a single LoadBalancer Service for all ports that you expose on a cluster. +Each port can be accessed using the LoadBalancer's external IP address and the port number. Use :code:`sky status --endpoints ` to view the external endpoints for all ports. + +In cloud based Kubernetes clusters, this will automatically create an external Load Balancer. +GKE creates a `Pass-through Load Balancer `__ +and AWS creates a `Network Load Balancer `__. +These load balancers will be automatically terminated when the cluster is deleted. + +.. note:: + LoadBalancer services are not supported on kind clusters created using :code:`sky local up`. + +.. note:: + The default LoadBalancer implementation in EKS selects a random port from the list of opened ports for the + `LoadBalancer's health check `_. This can cause issues if the selected port does not have a service running behind it. + + + For example, if a SkyPilot task exposes 5 ports but only 2 of them have services running behind them, EKS may select a port that does not have a service running behind it and the LoadBalancer will not pass the healthcheck. As a result, the service will not be assigned an external IP address. + + To work around this issue, make sure all your ports have services running behind them. + + +.. _kubernetes-ingress: + +Nginx Ingress +^^^^^^^^^^^^^ + +This mode exposes ports by creating a Kubernetes `Ingress `_ backed by an existing `Nginx Ingress Controller `_. + +To use this mode: + +1. Install the Nginx Ingress Controller on your Kubernetes cluster. Refer to the `documentation `_ for installation instructions specific to your environment. +2. Verify that the ``ingress-nginx-controller`` service has a valid external IP: + +.. code-block:: bash + + $ kubectl get service ingress-nginx-controller -n ingress-nginx + + # Example output: + # NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) + # ingress-nginx-controller LoadBalancer 10.24.4.254 35.202.58.117 80:31253/TCP,443:32699/TCP + + +.. note:: + If the ``EXTERNAL-IP`` field is ````, you can manually + specify the Ingress IP or hostname through the ``skypilot.co/external-ip`` + annotation on the ``ingress-nginx-controller`` service. In this case, + having a valid ``EXTERNAL-IP`` field is not required. + + For example, if your ``ingress-nginx-controller`` service is ``NodePort``: + + .. code-block:: bash + + # Add skypilot.co/external-ip annotation to the nginx ingress service. + # Replace in the following command with the IP you select. + # Can be any node's IP if using NodePort service type. + $ kubectl annotate service ingress-nginx-controller skypilot.co/external-ip= -n ingress-nginx + + If the ``EXTERNAL-IP`` field is ```` and the ``skypilot.co/external-ip`` annotation does not exist, + SkyPilot will use ``localhost`` as the external IP for the Ingress, + and the endpoint may not be accessible from outside the cluster. + + +3. Update the :ref:`SkyPilot config ` at :code:`~/.sky/config` to use the ingress mode. + +.. code-block:: yaml + + kubernetes: + ports: ingress + +.. tip:: + + For RKE2 and K3s, the pre-installed Nginx ingress is not correctly configured by default. Follow the `bare-metal installation instructions `_ to set up the Nginx ingress controller correctly. + +When using this mode, SkyPilot creates an ingress resource and a ClusterIP service for each port opened. The port can be accessed externally by using the Ingress URL plus a path prefix of the form :code:`/skypilot/{pod_name}/{port}`. + +Use :code:`sky status --endpoints ` to view the full endpoint URLs for all ports. + +.. code-block:: + + $ sky status --endpoints mycluster + 8888: http://34.173.152.251/skypilot/test-2ea4/8888 + +.. note:: + + When exposing a port under a sub-path such as an ingress, services expecting root path access, (e.g., Jupyter notebooks) may face issues. To resolve this, configure the service to operate under a different base URL. For Jupyter, use `--NotebookApp.base_url `_ flag during launch. Alternatively, consider using :ref:`LoadBalancer ` mode. diff --git a/docs/source/reference/kubernetes/kubernetes-setup.rst b/docs/source/reference/kubernetes/kubernetes-setup.rst index 3ed1b8c89f0..7bf04f3a7a9 100644 --- a/docs/source/reference/kubernetes/kubernetes-setup.rst +++ b/docs/source/reference/kubernetes/kubernetes-setup.rst @@ -12,196 +12,156 @@ Kubernetes Cluster Setup and shared a kubeconfig file with you, :ref:`Submitting tasks to Kubernetes ` explains how to submit tasks to your cluster. +.. grid:: 1 1 3 3 + :gutter: 2 -SkyPilot's Kubernetes support is designed to work with most Kubernetes distributions and deployment environments. + .. grid-item-card:: ⚙️ Setup Kubernetes Cluster + :link: kubernetes-setup-intro + :link-type: ref + :text-align: center -To connect to a Kubernetes cluster, SkyPilot needs: + Configure your Kubernetes cluster to run SkyPilot. -* An existing Kubernetes cluster running Kubernetes v1.20 or later. -* A `Kubeconfig `_ file containing access credentials and namespace to be used. + .. grid-item-card:: ✅️ Verify Setup + :link: kubernetes-setup-verify + :link-type: ref + :text-align: center + Ensure your cluster is set up correctly for SkyPilot. -Deployment Guides ------------------ -Below we show minimal examples to set up a new Kubernetes cluster in different environments, including hosted services on the cloud, and generating kubeconfig files which can be :ref:`used by SkyPilot `. -.. - TODO(romilb) - Add a table of contents/grid cards for each deployment environment. + .. grid-item-card:: 👀️ Observability + :link: kubernetes-observability + :link-type: ref + :text-align: center -* :ref:`Deploying locally on your laptop ` -* :ref:`Deploying on Google Cloud GKE ` -* :ref:`Deploying on Amazon EKS ` -* :ref:`Deploying on on-prem clusters ` + Use your existing Kubernetes tooling to monitor SkyPilot resources. -.. _kubernetes-setup-kind: -Deploying locally on your laptop -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. _kubernetes-setup-intro: -To try out SkyPilot on Kubernetes on your laptop or run SkyPilot -tasks locally without requiring any cloud access, we provide the -:code:`sky local up` CLI to create a 1-node Kubernetes cluster locally. +Setting up Kubernetes cluster for SkyPilot +------------------------------------------ -Under the hood, :code:`sky local up` uses `kind `_, -a tool for creating a Kubernetes cluster on your local machine. -It runs a Kubernetes cluster inside a container, so no setup is required. +To prepare a Kubernetes cluster to run SkyPilot, the cluster administrator must: -1. Install `Docker `_ and `kind `_. -2. Run :code:`sky local up` to launch a Kubernetes cluster and automatically configure your kubeconfig file: +1. :ref:`Deploy a cluster ` running Kubernetes v1.20 or later. +2. Set up :ref:`GPU support `. +3. [Optional] :ref:`Set up ports ` for exposing services. +4. [Optional] :ref:`Set up permissions `: create a namespace for your users and/or create a service account with minimal permissions for SkyPilot. - .. code-block:: console +After these steps, the administrator can share the kubeconfig file with users, who can then submit tasks to the cluster using SkyPilot. - $ sky local up +.. _kubernetes-setup-deploy: -3. Run :code:`sky check` and verify that Kubernetes is enabled in SkyPilot. You can now run SkyPilot tasks on this locally hosted Kubernetes cluster using :code:`sky launch`. -4. After you are done using the cluster, you can remove it with :code:`sky local down`. This will terminate the KinD container and switch your kubeconfig back to it's original context: +Step 1 - Deploy a Kubernetes Cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - .. code-block:: console - - $ sky local down - -.. note:: - We recommend allocating at least 4 or more CPUs to your docker runtime to - ensure kind has enough resources. See instructions - `here `_. - -.. note:: - kind does not support multiple nodes and GPUs. - It is not recommended for use in a production environment. - If you want to run a private on-prem cluster, see the section on :ref:`on-prem deployment ` for more. - - -.. _kubernetes-setup-gke: - -Deploying on Google Cloud GKE -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -1. Create a GKE standard cluster with at least 1 node. We recommend creating nodes with at least 4 vCPUs. -2. Get the kubeconfig for your cluster. The following command will automatically update ``~/.kube/config`` with new kubecontext for the GKE cluster: - - .. code-block:: console - - $ gcloud container clusters get-credentials --region - - # Example: - # gcloud container clusters get-credentials testcluster --region us-central1-c - -3. [If using GPUs] If your GKE nodes have GPUs, you may need to to - `manually install `_ - nvidia drivers. You can do so by deploying the daemonset - depending on the GPU and OS on your nodes: - - .. code-block:: console - - # For Container Optimized OS (COS) based nodes with GPUs other than Nvidia L4 (e.g., V100, A100, ...): - $ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml - - # For Container Optimized OS (COS) based nodes with L4 GPUs: - $ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml - - # For Ubuntu based nodes with GPUs other than Nvidia L4 (e.g., V100, A100, ...): - $ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded.yaml - - # For Ubuntu based nodes with L4 GPUs: - $ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded-R525.yaml - - To verify if GPU drivers are set up, run ``kubectl describe nodes`` and verify that ``nvidia.com/gpu`` is listed under the ``Capacity`` section. - -4. Verify your kubeconfig (and GPU support, if available) is correctly set up by running :code:`sky check`: - - .. code-block:: console +.. tip:: - $ sky check + If you already have a Kubernetes cluster, skip this step. -.. note:: - GKE autopilot clusters are currently not supported. Only GKE standard clusters are supported. +Below we link to minimal guides to set up a new Kubernetes cluster in different environments, including hosted services on the cloud. +.. grid:: 2 + :gutter: 3 -.. _kubernetes-setup-eks: + .. grid-item-card:: Local Development Cluster + :link: kubernetes-setup-kind + :link-type: ref + :text-align: center -Deploying on Amazon EKS -^^^^^^^^^^^^^^^^^^^^^^^ + Run a local Kubernetes cluster on your laptop with ``sky local up``. -1. Create a EKS cluster with at least 1 node. We recommend creating nodes with at least 4 vCPUs. + .. grid-item-card:: On-prem Clusters (RKE2, K3s, etc.) + :link: kubernetes-setup-onprem + :link-type: ref + :text-align: center -2. Get the kubeconfig for your cluster. The following command will automatically update ``~/.kube/config`` with new kubecontext for the EKS cluster: + For on-prem deployments with kubeadm, RKE2, K3s or other distributions. - .. code-block:: console + .. grid-item-card:: Google Cloud - GKE + :link: kubernetes-setup-gke + :link-type: ref + :text-align: center - $ aws eks update-kubeconfig --name --region + Google's hosted Kubernetes service. - # Example: - # aws eks update-kubeconfig --name testcluster --region us-west-2 + .. grid-item-card:: Amazon - EKS + :link: kubernetes-setup-eks + :link-type: ref + :text-align: center -3. [If using GPUs] EKS clusters already come with Nvidia drivers set up. However, you will need to label the nodes with the GPU type. Use the SkyPilot node labelling tool to do so: + Amazon's hosted Kubernetes service. - .. code-block:: console - python -m sky.utils.kubernetes.gpu_labeler +.. _kubernetes-setup-gpusupport: +Step 2 - Set up GPU support +^^^^^^^^^^^^^^^^^^^^^^^^^^^ - This will create a job on each node to read the GPU type from `nvidia-smi` and assign a ``skypilot.co/accelerator`` label to the node. You can check the status of these jobs by running: +To utilize GPUs on Kubernetes, your cluster must: - .. code-block:: console +1. Have the ``nvidia.com/gpu`` **resource** available on all GPU nodes and have ``nvidia`` as the default runtime for your container engine. - kubectl get jobs -n kube-system + * If you are following :ref:`our deployment guides ` or using GKE or EKS, this would already be set up. Else, install the `Nvidia GPU Operator `_. -4. Verify your kubeconfig (and GPU support, if available) is correctly set up by running :code:`sky check`: +2. Have a **label on each node specifying the GPU type**. See :ref:`Setting up GPU labels ` for more details. - .. code-block:: console - $ sky check +.. tip:: + To verify the `Nvidia GPU Operator `_ is installed after step 1 and the ``nvidia`` runtime is set as default, run: + .. code-block:: console -.. _kubernetes-setup-onprem: + $ kubectl apply -f https://raw.githubusercontent.com/skypilot-org/skypilot/master/tests/kubernetes/gpu_test_pod.yaml + $ watch kubectl get pods + # If the pod status changes to completed after a few minutes, Nvidia GPU driver is set up correctly. Move on to setting up GPU labels. -Deploying on on-prem clusters -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. note:: -You can also deploy Kubernetes on your on-prem clusters using off-the-shelf tools, -such as `kubeadm `_, -`k3s `_ or -`Rancher `_. -Please follow their respective guides to deploy your Kubernetes cluster. + Refer to :ref:`Notes for specific Kubernetes distributions ` for additional instructions on setting up GPU support on specific Kubernetes distributions, such as RKE2 and K3s. -.. _kubernetes-setup-gpusupport: -Setting up GPU support -~~~~~~~~~~~~~~~~~~~~~~ -If your Kubernetes cluster has Nvidia GPUs, ensure that: +.. _kubernetes-gpu-labels: -1. The Nvidia GPU operator is installed (i.e., ``nvidia.com/gpu`` resource is available on each node) and ``nvidia`` is set as the default runtime for your container engine. See `Nvidia's installation guide `_ for more details. -2. Each node in your cluster is labelled with the GPU type. This labelling can be done using `SkyPilot's GPU labelling script `_ or by manually adding a label of the format ``skypilot.co/accelerator: ``, where the ```` is the lowercase name of the GPU. For example, a node with V100 GPUs must have a label :code:`skypilot.co/accelerator: v100`. +Setting up GPU labels +~~~~~~~~~~~~~~~~~~~~~ .. tip:: - You can check if GPU operator is installed and the ``nvidia`` runtime is set as default by running: - - .. code-block:: console - $ kubectl apply -f https://raw.githubusercontent.com/skypilot-org/skypilot/master/tests/kubernetes/gpu_test_pod.yaml - $ watch kubectl get pods - # If the pod status changes to completed after a few minutes, your Kubernetes environment is set up correctly. + If your cluster has the Nvidia GPU Operator installed or you are using GKE or Karpenter, your cluster already has the necessary GPU labels. You can skip this section. -.. note:: +To use GPUs with SkyPilot, cluster nodes must be labelled with the GPU type. This informs SkyPilot which GPU types are available on the cluster. - Refer to :ref:`Notes for specific Kubernetes distributions ` for additional instructions on setting up GPU support on specific Kubernetes distributions, such as RKE2 and K3s. +Currently supported labels are: +* ``nvidia.com/gpu.product``: automatically created by Nvidia GPU Operator. +* ``cloud.google.com/gke-accelerator``: used by GKE clusters. +* ``karpenter.k8s.aws/instance-gpu-name``: used by Karpenter. +* ``skypilot.co/accelerator``: custom label used by SkyPilot if none of the above are present. -.. note:: +Any one of these labels is sufficient for SkyPilot to detect GPUs on the cluster. - GPU labels are case-sensitive. Ensure that the GPU name is lowercase if you are using the ``skypilot.co/accelerator`` label. +.. tip:: -.. note:: + To check if your nodes contain the necessary labels, run: - GPU labelling is not required on GKE clusters - SkyPilot will automatically use GKE provided labels. However, you will still need to install `drivers `_. + .. code-block:: bash -.. _automatic-gpu-labelling: + output=$(kubectl get nodes --show-labels | awk -F'[, ]' '{for (i=1; i<=NF; i++) if ($i ~ /nvidia.com\/gpu.product=|cloud.google.com\/gke-accelerator=|karpenter.k8s.aws\/instance-gpu-name=|skypilot.co\/accelerator=/) print $i}') + if [ -z "$output" ]; then + echo "No valid GPU labels found." + else + echo "GPU Labels found:" + echo "$output" + fi -Automatic GPU labelling -~~~~~~~~~~~~~~~~~~~~~~~ +Automatically Labelling Nodes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -We provide a convenience script that automatically detects GPU types and labels each node. You can run it with: +If none of the above labels are present on your cluster, we provide a convenience script that automatically detects GPU types and labels each node with the ``skypilot.co/accelerator`` label. You can run it with: .. code-block:: console @@ -217,229 +177,93 @@ We provide a convenience script that automatically detects GPU types and labels If the GPU labelling process fails, you can run ``python -m sky.utils.kubernetes.gpu_labeler --cleanup`` to clean up the failed jobs. -Once the cluster is deployed and you have placed your kubeconfig at ``~/.kube/config``, verify your setup by running :code:`sky check`: - -.. code-block:: console - - $ sky check - -This should show ``Kubernetes: Enabled`` without any warnings. - -You can also check the GPUs available on your nodes by running: - -.. code-block:: console - - $ sky show-gpus --cloud kubernetes - -.. tip:: - - If automatic GPU labelling fails, you can manually label your nodes with the GPU type. Use the following command to label your nodes: - - .. code-block:: console - - $ kubectl label nodes skypilot.co/accelerator= +Manually Labelling Nodes +~~~~~~~~~~~~~~~~~~~~~~~~ -.. _kubernetes-setup-onprem-distro-specific: +You can also manually label nodes, if required. Labels must be of the format ``skypilot.co/accelerator: `` where ```` is the lowercase name of the GPU. -Notes for specific Kubernetes distributions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For example, a node with V100 GPUs must have a label :code:`skypilot.co/accelerator: v100`. -Rancher Kubernetes Engine 2 (RKE2) -********************************** +Use the following command to label a node: -Nvidia GPU operator installation on RKE2 through helm requires extra flags to set ``nvidia`` as the default runtime for containerd. - -.. code-block:: console +.. code-block:: bash - $ helm install gpu-operator -n gpu-operator --create-namespace \ - nvidia/gpu-operator $HELM_OPTIONS \ - --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \ - --set 'toolkit.env[0].value=/var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl' \ - --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \ - --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \ - --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \ - --set 'toolkit.env[2].value=nvidia' \ - --set 'toolkit.env[3].name=CONTAINERD_SET_AS_DEFAULT' \ - --set-string 'toolkit.env[3].value=true' + kubectl label nodes skypilot.co/accelerator= -Refer to instructions on `Nvidia GPU Operator installation with Helm on RKE2 `_ for details. -K3s -*** +.. note:: -Installing Nvidia GPU operator on K3s is similar to `RKE2 instructions from Nvidia `_, but requires changing -the ``CONTAINERD_CONFIG`` variable to ``/var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl``. Here is an example command to install the Nvidia GPU operator on K3s: + GPU labels are case-sensitive. Ensure that the GPU name is lowercase if you are using the ``skypilot.co/accelerator`` label. -.. code-block:: console - $ helm install gpu-operator -n gpu-operator --create-namespace \ - nvidia/gpu-operator $HELM_OPTIONS \ - --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \ - --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \ - --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \ - --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \ - --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \ - --set 'toolkit.env[2].value=nvidia' +.. _kubernetes-setup-ports: -Check the status of the GPU operator installation by running ``kubectl get pods -n gpu-operator``. It takes a few minutes to install and some CrashLoopBackOff errors are expected during the installation process. +[Optional] Step 3 - Set up for Exposing Services +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. tip:: - If your gpu-operator installation stays stuck in CrashLoopBackOff, you may need to create a symlink to the ``ldconfig`` binary to work around a `known issue `_ with nvidia-docker runtime. Run the following command on your nodes: - - .. code-block:: console - - $ ln -s /sbin/ldconfig /sbin/ldconfig.real - -After the GPU operator is installed, create the nvidia RuntimeClass required by K3s. This runtime class will automatically be used by SkyPilot to schedule GPU pods: - -.. code-block:: console - - $ kubectl apply -f - <`_ above. - -.. _kubernetes-ports: - -Setting up Ports on Kubernetes -------------------------------- - - -.. note:: - This is a guide on how to configure an existing Kubernetes cluster (along with the caveats involved) to successfully expose ports and services externally through SkyPilot. - - If you are a SkyPilot user and your cluster has already been set up to expose ports, - :ref:`Opening Ports ` explains how to expose services in your task through SkyPilot. + If you are using GKE or EKS or do not plan expose ports publicly on Kubernetes (such as ``sky launch --ports``, SkyServe), no additional setup is required. On GKE and EKS, SkyPilot will create a LoadBalancer service automatically. -SkyPilot clusters can :ref:`open ports ` to expose services. For SkyPilot -clusters running on Kubernetes, we support either of two modes to expose ports: +Running SkyServe or tasks exposing ports requires additional setup to expose ports running services. +SkyPilot supports either of two modes to expose ports: * :ref:`LoadBalancer Service ` (default) * :ref:`Nginx Ingress ` +Refer to :ref:`Exposing Services on Kubernetes ` for more details. -By default, SkyPilot creates a `LoadBalancer Service `__ on your Kubernetes cluster to expose the port. - -If your cluster does not support LoadBalancer services, SkyPilot can also use `an existing Nginx IngressController `_ to create an `Ingress `_ to expose your service. - -.. _kubernetes-loadbalancer: - -LoadBalancer Service -^^^^^^^^^^^^^^^^^^^^ - -This mode exposes ports through a Kubernetes `LoadBalancer Service `__. This is the default mode used by SkyPilot. - - -To use this mode, you must have a Kubernetes cluster that supports LoadBalancer Services: - -* On Google GKE, Amazon EKS or other cloud-hosted Kubernetes services, this mode is supported out of the box and no additional configuration is needed. -* On bare metal and self-managed Kubernetes clusters, `MetalLB `_ can be used to support LoadBalancer Services. - -When using this mode, SkyPilot will create a single LoadBalancer Service for all ports that you expose on a cluster. -Each port can be accessed using the LoadBalancer's external IP address and the port number. Use :code:`sky status --endpoints ` to view the external endpoints for all ports. - -.. note:: - In cloud based Kubernetes clusters, this will automatically create an external Load Balancer. GKE creates a (`pass-through load balancer `__) - and AWS creates a `Network Load Balancer `__). These load balancers will be automatically terminated when the cluster is deleted. - -.. note:: - The default LoadBalancer implementation in EKS selects a random port from the list of opened ports for the - `LoadBalancer's health check `_. This can cause issues if the selected port does not have a service running behind it. - - - For example, if a SkyPilot task exposes 5 ports but only 2 of them have services running behind them, EKS may select a port that does not have a service running behind it and the LoadBalancer will not pass the healthcheck. As a result, the service will not be assigned an external IP address. - - To work around this issue, make sure all your ports have services running behind them. - -.. note:: - LoadBalancer services are not supported on kind clusters created using :code:`sky local up`. +.. _kubernetes-setup-serviceaccount: +[Optional] Step 4 - Namespace and Service Account Setup +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. _kubernetes-ingress: +.. tip:: -Nginx Ingress -^^^^^^^^^^^^^ + This step is optional and required only in specific environments. By default, SkyPilot runs in the namespace configured in current `kube-context `_ and creates a service account named ``skypilot-service-account`` to run tasks. + **This step is not required if you use these defaults.** -This mode exposes ports by creating a Kubernetes `Ingress `_ backed by an existing `Nginx Ingress Controller `_. +If your cluster requires isolating SkyPilot tasks to a specific namespace and restricting the permissions granted to users, +you can create a new namespace and service account for SkyPilot to use. -To use this mode: +The minimal permissions required for the service account can be found on the :ref:`Minimal Kubernetes Permissions ` page. -1. Install the Nginx Ingress Controller on your Kubernetes cluster. Refer to the `documentation `_ for installation instructions specific to your environment. -2. Verify that the ``ingress-nginx-controller`` service has a valid external IP: +To simplify the setup, we provide a `script `_ that creates a namespace and service account with the necessary permissions for a given service account name and namespace. .. code-block:: bash - $ kubectl get service ingress-nginx-controller -n ingress-nginx - - # Example output: - # NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) - # ingress-nginx-controller LoadBalancer 10.24.4.254 35.202.58.117 80:31253/TCP,443:32699/TCP - -.. note:: - If the ``EXTERNAL-IP`` field is ````, you may manually assign it an External IP. - This can be done by patching the service with an IP that can be accessed from outside the cluster. - If the service type is ``NodePort``, you can set the ``EXTERNAL-IP`` to any node's IP address: - - .. code-block:: bash - - # Patch the nginx ingress service with an external IP. Can be any node's IP if using NodePort service. - # Replace in the following command with the IP you select. - $ kubectl patch svc ingress-nginx-controller -n ingress-nginx -p '{"spec": {"externalIPs": [""]}}' - - If the ``EXTERNAL-IP`` field is left as ````, SkyPilot will use ``localhost`` as the external IP for the Ingress, - and the endpoint may not be accessible from outside the cluster. - -.. note:: - If you cannot update the ``EXTERNAL-IP`` field of the service, you can also - specify the Ingress IP or hostname through the ``skypilot.co/external-ip`` - annotation on the ``ingress-nginx-controller`` service. In this case, - having a valid ``EXTERNAL-IP`` field is not required. - - For example, if your ``ingress-nginx-controller`` service is ``NodePort``: - - .. code-block:: bash - - # Add skypilot.co/external-ip annotation to the nginx ingress service. - # Replace in the following command with the IP you select. - # Can be any node's IP if using NodePort service type. - $ kubectl annotate service ingress-nginx-controller skypilot.co/external-ip= -n ingress-nginx - + # Download the script + wget https://raw.githubusercontent.com/skypilot-org/skypilot/master/sky/utils/kubernetes/generate_kubeconfig.sh + chmod +x generate_kubeconfig.sh -3. Update the :ref:`SkyPilot config ` at :code:`~/.sky/config` to use the ingress mode. + # Execute the script to generate a kubeconfig file with the service account and namespace + # Replace my-sa and my-namespace with your desired service account name and namespace + # The script will create the namespace if it does not exist and create a service account with the necessary permissions. + SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh -.. code-block:: yaml +You may distribute the generated kubeconfig file to users who can then use it to submit tasks to the cluster. - kubernetes: - ports: ingress +.. _kubernetes-setup-verify: -.. tip:: - - For RKE2 and K3s, the pre-installed Nginx ingress is not correctly configured by default. Follow the `bare-metal installation instructions `_ to set up the Nginx ingress controller correctly. - -When using this mode, SkyPilot creates an ingress resource and a ClusterIP service for each port opened. The port can be accessed externally by using the Ingress URL plus a path prefix of the form :code:`/skypilot/{pod_name}/{port}`. - -Use :code:`sky status --endpoints ` to view the full endpoint URLs for all ports. +Verifying Setup +--------------- -.. code-block:: +Once the cluster is deployed and you have placed your kubeconfig at ``~/.kube/config``, verify your setup by running :code:`sky check`: - $ sky status --endpoints mycluster - 8888: http://34.173.152.251/skypilot/test-2ea4/8888 +.. code-block:: bash -.. note:: + sky check kubernetes - When exposing a port under a sub-path such as an ingress, services expecting root path access, (e.g., Jupyter notebooks) may face issues. To resolve this, configure the service to operate under a different base URL. For Jupyter, use `--NotebookApp.base_url `_ flag during launch. Alternatively, consider using :ref:`LoadBalancer ` mode. +This should show ``Kubernetes: Enabled`` without any warnings. +You can also check the GPUs available on your nodes by running: -.. note:: +.. code-block:: console - Currently, SkyPilot does not support opening ports on a Kubernetes cluster using the `Gateway API `_. - If you are interested in this feature, please `reach out `_. + $ sky show-gpus --cloud kubernetes + GPU QTY_PER_NODE TOTAL_GPUS TOTAL_FREE_GPUS + L4 1, 2, 3, 4 8 6 + H100 1, 2 4 2 .. _kubernetes-observability: @@ -483,7 +307,15 @@ Note that this dashboard can only be accessed from the machine where the ``kubec `Kubernetes documentation `_ for more information on how to set up access control for the dashboard. + Troubleshooting Kubernetes Setup -------------------------------- If you encounter issues while setting up your Kubernetes cluster, please refer to the :ref:`troubleshooting guide ` to diagnose and fix issues. + + +.. toctree:: + :hidden: + + kubernetes-deployment + Exposing Services diff --git a/docs/source/reference/kubernetes/kubernetes-troubleshooting.rst b/docs/source/reference/kubernetes/kubernetes-troubleshooting.rst index bb0befc602a..258c3e9eb55 100644 --- a/docs/source/reference/kubernetes/kubernetes-troubleshooting.rst +++ b/docs/source/reference/kubernetes/kubernetes-troubleshooting.rst @@ -68,7 +68,19 @@ Run :code:`sky check` to verify that SkyPilot can access your cluster. If you see an error, ensure that your kubeconfig file at :code:`~/.kube/config` is correctly set up. -Step A3 - Can you launch a SkyPilot task? +Step A3 - Do your nodes have enough disk space? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If your nodes are out of disk space, pulling the SkyPilot images may fail with :code:`rpc error: code = Canceled desc = failed to pull and unpack image: context canceled` error in the terminal during provisioning. +Make sure your nodes are not under disk pressure by checking :code:`Conditions` in :code:`kubectl describe nodes`, or by running: + +.. code-block:: bash + + $ kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{range .status.conditions[?(@.type=="DiskPressure")]}{.type}={.status}{"\n"}{end}{"\n"}{end}' + # Should not show DiskPressure=True for any node + + +Step A4 - Can you launch a SkyPilot task? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Next, try running a simple hello world task to verify that SkyPilot can launch tasks on your cluster. diff --git a/docs/source/running-jobs/distributed-jobs.rst b/docs/source/running-jobs/distributed-jobs.rst index fb20b7ca988..9eb590c10bc 100644 --- a/docs/source/running-jobs/distributed-jobs.rst +++ b/docs/source/running-jobs/distributed-jobs.rst @@ -1,15 +1,15 @@ .. _dist-jobs: -Distributed Jobs on Many VMs +Distributed Jobs on Many Nodes ================================================ SkyPilot supports multi-node cluster -provisioning and distributed execution on many VMs. +provisioning and distributed execution on many nodes. For example, here is a simple PyTorch Distributed training example: .. code-block:: yaml - :emphasize-lines: 6-6,21-22,24-25 + :emphasize-lines: 6-6,21-21,23-26 name: resnet-distributed-app @@ -31,14 +31,13 @@ For example, here is a simple PyTorch Distributed training example: run: | cd pytorch-distributed-resnet - num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l` - master_addr=`echo "$SKYPILOT_NODE_IPS" | head -n1` - python3 -m torch.distributed.launch \ - --nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} \ - --node_rank=${SKYPILOT_NODE_RANK} \ - --nnodes=$num_nodes \ - --master_addr=$master_addr \ - --master_port=8008 \ + MASTER_ADDR=`echo "$SKYPILOT_NODE_IPS" | head -n1` + torchrun \ + --nnodes=$SKPILOT_NUM_NODES \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ + --node_rank=$SKYPILOT_NODE_RANK \ + --master_port=12375 \ resnet_ddp.py --num_epochs 20 In the above, @@ -66,16 +65,11 @@ SkyPilot exposes these environment variables that can be accessed in a task's `` the node executing the task. - :code:`SKYPILOT_NODE_IPS`: a string of IP addresses of the nodes reserved to execute the task, where each line contains one IP address. - - - You can retrieve the number of nodes by :code:`echo "$SKYPILOT_NODE_IPS" | wc -l` - and the IP address of the third node by :code:`echo "$SKYPILOT_NODE_IPS" | sed -n - 3p`. - - - To manipulate these IP addresses, you can also store them to a file in the - :code:`run` command with :code:`echo $SKYPILOT_NODE_IPS >> ~/sky_node_ips`. +- :code:`SKYPILOT_NUM_NODES`: number of nodes reserved for the task, which can be specified by ``num_nodes: ``. Same value as :code:`echo "$SKYPILOT_NODE_IPS" | wc -l`. - :code:`SKYPILOT_NUM_GPUS_PER_NODE`: number of GPUs reserved on each node to execute the task; the same as the count in ``accelerators: :`` (rounded up if a fraction). +See :ref:`sky-env-vars` for more details. Launching a multi-node task (new cluster) ------------------------------------------------- @@ -106,7 +100,7 @@ The following happens in sequence: and step 4). Executing a task on the head node only ------------------------------------------ +-------------------------------------- To execute a task on the head node only (a common scenario for tools like ``mpirun``), use the ``SKYPILOT_NODE_RANK`` environment variable as follows: @@ -141,7 +135,7 @@ This allows you directly to SSH into the worker nodes, if required. Executing a Distributed Ray Program ------------------------------------ -To execute a distributed Ray program on many VMs, you can download the `training script `_ and launch the `task yaml `_: +To execute a distributed Ray program on many nodes, you can download the `training script `_ and launch the `task yaml `_: .. code-block:: console @@ -171,19 +165,17 @@ To execute a distributed Ray program on many VMs, you can download the `training run: | sudo chmod 777 -R /var/tmp - head_ip=`echo "$SKYPILOT_NODE_IPS" | head -n1` - num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l` + HEAD_IP=`echo "$SKYPILOT_NODE_IPS" | head -n1` if [ "$SKYPILOT_NODE_RANK" == "0" ]; then ps aux | grep ray | grep 6379 &> /dev/null || ray start --head --disable-usage-stats --port 6379 sleep 5 - python train.py --num-workers $num_nodes + python train.py --num-workers $SKYPILOT_NUM_NODES else sleep 5 - ps aux | grep ray | grep 6379 &> /dev/null || ray start --address $head_ip:6379 --disable-usage-stats + ps aux | grep ray | grep 6379 &> /dev/null || ray start --address $HEAD_IP:6379 --disable-usage-stats fi .. warning:: - **Avoid Installing Ray in Base Environment**: Before proceeding with the execution of a distributed Ray program, it is crucial to ensure that Ray is **not** installed in the *base* environment. Installing a different version of Ray in the base environment can lead to abnormal cluster status. - It is highly recommended to **create a dedicated virtual environment** (as above) for Ray and its dependencies, and avoid calling `ray stop` as that will also cause issue with the cluster. + When using Ray, avoid calling ``ray stop`` as that will also cause the SkyPilot runtime to be stopped. diff --git a/docs/source/running-jobs/environment-variables.rst b/docs/source/running-jobs/environment-variables.rst index 2f3427c1bf5..f7138af95fa 100644 --- a/docs/source/running-jobs/environment-variables.rst +++ b/docs/source/running-jobs/environment-variables.rst @@ -1,23 +1,38 @@ .. _env-vars: -Using Environment Variables +Secrets and Environment Variables ================================================ +Environment variables are a powerful way to pass configuration and secrets to your tasks. There are two types of environment variables in SkyPilot: + +- :ref:`User-specified environment variables `: Passed by users to tasks, useful for secrets and configurations. +- :ref:`SkyPilot environment variables `: Predefined by SkyPilot with information about the current cluster and task. + +.. _user-specified-env-vars: + User-specified environment variables ------------------------------------------------------------------ +User-specified environment variables are useful for passing secrets and any arguments or configurations needed for your tasks. They are made available in ``file_mounts``, ``setup``, and ``run``. + You can specify environment variables to be made available to a task in two ways: -- The ``envs`` field (dict) in a :ref:`task YAML ` -- The ``--env`` flag in the ``sky launch/exec`` :ref:`CLI ` (takes precedence over the above) +- ``envs`` field (dict) in a :ref:`task YAML `: + + .. code-block:: yaml + + envs: + MYVAR: val + +- ``--env`` flag in ``sky launch/exec`` :ref:`CLI ` (takes precedence over the above) .. tip:: - If an environment variable is required to be specified with `--env` during - ``sky launch/exec``, you can set it to ``null`` in task YAML to raise an - error when it is forgotten to be specified. For example, the ``WANDB_API_KEY`` - and ``HF_TOKEN`` in the following task YAML: + To mark an environment variable as required and make SkyPilot forcefully check + its existence (errors out if not specified), set it to an empty string or + ``null`` in the task YAML. For example, ``WANDB_API_KEY`` and ``HF_TOKEN`` in + the following task YAML are marked as required: .. code-block:: yaml @@ -28,6 +43,26 @@ You can specify environment variables to be made available to a task in two ways The ``file_mounts``, ``setup``, and ``run`` sections of a task YAML can access the variables via the ``${MYVAR}`` syntax. +.. _passing-secrets: + +Passing secrets +~~~~~~~~~~~~~~~ + +We recommend passing secrets to any node(s) executing your task by first making +it available in your current shell, then using ``--env SECRET`` to pass it to SkyPilot: + +.. code-block:: console + + $ sky launch -c mycluster --env HF_TOKEN --env WANDB_API_KEY task.yaml + $ sky exec mycluster --env WANDB_API_KEY task.yaml + +.. tip:: + + You do not need to pass the value directly such as ``--env + WANDB_API_KEY=1234``. When the value is not specified (e.g., ``--env WANDB_API_KEY``), + SkyPilot reads it from local environment variables. + + Using in ``file_mounts`` ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -77,40 +112,29 @@ For example, this is useful for passing secrets (see below) or passing configura See complete examples at `llm/vllm/serve.yaml `_ and `llm/vicuna/train.yaml `_. -.. _passing-secrets: - -Passing secrets -~~~~~~~~~~~~~~~~~~~~~~~~ - -We recommend passing secrets to any node(s) executing your task by first making -it available in your current shell, then using ``--env`` to pass it to SkyPilot: - -.. code-block:: console - - $ sky launch -c mycluster --env WANDB_API_KEY task.yaml - $ sky exec mycluster --env WANDB_API_KEY task.yaml - -.. tip:: - - In other words, you do not need to pass the value directly such as ``--env - WANDB_API_KEY=1234``. +.. _sky-env-vars: +SkyPilot environment variables +------------------------------------------------------------------ +SkyPilot exports several predefined environment variables made available during a task's execution. These variables contain information about the current cluster or task, which can be useful for distributed frameworks such as +torch.distributed, OpenMPI, etc. See examples in :ref:`dist-jobs` and :ref:`managed-jobs`. +The values of these variables are filled in by SkyPilot at task execution time. +You can access these variables in the following ways: -SkyPilot environment variables ------------------------------------------------------------------- +* In the task YAML's ``setup``/``run`` commands (a Bash script), access them using the ``${MYVAR}`` syntax; +* In the program(s) launched in ``setup``/``run``, access them using the language's standard method (e.g., ``os.environ`` for Python). -SkyPilot exports these environment variables for a task's execution. ``setup`` -and ``run`` stages have different environment variables available. +The ``setup`` and ``run`` stages can access different sets of SkyPilot environment variables: Environment variables for ``setup`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. list-table:: - :widths: 20 60 10 + :widths: 20 40 10 :header-rows: 1 * - Name @@ -121,7 +145,17 @@ Environment variables for ``setup`` - 0 * - ``SKYPILOT_SETUP_NODE_IPS`` - A string of IP addresses of the nodes in the cluster with the same order as the node ranks, where each line contains one IP address. - - 1.2.3.4 + + Note that this is not necessarily the same as the nodes in ``run`` stage: the ``setup`` stage runs on all nodes of the cluster, while the ``run`` stage can run on a subset of nodes. + - + .. code-block:: text + + 1.2.3.4 + 3.4.5.6 + + * - ``SKYPILOT_NUM_NODES`` + - Number of nodes in the cluster. Same value as ``$(echo "$SKYPILOT_NODE_IPS" | wc -l)``. + - 2 * - ``SKYPILOT_TASK_ID`` - A unique ID assigned to each task. @@ -133,7 +167,15 @@ Environment variables for ``setup`` For managed spot jobs: sky-managed-2023-07-06-21-18-31-563597_my-job-name_1-0 * - ``SKYPILOT_CLUSTER_INFO`` - - A JSON string containing information about the cluster. To access the information, you could parse the JSON string in bash ``echo $SKYPILOT_CLUSTER_INFO | jq .cloud`` or in Python ``json.loads(os.environ['SKYPILOT_CLUSTER_INFO'])['cloud']``. + - A JSON string containing information about the cluster. To access the information, you could parse the JSON string in bash ``echo $SKYPILOT_CLUSTER_INFO | jq .cloud`` or in Python : + + .. code-block:: python + + import json + json.loads( + os.environ['SKYPILOT_CLUSTER_INFO'] + )['cloud'] + - {"cluster_name": "my-cluster-name", "cloud": "GCP", "region": "us-central1", "zone": "us-central1-a"} * - ``SKYPILOT_SERVE_REPLICA_ID`` - The ID of a replica within the service (starting from 1). Available only for a :ref:`service `'s replica task. @@ -147,7 +189,7 @@ Environment variables for ``run`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. list-table:: - :widths: 20 60 10 + :widths: 20 40 10 :header-rows: 1 * - Name @@ -158,7 +200,14 @@ Environment variables for ``run`` - 0 * - ``SKYPILOT_NODE_IPS`` - A string of IP addresses of the nodes reserved to execute the task, where each line contains one IP address. Read more :ref:`here `. - - 1.2.3.4 + - + .. code-block:: text + + 1.2.3.4 + + * - ``SKYPILOT_NUM_NODES`` + - Number of nodes assigned to execute the current task. Same value as ``$(echo "$SKYPILOT_NODE_IPS" | wc -l)``. Read more :ref:`here `. + - 1 * - ``SKYPILOT_NUM_GPUS_PER_NODE`` - Number of GPUs reserved on each node to execute the task; the same as the count in ``accelerators: :`` (rounded up if a fraction). Read @@ -175,16 +224,15 @@ Environment variables for ``run`` For managed spot jobs: sky-managed-2023-07-06-21-18-31-563597_my-job-name_1-0 * - ``SKYPILOT_CLUSTER_INFO`` - - A JSON string containing information about the cluster. To access the information, you could parse the JSON string in bash ``echo $SKYPILOT_CLUSTER_INFO | jq .cloud`` or in Python ``json.loads(os.environ['SKYPILOT_CLUSTER_INFO'])['cloud']``. + - A JSON string containing information about the cluster. To access the information, you could parse the JSON string in bash ``echo $SKYPILOT_CLUSTER_INFO | jq .cloud`` or in Python : + + .. code-block:: python + + import json + json.loads( + os.environ['SKYPILOT_CLUSTER_INFO'] + )['cloud'] - {"cluster_name": "my-cluster-name", "cloud": "GCP", "region": "us-central1", "zone": "us-central1-a"} * - ``SKYPILOT_SERVE_REPLICA_ID`` - The ID of a replica within the service (starting from 1). Available only for a :ref:`service `'s replica task. - 1 - -The values of these variables are filled in by SkyPilot at task execution time. - -You can access these variables in the following ways: - -* In the task YAML's ``setup``/``run`` commands (a Bash script), access them using the ``${MYVAR}`` syntax; -* In the program(s) launched in ``setup``/``run``, access them using the - language's standard method (e.g., ``os.environ`` for Python). diff --git a/docs/source/running-jobs/index.rst b/docs/source/running-jobs/index.rst deleted file mode 100644 index 04c921d1022..00000000000 --- a/docs/source/running-jobs/index.rst +++ /dev/null @@ -1,7 +0,0 @@ -More User Guides -================================================ - -.. toctree:: - - distributed-jobs - environment-variables diff --git a/docs/source/serving/auth.rst b/docs/source/serving/auth.rst new file mode 100644 index 00000000000..91e02a64b07 --- /dev/null +++ b/docs/source/serving/auth.rst @@ -0,0 +1,123 @@ +.. _serve-auth: + +Authorization +============= + +SkyServe provides robust authorization capabilities at the replica level, allowing you to control access to service endpoints with API keys. + +Setup API Keys +-------------- + +SkyServe relies on the authorization of the service running on underlying service replicas, e.g., the inference engine. We take the vLLM inference engine as an example, which supports static API key authorization with an argument :code:`--api-key`. + +We define a SkyServe service spec for serving Llama-3 chatbot with vLLM and an API key. In the example YAML below, we define the authorization token as an environment variable, :code:`AUTH_TOKEN`, and pass it to both the service field to enable :code:`readiness_probe` to access the replicas and the vllm entrypoint to start services on replicas with the API key. + +.. code-block:: yaml + :emphasize-lines: 5,10-11,28 + + # auth.yaml + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. + AUTH_TOKEN: # TODO: Fill with your own auth token (a random string), or use --env to pass. + + service: + readiness_probe: + path: /v1/models + headers: + Authorization: Bearer $AUTH_TOKEN + replicas: 2 + + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + ports: 8000 + + setup: | + pip install vllm==0.4.0.post1 flash-attn==2.5.7 gradio openai + # python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + + run: | + export PATH=$PATH:/sbin + python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_NAME --trust-remote-code \ + --gpu-memory-utilization 0.95 \ + --host 0.0.0.0 --port 8000 \ + --api-key $AUTH_TOKEN + +To deploy the service, run the following command: + +.. code-block:: bash + + HF_TOKEN=xxx AUTH_TOKEN=yyy sky serve up auth.yaml -n auth --env HF_TOKEN --env AUTH_TOKEN + +To send a request to the service endpoint, a service client need to include the static API key in a request's header: + +.. code-block:: bash + :emphasize-lines: 5 + + $ ENDPOINT=$(sky serve status --endpoint auth) + $ AUTH_TOKEN=yyy + $ curl http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $AUTH_TOKEN" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "stop_token_ids": [128009, 128001] + }' | jq + +.. raw:: HTML + +
+ + Example output + + +.. code-block:: console + + { + "id": "cmpl-cad2c1a2a6ee44feabed0b28be294d6f", + "object": "chat.completion", + "created": 1716819147, + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "I'm so glad you asked! I'm LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I'm here to help you with any questions, tasks, or topics you'd like to discuss. I can provide information on a wide range of subjects, from science and history to entertainment and culture. I can also assist with language-related tasks such as language translation, text summarization, and even writing and proofreading. My goal is to provide accurate and helpful responses to your inquiries, while also being friendly and engaging. So, what's on your mind? How can I assist you today?" + }, + "logprobs": null, + "finish_reason": "stop", + "stop_reason": 128009 + } + ], + "usage": { + "prompt_tokens": 26, + "total_tokens": 160, + "completion_tokens": 134 + } + } + +.. raw:: html + +
+ +A service client without an API key will not be able to access the service and get a :code:`401 Unauthorized` error: + +.. code-block:: bash + + $ curl http://$ENDPOINT/v1/models + {"error": "Unauthorized"} + + $ curl http://$ENDPOINT/v1/models -H "Authorization: Bearer random-string" + {"error": "Unauthorized"} diff --git a/docs/source/serving/service-yaml-spec.rst b/docs/source/serving/service-yaml-spec.rst index a5e23f101d2..4d3ffc06d48 100644 --- a/docs/source/serving/service-yaml-spec.rst +++ b/docs/source/serving/service-yaml-spec.rst @@ -27,6 +27,13 @@ Available fields: # highly related to your service, so it is recommended to set this value # based on your service's startup time. initial_delay_seconds: 1200 + # The Timeout in seconds for a readiness probe request (optional). + # Defaults to 15 seconds. If the readiness probe takes longer than this + # time to respond, the probe will be considered as failed. This is + # useful when your service is slow to respond to readiness probe + # requests. Note, having a too high timeout will delay the detection + # of a real failure of your service replica. + timeout_seconds: 15 # Simplified version of readiness probe that only contains the readiness # probe path. If you want to use GET method for readiness probe and the diff --git a/docs/source/serving/sky-serve.rst b/docs/source/serving/sky-serve.rst index 3ccbed140c0..c00fa427bd6 100644 --- a/docs/source/serving/sky-serve.rst +++ b/docs/source/serving/sky-serve.rst @@ -444,6 +444,11 @@ Autoscaling See :ref:`Autoscaling ` for more information. +Authorization +------------- + +See :ref:`Authorization ` for more information. + SkyServe Architecture --------------------- diff --git a/docs/source/serving/spot-policy.rst b/docs/source/serving/spot-policy.rst new file mode 100644 index 00000000000..ff23b328705 --- /dev/null +++ b/docs/source/serving/spot-policy.rst @@ -0,0 +1,160 @@ +.. _spot_policy: + +Using Spot Instances for Serving +================================ + +SkyServe supports serving models on a mixture of spot and on-demand replicas with two options: :code:`base_ondemand_fallback_replicas` and :code:`dynamic_ondemand_fallback`. Currently, SkyServe relies on the user side to retry in the event of spot instance preemptions. + + +Base on-demand Fallback +----------------------- + +:code:`base_ondemand_fallback_replicas` sets the number of on-demand replicas to keep running at all times. This is useful for ensuring service availability and making sure that there is always some capacity available, even if spot replicas are not available. :code:`use_spot` should be set to :code:`true` to enable spot replicas. + +.. code-block:: yaml + + service: + readiness_probe: /health + replica_policy: + min_replicas: 2 + max_replicas: 3 + target_qps_per_replica: 1 + # Ensures that one of the replicas is run on on-demand instances + base_ondemand_fallback_replicas: 1 + + resources: + ports: 8081 + cpus: 2+ + use_spot: true + + workdir: examples/serve/http_server + + run: python3 server.py + + +.. tip:: + + Kubernetes instances are considered on-demand instances. You can use the :code:`base_ondemand_fallback_replicas` option to have some replicas run on Kubernetes, while others run on cloud spot instances. + + +Dynamic on-demand Fallback +-------------------------- + +SkyServe supports dynamically fallback to on-demand replicas when spot replicas are not available. +This is enabled by setting :code:`dynamic_ondemand_fallback` to be :code:`true`. +This is useful for ensuring the required capacity of replicas in the case of spot instance interruptions. +When spot replicas are available, SkyServe will automatically switch back to using spot replicas to maximize cost savings. + +.. code-block:: yaml + + service: + readiness_probe: /health + replica_policy: + min_replicas: 2 + max_replicas: 3 + target_qps_per_replica: 1 + # Allows replicas to be run on on-demand instances if spot instances are not available + dynamic_ondemand_fallback: true + + resources: + ports: 8081 + cpus: 2+ + use_spot: true + + workdir: examples/serve/http_server + + run: python3 server.py + + +.. tip:: + + SkyServe supports specifying both :code:`base_ondemand_fallback_replicas` and :code:`dynamic_ondemand_fallback`. Specifying both will set a base number of on-demand replicas and dynamically fallback to on-demand replicas when spot replicas are not available. + +Example +------- + +The following example demonstrates how to use spot replicas with SkyServe with dynamic fallback. The example is a simple HTTP server that listens on port 8081 with :code:`dynamic_ondemand_fallback: true`. To run: + +.. code-block:: console + + $ sky serve up examples/serve/spot_policy/dynamic_on_demand_fallback.yaml -n http-server + +When the service is up, we can check the status of the service and the replicas using the following command. Initially, we will see: + +.. code-block:: console + + $ sky serve status http-server + + Services + NAME VERSION UPTIME STATUS REPLICAS ENDPOINT + http-server 1 1m 17s NO_REPLICA 0/4 54.227.229.217:30001 + + Service Replicas + SERVICE_NAME ID VERSION ENDPOINT LAUNCHED RESOURCES STATUS REGION + http-server 1 1 - 1 min ago 1x GCP([Spot]vCPU=2) PROVISIONING us-east1 + http-server 2 1 - 1 min ago 1x GCP([Spot]vCPU=2) PROVISIONING us-central1 + http-server 3 1 - 1 mins ago 1x GCP(vCPU=2) PROVISIONING us-east1 + http-server 4 1 - 1 min ago 1x GCP(vCPU=2) PROVISIONING us-central1 + +When the required number of spot replicas are not available, SkyServe will provision the number of on-demand replicas needed to meet the target number of replicas. For example, when the target number is 2 and only 1 spot replica is ready, SkyServe will provision 1 on-demand replica to meet the target number of replicas. + +.. code-block:: console + + $ sky serve status http-server + + Services + NAME VERSION UPTIME STATUS REPLICAS ENDPOINT + http-server 1 1m 17s READY 2/4 54.227.229.217:30001 + + Service Replicas + SERVICE_NAME ID VERSION ENDPOINT LAUNCHED RESOURCES STATUS REGION + http-server 1 1 http://34.23.22.160:8081 3 min ago 1x GCP([Spot]vCPU=2) READY us-east1 + http-server 2 1 http://34.68.226.193:8081 3 min ago 1x GCP([Spot]vCPU=2) READY us-central1 + http-server 3 1 - 3 mins ago 1x GCP(vCPU=2) SHUTTING_DOWN us-east1 + http-server 4 1 - 3 min ago 1x GCP(vCPU=2) SHUTTING_DOWN us-central1 + +When the spot replicas are ready, SkyServe will automatically scale down on-demand replicas to maximize cost savings. + +.. code-block:: console + + $ sky serve status http-server + + Services + NAME VERSION UPTIME STATUS REPLICAS ENDPOINT + http-server 1 3m 59s READY 2/2 54.227.229.217:30001 + + Service Replicas + SERVICE_NAME ID VERSION ENDPOINT LAUNCHED RESOURCES STATUS REGION + http-server 1 1 http://34.23.22.160:8081 4 mins ago 1x GCP([Spot]vCPU=2) READY us-east1 + http-server 2 1 http://34.68.226.193:8081 4 mins ago 1x GCP([Spot]vCPU=2) READY us-central1 + +In the event of spot instance interruptions (e.g. replica 1), SkyServe will automatically fallback to on-demand replicas (e.g. launch one on-demand replica) to meet the required capacity of replicas. SkyServe will continue trying to provision one spot replica in the event where spot availability is back. Note that SkyServe will try different regions and clouds to maximize the chance of successfully provisioning spot instances. + +.. code-block:: console + + $ sky serve status http-server + + Services + NAME VERSION UPTIME STATUS REPLICAS ENDPOINT + http-server 1 7m 2s READY 1/3 54.227.229.217:30001 + + Service Replicas + SERVICE_NAME ID VERSION ENDPOINT LAUNCHED RESOURCES STATUS REGION + http-server 2 1 http://34.68.226.193:8081 7 mins ago 1x GCP([Spot]vCPU=2) READY us-central1 + http-server 5 1 - 13 secs ago 1x GCP([Spot]vCPU=2) PROVISIONING us-central1 + http-server 6 1 - 13 secs ago 1x GCP(vCPU=2) PROVISIONING us-central1 + +Eventually, when the spot availability is back, SkyServe will automatically scale down on-demand replicas. + +.. code-block:: console + + $ sky serve status http-server + + Services + NAME VERSION UPTIME STATUS REPLICAS ENDPOINT + http-server 1 10m 5s READY 2/3 54.227.229.217:30001 + + Service Replicas + SERVICE_NAME ID VERSION ENDPOINT LAUNCHED RESOURCES STATUS REGION + http-server 2 1 http://34.68.226.193:8081 10 mins ago 1x GCP([Spot]vCPU=2) READY us-central1 + http-server 5 1 http://34.121.49.94:8081 1 min ago 1x GCP([Spot]vCPU=2) READY us-central1 \ No newline at end of file diff --git a/docs/source/serving/user-guides.rst b/docs/source/serving/user-guides.rst index e6e63fd40a5..8b9cba92b45 100644 --- a/docs/source/serving/user-guides.rst +++ b/docs/source/serving/user-guides.rst @@ -5,3 +5,5 @@ Serving User Guides autoscaling update + auth + spot-policy diff --git a/examples/managed_job.yaml b/examples/managed_job.yaml index 4bfcb63f40a..30ad2db287a 100644 --- a/examples/managed_job.yaml +++ b/examples/managed_job.yaml @@ -6,6 +6,7 @@ setup: | run: | conda env list + echo "start counting" python -u - << EOF import time import tqdm diff --git a/examples/managed_job_with_storage.yaml b/examples/managed_job_with_storage.yaml index ecefccd8b3d..61244c16ba0 100644 --- a/examples/managed_job_with_storage.yaml +++ b/examples/managed_job_with_storage.yaml @@ -15,11 +15,17 @@ workdir: ./examples file_mounts: ~/bucket_workdir: - # Change this to the your own globally unique bucket name. + # Change this to your own globally unique bucket name. name: sky-workdir-zhwu source: ./examples persistent: false mode: COPY + + /output_path: + # Change this to your own globally unique bucket name. + name: sky-output-bucket + mode: MOUNT + /imagenet-image: source: s3://sky-imagenet-data @@ -55,3 +61,6 @@ run: | cat ~/tmpfile cat ~/a/b/c/tmpfile + + # Write to a file in the mounted bucket + echo "hello world!" > /output_path/output.txt diff --git a/examples/nccl_test.yaml b/examples/nccl_test.yaml new file mode 100644 index 00000000000..046e72cc00f --- /dev/null +++ b/examples/nccl_test.yaml @@ -0,0 +1,42 @@ +# This measures NCCL all reduce performance with Torch. + +# Usage: +# $ sky launch -c nccl --use-spot nccl_test.yaml + +# Example output +# (head, rank=0, pid=17654) [nccl-ebd1-head-8x3wqw6d-compute:0]:1 +# (head, rank=0, pid=17654) [nccl-ebd1-head-8x3wqw6d-compute:0]:2 +# (head, rank=0, pid=17654) [nccl-ebd1-head-8x3wqw6d-compute:0]:3 +# (head, rank=0, pid=17654) [nccl-ebd1-head-8x3wqw6d-compute:0]:4 +# (head, rank=0, pid=17654) [nccl-ebd1-head-8x3wqw6d-compute:0]:5 +# (head, rank=0, pid=17654) [nccl-ebd1-head-8x3wqw6d-compute:0]:The average bandwidth of all_reduce with a 4.0GB payload (5 trials, 16 ranks): +# (head, rank=0, pid=17654) [nccl-ebd1-head-8x3wqw6d-compute:0]: algbw: 2.053 GBps (16.4 Gbps) +# (head, rank=0, pid=17654) [nccl-ebd1-head-8x3wqw6d-compute:0]: busbw: 3.850 GBps (30.8 Gbps) +# (head, rank=0, pid=17654) [nccl-ebd1-head-8x3wqw6d-compute:0]: + +name: torch-nccl-allreduce + +num_nodes: 2 + +resources: + accelerators: A100:8 + use_spot: True + +setup: | + pip install torch + git clone https://github.com/stas00/ml-engineering.git + +run: | + cd ml-engineering/network/benchmarks + NNODES=`echo "$SKYPILOT_NODE_IPS" | wc -l` + MASTER_ADDR=`echo "$SKYPILOT_NODE_IPS" | head -n1` + python -u -m torch.distributed.run \ + --nproc_per_node $SKYPILOT_NUM_GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:8888 \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --role `hostname -s`: \ + --tee 3 \ + all_reduce_bench.py + \ No newline at end of file diff --git a/examples/stable_diffusion/README.md b/examples/stable_diffusion/README.md index 9bec3354848..2a4383f1347 100644 --- a/examples/stable_diffusion/README.md +++ b/examples/stable_diffusion/README.md @@ -8,7 +8,7 @@ 4. Run `ssh -L 7860:localhost:7860 stable-diffusion` -5. Open [`http://localhost:7860/`](http://localhost:7860/) in browser. +5. Open [`http://localhost:7860/`](http://localhost:7860/) in browser. If the page doesn't load, try again in a few minutes to allow the container to start. 6. Type in text prompt and click "Generate". diff --git a/examples/stable_diffusion/stable_diffusion_docker.yaml b/examples/stable_diffusion/stable_diffusion_docker.yaml index 47499fa2ea4..1a830241f14 100644 --- a/examples/stable_diffusion/stable_diffusion_docker.yaml +++ b/examples/stable_diffusion/stable_diffusion_docker.yaml @@ -7,8 +7,6 @@ file_mounts: /stable_diffusion: . setup: | - sudo curl "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose - sudo chmod +x /usr/local/bin/docker-compose cd stable-diffusion-webui-docker sudo rm -r stable-diffusion-webui-docker git clone https://github.com/AbdBarho/stable-diffusion-webui-docker.git @@ -17,9 +15,16 @@ setup: | wget https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt -P models mv models/sd-v1-4.ckpt models/model.ckpt docker pull berkeleyskypilot/stable-diffusion - rm docker-compose.yml - cp /stable_diffusion/docker-compose.yml . run: | cd stable-diffusion-webui-docker - docker-compose up + docker run -d \ + --name model \ + --restart on-failure \ + -p 7860:7860 \ + -v $(pwd)/cache:/cache \ + -v $(pwd)/output:/output \ + -v $(pwd)/models:/models \ + -e CLI_ARGS='--extra-models-cpu --optimized-turbo' \ + --gpus all \ + berkeleyskypilot/stable-diffusion diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md new file mode 100644 index 00000000000..bc9893fec5b --- /dev/null +++ b/llm/gpt-2/README.md @@ -0,0 +1,119 @@ +# Run GPT-2 in llm.c on any cloud with SkyPilot + +This is a reproducible package of llm.c's GPT-2 (124M) training by @karpathy (https://github.com/karpathy/llm.c/discussions/481). +With SkyPilot, you can run GPT-2 (124M) training on any cloud. SkyPilot looks for the cheapest resources available on the clouds enabled for a user, launches and manages the whole data processing and training pipeline, leading to a close to ~\$20 target cost as @karpathy mentioned in the discussion. + +## Prerequisites + +1. Install [SkyPilot](https://github.com/skypilot-org/skypilot): +```bash +pip install "skypilot-nightly[aws,gcp,azure,kubernetes,lambda,fluidstack]" # Choose the clouds you want to enable +``` +2. Enable clouds for SkyPilot: +```bash +sky check +``` +Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). + +3. Download the YAML for starting the training: +```bash +wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2.yaml +``` + +## Run GPT-2 training + +Run the following command to start GPT-2 (124M) training on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name): + +```bash +sky launch -c gpt2 gpt2.yaml +``` + +![GPT-2 training with 8 A100 GPUs](https://imgur.com/v8SGpsF.png) + +Or, you can train the model with a single A100, by adding `--gpus A100`: +```bash +sky launch -c gpt2 gpt2.yaml --gpus A100 +``` + +![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png) + + +It is also possible to speed up the training of the model on 8 H100 (2.3x more tok/s than 8x A100s): +```bash +sky launch -c gpt2 gpt2.yaml --gpus H100:8 +``` + +![GPT-2 training with 8 H100](https://imgur.com/STbi80b.png) + +### Download logs and visualizations + +After the training is finished, you can download the logs and visualizations with the following command: +```bash +scp -r gpt2:~/llm.c/log124M . +``` +We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 10K steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.) + +
+ +
+ +> Yes! We are able to reproduce the training of GPT-2 (124M) on any cloud with SkyPilot. + + + +## Advanced: Run GPT-2 training in two stages + +The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily +separate the data processing and training into two stages and execute them sequantially manually, or let SkyPilot manage the dependencies between the two stages. + +With this data processing can be run on cheaper CPU VMs (e.g., ~\$0.4/hour), and run the training on more expensive GPU VMs (e.g., ~\$1.3-\$3.6/hour for a single A100 GPU, or \$10.3-\$32.8/hour for 8 A100 GPUs). + +We can run the data processing on a CPU VM and store the processed data in a cloud bucket. Then, we can run the training on a GPU VM with the processed data. + +```bash +wget https://raw.githubusercontent.com//skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml +wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml +``` + +### Run two stages manually +#### Data processing + +Run the following command to process the training data on a CPU VM and store it in a cloud bucket for future use (replace `your-bucket-name` with your bucket name): + +```bash +sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name +``` + + +#### Training + +After the data is processed, you can then train the model on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name): + +```bash +sky launch -c gpt2-train --detach-setup gpt2-train.yaml --env BUCKET_NAME=your-bucket-name +``` + +Or, you can train the model with a single A100, by adding `--gpus A100`: +```bash +sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpus A100 --env BUCKET_NAME=your-bucket-name +``` + + +### Run in a Pipeline + +We can also combine the two steps into a single SkyPilot job, and let SkyPilot to handle the dependencies between the two steps. Here is an example of how to do this (replace `your-bucket-name` with your bucket name): +```bash +sky jobs launch -n gpt2 gpt2-pipeline.yaml --env BUCKET_NAME=your-bucket-name +``` + +> Note: the pipeline yaml can be retrieved with the following command: +```bash +cat gpt2-data.yaml > gpt2-pipeline.yaml; echo "---" >> gpt2-pipeline.yaml; cat gpt2-train.yaml >> gpt2-pipeline.yaml +``` + +SkyPilot will first download and process the dataset on a CPU VM and store the +processed data in a GCS bucket. Then, it will launch a GPT-2 training job on a +GPU VM. The training job will train GPT-2 (124M) on the processed data. + + + diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml new file mode 100644 index 00000000000..fc7bb02bf95 --- /dev/null +++ b/llm/gpt-2/gpt2-data.yaml @@ -0,0 +1,34 @@ +name: gpt2-data + +envs: + BUCKET_NAME: # TODO: Fill in your bucket name + BUCKET_STORE: s3 # Can be s3, gcs, or r2. + +resources: + cpus: 8+ + +file_mounts: + /cache: + name: $BUCKET_NAME + store: $BUCKET_STORE + mode: MOUNT + +setup: | + pip install tqdm tiktoken requests datasets + git clone https://github.com/karpathy/llm.c.git@ed37d9261ba13ef212c01e2de8b309cbb46a2aa7 || true + + # Adding revision to fix the dataset version, as the latest fineweb + # dataset removed the samples, causing error: + # Please pass `features` or at least one example when writing data + sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py + + +run: | + cd llm.c + # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) + # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B + # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb + python dev/data/fineweb.py --version 10B + + rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/ + rsync -Pavz dev/data/fineweb10B /cache/ diff --git a/llm/gpt-2/gpt2-pipeline.yaml b/llm/gpt-2/gpt2-pipeline.yaml new file mode 100644 index 00000000000..e5ea05f7948 --- /dev/null +++ b/llm/gpt-2/gpt2-pipeline.yaml @@ -0,0 +1,129 @@ +name: gpt2-data + +envs: + BUCKET_NAME: # TODO: Fill in your bucket name + BUCKET_STORE: s3 # Can be s3, gcs, or r2. + +resources: + cpus: 8+ + +file_mounts: + /cache: + name: $BUCKET_NAME + store: $BUCKET_STORE + mode: MOUNT + +setup: | + pip install tqdm tiktoken requests datasets + git clone https://github.com/karpathy/llm.c.git@ed37d9261ba13ef212c01e2de8b309cbb46a2aa7 || true + + # Adding revision to fix the dataset version, as the latest fineweb + # dataset removed the samples, causing error: + # Please pass `features` or at least one example when writing data + sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py + + +run: | + cd llm.c + # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) + # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B + # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb + python dev/data/fineweb.py --version 10B + + rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/ + rsync -Pavz dev/data/fineweb10B /cache/ +--- +name: gpt2-train + +envs: + BUCKET_NAME: # TODO: Fill in your bucket name + BUCKET_STORE: s3 # Can be s3, gcs, or r2. + +resources: + accelerators: A100:8 + # Use docker image for latest version g++ to enable the compilation of llm.c. + image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + any_of: + # Avoid using docker image for lambda due to the docker is not supported on + # Lambda yet, but the base image works. + - cloud: lambda + image_id: null + - cloud: aws + - cloud: gcp + - cloud: azure + - cloud: fluidstack + - cloud: kubernetes + +file_mounts: + ~/.cache/huggingface: + name: $BUCKET_NAME + store: $BUCKET_STORE + mode: COPY + +setup: | + cd ~ + + # install cudnn so we can use FlashAttention and run fast (optional) + # https://developer.nvidia.com/cudnn-downloads + # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04 + if [ -f ./CUDNN_INSTALLED ]; then + echo "cudnn already installed" + else + system=$(lsb_release -si | tr '[:upper:]' '[:lower:]') + # Get version and remove the dot + version=$(lsb_release -sr | tr -d .) + export system_version="${system}${version}" + wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb + sudo dpkg -i cudnn-installer.deb + sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ + # Remove problematic kubernetes.list source + sudo apt-get update --allow-releaseinfo-change || true + + sudo apt-get -y install cudnn-cuda-12 + + touch ./CUDNN_INSTALLED + fi + + # "install" cudnn-frontend to ~/ + sudo apt -y install git + git clone https://github.com/NVIDIA/cudnn-frontend.git || true + + # install MPI (optional, if you intend to use multiple GPUs) + # SkyPilot do not install MPI as that requires NCCL which needs to be manually + # installed. + sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev + # install nccl + pip install nvidia-nccl-cu12 + export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include + + git clone https://github.com/karpathy/llm.c.git || true + cd llm.c + ln -s ~/.cache/huggingface/fineweb10B dev/data/ + # compile llm.c (mixed precision, with cuDNN flash-attention) + # first compilation is ~1 minute, mostly due to cuDNN + make train_gpt2cu USE_CUDNN=1 + + +run: | + cd ~/llm.c + # train on multiple GPUs + mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \ + -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" \ + -o log124M \ + -e "d12" \ + -b 64 -t 1024 \ + -d 524288 \ + -r 1 \ + -z 1 \ + -c 0.1 \ + -l 0.0006 \ + -q 0.0 \ + -u 700 \ + -n 5000 \ + -v 250 -s 20000 \ + -h 1 + + # Upload the log and model to the bucket + rsync -Pavz log124M ~/.cache/huggingface diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml new file mode 100644 index 00000000000..3a4e8c28d14 --- /dev/null +++ b/llm/gpt-2/gpt2-train.yaml @@ -0,0 +1,94 @@ +name: gpt2-train + +envs: + BUCKET_NAME: # TODO: Fill in your bucket name + BUCKET_STORE: s3 # Can be s3, gcs, or r2. + +resources: + accelerators: A100:8 + # Use docker image for latest version g++ to enable the compilation of llm.c. + image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + any_of: + # Avoid using docker image for lambda due to the docker is not supported on + # Lambda yet, but the base image works. + - cloud: lambda + image_id: null + - cloud: aws + - cloud: gcp + - cloud: azure + - cloud: fluidstack + - cloud: kubernetes + +file_mounts: + ~/.cache/huggingface: + name: $BUCKET_NAME + store: $BUCKET_STORE + mode: COPY + +setup: | + cd ~ + + # install cudnn so we can use FlashAttention and run fast (optional) + # https://developer.nvidia.com/cudnn-downloads + # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04 + if [ -f ./CUDNN_INSTALLED ]; then + echo "cudnn already installed" + else + system=$(lsb_release -si | tr '[:upper:]' '[:lower:]') + # Get version and remove the dot + version=$(lsb_release -sr | tr -d .) + export system_version="${system}${version}" + wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb + sudo dpkg -i cudnn-installer.deb + sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ + # Remove problematic kubernetes.list source + sudo apt-get update --allow-releaseinfo-change || true + + sudo apt-get -y install cudnn-cuda-12 + + touch ./CUDNN_INSTALLED + fi + + # "install" cudnn-frontend to ~/ + sudo apt -y install git + git clone https://github.com/NVIDIA/cudnn-frontend.git || true + + # install MPI (optional, if you intend to use multiple GPUs) + # SkyPilot do not install MPI as that requires NCCL which needs to be manually + # installed. + sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev + # install nccl + pip install nvidia-nccl-cu12 + export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include + + git clone https://github.com/karpathy/llm.c.git || true + cd llm.c + ln -s ~/.cache/huggingface/fineweb10B dev/data/ + # compile llm.c (mixed precision, with cuDNN flash-attention) + # first compilation is ~1 minute, mostly due to cuDNN + make train_gpt2cu USE_CUDNN=1 + + +run: | + cd ~/llm.c + # train on multiple GPUs + mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \ + -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" \ + -o log124M \ + -e "d12" \ + -b 64 -t 1024 \ + -d 524288 \ + -r 1 \ + -z 1 \ + -c 0.1 \ + -l 0.0006 \ + -q 0.0 \ + -u 700 \ + -n 5000 \ + -v 250 -s 20000 \ + -h 1 + + # Upload the log and model to the bucket + rsync -Pavz log124M ~/.cache/huggingface diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml new file mode 100644 index 00000000000..8e203772128 --- /dev/null +++ b/llm/gpt-2/gpt2.yaml @@ -0,0 +1,95 @@ +name: train + +resources: + accelerators: A100:8 + # Use docker image for latest version g++ to enable the compilation of llm.c. + image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + any_of: + # Avoid using docker image for lambda due to the docker is not supported on + # Lambda yet, but the base image works. + - cloud: lambda + image_id: null + - cloud: aws + - cloud: gcp + - cloud: azure + - cloud: fluidstack + - cloud: kubernetes + + +setup: | + cd ~ + pip install tqdm tiktoken requests datasets + + # Training dependencies + # install cudnn so we can use FlashAttention and run fast (optional) + # https://developer.nvidia.com/cudnn-downloads + # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04 + if [ -f ./CUDNN_INSTALLED ]; then + echo "cudnn already installed" + else + system=$(lsb_release -si | tr '[:upper:]' '[:lower:]') + # Get version and remove the dot + version=$(lsb_release -sr | tr -d .) + export system_version="${system}${version}" + wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb + sudo dpkg -i cudnn-installer.deb + sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ + # Remove problematic kubernetes.list source + sudo apt-get update --allow-releaseinfo-change || true + + sudo apt-get -y install cudnn-cuda-12 + + touch ./CUDNN_INSTALLED + fi + + # "install" cudnn-frontend to ~/ + sudo apt -y install git + git clone https://github.com/NVIDIA/cudnn-frontend.git || true + + # install MPI (optional, if you intend to use multiple GPUs) + # SkyPilot do not install MPI as that requires NCCL which needs to be manually + # installed. + sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev + # install nccl + pip install nvidia-nccl-cu12 + export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include + + git clone https://github.com/karpathy/llm.c.git || true + cd llm.c + + # add revision to fix the dataset version, as the latest fineweb + # dataset removed the samples, causing error: + # Please pass `features` or at least one example when writing data + sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py + + # compile llm.c (mixed precision, with cuDNN flash-attention) + # first compilation is ~1 minute, mostly due to cuDNN + make train_gpt2cu USE_CUDNN=1 + + +run: | + cd ~/llm.c + # Processing data + # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) + # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B + # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb + python dev/data/fineweb.py --version 10B + + # Start training on multiple GPUs + mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \ + -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" \ + -o log124M \ + -e "d12" \ + -b 64 -t 1024 \ + -d 524288 \ + -r 1 \ + -z 1 \ + -c 0.1 \ + -l 0.0006 \ + -q 0.0 \ + -u 700 \ + -n 5000 \ + -v 250 -s 20000 \ + -h 1 diff --git a/llm/llama-3/README.md b/llm/llama-3/README.md index decff6054bf..d0c28dc93c6 100644 --- a/llm/llama-3/README.md +++ b/llm/llama-3/README.md @@ -74,10 +74,10 @@ setup: | conda activate vllm fi - pip install vllm==0.4.0.post1 + pip install vllm==0.4.2 # Install Gradio for web UI. pip install gradio openai - pip install flash-attn==2.5.7 + pip install flash-attn==2.5.9.post1 run: | @@ -226,7 +226,7 @@ ENDPOINT=$(sky status --endpoint 8081 llama3) curl http://$ENDPOINT/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "messages": [ { "role": "system", diff --git a/llm/llama-3/llama3.yaml b/llm/llama-3/llama3.yaml index 1e9b236efd4..ff9dc4967ac 100644 --- a/llm/llama-3/llama3.yaml +++ b/llm/llama-3/llama3.yaml @@ -89,10 +89,10 @@ setup: | conda activate vllm fi - pip install vllm==0.4.0.post1 + pip install vllm==0.4.2 # Install Gradio for web UI. pip install gradio openai - pip install flash-attn==2.5.7 + pip install flash-attn==2.5.9.post1 run: | diff --git a/llm/qwen/README.md b/llm/qwen/README.md index 113bbd9e740..6a76af71287 100644 --- a/llm/qwen/README.md +++ b/llm/qwen/README.md @@ -1,10 +1,12 @@ -# Serving Qwen1.5 on Your Own Cloud +# Serving Qwen2 on Your Own Cloud -[Qwen1.5](https://github.com/QwenLM/Qwen1.5) is one of the top open LLMs. -As of Feb 2024, Qwen1.5-72B-Chat is ranked higher than Mixtral-8x7b-Instruct-v0.1 on the LMSYS Chatbot Arena Leaderboard. +[Qwen2](https://github.com/QwenLM/Qwen2) is one of the top open LLMs. +As of Jun 2024, Qwen1.5-110B-Chat is ranked higher than GPT-4-0613 on the [LMSYS Chatbot Arena Leaderboard](https://chat.lmsys.org/?leaderboard). 📰 **Update (26 April 2024) -** SkyPilot now also supports the [**Qwen1.5-110B**](https://qwenlm.github.io/blog/qwen1.5-110b/) model! It performs competitively with Llama-3-70B across a [series of evaluations](https://qwenlm.github.io/blog/qwen1.5-110b/#model-quality). Use [serve-110b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/serve-110b.yaml) to serve the 110B model. +📰 **Update (6 Jun 2024) -** SkyPilot now also supports the [**Qwen2**](https://qwenlm.github.io/blog/qwen2/) model! It further improves the competitive model, Qwen1.5. +

qwen

@@ -99,7 +101,7 @@ ENDPOINT=$(sky serve status --endpoint qwen) curl http://$ENDPOINT/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "Qwen/Qwen1.5-72B-Chat", + "model": "Qwen/Qwen2-72B-Instruct", "messages": [ { "role": "system", @@ -121,7 +123,7 @@ It is also possible to access the Qwen service with a GUI using [vLLM](https://g 1. Start the chat web UI (change the `--env` flag to the model you are running): ```bash -sky launch -c qwen-gui ./gui.yaml --env MODEL_NAME='Qwen/Qwen1.5-72B-Chat' --env ENDPOINT=$(sky serve status --endpoint qwen) +sky launch -c qwen-gui ./gui.yaml --env MODEL_NAME='Qwen/Qwen2-72B-Instruct' --env ENDPOINT=$(sky serve status --endpoint qwen) ``` 2. Then, we can access the GUI at the returned gradio link: diff --git a/llm/qwen/serve-110b.yaml b/llm/qwen/serve-110b.yaml index 857f37370b4..1e98bd254e9 100644 --- a/llm/qwen/serve-110b.yaml +++ b/llm/qwen/serve-110b.yaml @@ -29,8 +29,8 @@ setup: | conda create -n qwen python=3.10 -y conda activate qwen fi - pip install -U vllm==0.4.1 - pip install -U transformers==4.38.0 + pip install vllm==0.4.2 + pip install flash-attn==2.5.9.post1 run: | conda activate qwen diff --git a/llm/qwen/serve-72b.yaml b/llm/qwen/serve-72b.yaml index 86248011bbf..34e3e348f2f 100644 --- a/llm/qwen/serve-72b.yaml +++ b/llm/qwen/serve-72b.yaml @@ -1,5 +1,5 @@ envs: - MODEL_NAME: Qwen/Qwen1.5-72B-Chat + MODEL_NAME: Qwen/Qwen2-72B-Instruct service: # Specifying the path to the endpoint to check the readiness of the replicas. @@ -29,8 +29,8 @@ setup: | conda create -n qwen python=3.10 -y conda activate qwen fi - pip install -U vllm==0.3.2 - pip install -U transformers==4.38.0 + pip install vllm==0.4.2 + pip install flash-attn==2.5.9.post1 run: | conda activate qwen diff --git a/llm/qwen/serve-7b.yaml b/llm/qwen/serve-7b.yaml index a1ec7ee3f2b..f33adcdd2cd 100644 --- a/llm/qwen/serve-7b.yaml +++ b/llm/qwen/serve-7b.yaml @@ -1,5 +1,5 @@ envs: - MODEL_NAME: Qwen/Qwen1.5-7B-Chat + MODEL_NAME: Qwen/Qwen2-7B-Instruct service: # Specifying the path to the endpoint to check the readiness of the replicas. @@ -27,8 +27,8 @@ setup: | conda create -n qwen python=3.10 -y conda activate qwen fi - pip install -U vllm==0.3.2 - pip install -U transformers==4.38.0 + pip install vllm==0.4.2 + pip install flash-attn==2.5.9.post1 run: | conda activate qwen diff --git a/llm/vllm/README.md b/llm/vllm/README.md index 61932cd8571..e3a2befbecc 100644 --- a/llm/vllm/README.md +++ b/llm/vllm/README.md @@ -33,6 +33,8 @@ sky launch -c vllm-llama2 serve-openai-api.yaml --env HF_TOKEN=YOUR_HUGGING_FACE ```bash sky launch -c vllm-llama2 serve-openai-api.yaml --gpus V100:1 --env HF_TOKEN=YOUR_HUGGING_FACE_API_TOKEN ``` +**Tip**: You can also use the vLLM docker container for faster setup. Refer to [serve-openai-api-docker.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vllm/serve-openai-api-docker.yaml) for more. + 2. Check the IP for the cluster with: ``` IP=$(sky status --ip vllm-llama2) diff --git a/llm/vllm/serve-openai-api-docker.yaml b/llm/vllm/serve-openai-api-docker.yaml new file mode 100644 index 00000000000..0a980092e99 --- /dev/null +++ b/llm/vllm/serve-openai-api-docker.yaml @@ -0,0 +1,20 @@ +envs: + MODEL_NAME: meta-llama/Llama-2-7b-chat-hf + HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. + +resources: + image_id: docker:vllm/vllm-openai:latest + accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1} + ports: + - 8000 + +setup: | + conda deactivate + python3 -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + +run: | + conda deactivate + echo 'Starting vllm openai api server...' + python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_NAME --tokenizer hf-internal-testing/llama-tokenizer \ + --host 0.0.0.0 diff --git a/sky/adaptors/azure.py b/sky/adaptors/azure.py index 44618a8f64f..6bd57bc6bec 100644 --- a/sky/adaptors/azure.py +++ b/sky/adaptors/azure.py @@ -3,8 +3,10 @@ # pylint: disable=import-outside-toplevel import functools import threading +import time from sky.adaptors import common +from sky.utils import common_utils azure = common.LazyImport( 'azure', @@ -13,13 +15,30 @@ _LAZY_MODULES = (azure,) _session_creation_lock = threading.RLock() +_MAX_RETRY_FOR_GET_SUBSCRIPTION_ID = 5 @common.load_lazy_modules(modules=_LAZY_MODULES) +@functools.lru_cache() def get_subscription_id() -> str: """Get the default subscription id.""" from azure.common import credentials - return credentials.get_cli_profile().get_subscription_id() + retry = 0 + backoff = common_utils.Backoff(initial_backoff=0.5, max_backoff_factor=4) + while True: + try: + return credentials.get_cli_profile().get_subscription_id() + except Exception as e: + if ('Please run \'az login\' to setup account.' in str(e) and + retry < _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID): + # When there are multiple processes trying to get the + # subscription id, it may fail with the above error message. + # Retry will fix the issue. + retry += 1 + + time.sleep(backoff.current_backoff()) + continue + raise @common.load_lazy_modules(modules=_LAZY_MODULES) @@ -36,8 +55,8 @@ def exceptions(): return azure_exceptions -@functools.lru_cache() @common.load_lazy_modules(modules=_LAZY_MODULES) +@functools.lru_cache() def get_client(name: str, subscription_id: str): # Sky only supports Azure CLI credential for now. # Increase the timeout to fix the Azure get-access-token timeout issue. diff --git a/sky/adaptors/gcp.py b/sky/adaptors/gcp.py index 6465709d42c..9f63bec87ee 100644 --- a/sky/adaptors/gcp.py +++ b/sky/adaptors/gcp.py @@ -21,8 +21,9 @@ def build(service_name: str, version: str, *args, **kwargs): service_name: GCP service name (e.g., 'compute', 'storagetransfer'). version: Service version (e.g., 'v1'). """ - from googleapiclient import discovery - return discovery.build(service_name, version, *args, **kwargs) + + return googleapiclient.discovery.build(service_name, version, *args, + **kwargs) @common.load_lazy_modules(_LAZY_MODULES) diff --git a/sky/adaptors/kubernetes.py b/sky/adaptors/kubernetes.py index 7cdb3ff3059..7f52a099f56 100644 --- a/sky/adaptors/kubernetes.py +++ b/sky/adaptors/kubernetes.py @@ -2,9 +2,11 @@ # pylint: disable=import-outside-toplevel +import logging import os from sky.adaptors import common +from sky.sky_logging import set_logging_level from sky.utils import env_options from sky.utils import ux_utils @@ -28,6 +30,33 @@ API_TIMEOUT = 5 +def _decorate_methods(obj, decorator): + for attr_name in dir(obj): + attr = getattr(obj, attr_name) + if callable(attr) and not attr_name.startswith('__'): + setattr(obj, attr_name, decorator(attr)) + return obj + + +def _api_logging_decorator(logger: str, level: int): + """Decorator to set logging level for API calls. + + This is used to suppress the verbose logging from urllib3 when calls to the + Kubernetes API timeout. + """ + + def decorated_api(api): + + def wrapped(*args, **kwargs): + obj = api(*args, **kwargs) + _decorate_methods(obj, set_logging_level(logger, level)) + return obj + + return wrapped + + return decorated_api + + def _load_config(): global _configured if _configured: @@ -65,15 +94,16 @@ def _load_config(): _configured = True +@_api_logging_decorator('urllib3', logging.ERROR) def core_api(): global _core_api if _core_api is None: _load_config() _core_api = kubernetes.client.CoreV1Api() - return _core_api +@_api_logging_decorator('urllib3', logging.ERROR) def auth_api(): global _auth_api if _auth_api is None: @@ -83,6 +113,7 @@ def auth_api(): return _auth_api +@_api_logging_decorator('urllib3', logging.ERROR) def networking_api(): global _networking_api if _networking_api is None: @@ -92,6 +123,7 @@ def networking_api(): return _networking_api +@_api_logging_decorator('urllib3', logging.ERROR) def custom_objects_api(): global _custom_objects_api if _custom_objects_api is None: @@ -101,6 +133,7 @@ def custom_objects_api(): return _custom_objects_api +@_api_logging_decorator('urllib3', logging.ERROR) def node_api(): global _node_api if _node_api is None: @@ -110,6 +143,7 @@ def node_api(): return _node_api +@_api_logging_decorator('urllib3', logging.ERROR) def apps_api(): global _apps_api if _apps_api is None: @@ -119,6 +153,7 @@ def apps_api(): return _apps_api +@_api_logging_decorator('urllib3', logging.ERROR) def api_client(): global _api_client if _api_client is None: diff --git a/sky/authentication.py b/sky/authentication.py index 966dad670c5..c61e0ce36c8 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -439,29 +439,38 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]: f'Key {secret_name} does not exist in the cluster, creating it...') kubernetes.core_api().create_namespaced_secret(namespace, secret) - ssh_jump_name = clouds.Kubernetes.SKY_SSH_JUMP_NAME + private_key_path, _ = get_or_generate_keys() if network_mode == nodeport_mode: + ssh_jump_name = clouds.Kubernetes.SKY_SSH_JUMP_NAME service_type = kubernetes_enums.KubernetesServiceType.NODEPORT + # Setup service for SSH jump pod. We create the SSH jump service here + # because we need to know the service IP address and port to set the + # ssh_proxy_command in the autoscaler config. + kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, + service_type) + ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command( + ssh_jump_name, + nodeport_mode, + private_key_path=private_key_path, + namespace=namespace) elif network_mode == port_forward_mode: + # Using `kubectl port-forward` creates a direct tunnel to the pod and + # does not require a ssh jump pod. kubernetes_utils.check_port_forward_mode_dependencies() - # Using `kubectl port-forward` creates a direct tunnel to jump pod and - # does not require opening any ports on Kubernetes nodes. As a result, - # the service can be a simple ClusterIP service which we access with - # `kubectl port-forward`. - service_type = kubernetes_enums.KubernetesServiceType.CLUSTERIP + # TODO(romilb): This can be further optimized. Instead of using the + # head node as a jump pod for worker nodes, we can also directly + # set the ssh_target to the worker node. However, that requires + # changes in the downstream code to return a mapping of node IPs to + # pod names (to be used as ssh_target) and updating the upstream + # SSHConfigHelper to use a different ProxyCommand for each pod. + # This optimization can reduce SSH time from ~0.35s to ~0.25s, tested + # on GKE. + ssh_target = config['cluster_name'] + '-head' + ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command( + ssh_target, port_forward_mode, private_key_path=private_key_path) else: # This should never happen because we check for this in from_str above. raise ValueError(f'Unsupported networking mode: {network_mode_str}') - # Setup service for SSH jump pod. We create the SSH jump service here - # because we need to know the service IP address and port to set the - # ssh_proxy_command in the autoscaler config. - kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, service_type) - - ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command( - PRIVATE_SSH_KEY_PATH, ssh_jump_name, network_mode, namespace, - clouds.Kubernetes.PORT_FORWARD_PROXY_CMD_PATH, - clouds.Kubernetes.PORT_FORWARD_PROXY_CMD_TEMPLATE) - config['auth']['ssh_proxy_command'] = ssh_proxy_cmd return config diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index b1598c7c039..a1c86fdb624 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -146,6 +146,7 @@ # Clouds with new provisioner has docker_login_config in the # docker field, instead of the provider field. ('docker', 'docker_login_config'), + ('docker', 'run_options'), # Other clouds ('provider', 'docker_login_config'), ('provider', 'firewall_rule'), @@ -873,8 +874,23 @@ def write_cluster_config( f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\'' ) + # Docker run options + docker_run_options = skypilot_config.get_nested(('docker', 'run_options'), + []) + if isinstance(docker_run_options, str): + docker_run_options = [docker_run_options] + if docker_run_options and isinstance(to_provision.cloud, clouds.Kubernetes): + logger.warning(f'{colorama.Style.DIM}Docker run options are specified, ' + 'but ignored for Kubernetes: ' + f'{" ".join(docker_run_options)}' + f'{colorama.Style.RESET_ALL}') + # Use a tmp file path to avoid incomplete YAML file being re-used in the # future. + initial_setup_commands = [] + if (skypilot_config.get_nested(('nvidia_gpus', 'disable_ecc'), False) and + to_provision.accelerators is not None): + initial_setup_commands.append(constants.DISABLE_GPU_ECC_COMMAND) tmp_yaml_path = yaml_path + '.tmp' common_utils.fill_template( cluster_config_template, @@ -906,6 +922,8 @@ def write_cluster_config( # currently only used by GCP. 'specific_reservations': specific_reservations, + # Initial setup commands. + 'initial_setup_commands': initial_setup_commands, # Conda setup 'conda_installation_commands': constants.CONDA_INSTALLATION_COMMANDS, @@ -917,6 +935,9 @@ def write_cluster_config( wheel_hash).replace('{cloud}', str(cloud).lower())), + # Docker + 'docker_run_options': docker_run_options, + # Port of Ray (GCS server). # Ray's default port 6379 is conflicted with Redis. 'ray_port': constants.SKY_REMOTE_RAY_PORT, @@ -979,7 +1000,11 @@ def write_cluster_config( with open(tmp_yaml_path, 'w', encoding='utf-8') as f: f.write(restored_yaml_content) - config_dict['cluster_name_on_cloud'] = cluster_name_on_cloud + # Read the cluster name from the tmp yaml file, to take the backward + # compatbility restortion above into account. + # TODO: remove this after 2 minor releases, 0.8.0. + yaml_config = common_utils.read_yaml(tmp_yaml_path) + config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name'] # Optimization: copy the contents of source files in file_mounts to a # special dir, and upload that as the only file_mount instead. Delay @@ -1226,6 +1251,12 @@ def ssh_credential_from_yaml( ssh_private_key = auth_section.get('ssh_private_key') ssh_control_name = config.get('cluster_name', '__default__') ssh_proxy_command = auth_section.get('ssh_proxy_command') + + # Update the ssh_user placeholder in proxy command, if required + if (ssh_proxy_command is not None and + constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command): + ssh_proxy_command = ssh_proxy_command.replace( + constants.SKY_SSH_USER_PLACEHOLDER, ssh_user) credentials = { 'ssh_user': ssh_user, 'ssh_private_key': ssh_private_key, @@ -2650,27 +2681,6 @@ def stop_handler(signum, frame): raise KeyboardInterrupt(exceptions.SIGTSTP_CODE) -def run_command_and_handle_ssh_failure(runner: command_runner.SSHCommandRunner, - command: str, - failure_message: str) -> str: - """Runs command remotely and returns output with proper error handling.""" - rc, stdout, stderr = runner.run(command, - require_outputs=True, - stream_logs=False) - if rc == 255: - # SSH failed - raise RuntimeError( - f'SSH with user {runner.ssh_user} and key {runner.ssh_private_key} ' - f'to {runner.ip} failed. This is most likely due to incorrect ' - 'credentials or incorrect permissions for the key file. Check ' - 'your credentials and try again.') - subprocess_utils.handle_returncode(rc, - command, - failure_message, - stderr=stderr) - return stdout - - def check_rsync_installed() -> None: """Checks if rsync is installed. diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index dcf20796568..e65d49c6276 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -151,6 +151,18 @@ _MAX_INLINE_SCRIPT_LENGTH = 120 * 1024 +def _is_command_length_over_limit(command: str) -> bool: + """Check if the length of the command exceeds the limit. + + We calculate the length of the command after quoting the command twice as + when it is executed by the CommandRunner, the command will be quoted twice + to ensure the correctness, which will add significant length to the command. + """ + + quoted_length = len(shlex.quote(shlex.quote(command))) + return quoted_length > _MAX_INLINE_SCRIPT_LENGTH + + def _get_cluster_config_template(cloud): cloud_to_template = { clouds.AWS: 'aws-ray.yml.j2', @@ -269,8 +281,9 @@ def add_prologue(self, job_id: int) -> None: SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r} kwargs = dict() - # Only set the `_temp_dir` to SkyPilot's ray cluster directory when the directory - # exists for backward compatibility for the VM launched before #1790. + # Only set the `_temp_dir` to SkyPilot's ray cluster directory when + # the directory exists for backward compatibility for the VM + # launched before #1790. if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}): kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r} ray.init( @@ -308,8 +321,9 @@ def get_or_fail(futures, pg) -> List[int]: ready, unready = ray.wait(unready) idx = futures.index(ready[0]) returncodes[idx] = ray.get(ready[0]) - # Remove the placement group after all tasks are done, so that the - # next job can be scheduled on the released resources immediately. + # Remove the placement group after all tasks are done, so that + # the next job can be scheduled on the released resources + # immediately. ray_util.remove_placement_group(pg) sys.stdout.flush() return returncodes @@ -348,9 +362,9 @@ def add_gang_scheduling_placement_group_and_setup( num_nodes: int, resources_dict: Dict[str, float], stable_cluster_internal_ips: List[str], + env_vars: Dict[str, str], setup_cmd: Optional[str] = None, setup_log_path: Optional[str] = None, - env_vars: Optional[Dict[str, str]] = None, ) -> None: """Create the gang scheduling placement group for a Task. @@ -410,6 +424,8 @@ def add_gang_scheduling_placement_group_and_setup( job_id = self.job_id if setup_cmd is not None: + setup_envs = env_vars.copy() + setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes) self._code += [ textwrap.dedent(f"""\ setup_cmd = {setup_cmd!r} @@ -439,7 +455,7 @@ def add_gang_scheduling_placement_group_and_setup( .remote( setup_cmd, os.path.expanduser({setup_log_path!r}), - env_vars={env_vars!r}, + env_vars={setup_envs!r}, stream_logs=True, with_ray=True, ) for i in range(total_num_nodes)] @@ -550,11 +566,13 @@ def add_ray_task(self, f'placement_group_bundle_index={gang_scheduling_id})') sky_env_vars_dict_str = [ - textwrap.dedent("""\ - sky_env_vars_dict = {} - sky_env_vars_dict['SKYPILOT_NODE_IPS'] = job_ip_list_str - # Environment starting with `SKY_` is deprecated. + textwrap.dedent(f"""\ + sky_env_vars_dict = {{}} + sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str + # Backward compatibility: Environment starting with `SKY_` is + # deprecated. Remove it in v0.9.0. sky_env_vars_dict['SKY_NODE_IPS'] = job_ip_list_str + sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list) """) ] @@ -575,8 +593,9 @@ def add_ray_task(self, if script is not None: - sky_env_vars_dict['SKYPILOT_NUM_GPUS_PER_NODE'] = {int(math.ceil(num_gpus))!r} - # Environment starting with `SKY_` is deprecated. + sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r} + # Backward compatibility: Environment starting with `SKY_` is + # deprecated. Remove it in v0.9.0. sky_env_vars_dict['SKY_NUM_GPUS_PER_NODE'] = {int(math.ceil(num_gpus))!r} ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}] @@ -593,12 +612,14 @@ def add_ray_task(self, node_name = f'worker{{idx_in_cluster}}' name_str = f'{{node_name}}, rank={{rank}},' log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log')) - sky_env_vars_dict['SKYPILOT_NODE_RANK'] = rank - # Environment starting with `SKY_` is deprecated. + sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank + # Backward compatibility: Environment starting with `SKY_` is + # deprecated. Remove it in v0.9.0. sky_env_vars_dict['SKY_NODE_RANK'] = rank sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id} - # Environment starting with `SKY_` is deprecated. + # Backward compatibility: Environment starting with `SKY_` is + # deprecated. Remove it in v0.9.0. sky_env_vars_dict['SKY_INTERNAL_JOB_ID'] = {self.job_id} futures.append(run_bash_command_with_log \\ @@ -2000,8 +2021,16 @@ def provision_with_retries( failover_history: List[Exception] = list() style = colorama.Style + fore = colorama.Fore # Retrying launchable resources. while True: + if (isinstance(to_provision.cloud, clouds.Azure) and + to_provision.accelerators is not None and + 'A10' in to_provision.accelerators): + logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch ' + 'an A10 cluster on Azure. This may take ~20 ' + 'minutes due to driver installation.' + f'{style.RESET_ALL}') try: # Recheck cluster name as the 'except:' block below may # change the cloud assignment. @@ -3045,7 +3074,10 @@ def _update_after_cluster_provisioned( ) usage_lib.messages.usage.update_final_cluster_status( status_lib.ClusterStatus.UP) - auth_config = common_utils.read_yaml(handle.cluster_yaml)['auth'] + auth_config = backend_utils.ssh_credential_from_yaml( + handle.cluster_yaml, + ssh_user=handle.ssh_user, + docker_user=handle.docker_user) backend_utils.SSHConfigHelper.add_cluster(handle.cluster_name, ip_list, auth_config, ssh_port_list, @@ -3151,8 +3183,7 @@ def _setup_node(node_id: int) -> None: setup_script = log_lib.make_task_bash_script(setup, env_vars=setup_envs) encoded_script = shlex.quote(setup_script) - if (detach_setup or - len(encoded_script) > _MAX_INLINE_SCRIPT_LENGTH): + if detach_setup or _is_command_length_over_limit(encoded_script): with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f: f.write(setup_script) f.flush() @@ -3176,11 +3207,11 @@ def _setup_node(node_id: int) -> None: process_stream=False, # We do not source bashrc for setup, since bashrc is sourced # in the script already. - # Skip two lines due to the /bin/bash -i and source ~/.bashrc - # in the setup_cmd. + # Skip an empty line and two lines due to the /bin/bash -i and + # source ~/.bashrc in the setup_cmd. # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long # bash: no job control in this shell - skip_lines=2, + skip_lines=3, ) def error_message() -> str: @@ -3263,7 +3294,7 @@ def _exec_code_on_head( code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd) job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code]) - if len(job_submit_cmd) > _MAX_INLINE_SCRIPT_LENGTH: + if _is_command_length_over_limit(job_submit_cmd): runners = handle.get_command_runners() head_runner = runners[0] with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp: @@ -3645,7 +3676,10 @@ def _rsync_down(args) -> None: try: os.makedirs(local_log_dir, exist_ok=True) runner.rsync( - source=f'{remote_log_dir}/*', + # Require a `/` at the end to make sure the parent dir + # are not created locally. We do not add additional '*' as + # kubernetes's rsync does not work with an ending '*'. + source=f'{remote_log_dir}/', target=local_log_dir, up=False, stream_logs=False, @@ -3654,7 +3688,7 @@ def _rsync_down(args) -> None: if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE: # Raised by rsync_down. Remote log dir may not exist, since # the job can be run on some part of the nodes. - logger.debug(f'{runner.ip} does not have the tasks/*.') + logger.debug(f'{runner.node_id} does not have the tasks/*.') else: raise @@ -3669,6 +3703,14 @@ def tail_logs(self, job_id: Optional[int], managed_job_id: Optional[int] = None, follow: bool = True) -> int: + """Tail the logs of a job. + + Args: + handle: The handle to the cluster. + job_id: The job ID to tail the logs of. + managed_job_id: The managed job ID for display purpose only. + follow: Whether to follow the logs. + """ code = job_lib.JobLibCodeGen.tail_logs(job_id, managed_job_id=managed_job_id, follow=follow) @@ -3703,15 +3745,12 @@ def tail_managed_job_logs(self, handle: CloudVmRayResourceHandle, job_id: Optional[int] = None, job_name: Optional[str] = None, + controller: bool = False, follow: bool = True) -> None: # if job_name is not None, job_id should be None assert job_name is None or job_id is None, (job_name, job_id) - if job_name is not None: - code = managed_jobs.ManagedJobCodeGen.stream_logs_by_name( - job_name, follow) - else: - code = managed_jobs.ManagedJobCodeGen.stream_logs_by_id( - job_id, follow) + code = managed_jobs.ManagedJobCodeGen.stream_logs( + job_name, job_id, follow, controller) # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly # kill the process, so we need to handle it manually here. @@ -3858,22 +3897,8 @@ def teardown_no_lock(self, self.post_teardown_cleanup(handle, terminate, purge) return - if terminate and isinstance(cloud, clouds.Azure): - # Here we handle termination of Azure by ourselves instead of Ray - # autoscaler. - resource_group = config['provider']['resource_group'] - terminate_cmd = f'az group delete -y --name {resource_group}' - with rich_utils.safe_status(f'[bold cyan]Terminating ' - f'[green]{cluster_name}'): - returncode, stdout, stderr = log_lib.run_with_log( - terminate_cmd, - log_abs_path, - shell=True, - stream_logs=False, - require_outputs=True) - - elif (isinstance(cloud, clouds.IBM) and terminate and - prev_cluster_status == status_lib.ClusterStatus.STOPPED): + if (isinstance(cloud, clouds.IBM) and terminate and + prev_cluster_status == status_lib.ClusterStatus.STOPPED): # pylint: disable= W0622 W0703 C0415 from sky.adaptors import ibm from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider @@ -3991,14 +4016,8 @@ def teardown_no_lock(self, # never launched and the errors are related to pre-launch # configurations (such as VPC not found). So it's safe & good UX # to not print a failure message. - # - # '(ResourceGroupNotFound)': this indicates the resource group on - # Azure is not found. That means the cluster is already deleted - # on the cloud. So it's safe & good UX to not print a failure - # message. elif ('TPU must be specified.' not in stderr and - 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr and - '(ResourceGroupNotFound)' not in stderr): + 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr): raise RuntimeError( _TEARDOWN_FAILURE_MESSAGE.format( extra_reason='', @@ -4761,9 +4780,9 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, 1, resources_dict, stable_cluster_internal_ips=internal_ips, + env_vars=task_env_vars, setup_cmd=self._setup_cmd, setup_log_path=os.path.join(log_dir, 'setup.log'), - env_vars=task_env_vars, ) if callable(task.run): @@ -4810,9 +4829,10 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, num_actual_nodes, resources_dict, stable_cluster_internal_ips=internal_ips, + env_vars=task_env_vars, setup_cmd=self._setup_cmd, setup_log_path=os.path.join(log_dir, 'setup.log'), - env_vars=task_env_vars) + ) if callable(task.run): run_fn_code = textwrap.dedent(inspect.getsource(task.run)) diff --git a/sky/cli.py b/sky/cli.py index 9a45a35ae55..db5291d949c 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -772,6 +772,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides( else: task = sky.Task(name='sky-cmd', run=entrypoint) task.set_resources({sky.Resources()}) + # env update has been done for DAG in load_chain_dag_from_yaml for YAML. + task.update_envs(env) # Override. if workdir is not None: @@ -787,7 +789,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides( task.num_nodes = num_nodes if name is not None: task.name = name - task.update_envs(env) return task @@ -2966,6 +2967,15 @@ def show_gpus( To show all regions for a specified accelerator, use ``sky show-gpus --all-regions``. + If ``--region`` or ``--all-regions`` is not specified, the price displayed + for each instance type is the lowest across all regions for both on-demand + and spot instances. There may be multiple regions with the same lowest + price. + + If ``--cloud kubernetes`` is specified, it will show the maximum quantities + of the GPU available on a single node and the real-time availability of + the GPU across all nodes in the Kubernetes cluster. + Definitions of certain fields: * ``DEVICE_MEM``: Memory of a single device; does not depend on the device @@ -2973,10 +2983,15 @@ def show_gpus( * ``HOST_MEM``: Memory of the host instance (VM). - If ``--region`` or ``--all-regions`` is not specified, the price displayed - for each instance type is the lowest across all regions for both on-demand - and spot instances. There may be multiple regions with the same lowest - price. + * ``QTY_PER_NODE`` (Kubernetes only): GPU quantities that can be requested + on a single node. + + * ``TOTAL_GPUS`` (Kubernetes only): Total number of GPUs available in the + Kubernetes cluster. + + * ``TOTAL_FREE_GPUS`` (Kubernetes only): Number of currently free GPUs + in the Kubernetes cluster. This is fetched in real-time and may change + when other users are using the cluster. """ # validation for the --region flag if region is not None and cloud is None: @@ -2999,9 +3014,64 @@ def show_gpus( if show_all and accelerator_str is not None: raise click.UsageError('--all is only allowed without a GPU name.') + # Kubernetes specific bools + cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes) + kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None + kubernetes_is_enabled = sky_clouds.cloud_in_iterable( + sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds()) + + if cloud_is_kubernetes and region is not None: + raise click.UsageError( + 'The --region flag cannot be set with --cloud kubernetes.') + def _list_to_str(lst): return ', '.join([str(e) for e in lst]) + def _get_kubernetes_realtime_gpu_table( + name_filter: Optional[str] = None, + quantity_filter: Optional[int] = None): + if quantity_filter: + qty_header = 'QTY_FILTER' + free_header = 'FILTERED_FREE_GPUS' + else: + qty_header = 'QTY_PER_NODE' + free_header = 'TOTAL_FREE_GPUS' + realtime_gpu_table = log_utils.create_table( + ['GPU', qty_header, 'TOTAL_GPUS', free_header]) + counts, capacity, available = service_catalog.list_accelerator_realtime( + gpus_only=True, + clouds='kubernetes', + name_filter=name_filter, + region_filter=region, + quantity_filter=quantity_filter, + case_sensitive=False) + assert (set(counts.keys()) == set(capacity.keys()) == set( + available.keys())), (f'Keys of counts ({list(counts.keys())}), ' + f'capacity ({list(capacity.keys())}), ' + f'and available ({list(available.keys())}) ' + 'must be same.') + if len(counts) == 0: + err_msg = 'No GPUs found in Kubernetes cluster. ' + debug_msg = 'To further debug, run: sky check ' + if name_filter is not None: + gpu_info_msg = f' {name_filter!r}' + if quantity_filter is not None: + gpu_info_msg += (' with requested quantity' + f' {quantity_filter}') + err_msg = (f'Resources{gpu_info_msg} not found ' + 'in Kubernetes cluster. ') + debug_msg = ('To show available accelerators on kubernetes,' + ' run: sky show-gpus --cloud kubernetes ') + full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + + debug_msg) + raise ValueError(full_err_msg) + for gpu, _ in sorted(counts.items()): + realtime_gpu_table.add_row([ + gpu, + _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu] + ]) + return realtime_gpu_table + def _output(): gpu_table = log_utils.create_table( ['COMMON_GPU', 'AVAILABLE_QUANTITIES']) @@ -3012,35 +3082,69 @@ def _output(): name, quantity = None, None - # Kubernetes specific bools - cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes) - kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type( - ) is not None + # Optimization - do not poll for Kubernetes API for fetching + # common GPUs because that will be fetched later for the table after + # common GPUs. + clouds_to_list = cloud + if cloud is None: + clouds_to_list = [ + c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes' + ] + k8s_messages = '' if accelerator_str is None: + # Collect k8s related messages in k8s_messages and print them at end + print_section_titles = False + # If cloud is kubernetes, we want to show real-time capacity + if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes): + try: + # If --cloud kubernetes is not specified, we want to catch + # the case where no GPUs are available on the cluster and + # print the warning at the end. + k8s_realtime_table = _get_kubernetes_realtime_gpu_table() + except ValueError as e: + if not cloud_is_kubernetes: + # Make it a note if cloud is not kubernetes + k8s_messages += 'Note: ' + k8s_messages += str(e) + else: + print_section_titles = True + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') + yield from k8s_realtime_table.get_string() + if kubernetes_autoscaling: + k8s_messages += ( + '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) + if cloud_is_kubernetes: + # Do not show clouds if --cloud kubernetes is specified + if not kubernetes_is_enabled: + yield ('Kubernetes is not enabled. To fix, run: ' + 'sky check kubernetes ') + yield k8s_messages + return + + # For show_all, show the k8s message at the start since output is + # long and the user may not scroll to the end. + if show_all and k8s_messages: + yield k8s_messages + yield '\n\n' + result = service_catalog.list_accelerator_counts( gpus_only=True, - clouds=cloud, + clouds=clouds_to_list, region_filter=region, ) - if len(result) == 0 and cloud_is_kubernetes: - yield kubernetes_utils.NO_GPU_ERROR_MESSAGE - if kubernetes_autoscaling: - yield '\n' - yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE - return + if print_section_titles: + # If section titles were printed above, print again here + yield '\n\n' + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Cloud GPUs{colorama.Style.RESET_ALL}\n') # "Common" GPUs - # If cloud is kubernetes, we want to show all GPUs here, even if - # they are not listed as common in SkyPilot. - if cloud_is_kubernetes: - for gpu, _ in sorted(result.items()): + for gpu in service_catalog.get_common_gpus(): + if gpu in result: gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))]) - else: - for gpu in service_catalog.get_common_gpus(): - if gpu in result: - gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))]) yield from gpu_table.get_string() # Google TPUs @@ -3058,16 +3162,12 @@ def _output(): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() yield '\n\n' - if (cloud_is_kubernetes or - cloud is None) and kubernetes_autoscaling: - yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE - yield '\n\n' else: yield ('\n\nHint: use -a/--all to see all accelerators ' '(including non-common ones) and pricing.') - if (cloud_is_kubernetes or - cloud is None) and kubernetes_autoscaling: - yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + if k8s_messages: + yield '\n' + yield k8s_messages return else: # Parse accelerator string @@ -3091,12 +3191,40 @@ def _output(): else: name, quantity = accelerator_str, None + print_section_titles = False + if (kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and + not show_all): + # Print section title if not showing all and instead a specific + # accelerator is requested + print_section_titles = True + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') + try: + k8s_realtime_table = _get_kubernetes_realtime_gpu_table( + name_filter=name, quantity_filter=quantity) + yield from k8s_realtime_table.get_string() + except ValueError as e: + # In the case of a specific accelerator, show the error message + # immediately (e.g., "Resources H100 not found ...") + yield str(e) + if kubernetes_autoscaling: + k8s_messages += ('\n' + + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) + yield k8s_messages + if cloud_is_kubernetes: + # Do not show clouds if --cloud kubernetes is specified + if not kubernetes_is_enabled: + yield ('Kubernetes is not enabled. To fix, run: ' + 'sky check kubernetes ') + return + + # For clouds other than Kubernetes, get the accelerator details # Case-sensitive result = service_catalog.list_accelerators(gpus_only=True, name_filter=name, quantity_filter=quantity, region_filter=region, - clouds=cloud, + clouds=clouds_to_list, case_sensitive=False, all_regions=all_regions) # Import here to save module load speed. @@ -3128,16 +3256,17 @@ def _output(): new_result[gpu] = sorted_dataclasses result = new_result - if len(result) == 0: - if cloud == 'kubernetes': - yield kubernetes_utils.NO_GPU_ERROR_MESSAGE - return + if print_section_titles and not show_all: + yield '\n\n' + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Cloud GPUs{colorama.Style.RESET_ALL}\n') + if len(result) == 0: quantity_str = (f' with requested quantity {quantity}' if quantity else '') - yield f'Resources \'{name}\'{quantity_str} not found. ' - yield 'Try \'sky show-gpus --all\' ' - yield 'to show available accelerators.' + cloud_str = f' on {cloud_obj}.' if cloud else ' in cloud catalogs.' + yield f'Resources \'{name}\'{quantity_str} not found{cloud_str} ' + yield 'To show available accelerators, run: sky show-gpus --all' return for i, (gpu, items) in enumerate(result.items()): @@ -3632,13 +3761,10 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool, controller: bool): """Tail the log of a managed job.""" try: - if controller: - core.tail_logs( - controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name, - job_id=job_id, - follow=follow) - else: - managed_jobs.tail_logs(name=name, job_id=job_id, follow=follow) + managed_jobs.tail_logs(name=name, + job_id=job_id, + follow=follow, + controller=controller) except exceptions.ClusterNotUpError: with ux_utils.print_exception_no_traceback(): raise diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 4df1cd4a4bf..916a1c01c7d 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -7,17 +7,15 @@ import subprocess import textwrap import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple import colorama from sky import clouds from sky import exceptions from sky import sky_logging -from sky import status_lib from sky.adaptors import azure from sky.clouds import service_catalog -from sky.skylet import log_lib from sky.utils import common_utils from sky.utils import resources_utils from sky.utils import ux_utils @@ -69,7 +67,8 @@ class Azure(clouds.Cloud): _INDENT_PREFIX = ' ' * 4 - PROVISIONER_VERSION = clouds.ProvisionerVersion.RAY_AUTOSCALER + PROVISIONER_VERSION = clouds.ProvisionerVersion.RAY_PROVISIONER_SKYPILOT_TERMINATOR + STATUS_VERSION = clouds.StatusVersion.SKYPILOT @classmethod def _unsupported_features_for_resources( @@ -270,13 +269,12 @@ def get_vcpus_mem_from_instance_type( def get_zone_shell_cmd(cls) -> Optional[str]: return None - def make_deploy_resources_variables( - self, - resources: 'resources.Resources', - cluster_name_on_cloud: str, - region: 'clouds.Region', - zones: Optional[List['clouds.Zone']], - dryrun: bool = False) -> Dict[str, Optional[str]]: + def make_deploy_resources_variables(self, + resources: 'resources.Resources', + cluster_name_on_cloud: str, + region: 'clouds.Region', + zones: Optional[List['clouds.Zone']], + dryrun: bool = False) -> Dict[str, Any]: assert zones is None, ('Azure does not support zones', zones) region_name = region.name @@ -316,6 +314,10 @@ def make_deploy_resources_variables( 'image_version': version, } + # Setup the A10 nvidia driver. + need_nvidia_driver_extension = (acc_dict is not None and + 'A10' in acc_dict) + # Setup commands to eliminate the banner and restart sshd. # This script will modify /etc/ssh/sshd_config and add a bash script # into .bashrc. The bash script will restart sshd if it has not been @@ -368,6 +370,7 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]: # Azure does not support specific zones. 'zones': None, **image_config, + 'need_nvidia_driver_extension': need_nvidia_driver_extension, 'disk_tier': Azure._get_disk_type(_failover_disk_tier()), 'cloud_init_setup_commands': cloud_init_setup_commands, 'azure_subscription_id': self.get_project_id(dryrun), @@ -613,90 +616,3 @@ def _get_disk_type(cls, resources_utils.DiskTier.LOW: 'Standard_LRS', } return tier2name[tier] - - @classmethod - def query_status(cls, name: str, tag_filters: Dict[str, str], - region: Optional[str], zone: Optional[str], - **kwargs) -> List[status_lib.ClusterStatus]: - del zone # unused - status_map = { - 'VM starting': status_lib.ClusterStatus.INIT, - 'VM running': status_lib.ClusterStatus.UP, - # 'VM stopped' in Azure means Stopped (Allocated), which still bills - # for the VM. - 'VM stopping': status_lib.ClusterStatus.INIT, - 'VM stopped': status_lib.ClusterStatus.INIT, - # 'VM deallocated' in Azure means Stopped (Deallocated), which does not - # bill for the VM. - 'VM deallocating': status_lib.ClusterStatus.STOPPED, - 'VM deallocated': status_lib.ClusterStatus.STOPPED, - } - tag_filter_str = ' '.join( - f'tags.\\"{k}\\"==\'{v}\'' for k, v in tag_filters.items()) - - query_node_id = (f'az vm list --query "[?{tag_filter_str}].id" -o json') - returncode, stdout, stderr = log_lib.run_with_log(query_node_id, - '/dev/null', - require_outputs=True, - shell=True) - logger.debug(f'{query_node_id} returned {returncode}.\n' - '**** STDOUT ****\n' - f'{stdout}\n' - '**** STDERR ****\n' - f'{stderr}') - if returncode == 0: - if not stdout.strip(): - return [] - node_ids = json.loads(stdout.strip()) - if not node_ids: - return [] - state_str = '[].powerState' - if len(node_ids) == 1: - state_str = 'powerState' - node_ids_str = '\t'.join(node_ids) - query_cmd = ( - f'az vm show -d --ids {node_ids_str} --query "{state_str}" -o json' - ) - returncode, stdout, stderr = log_lib.run_with_log( - query_cmd, '/dev/null', require_outputs=True, shell=True) - logger.debug(f'{query_cmd} returned {returncode}.\n' - '**** STDOUT ****\n' - f'{stdout}\n' - '**** STDERR ****\n' - f'{stderr}') - - # NOTE: Azure cli should be handled carefully. The query command above - # takes about 1 second to run. - # An alternative is the following command, but it will take more than - # 20 seconds to run. - # query_cmd = ( - # f'az vm list --show-details --query "[' - # f'?tags.\\"ray-cluster-name\\" == \'{handle.cluster_name}\' ' - # '&& tags.\\"ray-node-type\\" == \'head\'].powerState" -o tsv' - # ) - - if returncode != 0: - with ux_utils.print_exception_no_traceback(): - raise exceptions.ClusterStatusFetchingError( - f'Failed to query Azure cluster {name!r} status: ' - f'{stdout + stderr}') - - assert stdout.strip(), f'No status returned for {name!r}' - - original_statuses_list = json.loads(stdout.strip()) - if not original_statuses_list: - # No nodes found. The original_statuses_list will be empty string. - # Return empty list. - return [] - if not isinstance(original_statuses_list, list): - original_statuses_list = [original_statuses_list] - statuses = [] - for s in original_statuses_list: - if s not in status_map: - with ux_utils.print_exception_no_traceback(): - raise exceptions.ClusterStatusFetchingError( - f'Failed to parse status from Azure response: {stdout}') - node_status = status_map[s] - if node_status is not None: - statuses.append(node_status) - return statuses diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 1a32bb0bd2c..3ad66306517 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -66,6 +66,10 @@ class Cudo(clouds.Cloud): clouds.CloudImplementationFeatures.DOCKER_IMAGE: ('Docker image is currently not supported on Cudo. You can try ' 'running docker command inside the `run` section in task.yaml.'), + clouds.CloudImplementationFeatures.HOST_CONTROLLERS: ( + 'Cudo Compute cannot host a controller as it does not ' + 'autostopping, which will leave the controller to run indefinitely.' + ), } _MAX_CLUSTER_NAME_LEN_LIMIT = 60 diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 93260533f27..94add7fce7d 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -14,6 +14,7 @@ from sky import clouds from sky import exceptions from sky import sky_logging +from sky import skypilot_config from sky.adaptors import gcp from sky.clouds import service_catalog from sky.clouds.utils import gcp_utils @@ -179,20 +180,31 @@ class GCP(clouds.Cloud): def _unsupported_features_for_resources( cls, resources: 'resources.Resources' ) -> Dict[clouds.CloudImplementationFeatures, str]: + unsupported = {} if gcp_utils.is_tpu_vm_pod(resources): - return { + unsupported = { clouds.CloudImplementationFeatures.STOP: ( - 'TPU VM pods cannot be stopped. Please refer to: https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_your_resources' + 'TPU VM pods cannot be stopped. Please refer to: ' + 'https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_your_resources' ) } if gcp_utils.is_tpu(resources) and not gcp_utils.is_tpu_vm(resources): # TPU node does not support multi-node. - return { - clouds.CloudImplementationFeatures.MULTI_NODE: - ('TPU node does not support multi-node. Please set ' - 'num_nodes to 1.') - } - return {} + unsupported[clouds.CloudImplementationFeatures.MULTI_NODE] = ( + 'TPU node does not support multi-node. Please set ' + 'num_nodes to 1.') + # TODO(zhwu): We probably need to store the MIG requirement in resources + # because `skypilot_config` may change for an existing cluster. + # Clusters created with MIG (only GPU clusters) cannot be stopped. + if (skypilot_config.get_nested( + ('gcp', 'managed_instance_group'), None) is not None and + resources.accelerators): + unsupported[clouds.CloudImplementationFeatures.STOP] = ( + 'Managed Instance Group (MIG) does not support stopping yet.') + unsupported[clouds.CloudImplementationFeatures.SPOT_INSTANCE] = ( + 'Managed Instance Group with DWS does not support ' + 'spot instances.') + return unsupported @classmethod def max_cluster_name_length(cls) -> Optional[int]: @@ -493,6 +505,16 @@ def make_deploy_resources_variables( resources_vars['tpu_node_name'] = tpu_node_name + managed_instance_group_config = skypilot_config.get_nested( + ('gcp', 'managed_instance_group'), None) + use_mig = managed_instance_group_config is not None + resources_vars['gcp_use_managed_instance_group'] = use_mig + # Convert boolean to 0 or 1 in string, as GCP does not support boolean + # value in labels for TPU VM APIs. + resources_vars['gcp_use_managed_instance_group_value'] = str( + int(use_mig)) + if use_mig: + resources_vars.update(managed_instance_group_config) return resources_vars def _get_feasible_launchable_resources( @@ -736,13 +758,13 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: # pylint: disable=import-outside-toplevel,unused-import import google.auth - import googleapiclient.discovery # This takes user's credential info from "~/.config/gcloud/application_default_credentials.json". # pylint: disable=line-too-long credentials, project = google.auth.default() - crm = googleapiclient.discovery.build('cloudresourcemanager', - 'v1', - credentials=credentials) + crm = gcp.build('cloudresourcemanager', + 'v1', + credentials=credentials, + cache_discovery=False) gcp_minimal_permissions = gcp_utils.get_minimal_permissions() permissions = {'permissions': gcp_minimal_permissions} request = crm.projects().testIamPermissions(resource=project, diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index c0b25232f84..1e307f475c8 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -38,9 +38,6 @@ class Kubernetes(clouds.Cloud): SKY_SSH_KEY_SECRET_NAME = 'sky-ssh-keys' SKY_SSH_JUMP_NAME = 'sky-ssh-jump-pod' - PORT_FORWARD_PROXY_CMD_TEMPLATE = \ - 'kubernetes-port-forward-proxy-command.sh.j2' - PORT_FORWARD_PROXY_CMD_PATH = '~/.sky/port-forward-proxy-cmd.sh' # Timeout for resource provisioning. This timeout determines how long to # wait for pod to be in pending status before giving up. # Larger timeout may be required for autoscaling clusters, since autoscaler @@ -51,6 +48,13 @@ class Kubernetes(clouds.Cloud): timeout = skypilot_config.get_nested(['kubernetes', 'provision_timeout'], 10) + # Limit the length of the cluster name to avoid exceeding the limit of 63 + # characters for Kubernetes resources. We limit to 42 characters (63-21) to + # allow additional characters for creating ingress services to expose ports. + # These services are named as {cluster_name_on_cloud}--skypilot-svc--{port}, + # where the suffix is 21 characters long. + _MAX_CLUSTER_NAME_LEN_LIMIT = 42 + _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True _DEFAULT_NUM_VCPUS = 2 @@ -92,7 +96,8 @@ def ssh_key_secret_field_name(self): def _unsupported_features_for_resources( cls, resources: 'resources_lib.Resources' ) -> Dict[clouds.CloudImplementationFeatures, str]: - unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES + unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES.copy() + # Features to be disabled for exec auth is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth() if is_exec_auth: assert isinstance(message, str), message @@ -102,8 +107,17 @@ def _unsupported_features_for_resources( # Pod does not have permissions to terminate itself with exec auth. unsupported_features[ clouds.CloudImplementationFeatures.AUTO_TERMINATE] = message + # Allow spot instances if supported by the cluster + spot_label_key, _ = kubernetes_utils.get_spot_label() + if spot_label_key is not None: + unsupported_features.pop( + clouds.CloudImplementationFeatures.SPOT_INSTANCE, None) return unsupported_features + @classmethod + def max_cluster_name_length(cls) -> Optional[int]: + return cls._MAX_CLUSTER_NAME_LEN_LIMIT + @classmethod def regions(cls) -> List[clouds.Region]: return cls._regions @@ -290,6 +304,11 @@ def make_deploy_resources_variables( fuse_device_required = bool(resources.requires_fuse) + # Configure spot labels, if requested and supported + spot_label_key, spot_label_value = None, None + if resources.use_spot: + spot_label_key, spot_label_value = kubernetes_utils.get_spot_label() + deploy_vars = { 'instance_type': resources.instance_type, 'custom_resources': custom_resources, @@ -301,6 +320,7 @@ def make_deploy_resources_variables( 'k8s_namespace': kubernetes_utils.get_current_kube_config_context_namespace(), 'k8s_port_mode': port_mode.value, + 'k8s_networking_mode': network_utils.get_networking_mode().value, 'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME, 'k8s_acc_label_key': k8s_acc_label_key, 'k8s_acc_label_value': k8s_acc_label_value, @@ -311,6 +331,8 @@ def make_deploy_resources_variables( 'k8s_fuse_device_required': fuse_device_required, # Namespace to run the FUSE device manager in 'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE, + 'k8s_spot_label_key': spot_label_key, + 'k8s_spot_label_value': spot_label_value, 'image_id': image_id, } diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index d380cce6757..acc6fa0aa8b 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -35,7 +35,7 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs): for cloud in clouds: try: cloud_module = importlib.import_module( - f'sky.clouds.service_catalog.{cloud}_catalog') + f'sky.clouds.service_catalog.{cloud.lower()}_catalog') except ModuleNotFoundError: raise ValueError( 'Cannot find module "sky.clouds.service_catalog' @@ -117,6 +117,46 @@ def list_accelerator_counts( return ret +def list_accelerator_realtime( + gpus_only: bool = True, + name_filter: Optional[str] = None, + region_filter: Optional[str] = None, + quantity_filter: Optional[int] = None, + clouds: CloudFilter = None, + case_sensitive: bool = True, +) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]: + """List all accelerators offered by Sky with their realtime availability. + + Realtime availability is the total number of accelerators in the cluster + and number of accelerators available at the time of the call. + + Used for fixed size cluster settings, such as Kubernetes. + + Returns: + A tuple of three dictionaries mapping canonical accelerator names to: + - A list of available counts. (e.g., [1, 2, 4]) + - Total number of accelerators in the cluster (capacity). + - Number of accelerators available at the time of call (availability). + """ + qtys_map, total_accelerators_capacity, total_accelerators_available = ( + _map_clouds_catalog(clouds, + 'list_accelerators_realtime', + gpus_only, + name_filter, + region_filter, + quantity_filter, + case_sensitive=case_sensitive, + all_regions=False, + require_price=False)) + accelerator_counts: Dict[str, List[int]] = collections.defaultdict(list) + for gpu, items in qtys_map.items(): + for item in items: + accelerator_counts[gpu].append(item.accelerator_count) + accelerator_counts[gpu] = sorted(accelerator_counts[gpu]) + return (accelerator_counts, total_accelerators_capacity, + total_accelerators_available) + + def instance_type_exists(instance_type: str, clouds: CloudFilter = None) -> bool: """Check the existence of a instance type.""" diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index cc5e4597748..9a7b2a90bee 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -93,6 +93,15 @@ def get_regions() -> List[str]: # We have to manually remove it. DEPRECATED_FAMILIES = ['standardNVSv2Family'] +# Some A10 instance types only contains a fractional of GPU. We temporarily +# filter them out here to avoid using it as a whole A10 GPU. +# TODO(zhwu,tian): support fractional GPUs, which can be done on +# kubernetes as well. +# Ref: https://learn.microsoft.com/en-us/azure/virtual-machines/nva10v5-series +FILTERED_A10_INSTANCE_TYPES = [ + f'Standard_NV{vcpu}ads_A10_v5' for vcpu in [6, 12, 18] +] + USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'Generation' @@ -286,6 +295,10 @@ def get_additional_columns(row): after_drop_len = len(df_ret) print(f'Dropped {before_drop_len - after_drop_len} duplicated rows') + # Filter out instance types that only contain a fractional of GPU. + df_ret = df_ret.loc[~df_ret['InstanceType'].isin(FILTERED_A10_INSTANCE_TYPES + )] + # Filter out deprecated families df_ret = df_ret.loc[~df_ret['family'].isin(DEPRECATED_FAMILIES)] df_ret = df_ret[USEFUL_COLUMNS] diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py b/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py index b15570ddcbc..617751d865a 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py @@ -9,98 +9,9 @@ import cudo_compute -VMS_CSV = 'cudo/vms.csv' +import sky.provision.cudo.cudo_utils as utils -cudo_gpu_model = { - 'NVIDIA V100': 'V100', - 'NVIDIA A40': 'A40', - 'RTX 3080': 'RTX3080', - 'RTX A4000': 'RTXA4000', - 'RTX A4500': 'RTXA4500', - 'RTX A5000': 'RTXA5000', - 'RTX A6000': 'RTXA6000', -} - -cudo_gpu_mem = { - 'RTX3080': 12, - 'A40': 48, - 'RTXA4000': 16, - 'RTXA4500': 20, - 'RTXA5000': 24, - 'RTXA6000': 48, - 'V100': 16, -} - -machine_specs = [ - # Low - { - 'vcpu': 2, - 'mem': 4, - 'gpu': 1, - }, - { - 'vcpu': 4, - 'mem': 8, - 'gpu': 1, - }, - { - 'vcpu': 8, - 'mem': 16, - 'gpu': 2, - }, - { - 'vcpu': 16, - 'mem': 32, - 'gpu': 2, - }, - { - 'vcpu': 32, - 'mem': 64, - 'gpu': 4, - }, - { - 'vcpu': 64, - 'mem': 128, - 'gpu': 8, - }, - # Mid - { - 'vcpu': 96, - 'mem': 192, - 'gpu': 8 - }, - { - 'vcpu': 48, - 'mem': 96, - 'gpu': 4 - }, - { - 'vcpu': 24, - 'mem': 48, - 'gpu': 2 - }, - { - 'vcpu': 12, - 'mem': 24, - 'gpu': 1 - }, - # Hi - { - 'vcpu': 96, - 'mem': 192, - 'gpu': 4 - }, - { - 'vcpu': 48, - 'mem': 96, - 'gpu': 2 - }, - { - 'vcpu': 24, - 'mem': 48, - 'gpu': 1 - }, -] +VMS_CSV = 'cudo/vms.csv' def cudo_api(): @@ -110,28 +21,8 @@ def cudo_api(): return cudo_compute.VirtualMachinesApi(client) -def cudo_gpu_to_skypilot_gpu(model): - if model in cudo_gpu_model: - return cudo_gpu_model[model] - else: - return model - - -def skypilot_gpu_to_cudo_gpu(model): - for key, value in cudo_gpu_model.items(): - if value == model: - return key - return model - - -def gpu_exists(model): - if model in cudo_gpu_model: - return True - return False - - def get_gpu_info(count, model): - mem = cudo_gpu_mem[model] + mem = utils.cudo_gpu_mem[model] # pylint: disable=line-too-long # {'Name': 'A4000', 'Manufacturer': 'NVIDIA', 'Count': 1.0, 'MemoryInfo': {'SizeInMiB': 16384}}], 'TotalGpuMemoryInMiB': 16384}" info = { @@ -168,16 +59,16 @@ def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count): def update_prices(): rows = [] - for spec in machine_specs: + for spec in utils.machine_specs: mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu']) for hc in mts['host_configs']: - if not gpu_exists(hc['gpu_model']): + if not utils.gpu_exists(hc['gpu_model']): continue - accelerator_name = cudo_gpu_to_skypilot_gpu(hc['gpu_model']) + accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model']) row = { 'instance_type': get_instance_type(hc['machine_type'], - spec['gpu'], spec['vcpu'], - spec['mem']), + spec['vcpu'], spec['mem'], + spec['gpu']), 'accelerator_name': accelerator_name, 'accelerator_count': str(spec['gpu']) + '.0', 'vcpus': str(spec['vcpu']), diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index bd44847016e..a64aa8f72e9 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -3,6 +3,7 @@ Kubernetes does not require a catalog of instances, but we need an image catalog mapping SkyPilot image tags to corresponding container image tags. """ +import re import typing from typing import Dict, List, Optional, Set, Tuple @@ -46,38 +47,107 @@ def list_accelerators( case_sensitive: bool = True, all_regions: bool = False, require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]: + # TODO(romilb): We should consider putting a lru_cache() with TTL to + # avoid multiple calls to kubernetes API in a short period of time (e.g., + # from the optimizer). + return list_accelerators_realtime(gpus_only, name_filter, region_filter, + quantity_filter, case_sensitive, + all_regions, require_price)[0] + + +def list_accelerators_realtime( + gpus_only: bool, + name_filter: Optional[str], + region_filter: Optional[str], + quantity_filter: Optional[int], + case_sensitive: bool = True, + all_regions: bool = False, + require_price: bool = True +) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str, + int]]: del all_regions, require_price # Unused. k8s_cloud = Kubernetes() if not any( map(k8s_cloud.is_same_cloud, sky_check.get_cached_enabled_clouds_or_refresh()) ) or not kubernetes_utils.check_credentials()[0]: - return {} + return {}, {}, {} has_gpu = kubernetes_utils.detect_gpu_resource() if not has_gpu: - return {} + return {}, {}, {} label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter() if not label_formatter: - return {} + return {}, {}, {} - accelerators: Set[Tuple[str, int]] = set() + accelerators_qtys: Set[Tuple[str, int]] = set() key = label_formatter.get_label_key() nodes = kubernetes_utils.get_kubernetes_nodes() + # Get the pods to get the real-time GPU usage + pods = kubernetes_utils.get_kubernetes_pods() + # Total number of GPUs in the cluster + total_accelerators_capacity: Dict[str, int] = {} + # Total number of GPUs currently available in the cluster + total_accelerators_available: Dict[str, int] = {} + min_quantity_filter = quantity_filter if quantity_filter else 1 + for node in nodes: if key in node.metadata.labels: + allocated_qty = 0 accelerator_name = label_formatter.get_accelerator_from_label_value( node.metadata.labels.get(key)) + + # Check if name_filter regex matches the accelerator_name + regex_flags = 0 if case_sensitive else re.IGNORECASE + if name_filter and not re.match( + name_filter, accelerator_name, flags=regex_flags): + continue + accelerator_count = int( node.status.allocatable.get('nvidia.com/gpu', 0)) + # Generate the GPU quantities for the accelerators if accelerator_name and accelerator_count > 0: for count in range(1, accelerator_count + 1): - accelerators.add((accelerator_name, count)) + accelerators_qtys.add((accelerator_name, count)) + + for pod in pods: + # Get all the pods running on the node + if (pod.spec.node_name == node.metadata.name and + pod.status.phase in ['Running', 'Pending']): + # Iterate over all the containers in the pod and sum the + # GPU requests + for container in pod.spec.containers: + if container.resources.requests: + allocated_qty += int( + container.resources.requests.get( + 'nvidia.com/gpu', 0)) + + accelerators_available = accelerator_count - allocated_qty + + if accelerator_count >= min_quantity_filter: + quantized_count = (min_quantity_filter * + (accelerator_count // min_quantity_filter)) + if accelerator_name not in total_accelerators_capacity: + total_accelerators_capacity[ + accelerator_name] = quantized_count + else: + total_accelerators_capacity[ + accelerator_name] += quantized_count + + if accelerator_name not in total_accelerators_available: + total_accelerators_available[accelerator_name] = 0 + if accelerators_available >= min_quantity_filter: + quantized_availability = min_quantity_filter * ( + accelerators_available // min_quantity_filter) + total_accelerators_available[ + accelerator_name] += quantized_availability result = [] - for accelerator_name, accelerator_count in accelerators: + + # Generate dataframe for common.list_accelerators_impl + for accelerator_name, accelerator_count in accelerators_qtys: result.append( common.InstanceTypeInfo(cloud='Kubernetes', instance_type=None, @@ -98,9 +168,13 @@ def list_accelerators( ]) df['GpuInfo'] = True - return common.list_accelerators_impl('Kubernetes', df, gpus_only, - name_filter, region_filter, - quantity_filter, case_sensitive) + # Use common.list_accelerators_impl to get InstanceTypeInfo objects used + # by sky show-gpus when cloud is not specified. + qtys_map = common.list_accelerators_impl('Kubernetes', df, gpus_only, + name_filter, region_filter, + quantity_filter, case_sensitive) + + return qtys_map, total_accelerators_capacity, total_accelerators_available def validate_region_zone( diff --git a/sky/core.py b/sky/core.py index b1006fe19ab..6b18fd2c190 100644 --- a/sky/core.py +++ b/sky/core.py @@ -19,6 +19,7 @@ from sky.skylet import job_lib from sky.usage import usage_lib from sky.utils import controller_utils +from sky.utils import rich_utils from sky.utils import subprocess_utils if typing.TYPE_CHECKING: @@ -126,7 +127,9 @@ def endpoints(cluster: str, RuntimeError: if the cluster has no ports to be exposed or no endpoints are exposed yet. """ - return backend_utils.get_endpoints(cluster=cluster, port=port) + with rich_utils.safe_status('[bold cyan]Fetching endpoints for cluster ' + f'{cluster}...[/]'): + return backend_utils.get_endpoints(cluster=cluster, port=port) @usage_lib.entrypoint diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index 50e0641be7b..26806ff991a 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -14,7 +14,7 @@ _TYPE_CACHE_TTL = '5s' _RENAME_DIR_LIMIT = 10000 # https://github.com/GoogleCloudPlatform/gcsfuse/releases -GCSFUSE_VERSION = '1.3.0' +GCSFUSE_VERSION = '2.2.0' # https://github.com/rclone/rclone/releases RCLONE_VERSION = '1.67.0' # Creates a fusermount3 soft link on older (<22) Ubuntu systems to utilize diff --git a/sky/jobs/core.py b/sky/jobs/core.py index 7f9e0d757ea..561d47f4b25 100644 --- a/sky/jobs/core.py +++ b/sky/jobs/core.py @@ -278,7 +278,8 @@ def cancel(name: Optional[str] = None, @usage_lib.entrypoint -def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool) -> None: +def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool, + controller: bool) -> None: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Tail logs of managed jobs. @@ -300,11 +301,12 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool) -> None: raise ValueError('Cannot specify both name and job_id.') backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend), backend - # Stream the realtime logs + backend.tail_managed_job_logs(handle, job_id=job_id, job_name=name, - follow=follow) + follow=follow, + controller=controller) spot_launch = common_utils.deprecated_function( diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py index 8220dd01621..aadf5a64684 100644 --- a/sky/jobs/utils.py +++ b/sky/jobs/utils.py @@ -6,6 +6,7 @@ """ import collections import enum +import inspect import os import pathlib import shlex @@ -28,7 +29,7 @@ from sky.jobs import state as managed_job_state from sky.skylet import constants from sky.skylet import job_lib -from sky.skylet.log_lib import run_bash_command_with_log +from sky.skylet import log_lib from sky.utils import common_utils from sky.utils import log_utils from sky.utils import rich_utils @@ -184,7 +185,7 @@ def callback_func(status: str): log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, 'managed_job_event', f'jobs-callback-{job_id}-{task_id}.log') - result = run_bash_command_with_log( + result = log_lib.run_bash_command_with_log( bash_command=event_callback, log_path=log_path, env_vars=dict( @@ -448,18 +449,55 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str: return '' -def stream_logs_by_name(job_name: str, follow: bool = True) -> str: - """Stream logs by name.""" - job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name) - if len(job_ids) == 0: - return (f'{colorama.Fore.RED}No job found with name {job_name!r}.' - f'{colorama.Style.RESET_ALL}') - if len(job_ids) > 1: - return (f'{colorama.Fore.RED}Multiple running jobs found ' - f'with name {job_name!r}.\n' - f'Job IDs: {job_ids}{colorama.Style.RESET_ALL}') - stream_logs_by_id(job_ids[0], follow) - return '' +def stream_logs(job_id: Optional[int], + job_name: Optional[str], + controller: bool = False, + follow: bool = True) -> str: + """Stream logs by job id or job name.""" + if job_id is None and job_name is None: + job_id = managed_job_state.get_latest_job_id() + if job_id is None: + return 'No managed job found.' + if controller: + if job_id is None: + assert job_name is not None + managed_jobs = managed_job_state.get_managed_jobs() + # We manually filter the jobs by name, instead of using + # get_nonterminal_job_ids_by_name, as with `controller=True`, we + # should be able to show the logs for jobs in terminal states. + managed_jobs = list( + filter(lambda job: job['job_name'] == job_name, managed_jobs)) + if len(managed_jobs) == 0: + return f'No managed job found with name {job_name!r}.' + if len(managed_jobs) > 1: + job_ids_str = ', '.join(job['job_id'] for job in managed_jobs) + raise ValueError( + f'Multiple managed jobs found with name {job_name!r} (Job ' + f'IDs: {job_ids_str}). Please specify the job_id instead.') + job_id = managed_jobs[0]['job_id'] + assert job_id is not None, (job_id, job_name) + # TODO: keep the following code sync with + # job_lib.JobLibCodeGen.tail_logs, we do not directly call that function + # as the following code need to be run in the current machine, instead + # of running remotely. + run_timestamp = job_lib.get_run_timestamp(job_id) + if run_timestamp is None: + return f'No managed job contrller log found with job_id {job_id}.' + log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp) + log_lib.tail_logs(job_id=job_id, log_dir=log_dir, follow=follow) + return '' + + if job_id is None: + assert job_name is not None + job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name) + if len(job_ids) == 0: + return f'No running managed job found with name {job_name!r}.' + if len(job_ids) > 1: + raise ValueError( + f'Multiple running jobs found with name {job_name!r}.') + job_id = job_ids[0] + + return stream_logs_by_id(job_id, follow) def dump_managed_job_queue() -> str: @@ -713,13 +751,19 @@ class ManagedJobCodeGen: >> codegen = ManagedJobCodeGen.show_jobs(...) """ + # TODO: the try..except.. block is for backward compatibility. Remove it in + # v0.8.0. _PREFIX = textwrap.dedent("""\ managed_job_version = 0 try: - from sky.jobs import constants, state, utils - managed_job_version = constants.MANAGED_JOBS_VERSION + from sky.jobs import utils + from sky.jobs import constants as managed_job_constants + from sky.jobs import state as managed_job_state + + managed_job_version = managed_job_constants.MANAGED_JOBS_VERSION except ImportError: - from sky.spot import spot_state as state, spot_utils as utils + from sky.spot import spot_state as managed_job_state + from sky.spot import spot_utils as utils """) @classmethod @@ -750,20 +794,32 @@ def cancel_job_by_name(cls, job_name: str) -> str: return cls._build(code) @classmethod - def stream_logs_by_name(cls, job_name: str, follow: bool = True) -> str: - code = textwrap.dedent(f"""\ - msg = utils.stream_logs_by_name({job_name!r}, follow={follow}) - print(msg, flush=True) + def stream_logs(cls, + job_name: Optional[str], + job_id: Optional[int], + follow: bool = True, + controller: bool = False) -> str: + # We inspect the source code of the function here for backward + # compatibility. + # TODO: change to utils.stream_logs(job_id, job_name, follow) in v0.8.0. + # Import libraries required by `stream_logs`. The try...except... block + # should be removed in v0.8.0. + code = textwrap.dedent("""\ + import os + + from sky.skylet import job_lib, log_lib + from sky.skylet import constants + try: + from sky.jobs.utils import stream_logs_by_id + except ImportError: + from sky.spot.spot_utils import stream_logs_by_id + from typing import Optional """) - return cls._build(code) + code += inspect.getsource(stream_logs) + code += textwrap.dedent(f"""\ - @classmethod - def stream_logs_by_id(cls, - job_id: Optional[int], - follow: bool = True) -> str: - code = textwrap.dedent(f"""\ - job_id = {job_id} if {job_id} is not None else state.get_latest_job_id() - msg = utils.stream_logs_by_id(job_id, follow={follow}) + msg = stream_logs({job_id!r}, {job_name!r}, + follow={follow}, controller={controller}) print(msg, flush=True) """) return cls._build(code) @@ -773,13 +829,13 @@ def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag') -> str: dag_name = managed_job_dag.name # Add the managed job to queue table. code = textwrap.dedent(f"""\ - state.set_job_name({job_id}, {dag_name!r}) + managed_job_state.set_job_name({job_id}, {dag_name!r}) """) for task_id, task in enumerate(managed_job_dag.tasks): resources_str = backend_utils.get_task_resources_str( task, is_managed_job=True) code += textwrap.dedent(f"""\ - state.set_pending({job_id}, {task_id}, + managed_job_state.set_pending({job_id}, {task_id}, {task.name!r}, {resources_str!r}) """) return cls._build(code) diff --git a/sky/provision/__init__.py b/sky/provision/__init__.py index 8371fb8ad83..0fe4ab614ce 100644 --- a/sky/provision/__init__.py +++ b/sky/provision/__init__.py @@ -155,6 +155,10 @@ def query_ports( return the endpoint without querying the cloud provider. If head_ip is not provided, the cloud provider will be queried to get the endpoint info. + The underlying implementation is responsible for retries and timeout, e.g. + kubernetes will wait for the service that expose the ports to be ready + before returning the endpoint info. + Returns a dict with port as the key and a list of common.Endpoint. """ del provider_name, provider_config, cluster_name_on_cloud # unused diff --git a/sky/provision/aws/config.py b/sky/provision/aws/config.py index 834967a7b15..c83732d60c4 100644 --- a/sky/provision/aws/config.py +++ b/sky/provision/aws/config.py @@ -191,6 +191,29 @@ def _get_role(role_name: str): for policy_arn in attach_policy_arns: role.attach_policy(PolicyArn=policy_arn) + # SkyPilot: 'PassRole' is required by the controllers (jobs and + # services) created with `aws.remote_identity: SERVICE_ACCOUNT` to + # create instances with the IAM role. + skypilot_pass_role_policy_doc = { + 'Statement': [ + { + 'Effect': 'Allow', + 'Action': [ + 'iam:GetRole', + 'iam:PassRole', + ], + 'Resource': role.arn, + }, + { + 'Effect': 'Allow', + 'Action': 'iam:GetInstanceProfile', + 'Resource': profile.arn, + }, + ] + } + role.Policy('SkyPilotPassRolePolicy').put( + PolicyDocument=json.dumps(skypilot_pass_role_policy_doc)) + profile.add_role(RoleName=role.name) time.sleep(15) # wait for propagation return {'Arn': profile.arn} diff --git a/sky/provision/aws/instance.py b/sky/provision/aws/instance.py index e279b30c74b..f3b727d7c21 100644 --- a/sky/provision/aws/instance.py +++ b/sky/provision/aws/instance.py @@ -717,16 +717,31 @@ def open_ports( existing_ports: Set[int] = set() for existing_rule in sg.ip_permissions: - # Skip any non-tcp rules. - if existing_rule['IpProtocol'] != 'tcp': + # Skip any non-tcp rules or if all traffic (-1) is specified. + if existing_rule['IpProtocol'] not in ['tcp', '-1']: continue # Skip any rules that don't have a FromPort or ToPort. - if 'FromPort' not in existing_rule or 'ToPort' not in existing_rule: - continue - existing_ports.update( - range(existing_rule['FromPort'], existing_rule['ToPort'] + 1)) - ports_to_open = resources_utils.port_set_to_ranges( - resources_utils.port_ranges_to_set(ports) - existing_ports) + if 'FromPort' in existing_rule and 'ToPort' in existing_rule: + existing_ports.update( + range(existing_rule['FromPort'], existing_rule['ToPort'] + 1)) + elif existing_rule['IpProtocol'] == '-1': + # For AWS, IpProtocol = -1 means all traffic + for group_pairs in existing_rule['UserIdGroupPairs']: + if group_pairs['GroupId'] != sg.id: + # We skip the port opening when the rule allows access from + # other security groups, as that is likely added by a user + # manually and satisfy their requirement. + # The security group created by SkyPilot allows all traffic + # from the same security group, which should not be skipped. + existing_ports.add(-1) + break + break + + ports_to_open = [] + # Do not need to open any ports when all traffic is already allowed. + if -1 not in existing_ports: + ports_to_open = resources_utils.port_set_to_ranges( + resources_utils.port_ranges_to_set(ports) - existing_ports) ip_permissions = [] for port in ports_to_open: diff --git a/sky/provision/azure/__init__.py b/sky/provision/azure/__init__.py index b83dbb462d9..2152728ba6e 100644 --- a/sky/provision/azure/__init__.py +++ b/sky/provision/azure/__init__.py @@ -2,3 +2,6 @@ from sky.provision.azure.instance import cleanup_ports from sky.provision.azure.instance import open_ports +from sky.provision.azure.instance import query_instances +from sky.provision.azure.instance import stop_instances +from sky.provision.azure.instance import terminate_instances diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py index de5c7cbf0e9..19c1ba3f3da 100644 --- a/sky/provision/azure/instance.py +++ b/sky/provision/azure/instance.py @@ -1,11 +1,19 @@ """Azure instance provisioning.""" import logging +from multiprocessing import pool +import typing from typing import Any, Callable, Dict, List, Optional +from sky import exceptions from sky import sky_logging +from sky import status_lib from sky.adaptors import azure +from sky.utils import common_utils from sky.utils import ux_utils +if typing.TYPE_CHECKING: + from azure.mgmt import compute as azure_compute + logger = sky_logging.init_logger(__name__) # Suppress noisy logs from Azure SDK. Reference: @@ -17,6 +25,8 @@ TAG_RAY_CLUSTER_NAME = 'ray-cluster-name' TAG_RAY_NODE_KIND = 'ray-node-type' +_RESOURCE_GROUP_NOT_FOUND_ERROR_MESSAGE = 'ResourceGroupNotFound' + def get_azure_sdk_function(client: Any, function_name: str) -> Callable: """Retrieve a callable function from Azure SDK client object. @@ -93,3 +103,164 @@ def cleanup_ports( # Azure will automatically cleanup network security groups when cleanup # resource group. So we don't need to do anything here. del cluster_name_on_cloud, ports, provider_config # Unused. + + +def stop_instances( + cluster_name_on_cloud: str, + provider_config: Optional[Dict[str, Any]] = None, + worker_only: bool = False, +) -> None: + """See sky/provision/__init__.py""" + assert provider_config is not None, (cluster_name_on_cloud, provider_config) + + subscription_id = provider_config['subscription_id'] + resource_group = provider_config['resource_group'] + compute_client = azure.get_client('compute', subscription_id) + tag_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} + if worker_only: + tag_filters[TAG_RAY_NODE_KIND] = 'worker' + + nodes = _filter_instances(compute_client, tag_filters, resource_group) + stop_virtual_machine = get_azure_sdk_function( + client=compute_client.virtual_machines, function_name='deallocate') + with pool.ThreadPool() as p: + p.starmap(stop_virtual_machine, + [(resource_group, node.name) for node in nodes]) + + +def terminate_instances( + cluster_name_on_cloud: str, + provider_config: Optional[Dict[str, Any]] = None, + worker_only: bool = False, +) -> None: + """See sky/provision/__init__.py""" + assert provider_config is not None, (cluster_name_on_cloud, provider_config) + # TODO(zhwu): check the following. Also, seems we can directly force + # delete a resource group. + subscription_id = provider_config['subscription_id'] + resource_group = provider_config['resource_group'] + if worker_only: + compute_client = azure.get_client('compute', subscription_id) + delete_virtual_machine = get_azure_sdk_function( + client=compute_client.virtual_machines, function_name='delete') + filters = { + TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud, + TAG_RAY_NODE_KIND: 'worker' + } + nodes = _filter_instances(compute_client, filters, resource_group) + with pool.ThreadPool() as p: + p.starmap(delete_virtual_machine, + [(resource_group, node.name) for node in nodes]) + return + + assert provider_config is not None, cluster_name_on_cloud + + resource_group_client = azure.get_client('resource', subscription_id) + delete_resource_group = get_azure_sdk_function( + client=resource_group_client.resource_groups, function_name='delete') + + delete_resource_group(resource_group, force_deletion_types=None) + + +def _get_vm_status(compute_client: 'azure_compute.ComputeManagementClient', + vm_name: str, resource_group: str) -> str: + instance = compute_client.virtual_machines.instance_view( + resource_group_name=resource_group, vm_name=vm_name).as_dict() + for status in instance['statuses']: + code_state = status['code'].split('/') + # It is possible that sometimes the 'code' is empty string, and we + # should skip them. + if len(code_state) != 2: + continue + code, state = code_state + # skip provisioning status + if code == 'PowerState': + return state + raise ValueError(f'Failed to get power state for VM {vm_name}: {instance}') + + +def _filter_instances( + compute_client: 'azure_compute.ComputeManagementClient', + filters: Dict[str, str], + resource_group: str) -> List['azure_compute.models.VirtualMachine']: + + def match_tags(vm): + for k, v in filters.items(): + if vm.tags.get(k) != v: + return False + return True + + try: + list_virtual_machines = get_azure_sdk_function( + client=compute_client.virtual_machines, function_name='list') + vms = list_virtual_machines(resource_group_name=resource_group) + nodes = list(filter(match_tags, vms)) + except azure.exceptions().ResourceNotFoundError as e: + if _RESOURCE_GROUP_NOT_FOUND_ERROR_MESSAGE in str(e): + return [] + raise + return nodes + + +@common_utils.retry +def query_instances( + cluster_name_on_cloud: str, + provider_config: Optional[Dict[str, Any]] = None, + non_terminated_only: bool = True, +) -> Dict[str, Optional[status_lib.ClusterStatus]]: + """See sky/provision/__init__.py""" + assert provider_config is not None, cluster_name_on_cloud + status_map = { + 'starting': status_lib.ClusterStatus.INIT, + 'running': status_lib.ClusterStatus.UP, + # 'stopped' in Azure means Stopped (Allocated), which still bills + # for the VM. + 'stopping': status_lib.ClusterStatus.INIT, + 'stopped': status_lib.ClusterStatus.INIT, + # 'VM deallocated' in Azure means Stopped (Deallocated), which does not + # bill for the VM. + 'deallocating': status_lib.ClusterStatus.STOPPED, + 'deallocated': status_lib.ClusterStatus.STOPPED, + } + provisioning_state_map = { + 'Creating': status_lib.ClusterStatus.INIT, + 'Updating': status_lib.ClusterStatus.INIT, + 'Failed': status_lib.ClusterStatus.INIT, + 'Migrating': status_lib.ClusterStatus.INIT, + 'Deleting': None, + # Succeeded in provisioning state means the VM is provisioned but not + # necessarily running. We exclude Succeeded state here, and the caller + # should determine the status of the VM based on the power state. + # 'Succeeded': status_lib.ClusterStatus.UP, + } + + subscription_id = provider_config['subscription_id'] + resource_group = provider_config['resource_group'] + compute_client = azure.get_client('compute', subscription_id) + filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} + nodes = _filter_instances(compute_client, filters, resource_group) + statuses = {} + + def _fetch_and_map_status( + compute_client: 'azure_compute.ComputeManagementClient', + node: 'azure_compute.models.VirtualMachine', + resource_group: str) -> None: + if node.provisioning_state in provisioning_state_map: + status = provisioning_state_map[node.provisioning_state] + else: + original_status = _get_vm_status(compute_client, node.name, + resource_group) + if original_status not in status_map: + with ux_utils.print_exception_no_traceback(): + raise exceptions.ClusterStatusFetchingError( + f'Failed to parse status from Azure response: {status}') + status = status_map[original_status] + if status is None and non_terminated_only: + return + statuses[node.name] = status + + with pool.ThreadPool() as p: + p.starmap(_fetch_and_map_status, + [(compute_client, node, resource_group) for node in nodes]) + + return statuses diff --git a/sky/provision/common.py b/sky/provision/common.py index 7c1bcb32652..e5df26a4c09 100644 --- a/sky/provision/common.py +++ b/sky/provision/common.py @@ -1,9 +1,11 @@ """Common data structures for provisioning""" import abc import dataclasses +import functools import os from typing import Any, Dict, List, Optional, Tuple +from sky import sky_logging from sky.utils import resources_utils # NOTE: we can use pydantic instead of dataclasses or namedtuples, because @@ -14,6 +16,10 @@ # -------------------- input data model -------------------- # InstanceId = str +_START_TITLE = '\n' + '-' * 20 + 'Start: {} ' + '-' * 20 +_END_TITLE = '-' * 20 + 'End: {} ' + '-' * 20 + '\n' + +logger = sky_logging.init_logger(__name__) class ProvisionerError(RuntimeError): @@ -268,3 +274,16 @@ def query_ports_passthrough( for port in ports: result[port] = [SocketEndpoint(port=port, host=head_ip)] return result + + +def log_function_start_end(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs): + logger.info(_START_TITLE.format(func.__name__)) + try: + return func(*args, **kwargs) + finally: + logger.info(_END_TITLE.format(func.__name__)) + + return wrapper diff --git a/sky/provision/cudo/__init__.py b/sky/provision/cudo/__init__.py index bbdc96413a8..c4587bfdfa7 100644 --- a/sky/provision/cudo/__init__.py +++ b/sky/provision/cudo/__init__.py @@ -3,6 +3,7 @@ from sky.provision.cudo.config import bootstrap_instances from sky.provision.cudo.instance import cleanup_ports from sky.provision.cudo.instance import get_cluster_info +from sky.provision.cudo.instance import open_ports from sky.provision.cudo.instance import query_instances from sky.provision.cudo.instance import run_instances from sky.provision.cudo.instance import stop_instances @@ -11,4 +12,4 @@ __all__ = ('bootstrap_instances', 'run_instances', 'stop_instances', 'terminate_instances', 'wait_instances', 'get_cluster_info', - 'cleanup_ports', 'query_instances') + 'cleanup_ports', 'query_instances', 'open_ports') diff --git a/sky/provision/cudo/cudo_utils.py b/sky/provision/cudo/cudo_utils.py new file mode 100644 index 00000000000..d4ef7f9e415 --- /dev/null +++ b/sky/provision/cudo/cudo_utils.py @@ -0,0 +1,112 @@ +"""Cudo catalog helper.""" + +cudo_gpu_model = { + 'NVIDIA V100': 'V100', + 'NVIDIA A40': 'A40', + 'RTX 3080': 'RTX3080', + 'RTX A4000': 'RTXA4000', + 'RTX A4500': 'RTXA4500', + 'RTX A5000': 'RTXA5000', + 'RTX A6000': 'RTXA6000', +} + +cudo_gpu_mem = { + 'RTX3080': 12, + 'A40': 48, + 'RTXA4000': 16, + 'RTXA4500': 20, + 'RTXA5000': 24, + 'RTXA6000': 48, + 'V100': 16, +} + +machine_specs = [ + # Low + { + 'vcpu': 2, + 'mem': 4, + 'gpu': 1, + }, + { + 'vcpu': 4, + 'mem': 8, + 'gpu': 1, + }, + { + 'vcpu': 8, + 'mem': 16, + 'gpu': 2, + }, + { + 'vcpu': 16, + 'mem': 32, + 'gpu': 2, + }, + { + 'vcpu': 32, + 'mem': 64, + 'gpu': 4, + }, + { + 'vcpu': 64, + 'mem': 128, + 'gpu': 8, + }, + # Mid + { + 'vcpu': 96, + 'mem': 192, + 'gpu': 8 + }, + { + 'vcpu': 48, + 'mem': 96, + 'gpu': 4 + }, + { + 'vcpu': 24, + 'mem': 48, + 'gpu': 2 + }, + { + 'vcpu': 12, + 'mem': 24, + 'gpu': 1 + }, + # Hi + { + 'vcpu': 96, + 'mem': 192, + 'gpu': 4 + }, + { + 'vcpu': 48, + 'mem': 96, + 'gpu': 2 + }, + { + 'vcpu': 24, + 'mem': 48, + 'gpu': 1 + }, +] + + +def cudo_gpu_to_skypilot_gpu(model): + if model in cudo_gpu_model: + return cudo_gpu_model[model] + else: + return model + + +def skypilot_gpu_to_cudo_gpu(model): + for key, value in cudo_gpu_model.items(): + if value == model: + return key + return model + + +def gpu_exists(model): + if model in cudo_gpu_model: + return True + return False diff --git a/sky/provision/cudo/cudo_wrapper.py b/sky/provision/cudo/cudo_wrapper.py index 691c69bda8c..eac39d9faed 100644 --- a/sky/provision/cudo/cudo_wrapper.py +++ b/sky/provision/cudo/cudo_wrapper.py @@ -4,29 +4,29 @@ from sky import sky_logging from sky.adaptors import cudo +import sky.provision.cudo.cudo_utils as utils logger = sky_logging.init_logger(__name__) def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str, - memory_gib: int, vcpu_count: int, gpu_count: int, gpu_model: str, + memory_gib: int, vcpu_count: int, gpu_count: int, tags: Dict[str, str], disk_size: int): """Launches an instance with the given parameters.""" - disk = cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK', - size_gib=disk_size) - - request = cudo.cudo.CreateVMBody(ssh_key_source='SSH_KEY_SOURCE_NONE', - custom_ssh_keys=[ssh_key], - vm_id=name, - machine_type=machine_type, - data_center_id=data_center_id, - boot_disk_image_id='ubuntu-nvidia-docker', - memory_gib=memory_gib, - vcpus=vcpu_count, - gpus=gpu_count, - gpu_model=gpu_model, - boot_disk=disk, - metadata=tags) + + request = cudo.cudo.CreateVMBody( + ssh_key_source='SSH_KEY_SOURCE_NONE', + custom_ssh_keys=[ssh_key], + vm_id=name, + machine_type=machine_type, + data_center_id=data_center_id, + boot_disk_image_id='ubuntu-2204-nvidia-535-docker-v20240214', + memory_gib=memory_gib, + vcpus=vcpu_count, + gpus=gpu_count, + boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK', + size_gib=disk_size), + metadata=tags) try: api = cudo.cudo.cudo_api.virtual_machines() @@ -121,3 +121,24 @@ def list_instances(): return instances except cudo.cudo.rest.ApiException as e: raise e + + +def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem, + cpus): + try: + gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model) + api = cudo.cudo.cudo_api.virtual_machines() + types = api.list_vm_machine_types(mem, + cpus, + gpu=gpu_count, + gpu_model=gpu_model, + data_center_id=data_center_id) + types_dict = types.to_dict() + hc = types_dict['host_configs'] + total_count = sum(item['count_vm_available'] for item in hc) + if total_count < to_start_count: + raise Exception( + 'Too many VMs requested, try another gpu type or region') + return total_count + except cudo.cudo.rest.ApiException as e: + raise e diff --git a/sky/provision/cudo/instance.py b/sky/provision/cudo/instance.py index 39d4bc6b3d1..5f7473a4d93 100644 --- a/sky/provision/cudo/instance.py +++ b/sky/provision/cudo/instance.py @@ -16,7 +16,6 @@ def _filter_instances(cluster_name_on_cloud: str, status_filters: Optional[List[str]]) -> Dict[str, Any]: - instances = cudo_wrapper.list_instances() possible_names = [ f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker' @@ -77,10 +76,19 @@ def run_instances(region: str, cluster_name_on_cloud: str, created_instance_ids = [] public_key = config.node_config['AuthorizedKey'] - + instance_type = config.node_config['InstanceType'] + spec = cudo_machine_type.get_spec_from_instance(instance_type, region) + gpu_count = int(float(spec['gpu_count'])) + vcpu_count = int(spec['vcpu_count']) + memory_gib = int(spec['mem_gb']) + gpu_model = spec['gpu_model'] + try: + cudo_wrapper.vm_available(to_start_count, gpu_count, gpu_model, region, + memory_gib, vcpu_count) + except Exception as e: + logger.warning(f'run_instances: {e}') + raise for _ in range(to_start_count): - instance_type = config.node_config['InstanceType'] - spec = cudo_machine_type.get_spec_from_instance(instance_type, region) node_type = 'head' if head_instance_id is None else 'worker' try: @@ -89,10 +97,9 @@ def run_instances(region: str, cluster_name_on_cloud: str, ssh_key=public_key, data_center_id=region, machine_type=spec['machine_type'], - memory_gib=int(spec['mem_gb']), - vcpu_count=int(spec['vcpu_count']), - gpu_count=int(float(spec['gpu_count'])), - gpu_model=spec['gpu_model'], + memory_gib=memory_gib, + vcpu_count=vcpu_count, + gpu_count=gpu_count, tags={}, disk_size=config.node_config['DiskSize']) except Exception as e: # pylint: disable=broad-except @@ -150,11 +157,10 @@ def terminate_instances( del provider_config instances = _filter_instances(cluster_name_on_cloud, None) for inst_id, inst in instances.items(): - logger.info(f'Terminating instance {inst_id}.' - f'{inst}') if worker_only and inst['name'].endswith('-head'): continue - logger.info(f'Removing {inst_id}: {inst}') + logger.debug(f'Terminating Cudo instance {inst_id}.' + f'{inst}') cudo_wrapper.remove(inst_id) @@ -213,6 +219,16 @@ def query_instances( return statuses +def open_ports( + cluster_name_on_cloud: str, + ports: List[str], + provider_config: Optional[Dict[str, Any]] = None, +) -> None: + del cluster_name_on_cloud, ports, provider_config + # Cudo has all ports open by default. Nothing to do here. + return + + def cleanup_ports( cluster_name_on_cloud: str, ports: List[str], diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index b9ed689fdaf..9fbc19c2959 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -139,7 +139,8 @@ def __init__(self, docker_config: Dict[str, Any], def _run(self, cmd, run_env='host', - wait_for_docker_daemon: bool = False) -> str: + wait_for_docker_daemon: bool = False, + separate_stderr: bool = False) -> str: if run_env == 'docker': cmd = self._docker_expand_user(cmd, any_char=True) @@ -155,10 +156,12 @@ def _run(self, cnt = 0 retry = 3 while True: - rc, stdout, stderr = self.runner.run(cmd, - require_outputs=True, - stream_logs=False, - log_path=self.log_path) + rc, stdout, stderr = self.runner.run( + cmd, + require_outputs=True, + stream_logs=False, + separate_stderr=separate_stderr, + log_path=self.log_path) if (not wait_for_docker_daemon or DOCKER_PERMISSION_DENIED_STR not in stdout + stderr): break @@ -173,8 +176,10 @@ def _run(self, subprocess_utils.handle_returncode( rc, cmd, - error_msg='Failed to run docker setup commands', - stderr=stdout + stderr) + error_msg='Failed to run docker setup commands.', + stderr=stdout + stderr, + # Print out the error message if the command failed. + stream_logs=True) return stdout.strip() def initialize(self) -> str: @@ -340,9 +345,14 @@ def _docker_expand_user(self, string, any_char=False): user_pos = string.find('~') if user_pos > -1: if self.home_dir is None: - self.home_dir = (self._run( - f'{self.docker_cmd} exec {self.container_name} ' - 'printenv HOME',)) + cmd = (f'{self.docker_cmd} exec {self.container_name} ' + 'printenv HOME') + self.home_dir = self._run(cmd, separate_stderr=True) + # Check for unexpected newline in home directory, which can be + # a common issue when the output is mixed with stderr. + assert '\n' not in self.home_dir, ( + 'Unexpected newline in home directory ' + f'({{self.home_dir}}) retrieved with {cmd}') if any_char: return string.replace('~/', self.home_dir + '/') diff --git a/sky/provision/fluidstack/instance.py b/sky/provision/fluidstack/instance.py index b37519a8458..e870ff15e0c 100644 --- a/sky/provision/fluidstack/instance.py +++ b/sky/provision/fluidstack/instance.py @@ -26,7 +26,7 @@ def get_internal_ip(node_info: Dict[str, Any]) -> None: node_info['internal_ip'] = node_info['ip_address'] runner = command_runner.SSHCommandRunner( - node_info['ip_address'], + (node_info['ip_address'], 22), ssh_user=node_info['capabilities']['default_user_name'], ssh_private_key=auth.PRIVATE_SSH_KEY_PATH) result = runner.run(_GET_INTERNAL_IP_CMD, diff --git a/sky/provision/gcp/constants.py b/sky/provision/gcp/constants.py index 7ed8d3da6e0..8f9341bd342 100644 --- a/sky/provision/gcp/constants.py +++ b/sky/provision/gcp/constants.py @@ -214,3 +214,15 @@ MAX_POLLS = 60 // POLL_INTERVAL # Stopping instances can take several minutes, so we increase the timeout MAX_POLLS_STOP = MAX_POLLS * 8 + +TAG_SKYPILOT_HEAD_NODE = 'skypilot-head-node' +# Tag uniquely identifying all nodes of a cluster +TAG_RAY_CLUSTER_NAME = 'ray-cluster-name' +TAG_RAY_NODE_KIND = 'ray-node-type' +TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name' + +# MIG constants +MANAGED_INSTANCE_GROUP_CONFIG = 'managed-instance-group' +DEFAULT_MANAGED_INSTANCE_GROUP_PROVISION_TIMEOUT = 900 # 15 minutes +MIG_NAME_PREFIX = 'sky-mig-' +INSTANCE_TEMPLATE_NAME_PREFIX = 'sky-it-' diff --git a/sky/provision/gcp/instance.py b/sky/provision/gcp/instance.py index a4996fc4d4b..62f234725dd 100644 --- a/sky/provision/gcp/instance.py +++ b/sky/provision/gcp/instance.py @@ -16,11 +16,6 @@ logger = sky_logging.init_logger(__name__) -TAG_SKYPILOT_HEAD_NODE = 'skypilot-head-node' -# Tag uniquely identifying all nodes of a cluster -TAG_RAY_CLUSTER_NAME = 'ray-cluster-name' -TAG_RAY_NODE_KIND = 'ray-node-type' - _INSTANCE_RESOURCE_NOT_FOUND_PATTERN = re.compile( r'The resource \'projects/.*/zones/.*/instances/.*\' was not found') @@ -66,7 +61,7 @@ def query_instances( assert provider_config is not None, (cluster_name_on_cloud, provider_config) zone = provider_config['availability_zone'] project_id = provider_config['project_id'] - label_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} + label_filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} handler: Type[ instance_utils.GCPInstance] = instance_utils.GCPComputeInstance @@ -124,15 +119,15 @@ def _wait_for_operations( logger.debug( f'wait_for_compute_{op_type}_operation: ' f'Waiting for operation {operation["name"]} to finish...') - handler.wait_for_operation(operation, project_id, zone) + handler.wait_for_operation(operation, project_id, zone=zone) def _get_head_instance_id(instances: List) -> Optional[str]: head_instance_id = None for inst in instances: labels = inst.get('labels', {}) - if (labels.get(TAG_RAY_NODE_KIND) == 'head' or - labels.get(TAG_SKYPILOT_HEAD_NODE) == '1'): + if (labels.get(constants.TAG_RAY_NODE_KIND) == 'head' or + labels.get(constants.TAG_SKYPILOT_HEAD_NODE) == '1'): head_instance_id = inst['name'] break return head_instance_id @@ -158,12 +153,14 @@ def _run_instances(region: str, cluster_name_on_cloud: str, resource: Type[instance_utils.GCPInstance] if node_type == instance_utils.GCPNodeType.COMPUTE: resource = instance_utils.GCPComputeInstance + elif node_type == instance_utils.GCPNodeType.MIG: + resource = instance_utils.GCPManagedInstanceGroup elif node_type == instance_utils.GCPNodeType.TPU: resource = instance_utils.GCPTPUVMInstance else: raise ValueError(f'Unknown node type {node_type}') - filter_labels = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} + filter_labels = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} # wait until all stopping instances are stopped/terminated while True: @@ -264,12 +261,16 @@ def get_order_key(node): if config.resume_stopped_nodes and to_start_count > 0 and stopped_instances: resumed_instance_ids = [n['name'] for n in stopped_instances] if resumed_instance_ids: - for instance_id in resumed_instance_ids: - resource.start_instance(instance_id, project_id, - availability_zone) - resource.set_labels(project_id, availability_zone, instance_id, - labels) - to_start_count -= len(resumed_instance_ids) + resumed_instance_ids = resource.start_instances( + cluster_name_on_cloud, project_id, availability_zone, + resumed_instance_ids, labels) + # In MIG case, the resumed_instance_ids will include the previously + # PENDING and RUNNING instances. To avoid double counting, we need to + # remove them from the resumed_instance_ids. + ready_instances = set(resumed_instance_ids) + ready_instances |= set([n['name'] for n in running_instances]) + ready_instances |= set([n['name'] for n in pending_instances]) + to_start_count = config.count - len(ready_instances) if head_instance_id is None: head_instance_id = resource.create_node_tag( @@ -281,9 +282,14 @@ def get_order_key(node): if to_start_count > 0: errors, created_instance_ids = resource.create_instances( - cluster_name_on_cloud, project_id, availability_zone, - config.node_config, labels, to_start_count, - head_instance_id is None) + cluster_name_on_cloud, + project_id, + availability_zone, + config.node_config, + labels, + to_start_count, + total_count=config.count, + include_head_node=head_instance_id is None) if errors: error = common.ProvisionerError('Failed to launch instances.') error.errors = errors @@ -387,7 +393,7 @@ def get_cluster_info( assert provider_config is not None, cluster_name_on_cloud zone = provider_config['availability_zone'] project_id = provider_config['project_id'] - label_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} + label_filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} handlers: List[Type[instance_utils.GCPInstance]] = [ instance_utils.GCPComputeInstance @@ -415,7 +421,7 @@ def get_cluster_info( project_id, zone, { - **label_filters, TAG_RAY_NODE_KIND: 'head' + **label_filters, constants.TAG_RAY_NODE_KIND: 'head' }, lambda h: [h.RUNNING_STATE], ) @@ -441,14 +447,14 @@ def stop_instances( assert provider_config is not None, cluster_name_on_cloud zone = provider_config['availability_zone'] project_id = provider_config['project_id'] - label_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} + label_filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} tpu_node = provider_config.get('tpu_node') if tpu_node is not None: instance_utils.delete_tpu_node(project_id, zone, tpu_node) if worker_only: - label_filters[TAG_RAY_NODE_KIND] = 'worker' + label_filters[constants.TAG_RAY_NODE_KIND] = 'worker' handlers: List[Type[instance_utils.GCPInstance]] = [ instance_utils.GCPComputeInstance @@ -510,9 +516,16 @@ def terminate_instances( if tpu_node is not None: instance_utils.delete_tpu_node(project_id, zone, tpu_node) - label_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} + use_mig = provider_config.get('use_managed_instance_group', False) + if use_mig: + # Deleting the MIG will also delete the instances. + instance_utils.GCPManagedInstanceGroup.delete_mig( + project_id, zone, cluster_name_on_cloud) + return + + label_filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} if worker_only: - label_filters[TAG_RAY_NODE_KIND] = 'worker' + label_filters[constants.TAG_RAY_NODE_KIND] = 'worker' handlers: List[Type[instance_utils.GCPInstance]] = [ instance_utils.GCPComputeInstance @@ -555,7 +568,7 @@ def open_ports( project_id = provider_config['project_id'] firewall_rule_name = provider_config['firewall_rule'] - label_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} + label_filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud} handlers: List[Type[instance_utils.GCPInstance]] = [ instance_utils.GCPComputeInstance, instance_utils.GCPTPUVMInstance, diff --git a/sky/provision/gcp/instance_utils.py b/sky/provision/gcp/instance_utils.py index dde0918274d..e1e72a25d6c 100644 --- a/sky/provision/gcp/instance_utils.py +++ b/sky/provision/gcp/instance_utils.py @@ -14,12 +14,10 @@ from sky.clouds import gcp as gcp_cloud from sky.provision import common from sky.provision.gcp import constants +from sky.provision.gcp import mig_utils from sky.utils import common_utils from sky.utils import ux_utils -# Tag uniquely identifying all nodes of a cluster -TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name' -TAG_RAY_CLUSTER_NAME = 'ray-cluster-name' # Tag for the name of the node INSTANCE_NAME_MAX_LEN = 64 INSTANCE_NAME_UUID_LEN = 8 @@ -100,19 +98,20 @@ def _generate_node_name(cluster_name: str, node_suffix: str, return node_name -def _log_errors(errors: List[Dict[str, str]], e: Any, - zone: Optional[str]) -> None: - """Format errors into a string.""" +def _format_and_log_message_from_errors(errors: List[Dict[str, str]], e: Any, + zone: Optional[str]) -> str: + """Format errors into a string and log it to the console.""" if errors: plural = 's' if len(errors) > 1 else '' codes = ', '.join(repr(e.get('code', 'N/A')) for e in errors) messages = '; '.join( repr(e.get('message', 'N/A').strip('.')) for e in errors) zone_str = f' in {zone}' if zone else '' - logger.warning(f'Got return code{plural} {codes}' - f'{zone_str}: {messages}') + msg = f'Got return code{plural} {codes}{zone_str}: {messages}' else: - logger.warning(f'create_instances: Failed with reason: {e}') + msg = f'create_instances: Failed with reason: {e}' + logger.warning(msg) + return msg def selflink_to_name(selflink: str) -> str: @@ -133,6 +132,8 @@ def instance_to_handler(instance: str): return GCPComputeInstance elif instance_type == 'tpu': return GCPTPUVMInstance + elif instance.startswith(constants.MIG_NAME_PREFIX): + return GCPManagedInstanceGroup else: raise ValueError(f'Unknown instance type: {instance_type}') @@ -176,8 +177,11 @@ def terminate( raise NotImplementedError @classmethod - def wait_for_operation(cls, operation: dict, project_id: str, - zone: Optional[str]) -> None: + def wait_for_operation(cls, + operation: dict, + project_id: str, + region: Optional[str] = None, + zone: Optional[str] = None) -> None: raise NotImplementedError @classmethod @@ -239,6 +243,7 @@ def create_instances( node_config: dict, labels: dict, count: int, + total_count: int, include_head_node: bool, ) -> Tuple[Optional[List], List[str]]: """Creates multiple instances and returns result. @@ -247,6 +252,21 @@ def create_instances( """ raise NotImplementedError + @classmethod + def start_instances(cls, cluster_name: str, project_id: str, zone: str, + instances: List[str], labels: Dict[str, + str]) -> List[str]: + """Start multiple instances. + + Returns: + List of instance names that are started. + """ + del cluster_name # Unused + for instance_id in instances: + cls.start_instance(instance_id, project_id, zone) + cls.set_labels(project_id, zone, instance_id, labels) + return instances + @classmethod def start_instance(cls, node_id: str, project_id: str, zone: str) -> None: """Start a stopped instance.""" @@ -400,11 +420,18 @@ def filter( return instances @classmethod - def wait_for_operation(cls, operation: dict, project_id: str, - zone: Optional[str]) -> None: + def wait_for_operation(cls, + operation: dict, + project_id: str, + region: Optional[str] = None, + zone: Optional[str] = None, + timeout: int = GCP_TIMEOUT) -> None: if zone is not None: kwargs = {'zone': zone} operation_caller = cls.load_resource().zoneOperations() + elif region is not None: + kwargs = {'region': region} + operation_caller = cls.load_resource().regionOperations() else: kwargs = {} operation_caller = cls.load_resource().globalOperations() @@ -423,13 +450,13 @@ def call_operation(fn, timeout: int): return request.execute(num_retries=GCP_MAX_RETRIES) wait_start = time.time() - while time.time() - wait_start < GCP_TIMEOUT: + while time.time() - wait_start < timeout: # Retry the wait() call until it succeeds or times out. # This is because the wait() call is only best effort, and does not # guarantee that the operation is done when it returns. # Reference: https://cloud.google.com/workflows/docs/reference/googleapis/compute/v1/zoneOperations/wait # pylint: disable=line-too-long - timeout = max(GCP_TIMEOUT - (time.time() - wait_start), 1) - result = call_operation(operation_caller.wait, timeout) + remaining_timeout = max(timeout - (time.time() - wait_start), 1) + result = call_operation(operation_caller.wait, remaining_timeout) if result['status'] == 'DONE': # NOTE: Error example: # { @@ -441,8 +468,10 @@ def call_operation(fn, timeout: int): logger.debug( 'wait_operations: Failed to create instances. Reason: ' f'{errors}') - _log_errors(errors, result, zone) + msg = _format_and_log_message_from_errors( + errors, result, zone) error = common.ProvisionerError('Operation failed') + setattr(error, 'detailed_reason', msg) error.errors = errors raise error return @@ -451,9 +480,10 @@ def call_operation(fn, timeout: int): else: logger.warning('wait_for_operation: Timeout waiting for creation ' 'operation, cancelling the operation ...') - timeout = max(GCP_TIMEOUT - (time.time() - wait_start), 1) + remaining_timeout = max(timeout - (time.time() - wait_start), 1) try: - result = call_operation(operation_caller.delete, timeout) + result = call_operation(operation_caller.delete, + remaining_timeout) except gcp.http_error_exception() as e: logger.debug('wait_for_operation: failed to cancel operation ' f'due to error: {e}') @@ -462,8 +492,10 @@ def call_operation(fn, timeout: int): 'message': f'Timeout waiting for operation {operation["name"]}', 'domain': 'wait_for_operation' }] - _log_errors(errors, None, zone) + msg = _format_and_log_message_from_errors(errors, None, zone) error = common.ProvisionerError('Operation timed out') + # Used for usage collection only, to include in the usage message. + setattr(error, 'detailed_reason', msg) error.errors = errors raise error @@ -606,7 +638,7 @@ def set_labels(cls, project_id: str, availability_zone: str, node_id: str, body=body, ).execute(num_retries=GCP_CREATE_MAX_RETRIES)) - cls.wait_for_operation(operation, project_id, availability_zone) + cls.wait_for_operation(operation, project_id, zone=availability_zone) @classmethod def create_instances( @@ -617,6 +649,7 @@ def create_instances( node_config: dict, labels: dict, count: int, + total_count: int, include_head_node: bool, ) -> Tuple[Optional[List], List[str]]: # NOTE: The syntax for bulkInsert() is different from insert(). @@ -643,8 +676,8 @@ def create_instances( config.update({ 'labels': dict( labels, **{ - TAG_RAY_CLUSTER_NAME: cluster_name, - TAG_SKYPILOT_CLUSTER_NAME: cluster_name + constants.TAG_RAY_CLUSTER_NAME: cluster_name, + constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name }), }) @@ -739,6 +772,19 @@ def _insert(cls, names: List[str], project_id: str, zone: str, logger.debug('"insert" operation requested ...') return operations + @classmethod + def _convert_selflinks_in_config(cls, config: dict) -> None: + """Convert selflinks to names in the config.""" + for disk in config.get('disks', []): + disk_type = disk.get('initializeParams', {}).get('diskType') + if disk_type is not None: + disk['initializeParams']['diskType'] = selflink_to_name( + disk_type) + config['machineType'] = selflink_to_name(config['machineType']) + for accelerator in config.get('guestAccelerators', []): + accelerator['acceleratorType'] = selflink_to_name( + accelerator['acceleratorType']) + @classmethod def _bulk_insert(cls, names: List[str], project_id: str, zone: str, config: dict) -> List[dict]: @@ -752,15 +798,7 @@ def _bulk_insert(cls, names: List[str], project_id: str, zone: str, k: v for d in config['scheduling'] for k, v in d.items() } - for disk in config.get('disks', []): - disk_type = disk.get('initializeParams', {}).get('diskType') - if disk_type is not None: - disk['initializeParams']['diskType'] = selflink_to_name( - disk_type) - config['machineType'] = selflink_to_name(config['machineType']) - for accelerator in config.get('guestAccelerators', []): - accelerator['acceleratorType'] = selflink_to_name( - accelerator['acceleratorType']) + cls._convert_selflinks_in_config(config) body = { 'count': len(names), @@ -819,7 +857,7 @@ def _handle_http_error(e): }) logger.debug( f'create_instances: googleapiclient.errors.HttpError: {e}') - _log_errors(errors, e, zone) + _format_and_log_message_from_errors(errors, e, zone) return errors # Allow Google Compute Engine instance templates. @@ -849,13 +887,13 @@ def _handle_http_error(e): if errors: logger.debug('create_instances: Failed to create instances. ' f'Reason: {errors}') - _log_errors(errors, operations, zone) + _format_and_log_message_from_errors(errors, operations, zone) return errors logger.debug('Waiting GCP instances to be ready ...') try: for operation in operations: - cls.wait_for_operation(operation, project_id, zone) + cls.wait_for_operation(operation, project_id, zone=zone) except common.ProvisionerError as e: return e.errors except gcp.http_error_exception() as e: @@ -876,7 +914,7 @@ def start_instance(cls, node_id: str, project_id: str, zone: str) -> None: instance=node_id, ).execute()) - cls.wait_for_operation(operation, project_id, zone) + cls.wait_for_operation(operation, project_id, zone=zone) @classmethod def get_instance_info(cls, project_id: str, availability_zone: str, @@ -935,7 +973,219 @@ def resize_disk(cls, project_id: str, availability_zone: str, logger.warning(f'googleapiclient.errors.HttpError: {e.reason}') return - cls.wait_for_operation(operation, project_id, availability_zone) + cls.wait_for_operation(operation, project_id, zone=availability_zone) + + +class GCPManagedInstanceGroup(GCPComputeInstance): + """Handler for GCP Managed Instance Group.""" + + @classmethod + def create_instances( + cls, + cluster_name: str, + project_id: str, + zone: str, + node_config: dict, + labels: dict, + count: int, + total_count: int, + include_head_node: bool, + ) -> Tuple[Optional[List], List[str]]: + logger.debug(f'Creating cluster with MIG: {cluster_name!r}') + config = copy.deepcopy(node_config) + labels = dict(config.get('labels', {}), **labels) + + config.update({ + 'labels': dict( + labels, + **{ + constants.TAG_RAY_CLUSTER_NAME: cluster_name, + # Assume all nodes are workers, we can update the head node + # once the instances are created. + constants.TAG_RAY_NODE_KIND: 'worker', + constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name, + }), + }) + cls._convert_selflinks_in_config(config) + + # Convert label values to string and lowercase per MIG API requirement. + region = zone.rpartition('-')[0] + instance_template_name = mig_utils.get_instance_template_name( + cluster_name) + managed_instance_group_name = mig_utils.get_managed_instance_group_name( + cluster_name) + + instance_template_exists = mig_utils.check_instance_template_exits( + project_id, region, instance_template_name) + mig_exists = mig_utils.check_managed_instance_group_exists( + project_id, zone, managed_instance_group_name) + + label_filters = { + constants.TAG_RAY_CLUSTER_NAME: cluster_name, + } + potential_head_instances = [] + if mig_exists: + instances = cls.filter(project_id, + zone, + label_filters={ + constants.TAG_RAY_NODE_KIND: 'head', + **label_filters, + }, + status_filters=cls.NEED_TO_TERMINATE_STATES) + potential_head_instances = list(instances.keys()) + + config['labels'] = { + k: str(v).lower() for k, v in config['labels'].items() + } + if instance_template_exists: + if mig_exists: + logger.debug( + f'Instance template {instance_template_name} already ' + 'exists. Skip creating it.') + else: + logger.debug( + f'Instance template {instance_template_name!r} ' + 'exists and no instance group is using it. This is a ' + 'leftover of a previous autodown. Delete it and recreate ' + 'it.') + # TODO(zhwu): this is a bit hacky as we cannot delete instance + # template during an autodown, we can only defer the deletion + # to the next launch of a cluster with the same name. We should + # find a better way to handle this. + cls._delete_instance_template(project_id, zone, + instance_template_name) + instance_template_exists = False + + if not instance_template_exists: + operation = mig_utils.create_region_instance_template( + cluster_name, project_id, region, instance_template_name, + config) + cls.wait_for_operation(operation, project_id, region=region) + # create managed instance group + instance_template_url = (f'projects/{project_id}/regions/{region}/' + f'instanceTemplates/{instance_template_name}') + if not mig_exists: + # Create a new MIG with size 0 and resize it later for triggering + # DWS, according to the doc: https://cloud.google.com/compute/docs/instance-groups/create-mig-with-gpu-vms # pylint: disable=line-too-long + operation = mig_utils.create_managed_instance_group( + project_id, + zone, + managed_instance_group_name, + instance_template_url, + size=0) + cls.wait_for_operation(operation, project_id, zone=zone) + + managed_instance_group_config = config[ + constants.MANAGED_INSTANCE_GROUP_CONFIG] + if count > 0: + # Use resize to trigger DWS for creating VMs. + operation = mig_utils.resize_managed_instance_group( + project_id, + zone, + managed_instance_group_name, + count, + run_duration=managed_instance_group_config['run_duration']) + cls.wait_for_operation(operation, project_id, zone=zone) + + # This will block the provisioning until the nodes are ready, which + # makes the failover not effective. We rely on the request timeout set + # by user to trigger failover. + mig_utils.wait_for_managed_group_to_be_stable( + project_id, + zone, + managed_instance_group_name, + timeout=managed_instance_group_config.get( + 'provision_timeout', + constants.DEFAULT_MANAGED_INSTANCE_GROUP_PROVISION_TIMEOUT)) + + pending_running_instance_names = cls._add_labels_and_find_head( + cluster_name, project_id, zone, labels, potential_head_instances) + assert len(pending_running_instance_names) == total_count, ( + pending_running_instance_names, total_count) + cls.create_node_tag( + project_id, + zone, + pending_running_instance_names[0], + is_head=True, + ) + return None, pending_running_instance_names + + @classmethod + def _delete_instance_template(cls, project_id: str, zone: str, + instance_template_name: str) -> None: + logger.debug(f'Deleting instance template {instance_template_name}...') + region = zone.rpartition('-')[0] + try: + operation = cls.load_resource().regionInstanceTemplates().delete( + project=project_id, + region=region, + instanceTemplate=instance_template_name).execute() + cls.wait_for_operation(operation, project_id, region=region) + except gcp.http_error_exception() as e: + if re.search(mig_utils.IT_RESOURCE_NOT_FOUND_PATTERN, + str(e)) is None: + raise + logger.warning( + f'Instance template {instance_template_name!r} does not exist. ' + 'Skip deletion.') + + @classmethod + def delete_mig(cls, project_id: str, zone: str, cluster_name: str) -> None: + mig_name = mig_utils.get_managed_instance_group_name(cluster_name) + # Get all resize request of the MIG and cancel them. + mig_utils.cancel_all_resize_request_for_mig(project_id, zone, mig_name) + logger.debug(f'Deleting MIG {mig_name!r} ...') + try: + operation = cls.load_resource().instanceGroupManagers().delete( + project=project_id, zone=zone, + instanceGroupManager=mig_name).execute() + cls.wait_for_operation(operation, project_id, zone=zone) + except gcp.http_error_exception() as e: + if re.search(mig_utils.MIG_RESOURCE_NOT_FOUND_PATTERN, + str(e)) is None: + raise + logger.warning(f'MIG {mig_name!r} does not exist. Skip ' + 'deletion.') + + # In the autostop case, the following deletion of instance template + # will not be executed as the instance that runs the deletion will be + # terminated with the managed instance group. It is ok to leave the + # instance template there as when a user creates a new cluster with the + # same name, the instance template will be updated in our + # create_instances method. + cls._delete_instance_template( + project_id, zone, + mig_utils.get_instance_template_name(cluster_name)) + + @classmethod + def _add_labels_and_find_head( + cls, cluster_name: str, project_id: str, zone: str, + labels: Dict[str, str], + potential_head_instances: List[str]) -> List[str]: + pending_running_instances = cls.filter( + project_id, + zone, + {constants.TAG_RAY_CLUSTER_NAME: cluster_name}, + # Find all provisioning and running instances. + status_filters=cls.NEED_TO_STOP_STATES) + for running_instance_name in pending_running_instances.keys(): + if running_instance_name in potential_head_instances: + head_instance_name = running_instance_name + break + else: + head_instance_name = list(pending_running_instances.keys())[0] + # We need to update the node's label if mig already exists, as the + # config is not updated during the resize operation. + for instance_name in pending_running_instances.keys(): + cls.set_labels(project_id=project_id, + availability_zone=zone, + node_id=instance_name, + labels=labels) + + pending_running_instance_names = list(pending_running_instances.keys()) + pending_running_instance_names.remove(head_instance_name) + # Label for head node type will be set by caller + return [head_instance_name] + pending_running_instance_names class GCPTPUVMInstance(GCPInstance): @@ -959,10 +1209,13 @@ def load_resource(cls): discoveryServiceUrl='https://tpu.googleapis.com/$discovery/rest') @classmethod - def wait_for_operation(cls, operation: dict, project_id: str, - zone: Optional[str]) -> None: + def wait_for_operation(cls, + operation: dict, + project_id: str, + region: Optional[str] = None, + zone: Optional[str] = None) -> None: """Poll for TPU operation until finished.""" - del project_id, zone # unused + del project_id, region, zone # unused @_retry_on_http_exception( f'Failed to wait for operation {operation["name"]}') @@ -1176,6 +1429,7 @@ def create_instances( node_config: dict, labels: dict, count: int, + total_count: int, include_head_node: bool, ) -> Tuple[Optional[List], List[str]]: config = copy.deepcopy(node_config) @@ -1198,8 +1452,8 @@ def create_instances( config.update({ 'labels': dict( labels, **{ - TAG_RAY_CLUSTER_NAME: cluster_name, - TAG_SKYPILOT_CLUSTER_NAME: cluster_name + constants.TAG_RAY_CLUSTER_NAME: cluster_name, + constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name }), }) @@ -1257,7 +1511,7 @@ def create_instances( 'domain': 'create_instances', 'message': error_details, }) - _log_errors(errors, e, zone) + _format_and_log_message_from_errors(errors, e, zone) return errors, names for detail in error_details: # To be consistent with error messages returned by operation @@ -1276,7 +1530,7 @@ def create_instances( 'domain': violation.get('subject'), 'message': violation.get('description'), }) - _log_errors(errors, e, zone) + _format_and_log_message_from_errors(errors, e, zone) return errors, names errors = [] for operation in operations: @@ -1294,7 +1548,7 @@ def create_instances( if errors: logger.debug('create_instances: Failed to create instances. ' f'Reason: {errors}') - _log_errors(errors, operations, zone) + _format_and_log_message_from_errors(errors, operations, zone) return errors, names logger.debug('Waiting GCP instances to be ready ...') @@ -1336,7 +1590,7 @@ def create_instances( 'message': 'Timeout waiting for creation operation', 'domain': 'create_instances' }] - _log_errors(errors, None, zone) + _format_and_log_message_from_errors(errors, None, zone) return errors, names # NOTE: Error example: @@ -1353,7 +1607,7 @@ def create_instances( logger.debug( 'create_instances: Failed to create instances. Reason: ' f'{errors}') - _log_errors(errors, results, zone) + _format_and_log_message_from_errors(errors, results, zone) return errors, names assert all(success), ( 'Failed to create instances, but there is no error. ' @@ -1406,10 +1660,11 @@ class GCPNodeType(enum.Enum): """Enum for GCP node types (compute & tpu)""" COMPUTE = 'compute' + MIG = 'mig' TPU = 'tpu' -def get_node_type(node: dict) -> GCPNodeType: +def get_node_type(config: Dict[str, Any]) -> GCPNodeType: """Returns node type based on the keys in ``node``. This is a very simple check. If we have a ``machineType`` key, @@ -1419,17 +1674,22 @@ def get_node_type(node: dict) -> GCPNodeType: This works for both node configs and API returned nodes. """ - - if 'machineType' not in node and 'acceleratorType' not in node: + if ('machineType' not in config and 'acceleratorType' not in config): raise ValueError( 'Invalid node. For a Compute instance, "machineType" is ' 'required. ' 'For a TPU instance, "acceleratorType" and no "machineType" ' 'is required. ' - f'Got {list(node)}') + f'Got {list(config)}') - if 'machineType' not in node and 'acceleratorType' in node: + if 'machineType' not in config and 'acceleratorType' in config: return GCPNodeType.TPU + + if (config.get(constants.MANAGED_INSTANCE_GROUP_CONFIG, None) is not None + and config.get('guestAccelerators', None) is not None): + # DWS in MIG only works for machine with GPUs. + return GCPNodeType.MIG + return GCPNodeType.COMPUTE @@ -1475,7 +1735,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str], 'https://console.cloud.google.com/iam-admin/quotas ' 'for more information.' }] - _log_errors(provisioner_err.errors, e, zone) + _format_and_log_message_from_errors(provisioner_err.errors, e, zone) raise provisioner_err from e if 'PERMISSION_DENIED' in stderr: @@ -1484,7 +1744,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str], 'domain': 'tpu', 'message': 'TPUs are not available in this zone.' }] - _log_errors(provisioner_err.errors, e, zone) + _format_and_log_message_from_errors(provisioner_err.errors, e, zone) raise provisioner_err from e if 'no more capacity in the zone' in stderr: @@ -1493,7 +1753,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str], 'domain': 'tpu', 'message': 'No more capacity in this zone.' }] - _log_errors(provisioner_err.errors, e, zone) + _format_and_log_message_from_errors(provisioner_err.errors, e, zone) raise provisioner_err from e if 'CloudTpu received an invalid AcceleratorType' in stderr: @@ -1506,7 +1766,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str], 'message': (f'TPU type {tpu_type} is not available in this ' f'zone {zone}.') }] - _log_errors(provisioner_err.errors, e, zone) + _format_and_log_message_from_errors(provisioner_err.errors, e, zone) raise provisioner_err from e # TODO(zhwu): Add more error code handling, if needed. @@ -1515,7 +1775,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str], 'domain': 'tpu', 'message': stderr }] - _log_errors(provisioner_err.errors, e, zone) + _format_and_log_message_from_errors(provisioner_err.errors, e, zone) raise provisioner_err from e diff --git a/sky/provision/gcp/mig_utils.py b/sky/provision/gcp/mig_utils.py new file mode 100644 index 00000000000..9e33f5171e2 --- /dev/null +++ b/sky/provision/gcp/mig_utils.py @@ -0,0 +1,209 @@ +"""Managed Instance Group Utils""" +import re +import subprocess +from typing import Any, Dict + +from sky import sky_logging +from sky.adaptors import gcp +from sky.provision.gcp import constants + +logger = sky_logging.init_logger(__name__) + +MIG_RESOURCE_NOT_FOUND_PATTERN = re.compile( + r'The resource \'projects/.*/zones/.*/instanceGroupManagers/.*\' was not ' + r'found') + +IT_RESOURCE_NOT_FOUND_PATTERN = re.compile( + r'The resource \'projects/.*/regions/.*/instanceTemplates/.*\' was not ' + 'found') + + +def get_instance_template_name(cluster_name: str) -> str: + return f'{constants.INSTANCE_TEMPLATE_NAME_PREFIX}{cluster_name}' + + +def get_managed_instance_group_name(cluster_name: str) -> str: + return f'{constants.MIG_NAME_PREFIX}{cluster_name}' + + +def check_instance_template_exits(project_id: str, region: str, + template_name: str) -> bool: + compute = gcp.build('compute', + 'v1', + credentials=None, + cache_discovery=False) + try: + compute.regionInstanceTemplates().get( + project=project_id, region=region, + instanceTemplate=template_name).execute() + except gcp.http_error_exception() as e: + if IT_RESOURCE_NOT_FOUND_PATTERN.search(str(e)) is not None: + # Instance template does not exist. + return False + raise + return True + + +def create_region_instance_template(cluster_name_on_cloud: str, project_id: str, + region: str, template_name: str, + node_config: Dict[str, Any]) -> dict: + """Create a regional instance template.""" + logger.debug(f'Creating regional instance template {template_name!r}.') + compute = gcp.build('compute', + 'v1', + credentials=None, + cache_discovery=False) + config = node_config.copy() + config.pop(constants.MANAGED_INSTANCE_GROUP_CONFIG, None) + + # We have to ignore user defined scheduling for DWS. + # TODO: Add a warning log for this behvaiour. + scheduling = config.get('scheduling', {}) + assert scheduling.get('provisioningModel') != 'SPOT', ( + 'DWS does not support spot VMs.') + + reservations_affinity = config.pop('reservation_affinity', None) + if reservations_affinity is not None: + logger.warning( + f'Ignoring reservations_affinity {reservations_affinity} ' + 'for DWS.') + + # Create the regional instance template request + operation = compute.regionInstanceTemplates().insert( + project=project_id, + region=region, + body={ + 'name': template_name, + 'properties': dict( + description=( + 'SkyPilot instance template for ' + f'{cluster_name_on_cloud!r} to support DWS requests.'), + reservationAffinity=dict( + consumeReservationType='NO_RESERVATION'), + **config, + ) + }).execute() + return operation + + +def create_managed_instance_group(project_id: str, zone: str, group_name: str, + instance_template_url: str, + size: int) -> dict: + logger.debug(f'Creating managed instance group {group_name!r}.') + compute = gcp.build('compute', + 'v1', + credentials=None, + cache_discovery=False) + operation = compute.instanceGroupManagers().insert( + project=project_id, + zone=zone, + body={ + 'name': group_name, + 'instanceTemplate': instance_template_url, + 'target_size': size, + 'instanceLifecyclePolicy': { + 'defaultActionOnFailure': 'DO_NOTHING', + }, + 'updatePolicy': { + 'type': 'OPPORTUNISTIC', + }, + }).execute() + return operation + + +def resize_managed_instance_group(project_id: str, zone: str, group_name: str, + resize_by: int, run_duration: int) -> dict: + logger.debug(f'Resizing managed instance group {group_name!r} by ' + f'{resize_by} with run duration {run_duration}.') + compute = gcp.build('compute', + 'beta', + credentials=None, + cache_discovery=False) + operation = compute.instanceGroupManagerResizeRequests().insert( + project=project_id, + zone=zone, + instanceGroupManager=group_name, + body={ + 'name': group_name, + 'resizeBy': resize_by, + 'requestedRunDuration': { + 'seconds': run_duration, + } + }).execute() + return operation + + +def cancel_all_resize_request_for_mig(project_id: str, zone: str, + group_name: str) -> None: + logger.debug(f'Cancelling all resize requests for MIG {group_name!r}.') + try: + compute = gcp.build('compute', + 'beta', + credentials=None, + cache_discovery=False) + operation = compute.instanceGroupManagerResizeRequests().list( + project=project_id, + zone=zone, + instanceGroupManager=group_name, + filter='state eq ACCEPTED').execute() + for request in operation.get('items', []): + try: + compute.instanceGroupManagerResizeRequests().cancel( + project=project_id, + zone=zone, + instanceGroupManager=group_name, + resizeRequest=request['name']).execute() + except gcp.http_error_exception() as e: + logger.warning('Failed to cancel resize request ' + f'{request["id"]!r}: {e}') + except gcp.http_error_exception() as e: + if re.search(MIG_RESOURCE_NOT_FOUND_PATTERN, str(e)) is None: + raise + logger.warning(f'MIG {group_name!r} does not exist. Skip ' + 'resize request cancellation.') + logger.debug(f'Error: {e}') + + +def check_managed_instance_group_exists(project_id: str, zone: str, + group_name: str) -> bool: + compute = gcp.build('compute', + 'v1', + credentials=None, + cache_discovery=False) + try: + compute.instanceGroupManagers().get( + project=project_id, zone=zone, + instanceGroupManager=group_name).execute() + except gcp.http_error_exception() as e: + if MIG_RESOURCE_NOT_FOUND_PATTERN.search(str(e)) is not None: + return False + raise + return True + + +def wait_for_managed_group_to_be_stable(project_id: str, zone: str, + group_name: str, timeout: int) -> None: + """Wait until the managed instance group is stable.""" + logger.debug(f'Waiting for MIG {group_name} to be stable with timeout ' + f'{timeout}.') + try: + cmd = ('gcloud compute instance-groups managed wait-until ' + f'{group_name} ' + '--stable ' + f'--zone={zone} ' + f'--project={project_id} ' + f'--timeout={timeout}') + logger.info( + f'Waiting for MIG {group_name} to be stable with command:\n{cmd}') + proc = subprocess.run( + f'yes | {cmd}', + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + check=True, + ) + stdout = proc.stdout.decode('ascii') + logger.info(stdout) + except subprocess.CalledProcessError as e: + stderr = e.stderr.decode('ascii') + logger.info(stderr) diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index c81ecd78db4..2d9ead3dc01 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -6,8 +6,9 @@ import os import resource import time -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple +from sky import exceptions from sky import provision from sky import sky_logging from sky.provision import common @@ -22,8 +23,6 @@ from sky.utils import ux_utils logger = sky_logging.init_logger(__name__) -_START_TITLE = '\n' + '-' * 20 + 'Start: {} ' + '-' * 20 -_END_TITLE = '-' * 20 + 'End: {} ' + '-' * 20 + '\n' _MAX_RETRY = 6 @@ -68,42 +67,34 @@ 'sky.skylet.attempt_skylet;') -def _auto_retry(func): +def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True): """Decorator that retries the function if it fails. This decorator is mostly for SSH disconnection issues, which might happen during the setup of instances. """ - @functools.wraps(func) - def retry(*args, **kwargs): - backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=5) - for retry_cnt in range(_MAX_RETRY): - try: - return func(*args, **kwargs) - except Exception as e: # pylint: disable=broad-except - if retry_cnt >= _MAX_RETRY - 1: - raise e - sleep = backoff.current_backoff() - logger.info( - f'{func.__name__}: Retrying in {sleep:.1f} seconds, ' - f'due to {e}') - time.sleep(sleep) + def decorator(func): - return retry + @functools.wraps(func) + def retry(*args, **kwargs): + backoff = common_utils.Backoff(initial_backoff=1, + max_backoff_factor=5) + for retry_cnt in range(_MAX_RETRY): + try: + return func(*args, **kwargs) + except Exception as e: # pylint: disable=broad-except + if not should_retry(e) or retry_cnt >= _MAX_RETRY - 1: + raise + sleep = backoff.current_backoff() + logger.info( + f'{func.__name__}: Retrying in {sleep:.1f} seconds, ' + f'due to {e}') + time.sleep(sleep) + return retry -def _log_start_end(func): - - @functools.wraps(func) - def wrapper(*args, **kwargs): - logger.info(_START_TITLE.format(func.__name__)) - try: - return func(*args, **kwargs) - finally: - logger.info(_END_TITLE.format(func.__name__)) - - return wrapper + return decorator def _hint_worker_log_path(cluster_name: str, cluster_info: common.ClusterInfo, @@ -147,7 +138,7 @@ def _parallel_ssh_with_cache(func, return [future.result() for future in results] -@_log_start_end +@common.log_function_start_end def initialize_docker(cluster_name: str, docker_config: Dict[str, Any], cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any]) -> Optional[str]: @@ -156,7 +147,8 @@ def initialize_docker(cluster_name: str, docker_config: Dict[str, Any], return None _hint_worker_log_path(cluster_name, cluster_info, 'initialize_docker') - @_auto_retry + @_auto_retry(should_retry=lambda e: isinstance(e, exceptions.CommandError) + and e.returncode == 255) def _initialize_docker(runner: command_runner.CommandRunner, log_path: str): docker_user = docker_utils.DockerInitializer(docker_config, runner, log_path).initialize() @@ -177,7 +169,7 @@ def _initialize_docker(runner: command_runner.CommandRunner, log_path: str): return docker_users[0] -@_log_start_end +@common.log_function_start_end def setup_runtime_on_cluster(cluster_name: str, setup_commands: List[str], cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any]) -> None: @@ -193,7 +185,7 @@ def setup_runtime_on_cluster(cluster_name: str, setup_commands: List[str], hasher.update(d) digest = hasher.hexdigest() - @_auto_retry + @_auto_retry() def _setup_node(runner: command_runner.CommandRunner, log_path: str): for cmd in setup_commands: returncode, stdout, stderr = runner.run( @@ -253,8 +245,8 @@ def _ray_gpu_options(custom_resource: str) -> str: return f' --num-gpus={acc_count}' -@_log_start_end -@_auto_retry +@common.log_function_start_end +@_auto_retry() def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str], cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any]) -> None: @@ -313,8 +305,8 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str], f'===== stderr ====={stderr}') -@_log_start_end -@_auto_retry +@common.log_function_start_end +@_auto_retry() def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool, custom_resource: Optional[str], ray_port: int, cluster_info: common.ClusterInfo, @@ -410,8 +402,8 @@ def _setup_ray_worker(runner_and_id: Tuple[command_runner.CommandRunner, f'===== stderr ====={stderr}') -@_log_start_end -@_auto_retry +@common.log_function_start_end +@_auto_retry() def start_skylet_on_head_node(cluster_name: str, cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any]) -> None: @@ -437,7 +429,7 @@ def start_skylet_on_head_node(cluster_name: str, f'===== stderr ====={stderr}') -@_auto_retry +@_auto_retry() def _internal_file_mounts(file_mounts: Dict, runner: command_runner.CommandRunner, log_path: str) -> None: @@ -494,7 +486,7 @@ def _max_workers_for_file_mounts(common_file_mounts: Dict[str, str]) -> int: return max_workers -@_log_start_end +@common.log_function_start_end def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str], cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, str]) -> None: diff --git a/sky/provision/kubernetes/__init__.py b/sky/provision/kubernetes/__init__.py index ca3938215c9..c72f0c14054 100644 --- a/sky/provision/kubernetes/__init__.py +++ b/sky/provision/kubernetes/__init__.py @@ -2,6 +2,7 @@ from sky.provision.kubernetes.config import bootstrap_instances from sky.provision.kubernetes.instance import get_cluster_info +from sky.provision.kubernetes.instance import get_command_runners from sky.provision.kubernetes.instance import query_instances from sky.provision.kubernetes.instance import run_instances from sky.provision.kubernetes.instance import stop_instances diff --git a/sky/provision/kubernetes/config.py b/sky/provision/kubernetes/config.py index 65c494fcebf..05fe1df19ec 100644 --- a/sky/provision/kubernetes/config.py +++ b/sky/provision/kubernetes/config.py @@ -9,7 +9,9 @@ from sky.adaptors import kubernetes from sky.provision import common +from sky.provision.kubernetes import network_utils from sky.provision.kubernetes import utils as kubernetes_utils +from sky.utils import kubernetes_enums logger = logging.getLogger(__name__) @@ -25,7 +27,10 @@ def bootstrap_instances( _configure_services(namespace, config.provider_config) - config = _configure_ssh_jump(namespace, config) + networking_mode = network_utils.get_networking_mode( + config.provider_config.get('networking_mode')) + if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT: + config = _configure_ssh_jump(namespace, config) requested_service_account = config.node_config['spec']['serviceAccountName'] if (requested_service_account == @@ -46,6 +51,21 @@ def bootstrap_instances( _configure_autoscaler_cluster_role(namespace, config.provider_config) _configure_autoscaler_cluster_role_binding(namespace, config.provider_config) + # SkyPilot system namespace is required for FUSE mounting. Here we just + # create the namespace and set up the necessary permissions. + # + # We need to setup the namespace outside the + # if config.provider_config.get('fuse_device_required') block below + # because if we put in the if block, the following happens: + # 1. User launches job controller on Kubernetes with SERVICE_ACCOUNT. No + # namespace is created at this point since the controller does not + # require FUSE. + # 2. User submits a job requiring FUSE. + # 3. The namespace is created here, but since the job controller is + # using DEFAULT_SERVICE_ACCOUNT_NAME, it does not have the necessary + # permissions to create a role for itself to create the FUSE manager. + # 4. The job fails to launch. + _configure_skypilot_system_namespace(config.provider_config) if config.provider_config.get('port_mode', 'loadbalancer') == 'ingress': logger.info('Port mode is set to ingress, setting up ingress role ' 'and role binding.') @@ -69,26 +89,8 @@ def bootstrap_instances( elif requested_service_account != 'default': logger.info(f'Using service account {requested_service_account!r}, ' 'skipping role and role binding setup.') - - # SkyPilot system namespace is required for FUSE mounting. Here we just - # create the namespace and set up the necessary permissions. - # - # We need to setup the namespace outside the if block below because if - # we put in the if block, the following happens: - # 1. User launches job controller on Kubernetes with SERVICE_ACCOUNT. No - # namespace is created at this point since the controller does not - # require FUSE. - # 2. User submits a job requiring FUSE. - # 3. The namespace is created here, but since the job controller is using - # SERVICE_ACCOUNT, it does not have the necessary permissions to create - # a role for itself to create the FUSE device manager. - # 4. The job fails to launch. - _configure_skypilot_system_namespace(config.provider_config, - requested_service_account) - if config.provider_config.get('fuse_device_required', False): _configure_fuse_mounting(config.provider_config) - return config @@ -502,8 +504,7 @@ def _configure_ssh_jump(namespace, config: common.ProvisionConfig): def _configure_skypilot_system_namespace( - provider_config: Dict[str, - Any], service_account: Optional[str]) -> None: + provider_config: Dict[str, Any]) -> None: """Creates the namespace for skypilot-system mounting if it does not exist. Also patches the SkyPilot service account to have the necessary permissions @@ -513,34 +514,28 @@ def _configure_skypilot_system_namespace( skypilot_system_namespace = provider_config['skypilot_system_namespace'] kubernetes_utils.create_namespace(skypilot_system_namespace) - # Setup permissions if using the default service account. - # If the user has requested a different service account (via - # remote_identity in ~/.sky/config.yaml), we assume they have already set - # up the necessary roles and role bindings. - if service_account == kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME: - # Note - this must be run only after the service account has been - # created in the cluster (in bootstrap_instances). - # Create the role in the skypilot-system namespace if it does not exist. - _configure_autoscaler_role(skypilot_system_namespace, - provider_config, - role_field='autoscaler_skypilot_system_role') - # We must create a unique role binding per-namespace that SkyPilot is - # running in, so we override the name with a unique name identifying - # the namespace. This is required for multi-tenant setups where - # different SkyPilot instances may be running in different namespaces. - override_name = provider_config[ - 'autoscaler_skypilot_system_role_binding']['metadata'][ - 'name'] + '-' + svc_account_namespace - - # Create the role binding in the skypilot-system namespace, and have - # the subject namespace be the namespace that the SkyPilot service - # account is created in. - _configure_autoscaler_role_binding( - skypilot_system_namespace, - provider_config, - binding_field='autoscaler_skypilot_system_role_binding', - override_name=override_name, - override_subject_namespace=svc_account_namespace) + # Note - this must be run only after the service account has been + # created in the cluster (in bootstrap_instances). + # Create the role in the skypilot-system namespace if it does not exist. + _configure_autoscaler_role(skypilot_system_namespace, + provider_config, + role_field='autoscaler_skypilot_system_role') + # We must create a unique role binding per-namespace that SkyPilot is + # running in, so we override the name with a unique name identifying + # the namespace. This is required for multi-tenant setups where + # different SkyPilot instances may be running in different namespaces. + override_name = provider_config['autoscaler_skypilot_system_role_binding'][ + 'metadata']['name'] + '-' + svc_account_namespace + + # Create the role binding in the skypilot-system namespace, and have + # the subject namespace be the namespace that the SkyPilot service + # account is created in. + _configure_autoscaler_role_binding( + skypilot_system_namespace, + provider_config, + binding_field='autoscaler_skypilot_system_role_binding', + override_name=override_name, + override_subject_namespace=svc_account_namespace) def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None: diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 4f88293525f..052cbe1640f 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -12,7 +12,9 @@ from sky.provision import common from sky.provision import docker_utils from sky.provision.kubernetes import config as config_lib +from sky.provision.kubernetes import network_utils from sky.provision.kubernetes import utils as kubernetes_utils +from sky.utils import command_runner from sky.utils import common_utils from sky.utils import kubernetes_enums from sky.utils import ux_utils @@ -158,6 +160,15 @@ def _raise_pod_scheduling_errors(namespace, new_nodes): raise config_lib.KubernetesError(f'{timeout_err_msg}') +def _raise_command_running_error(message: str, command: str, pod_name: str, + rc: int, stdout: str) -> None: + if rc == 0: + return + raise config_lib.KubernetesError( + f'Failed to {message} for pod {pod_name} with return ' + f'code {rc}: {command!r}\nOutput: {stdout}.') + + def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int): """Wait for all pods to be scheduled. @@ -250,39 +261,6 @@ def _wait_for_pods_to_run(namespace, new_nodes): time.sleep(1) -def _run_command_on_pods(node_name: str, - node_namespace: str, - command: List[str], - stream_logs: bool = False): - """Run command on Kubernetes pods. - - If `stream_logs` is True, we poll for output and error messages while the - command is executing, and the stdout and stderr is written to logger.info. - When called from the provisioner, this logger.info is written to the - provision.log file (see setup_provision_logging()). - """ - cmd_output = kubernetes.stream()( - kubernetes.core_api().connect_get_namespaced_pod_exec, - node_name, - node_namespace, - command=command, - stderr=True, - stdin=False, - stdout=True, - tty=False, - _preload_content=(not stream_logs), - _request_timeout=kubernetes.API_TIMEOUT) - if stream_logs: - while cmd_output.is_open(): - cmd_output.update(timeout=1) - if cmd_output.peek_stdout(): - logger.info(f'{cmd_output.read_stdout().strip()}') - if cmd_output.peek_stderr(): - logger.info(f'{cmd_output.read_stderr().strip()}') - cmd_output.close() - return cmd_output - - def _set_env_vars_in_pods(namespace: str, new_pods: List): """Setting environment variables in pods. @@ -299,42 +277,46 @@ def _set_env_vars_in_pods(namespace: str, new_pods: List): /etc/profile.d/, making them available for all users in future shell sessions. """ - set_k8s_env_var_cmd = [ - '/bin/sh', - '-c', - docker_utils.SETUP_ENV_VARS_CMD, - ] + set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD for new_pod in new_pods: - _run_command_on_pods(new_pod.metadata.name, namespace, - set_k8s_env_var_cmd) + runner = command_runner.KubernetesCommandRunner( + (namespace, new_pod.metadata.name)) + rc, stdout, _ = runner.run(set_k8s_env_var_cmd, + require_outputs=True, + stream_logs=False) + _raise_command_running_error('set env vars', set_k8s_env_var_cmd, + new_pod.metadata.name, rc, stdout) def _check_user_privilege(namespace: str, new_nodes: List) -> None: # Checks if the default user has sufficient privilege to set up # the kubernetes instance pod. - check_k8s_user_sudo_cmd = [ - '/bin/sh', - '-c', - ( - 'if [ $(id -u) -eq 0 ]; then' - # If user is root, create an alias for sudo used in skypilot setup - ' echo \'alias sudo=""\' >> ~/.bashrc; ' - 'else ' - ' if command -v sudo >/dev/null 2>&1; then ' - ' timeout 2 sudo -l >/dev/null 2>&1 || ' - f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); ' - ' else ' - f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); ' - ' fi; ' - 'fi') - ] + check_k8s_user_sudo_cmd = ( + 'if [ $(id -u) -eq 0 ]; then' + # If user is root, create an alias for sudo used in skypilot setup + ' echo \'alias sudo=""\' >> ~/.bashrc; echo succeed;' + 'else ' + ' if command -v sudo >/dev/null 2>&1; then ' + ' timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || ' + f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); ' + ' else ' + f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); ' + ' fi; ' + 'fi') for new_node in new_nodes: - privilege_check = _run_command_on_pods(new_node.metadata.name, - namespace, - check_k8s_user_sudo_cmd) - if privilege_check == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE): + runner = command_runner.KubernetesCommandRunner( + (namespace, new_node.metadata.name)) + rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd, + require_outputs=True, + separate_stderr=True, + stream_logs=False) + _raise_command_running_error('check user privilege', + check_k8s_user_sudo_cmd, + new_node.metadata.name, rc, + stdout + stderr) + if stdout == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE): raise config_lib.KubernetesError( 'Insufficient system privileges detected. ' 'Ensure the default user has root access or ' @@ -345,44 +327,43 @@ def _check_user_privilege(namespace: str, new_nodes: List) -> None: def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None: # Setting up ssh for the pod instance. This is already setup for # the jump pod so it does not need to be run for it. - set_k8s_ssh_cmd = [ - '/bin/sh', - '-c', - ( - 'set -x; ' - 'prefix_cmd() ' - '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; ' - 'export DEBIAN_FRONTEND=noninteractive;' - '$(prefix_cmd) apt-get update;' - '$(prefix_cmd) apt install openssh-server rsync -y; ' - '$(prefix_cmd) mkdir -p /var/run/sshd; ' - '$(prefix_cmd) ' - 'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" ' - '/etc/ssh/sshd_config; ' - '$(prefix_cmd) sed ' - '"s@session\\s*required\\s*pam_loginuid.so@session optional ' - 'pam_loginuid.so@g" -i /etc/pam.d/sshd; ' - 'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; ' - '$(prefix_cmd) mkdir -p ~/.ssh; ' - '$(prefix_cmd) chown -R $(whoami) ~/.ssh;' - '$(prefix_cmd) chmod 700 ~/.ssh; ' - '$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; ' - '$(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ' - '~/.ssh/authorized_keys; ' - '$(prefix_cmd) service ssh restart; ' - # Eliminate the error - # `mesg: ttyname failed: inappropriate ioctl for device`. - # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long - '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;') - ] + set_k8s_ssh_cmd = ( + 'set -ex; ' + 'prefix_cmd() ' + '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; ' + 'export DEBIAN_FRONTEND=noninteractive;' + '$(prefix_cmd) apt-get update;' + '$(prefix_cmd) apt install openssh-server rsync -y; ' + '$(prefix_cmd) mkdir -p /var/run/sshd; ' + '$(prefix_cmd) ' + 'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" ' + '/etc/ssh/sshd_config; ' + '$(prefix_cmd) sed ' + '"s@session\\s*required\\s*pam_loginuid.so@session optional ' + 'pam_loginuid.so@g" -i /etc/pam.d/sshd; ' + 'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; ' + '$(prefix_cmd) mkdir -p ~/.ssh; ' + '$(prefix_cmd) chown -R $(whoami) ~/.ssh;' + '$(prefix_cmd) chmod 700 ~/.ssh; ' + '$(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ' + '~/.ssh/authorized_keys; ' + '$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; ' + '$(prefix_cmd) service ssh restart; ' + # Eliminate the error + # `mesg: ttyname failed: inappropriate ioctl for device`. + # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long + '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;') + # TODO(romilb): Parallelize the setup of SSH in pods for multi-node clusters for new_node in new_nodes: pod_name = new_node.metadata.name + runner = command_runner.KubernetesCommandRunner((namespace, pod_name)) logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}') - _run_command_on_pods(new_node.metadata.name, - namespace, - set_k8s_ssh_cmd, - stream_logs=True) + rc, stdout, _ = runner.run(set_k8s_ssh_cmd, + require_outputs=True, + stream_logs=False) + _raise_command_running_error('setup ssh', set_k8s_ssh_cmd, pod_name, rc, + stdout) logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}') @@ -515,14 +496,18 @@ def _create_pods(region: str, cluster_name_on_cloud: str, if head_pod_name is None: head_pod_name = pod.metadata.name - # Adding the jump pod to the new_nodes list as well so it can be - # checked if it's scheduled and running along with other pods. - ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump'] - jump_pod = kubernetes.core_api().read_namespaced_pod( - ssh_jump_pod_name, namespace) wait_pods_dict = _filter_pods(namespace, tags, ['Pending']) wait_pods = list(wait_pods_dict.values()) - wait_pods.append(jump_pod) + + networking_mode = network_utils.get_networking_mode( + config.provider_config.get('networking_mode')) + if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT: + # Adding the jump pod to the new_nodes list as well so it can be + # checked if it's scheduled and running along with other pods. + ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump'] + jump_pod = kubernetes.core_api().read_namespaced_pod( + ssh_jump_pod_name, namespace) + wait_pods.append(jump_pod) provision_timeout = provider_config['timeout'] wait_str = ('indefinitely' @@ -709,11 +694,16 @@ def get_cluster_info( assert cpu_request is not None, 'cpu_request should not be None' ssh_user = 'sky' - get_k8s_ssh_user_cmd = ['/bin/sh', '-c', ('echo $(whoami)')] + get_k8s_ssh_user_cmd = 'echo $(whoami)' assert head_pod_name is not None - ssh_user = _run_command_on_pods(head_pod_name, namespace, - get_k8s_ssh_user_cmd) - ssh_user = ssh_user.strip() + runner = command_runner.KubernetesCommandRunner((namespace, head_pod_name)) + rc, stdout, stderr = runner.run(get_k8s_ssh_user_cmd, + require_outputs=True, + separate_stderr=True, + stream_logs=False) + _raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd, + head_pod_name, rc, stdout + stderr) + ssh_user = stdout.strip() logger.debug( f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}') @@ -776,3 +766,21 @@ def query_instances( continue cluster_status[pod.metadata.name] = pod_status return cluster_status + + +def get_command_runners( + cluster_info: common.ClusterInfo, + **credentials: Dict[str, Any], +) -> List[command_runner.CommandRunner]: + """Get a command runner for the given cluster.""" + assert cluster_info.provider_config is not None, cluster_info + instances = cluster_info.instances + namespace = _get_namespace(cluster_info.provider_config) + node_list = [] + if cluster_info.head_instance_id is not None: + node_list = [(namespace, cluster_info.head_instance_id)] + node_list.extend((namespace, pod_name) + for pod_name in instances.keys() + if pod_name != cluster_info.head_instance_id) + return command_runner.KubernetesCommandRunner.make_runner_list( + node_list=node_list, **credentials) diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index 61870cb9119..e4b267e8ab3 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -1,6 +1,7 @@ """Kubernetes network provisioning.""" from typing import Any, Dict, List, Optional +from sky import sky_logging from sky.adaptors import kubernetes from sky.provision import common from sky.provision.kubernetes import network_utils @@ -8,8 +9,10 @@ from sky.utils import kubernetes_enums from sky.utils.resources_utils import port_ranges_to_set -_PATH_PREFIX = '/skypilot/{cluster_name_on_cloud}/{port}' -_LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}-skypilot-loadbalancer' +logger = sky_logging.init_logger(__name__) + +_PATH_PREFIX = '/skypilot/{namespace}/{cluster_name_on_cloud}/{port}' +_LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}--skypilot-lb' def open_ports( @@ -73,13 +76,14 @@ def _open_ports_using_ingress( 'https://github.com/kubernetes/ingress-nginx/blob/main/docs/deploy/index.md.' # pylint: disable=line-too-long ) - # Prepare service names, ports, for template rendering - service_details = [ - (f'{cluster_name_on_cloud}-skypilot-service--{port}', port, - _PATH_PREFIX.format(cluster_name_on_cloud=cluster_name_on_cloud, - port=port).rstrip('/').lstrip('/')) - for port in ports - ] + # Prepare service names, ports, for template rendering + service_details = [(f'{cluster_name_on_cloud}--skypilot-svc--{port}', port, + _PATH_PREFIX.format( + cluster_name_on_cloud=cluster_name_on_cloud, + port=port, + namespace=kubernetes_utils. + get_current_kube_config_context_namespace()).rstrip( + '/').lstrip('/')) for port in ports] # Generate ingress and services specs # We batch ingress rule creation because each rule triggers a hot reload of @@ -160,7 +164,7 @@ def _cleanup_ports_for_ingress( ) -> None: # Delete services for each port for port in ports: - service_name = f'{cluster_name_on_cloud}-skypilot-service--{port}' + service_name = f'{cluster_name_on_cloud}--skypilot-svc--{port}' network_utils.delete_namespaced_service( namespace=provider_config.get('namespace', 'default'), service_name=service_name, @@ -217,12 +221,17 @@ def _query_ports_for_loadbalancer( ports: List[int], provider_config: Dict[str, Any], ) -> Dict[int, List[common.Endpoint]]: + logger.debug(f'Getting loadbalancer IP for cluster {cluster_name_on_cloud}') result: Dict[int, List[common.Endpoint]] = {} service_name = _LOADBALANCER_SERVICE_NAME.format( cluster_name_on_cloud=cluster_name_on_cloud) external_ip = network_utils.get_loadbalancer_ip( namespace=provider_config.get('namespace', 'default'), - service_name=service_name) + service_name=service_name, + # Timeout is set so that we can retry the query when the + # cluster is firstly created and the load balancer is not ready yet. + timeout=60, + ) if external_ip is None: return {} @@ -245,7 +254,10 @@ def _query_ports_for_ingress( result: Dict[int, List[common.Endpoint]] = {} for port in ports: path_prefix = _PATH_PREFIX.format( - cluster_name_on_cloud=cluster_name_on_cloud, port=port) + cluster_name_on_cloud=cluster_name_on_cloud, + port=port, + namespace=kubernetes_utils. + get_current_kube_config_context_namespace()) http_port, https_port = external_ports \ if external_ports is not None else (None, None) diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index 836d75af41f..844f84a04f5 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -1,5 +1,6 @@ """Kubernetes network provisioning utils.""" import os +import time from typing import Dict, List, Optional, Tuple, Union import jinja2 @@ -7,12 +8,15 @@ import sky from sky import exceptions +from sky import sky_logging from sky import skypilot_config from sky.adaptors import kubernetes from sky.provision.kubernetes import utils as kubernetes_utils from sky.utils import kubernetes_enums from sky.utils import ux_utils +logger = sky_logging.init_logger(__name__) + _INGRESS_TEMPLATE_NAME = 'kubernetes-ingress.yml.j2' _LOADBALANCER_TEMPLATE_NAME = 'kubernetes-loadbalancer.yml.j2' @@ -43,6 +47,23 @@ def get_port_mode( return port_mode +def get_networking_mode( + mode_str: Optional[str] = None +) -> kubernetes_enums.KubernetesNetworkingMode: + """Get the networking mode from the provider config.""" + mode_str = mode_str or skypilot_config.get_nested( + ('kubernetes', 'networking_mode'), + kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value) + try: + networking_mode = kubernetes_enums.KubernetesNetworkingMode.from_str( + mode_str) + except ValueError as e: + with ux_utils.print_exception_no_traceback(): + raise ValueError(str(e) + + ' Please check: ~/.sky/config.yaml.') from None + return networking_mode + + def fill_loadbalancer_template(namespace: str, service_name: str, ports: List[int], selector_key: str, selector_value: str) -> Dict: @@ -222,18 +243,29 @@ def get_ingress_external_ip_and_ports( return external_ip, None -def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]: +def get_loadbalancer_ip(namespace: str, + service_name: str, + timeout: int = 0) -> Optional[str]: """Returns the IP address of the load balancer.""" core_api = kubernetes.core_api() - service = core_api.read_namespaced_service( - service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) - if service.status.load_balancer.ingress is None: - return None + ip = None - ip = service.status.load_balancer.ingress[ - 0].ip or service.status.load_balancer.ingress[0].hostname - return ip if ip is not None else None + start_time = time.time() + retry_cnt = 0 + while ip is None and (retry_cnt == 0 or time.time() - start_time < timeout): + service = core_api.read_namespaced_service( + service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) + if service.status.load_balancer.ingress is not None: + ip = (service.status.load_balancer.ingress[0].ip or + service.status.load_balancer.ingress[0].hostname) + if ip is None: + retry_cnt += 1 + if retry_cnt % 5 == 0: + logger.debug('Waiting for load balancer IP to be assigned' + '...') + time.sleep(1) + return ip def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]: diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index d5140d8846b..41b43b82c2c 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -3,6 +3,7 @@ import math import os import re +import shutil import subprocess from typing import Any, Dict, List, Optional, Set, Tuple, Union from urllib.parse import urlparse @@ -16,6 +17,7 @@ from sky import skypilot_config from sky.adaptors import kubernetes from sky.provision.kubernetes import network_utils +from sky.skylet import constants from sky.utils import common_utils from sky.utils import env_options from sky.utils import kubernetes_enums @@ -35,10 +37,10 @@ 'T': 2**40, 'P': 2**50, } -NO_GPU_ERROR_MESSAGE = 'No GPUs found in Kubernetes cluster. \ -If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs \ -(e.g., skypilot.co/accelerator) are setup correctly. \ -To further debug, run: sky check.' +NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure ' + 'nvidia.com/gpu resource is available on the nodes and ' + 'the node labels for identifying GPUs ' + '(e.g., skypilot.co/accelerator) are setup correctly. ') KUBERNETES_AUTOSCALER_NOTE = ( 'Note: Kubernetes cluster autoscaling is enabled. ' @@ -53,6 +55,10 @@ KIND_CONTEXT_NAME = 'kind-skypilot' # Context name used by sky local up +# Port-forward proxy command constants +PORT_FORWARD_PROXY_CMD_TEMPLATE = 'kubernetes-port-forward-proxy-command.sh' +PORT_FORWARD_PROXY_CMD_PATH = '~/.sky/kubernetes-port-forward-proxy-command.sh' + logger = sky_logging.init_logger(__name__) @@ -100,6 +106,9 @@ def get_gke_accelerator_name(accelerator: str) -> str: Uses the format - nvidia-tesla-. A100-80GB, H100-80GB and L4 are an exception. They use nvidia-. """ + if accelerator == 'H100': + # H100 is named as H100-80GB in GKE. + accelerator = 'H100-80GB' if accelerator in ('A100-80GB', 'L4', 'H100-80GB'): # A100-80GB, L4 and H100-80GB have a different name pattern. return 'nvidia-{}'.format(accelerator.lower()) @@ -183,12 +192,77 @@ def get_accelerator_from_label_value(cls, value: str) -> str: if value.startswith('nvidia-tesla-'): return value.replace('nvidia-tesla-', '').upper() elif value.startswith('nvidia-'): - return value.replace('nvidia-', '').upper() + acc = value.replace('nvidia-', '').upper() + if acc in ['H100-80GB', 'H100-MEGA-80GB']: + # H100 is named H100-80GB or H100-MEGA-80GB in GKE, + # where the latter has improved bandwidth. + # See a3-mega instances on GCP. + # TODO: we do not distinguish the two GPUs for simplicity, + # but we can evaluate whether we should distinguish + # them based on users' requests. + return 'H100' + return acc else: raise ValueError( f'Invalid accelerator name in GKE cluster: {value}') +class GFDLabelFormatter(GPULabelFormatter): + """GPU Feature Discovery label formatter + + NVIDIA GPUs nodes are labeled by GPU feature discovery + e.g. nvidia.com/gpu.product=NVIDIA-H100-80GB-HBM3 + https://github.com/NVIDIA/gpu-feature-discovery + + GPU feature discovery is included as part of the + NVIDIA GPU Operator: + https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html + + This LabelFormatter can't be used in autoscaling clusters since accelerators + may map to multiple label, so we're not implementing `get_label_value` + """ + + LABEL_KEY = 'nvidia.com/gpu.product' + + @classmethod + def get_label_key(cls) -> str: + return cls.LABEL_KEY + + @classmethod + def get_label_value(cls, accelerator: str) -> str: + """An accelerator can map to many Nvidia GFD labels + (e.g., A100-80GB-PCIE vs. A100-SXM4-80GB). + As a result, we do not support get_label_value for GFDLabelFormatter.""" + raise NotImplementedError + + @classmethod + def get_accelerator_from_label_value(cls, value: str) -> str: + """Searches against a canonical list of NVIDIA GPUs and pattern + matches the canonical GPU name against the GFD label. + """ + canonical_gpu_names = [ + 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', + 'V100', 'A10', 'P4000', 'P100', 'P40', 'P4', 'L4' + ] + for canonical_name in canonical_gpu_names: + # A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB + if canonical_name == 'A100-80GB' and re.search( + r'A100.*-80GB', value): + return canonical_name + elif canonical_name in value: + return canonical_name + + # If we didn't find a canonical name: + # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000') + # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070') + # 3. remove 'RTX-' (e.g. 'RTX-6000' -> 'RTX6000') + # Same logic, but uppercased, as the Skypilot labeler job found in + # sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml + return value.upper().replace('NVIDIA-', + '').replace('GEFORCE-', + '').replace('RTX-', 'RTX') + + class KarpenterLabelFormatter(SkyPilotLabelFormatter): """Karpeneter label formatter Karpenter uses the label `karpenter.k8s.aws/instance-gpu-name` to identify @@ -203,8 +277,8 @@ class KarpenterLabelFormatter(SkyPilotLabelFormatter): # it will be used to determine the priority of the label formats when # auto-detecting the GPU label type. LABEL_FORMATTER_REGISTRY = [ - SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter, - KarpenterLabelFormatter + SkyPilotLabelFormatter, GKELabelFormatter, KarpenterLabelFormatter, + GFDLabelFormatter, CoreWeaveLabelFormatter ] # Mapping of autoscaler type to label formatter @@ -280,6 +354,18 @@ def get_kubernetes_nodes() -> List[Any]: return nodes +def get_kubernetes_pods() -> List[Any]: + try: + ns = get_current_kube_config_context_namespace() + pods = kubernetes.core_api().list_namespaced_pod( + ns, _request_timeout=kubernetes.API_TIMEOUT).items + except kubernetes.max_retry_error(): + raise exceptions.ResourcesUnavailableError( + 'Timed out when trying to get pod info from Kubernetes cluster. ' + 'Please check if the cluster is healthy and retry.') from None + return pods + + def check_instance_fits(instance: str) -> Tuple[bool, Optional[str]]: """Checks if the instance fits on the Kubernetes cluster. @@ -435,7 +521,6 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # conclude that the cluster is setup correctly and return. return '', '' k8s_acc_label_key = label_formatter.get_label_key() - k8s_acc_label_value = label_formatter.get_label_value(acc_type) # Search in node_labels to see if any node has the requested # GPU type. # Note - this only checks if the label is available on a @@ -445,10 +530,9 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: for node_name, label_list in node_labels.items(): for label, value in label_list: if (label == k8s_acc_label_key and - value == k8s_acc_label_value): - # If a node is found, we can break out of the loop - # and proceed to deploy. - return k8s_acc_label_key, k8s_acc_label_value + label_formatter.get_accelerator_from_label_value( + value) == acc_type): + return label, value # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): suffix = '' @@ -680,6 +764,12 @@ def get_current_kube_config_context_namespace() -> str: the default namespace. """ k8s = kubernetes.kubernetes + # Get namespace if using in-cluster config + ns_path = '/var/run/secrets/kubernetes.io/serviceaccount/namespace' + if os.path.exists(ns_path): + with open(ns_path, encoding='utf-8') as f: + return f.read().strip() + # If not in-cluster, get the namespace from kubeconfig try: _, current_context = k8s.config.list_kube_config_contexts() if 'namespace' in current_context['context']: @@ -832,30 +922,38 @@ def __str__(self): return self.name -def construct_ssh_jump_command(private_key_path: str, - ssh_jump_ip: str, - ssh_jump_port: Optional[int] = None, - proxy_cmd_path: Optional[str] = None) -> str: +def construct_ssh_jump_command( + private_key_path: str, + ssh_jump_ip: str, + ssh_jump_port: Optional[int] = None, + ssh_jump_user: str = 'sky', + proxy_cmd_path: Optional[str] = None, + proxy_cmd_target_pod: Optional[str] = None) -> str: ssh_jump_proxy_command = (f'ssh -tt -i {private_key_path} ' '-o StrictHostKeyChecking=no ' '-o UserKnownHostsFile=/dev/null ' f'-o IdentitiesOnly=yes ' - f'-W %h:%p sky@{ssh_jump_ip}') + f'-W %h:%p {ssh_jump_user}@{ssh_jump_ip}') if ssh_jump_port is not None: ssh_jump_proxy_command += f' -p {ssh_jump_port} ' if proxy_cmd_path is not None: proxy_cmd_path = os.path.expanduser(proxy_cmd_path) # adding execution permission to the proxy command script os.chmod(proxy_cmd_path, os.stat(proxy_cmd_path).st_mode | 0o111) - ssh_jump_proxy_command += f' -o ProxyCommand=\'{proxy_cmd_path}\' ' + ssh_jump_proxy_command += (f' -o ProxyCommand=\'{proxy_cmd_path} ' + f'{proxy_cmd_target_pod}\' ') return ssh_jump_proxy_command def get_ssh_proxy_command( - private_key_path: str, ssh_jump_name: str, - network_mode: kubernetes_enums.KubernetesNetworkingMode, namespace: str, - port_fwd_proxy_cmd_path: str, port_fwd_proxy_cmd_template: str) -> str: - """Generates the SSH proxy command to connect through the SSH jump pod. + k8s_ssh_target: str, + network_mode: kubernetes_enums.KubernetesNetworkingMode, + private_key_path: Optional[str] = None, + namespace: Optional[str] = None) -> str: + """Generates the SSH proxy command to connect to the pod. + + Uses a jump pod if the network mode is NODEPORT, and direct port-forwarding + if the network mode is PORTFORWARD. By default, establishing an SSH connection creates a communication channel to a remote node by setting up a TCP connection. When a @@ -871,57 +969,77 @@ def get_ssh_proxy_command( With the NodePort networking mode, a NodePort service is launched. This service opens an external port on the node which redirects to the desired - port within the pod. When establishing an SSH session in this mode, the + port to a SSH jump pod. When establishing an SSH session in this mode, the ProxyCommand makes use of this external port to create a communication channel directly to port 22, which is the default port ssh server listens on, of the jump pod. With Port-forward mode, instead of directly exposing an external port, 'kubectl port-forward' sets up a tunnel between a local port - (127.0.0.1:23100) and port 22 of the jump pod. Then we establish a TCP + (127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'. - This is setup in the inner ProxyCommand of the nested ProxyCommand, and the - rest is the same as NodePort approach, which the outer ProxyCommand - establishes a communication channel between 127.0.0.1:23100 and port 22 on - the jump pod. Consequently, any stdin provided on the local machine is - forwarded through this tunnel to the application (SSH server) listening in - the pod. Similarly, any output from the application in the pod is tunneled - back and displayed in the terminal on the local machine. + All of this is done in a ProxyCommand script. Any stdin provided on the + local machine is forwarded through this tunnel to the application + (SSH server) listening in the pod. Similarly, any output from the + application in the pod is tunneled back and displayed in the terminal on + the local machine. Args: - private_key_path: str; Path to the private key to use for SSH. - This key must be authorized to access the SSH jump pod. - ssh_jump_name: str; Name of the SSH jump service to use + k8s_ssh_target: str; The Kubernetes object that will be used as the + target for SSH. If network_mode is NODEPORT, this is the name of the + service. If network_mode is PORTFORWARD, this is the pod name. network_mode: KubernetesNetworkingMode; networking mode for ssh session. It is either 'NODEPORT' or 'PORTFORWARD' - namespace: Kubernetes namespace to use - port_fwd_proxy_cmd_path: str; path to the script used as Proxycommand - with 'kubectl port-forward' - port_fwd_proxy_cmd_template: str; template used to create - 'kubectl port-forward' Proxycommand + private_key_path: str; Path to the private key to use for SSH. + This key must be authorized to access the SSH jump pod. + Required for NODEPORT networking mode. + namespace: Kubernetes namespace to use. + Required for NODEPORT networking mode. """ # Fetch IP to connect to for the jump svc ssh_jump_ip = get_external_ip(network_mode) + assert private_key_path is not None, 'Private key path must be provided' if network_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT: - ssh_jump_port = get_port(ssh_jump_name, namespace) + assert namespace is not None, 'Namespace must be provided for NodePort' + ssh_jump_port = get_port(k8s_ssh_target, namespace) ssh_jump_proxy_command = construct_ssh_jump_command( private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port) - # Setting kubectl port-forward/socat to establish ssh session using - # ClusterIP service to disallow any ports opened else: - vars_to_fill = { - 'ssh_jump_name': ssh_jump_name, - } - common_utils.fill_template(port_fwd_proxy_cmd_template, - vars_to_fill, - output_path=port_fwd_proxy_cmd_path) + ssh_jump_proxy_command_path = create_proxy_command_script() ssh_jump_proxy_command = construct_ssh_jump_command( private_key_path, ssh_jump_ip, - proxy_cmd_path=port_fwd_proxy_cmd_path) + ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER, + proxy_cmd_path=ssh_jump_proxy_command_path, + proxy_cmd_target_pod=k8s_ssh_target) return ssh_jump_proxy_command +def create_proxy_command_script() -> str: + """Creates a ProxyCommand script that uses kubectl port-forward to setup + a tunnel between a local port and the SSH server in the pod. + + Returns: + str: Path to the ProxyCommand script. + """ + port_fwd_proxy_cmd_path = os.path.expanduser(PORT_FORWARD_PROXY_CMD_PATH) + os.makedirs(os.path.dirname(port_fwd_proxy_cmd_path), + exist_ok=True, + mode=0o700) + + root_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + template_path = os.path.join(root_dir, 'templates', + PORT_FORWARD_PROXY_CMD_TEMPLATE) + # Copy the template to the proxy command path. We create a copy to allow + # different users sharing the same SkyPilot installation to have their own + # proxy command scripts. + shutil.copy(template_path, port_fwd_proxy_cmd_path) + # Set the permissions to 700 to ensure only the owner can read, write, + # and execute the file. + os.chmod(port_fwd_proxy_cmd_path, 0o700) + return port_fwd_proxy_cmd_path + + def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str, service_type: kubernetes_enums.KubernetesServiceType): """Sets up Kubernetes service resource to access for SSH jump pod. @@ -1411,6 +1529,14 @@ def create_namespace(namespace: str) -> None: namespace: Name of the namespace to create """ kubernetes_client = kubernetes.kubernetes.client + try: + kubernetes.core_api().read_namespace(namespace) + except kubernetes.api_exception() as e: + if e.status != 404: + raise + else: + return + ns_metadata = dict(name=namespace, labels={'parent': 'skypilot'}) merge_custom_metadata(ns_metadata) namespace_obj = kubernetes_client.V1Namespace(metadata=ns_metadata) @@ -1449,6 +1575,44 @@ def get_autoscaler_type( return autoscaler_type +# Mapping of known spot label keys and values for different cluster types +# Add new cluster types here if they support spot instances along with the +# corresponding spot label key and value. +SPOT_LABEL_MAP = { + kubernetes_enums.KubernetesAutoscalerType.GKE.value: + ('cloud.google.com/gke-spot', 'true') +} + + +def get_spot_label() -> Tuple[Optional[str], Optional[str]]: + """Get the spot label key and value for using spot instances, if supported. + + Checks if the underlying cluster supports spot instances by checking nodes + for known spot label keys and values. If found, returns the spot label key + and value. If not, checks if autoscaler is configured and returns + appropriate labels. If neither are found, returns None. + + Returns: + Tuple[str, str]: Tuple containing the spot label key and value. Returns + None if spot instances are not supported. + """ + # Check if the cluster supports spot instances by checking nodes for known + # spot label keys and values + for node in get_kubernetes_nodes(): + for _, (key, value) in SPOT_LABEL_MAP.items(): + if key in node.metadata.labels and node.metadata.labels[ + key] == value: + return key, value + + # Check if autoscaler is configured. Allow spot instances if autoscaler type + # is known to support spot instances. + autoscaler_type = get_autoscaler_type() + if autoscaler_type == kubernetes_enums.KubernetesAutoscalerType.GKE: + return SPOT_LABEL_MAP[autoscaler_type.value] + + return None, None + + def dict_to_k8s_object(object_dict: Dict[str, Any], object_type: 'str') -> Any: """Converts a dictionary to a Kubernetes object. diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 89ca683ada5..7775c3f8a6e 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -39,8 +39,7 @@ # The default timeout in seconds for a readiness probe request. We set the # timeout to 15s since using actual generation in LLM services as readiness # probe is very time-consuming (33B, 70B, ...). -# TODO(tian): Expose this option to users in yaml file. -READINESS_PROBE_TIMEOUT_SECONDS = 15 +DEFAULT_READINESS_PROBE_TIMEOUT_SECONDS = 15 # Autoscaler window size in seconds for query per second. We calculate qps by # divide the number of queries in last window size by this window size. diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index b4732d36153..b25921f5610 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -488,6 +488,7 @@ def probe( self, readiness_path: str, post_data: Optional[Dict[str, Any]], + timeout: int, headers: Optional[Dict[str, str]], ) -> Tuple['ReplicaInfo', bool, float]: """Probe the readiness of the replica. @@ -512,17 +513,15 @@ def probe( logger.info(f'Probing {replica_identity} with {readiness_path}.') if post_data is not None: msg += 'POST' - response = requests.post( - readiness_path, - headers=headers, - json=post_data, - timeout=serve_constants.READINESS_PROBE_TIMEOUT_SECONDS) + response = requests.post(readiness_path, + json=post_data, + headers=headers, + timeout=timeout) else: msg += 'GET' - response = requests.get( - readiness_path, - headers=headers, - timeout=serve_constants.READINESS_PROBE_TIMEOUT_SECONDS) + response = requests.get(readiness_path, + headers=headers, + timeout=timeout) msg += (f' request to {replica_identity} returned status ' f'code {response.status_code}') if response.status_code == 200: @@ -1043,6 +1042,7 @@ def _probe_all_replicas(self) -> None: ( self._get_readiness_path(info.version), self._get_post_data(info.version), + self._get_readiness_timeout_seconds(info.version), self._get_readiness_headers(info.version), ), ),) @@ -1230,3 +1230,6 @@ def _get_readiness_headers(self, version: int) -> Optional[Dict[str, str]]: def _get_initial_delay_seconds(self, version: int) -> int: return self._get_version_spec(version).initial_delay_seconds + + def _get_readiness_timeout_seconds(self, version: int) -> int: + return self._get_version_spec(version).readiness_timeout_seconds diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 80217acfff8..3a97a6f8521 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -19,6 +19,7 @@ def __init__( self, readiness_path: str, initial_delay_seconds: int, + readiness_timeout_seconds: int, min_replicas: int, max_replicas: Optional[int] = None, target_qps_per_replica: Optional[float] = None, @@ -78,6 +79,7 @@ def __init__( self._readiness_path: str = readiness_path self._initial_delay_seconds: int = initial_delay_seconds + self._readiness_timeout_seconds: int = readiness_timeout_seconds self._min_replicas: int = min_replicas self._max_replicas: Optional[int] = max_replicas self._target_qps_per_replica: Optional[float] = target_qps_per_replica @@ -113,16 +115,23 @@ def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec': service_config['readiness_path'] = readiness_section initial_delay_seconds = None post_data = None + readiness_timeout_seconds = None readiness_headers = None else: service_config['readiness_path'] = readiness_section['path'] initial_delay_seconds = readiness_section.get( 'initial_delay_seconds', None) post_data = readiness_section.get('post_data', None) + readiness_timeout_seconds = readiness_section.get( + 'timeout_seconds', None) readiness_headers = readiness_section.get('headers', None) if initial_delay_seconds is None: initial_delay_seconds = constants.DEFAULT_INITIAL_DELAY_SECONDS service_config['initial_delay_seconds'] = initial_delay_seconds + if readiness_timeout_seconds is None: + readiness_timeout_seconds = ( + constants.DEFAULT_READINESS_PROBE_TIMEOUT_SECONDS) + service_config['readiness_timeout_seconds'] = readiness_timeout_seconds if isinstance(post_data, str): try: post_data = json.loads(post_data) @@ -209,6 +218,8 @@ def add_if_not_none(section, key, value, no_empty: bool = False): add_if_not_none('readiness_probe', 'initial_delay_seconds', self.initial_delay_seconds) add_if_not_none('readiness_probe', 'post_data', self.post_data) + add_if_not_none('readiness_probe', 'timeout_seconds', + self.readiness_timeout_seconds) add_if_not_none('readiness_probe', 'headers', self._readiness_headers) add_if_not_none('replica_policy', 'min_replicas', self.min_replicas) add_if_not_none('replica_policy', 'max_replicas', self.max_replicas) @@ -249,7 +260,9 @@ def spot_policy_str(self): policy_strs.append('Static spot mixture with ' f'{self.base_ondemand_fallback_replicas} ' f'base on-demand replica{plural}') - return ' '.join(policy_strs) if policy_strs else 'No spot policy' + if not policy_strs: + return 'No spot fallback policy' + return ' '.join(policy_strs) def autoscaling_policy_str(self): # TODO(MaoZiming): Update policy_str @@ -268,6 +281,7 @@ def __repr__(self) -> str: return textwrap.dedent(f"""\ Readiness probe method: {self.probe_str()} Readiness initial delay seconds: {self.initial_delay_seconds} + Readiness probe timeout seconds: {self.readiness_timeout_seconds} Replica autoscaling policy: {self.autoscaling_policy_str()} Spot Policy: {self.spot_policy_str()} """) @@ -280,6 +294,10 @@ def readiness_path(self) -> str: def initial_delay_seconds(self) -> int: return self._initial_delay_seconds + @property + def readiness_timeout_seconds(self) -> int: + return self._readiness_timeout_seconds + @property def min_replicas(self) -> int: return self._min_replicas diff --git a/sky/sky_logging.py b/sky/sky_logging.py index dbaf1dd0479..c8a243c72cf 100644 --- a/sky/sky_logging.py +++ b/sky/sky_logging.py @@ -95,6 +95,17 @@ def init_logger(name: str): return logging.getLogger(name) +@contextlib.contextmanager +def set_logging_level(logger: str, level: int): + logger = logging.getLogger(logger) + original_level = logger.level + logger.setLevel(level) + try: + yield + finally: + logger.setLevel(original_level) + + @contextlib.contextmanager def silent(): """Make all sky_logging.print() and logger.{info, warning...} silent. diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 50c6323b452..c41b3b432d4 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -49,6 +49,11 @@ SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime' SKY_REMOTE_PYTHON_ENV = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}' ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate' +# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the +# environment. `deactivate` command does not work when conda is used. +DEACTIVATE_SKY_REMOTE_PYTHON_ENV = ( + 'export PATH=' + f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")') # The name for the environment variable that stores the unique ID of the # current task. This will stay the same across multiple recoveries of the @@ -93,6 +98,26 @@ DOCKER_SERVER_ENV_VAR, } +# Commands for disable GPU ECC, which can improve the performance of the GPU +# for some workloads by 30%. This will only be applied when a user specify +# `nvidia_gpus.disable_ecc: true` in ~/.sky/config.yaml. +# Running this command will reboot the machine, introducing overhead for +# provisioning the machine. +# https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW +DISABLE_GPU_ECC_COMMAND = ( + # Check if the GPU ECC is enabled. We use `sudo which` to check nvidia-smi + # because in some environments, nvidia-smi is not in path for sudo and we + # should skip disabling ECC in this case. + 'sudo which nvidia-smi && echo "Checking Nvidia ECC Mode" && ' + 'out=$(nvidia-smi -q | grep "ECC Mode" -A2) && ' + 'echo "$out" && echo "$out" | grep Current | grep Enabled && ' + 'echo "Disabling Nvidia ECC" && ' + # Disable the GPU ECC. + 'sudo nvidia-smi -e 0 && ' + # Reboot the machine to apply the changes. + '{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } ' + '|| true; ') + # Install conda on the remote cluster if it is not already installed. # We use conda with python 3.10 to be consistent across multiple clouds with # best effort. @@ -118,6 +143,8 @@ f'conda create -y -n {SKY_REMOTE_PYTHON_ENV_NAME} python=3.10 && ' f'conda activate {SKY_REMOTE_PYTHON_ENV_NAME};' # Create a separate conda environment for SkyPilot dependencies. + # We use --system-site-packages to reuse the system site packages to avoid + # the overhead of installing the same packages in the new environment. f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || ' f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} --system-site-packages && ' f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE}; }};' @@ -130,8 +157,15 @@ # backend_utils.write_cluster_config. RAY_SKYPILOT_INSTALLATION_COMMANDS = ( 'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;' + # Disable the pip version check to avoid the warning message, which makes + # the output hard to read. + 'export PIP_DISABLE_PIP_VERSION_CHECK=1;' # Print the PATH in provision.log to help debug PATH issues. 'echo PATH=$PATH; ' + # Install setuptools<=69.5.1 to avoid the issue with the latest setuptools + # causing the error: + # ImportError: cannot import name 'packaging' from 'pkg_resources'" + f'{SKY_PIP_CMD} install "setuptools<70"; ' # Backward compatibility for ray upgrade (#3248): do not upgrade ray if the # ray cluster is already running, to avoid the ray cluster being restarted. # @@ -218,5 +252,15 @@ # services. CONTROLLER_PROCESS_CPU_DEMAND = 0.25 +# SkyPilot environment variables +SKYPILOT_NUM_NODES = 'SKYPILOT_NUM_NODES' +SKYPILOT_NODE_IPS = 'SKYPILOT_NODE_IPS' +SKYPILOT_NUM_GPUS_PER_NODE = 'SKYPILOT_NUM_GPUS_PER_NODE' +SKYPILOT_NODE_RANK = 'SKYPILOT_NODE_RANK' + +# Placeholder for the SSH user in proxy command, replaced when the ssh_user is +# known after provisioning. +SKY_SSH_USER_PLACEHOLDER = 'skypilot:ssh_user' + RCLONE_CONFIG_DIR = '~/.config/rclone' -RCLONE_CONFIG_PATH = f'{RCLONE_CONFIG_DIR}/rclone.conf' \ No newline at end of file +RCLONE_CONFIG_PATH = f'{RCLONE_CONFIG_DIR}/rclone.conf' diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py index 44c44afc772..d184abd107e 100644 --- a/sky/skylet/log_lib.py +++ b/sky/skylet/log_lib.py @@ -263,6 +263,9 @@ def make_task_bash_script(codegen: str, # set -a is used for exporting all variables functions to the environment # so that bash `user_script` can access `conda activate`. Detail: #436. # Reference: https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html # pylint: disable=line-too-long + # DEACTIVATE_SKY_REMOTE_PYTHON_ENV: Deactivate the SkyPilot runtime env, as + # the ray cluster is started within the runtime env, which may cause the + # user program to run in that env as well. # PYTHONUNBUFFERED is used to disable python output buffering. script = [ textwrap.dedent(f"""\ @@ -271,6 +274,7 @@ def make_task_bash_script(codegen: str, set -a . $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true set +a + {constants.DEACTIVATE_SKY_REMOTE_PYTHON_ENV} export PYTHONUNBUFFERED=1 cd {constants.SKY_REMOTE_WORKDIR}"""), ] diff --git a/sky/skylet/providers/azure/config.py b/sky/skylet/providers/azure/config.py index a19273761ba..4c6322f00e5 100644 --- a/sky/skylet/providers/azure/config.py +++ b/sky/skylet/providers/azure/config.py @@ -12,10 +12,14 @@ from azure.mgmt.resource import ResourceManagementClient from azure.mgmt.resource.resources.models import DeploymentMode +from sky.adaptors import azure from sky.utils import common_utils +from sky.provision import common UNIQUE_ID_LEN = 4 _WAIT_NSG_CREATION_NUM_TIMEOUT_SECONDS = 600 +_WAIT_FOR_RESOURCE_GROUP_DELETION_TIMEOUT_SECONDS = 480 # 8 minutes + logger = logging.getLogger(__name__) @@ -46,6 +50,7 @@ def bootstrap_azure(config): return config +@common.log_function_start_end def _configure_resource_group(config): # TODO: look at availability sets # https://docs.microsoft.com/en-us/azure/virtual-machines/windows/tutorial-availability-sets @@ -77,7 +82,31 @@ def _configure_resource_group(config): rg_create_or_update = get_azure_sdk_function( client=resource_client.resource_groups, function_name="create_or_update" ) - rg_create_or_update(resource_group_name=resource_group, parameters=params) + rg_creation_start = time.time() + retry = 0 + while ( + time.time() - rg_creation_start + < _WAIT_FOR_RESOURCE_GROUP_DELETION_TIMEOUT_SECONDS + ): + try: + rg_create_or_update(resource_group_name=resource_group, parameters=params) + break + except azure.exceptions().ResourceExistsError as e: + if "ResourceGroupBeingDeleted" in str(e): + if retry % 5 == 0: + # TODO(zhwu): This should be shown in terminal for better + # UX, which will be achieved after we move Azure to use + # SkyPilot provisioner. + logger.warning( + f"Azure resource group {resource_group} of a recent " + "terminated cluster {config['cluster_name']} is being " + "deleted. It can only be provisioned after it is fully" + "deleted. Waiting..." + ) + time.sleep(1) + retry += 1 + continue + raise # load the template file current_path = Path(__file__).parent @@ -120,17 +149,36 @@ def _configure_resource_group(config): create_or_update = get_azure_sdk_function( client=resource_client.deployments, function_name="create_or_update" ) - # TODO (skypilot): this takes a long time (> 40 seconds) for stopping an - # azure VM, and this can be called twice during ray down. - outputs = ( - create_or_update( - resource_group_name=resource_group, - deployment_name="ray-config", - parameters=parameters, - ) - .result() - .properties.outputs + # Skip creating or updating the deployment if the deployment already exists + # and the cluster name is the same. + get_deployment = get_azure_sdk_function( + client=resource_client.deployments, function_name="get" ) + deployment_exists = False + try: + deployment = get_deployment( + resource_group_name=resource_group, deployment_name="ray-config" + ) + logger.info("Deployment already exists. Skipping deployment creation.") + + outputs = deployment.properties.outputs + if outputs is not None: + deployment_exists = True + except azure.exceptions().ResourceNotFoundError: + deployment_exists = False + + if not deployment_exists: + # This takes a long time (> 40 seconds), we should be careful calling + # this function. + outputs = ( + create_or_update( + resource_group_name=resource_group, + deployment_name="ray-config", + parameters=parameters, + ) + .result() + .properties.outputs + ) # We should wait for the NSG to be created before opening any ports # to avoid overriding the newly-added NSG rules. diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 068930eb390..5f87e57245e 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -11,11 +11,11 @@ from azure.mgmt.resource import ResourceManagementClient from azure.mgmt.resource.resources.models import DeploymentMode +from sky.adaptors import azure from sky.skylet.providers.azure.config import ( bootstrap_azure, get_azure_sdk_function, ) -from sky.skylet import autostop_lib from sky.skylet.providers.command_runner import SkyDockerCommandRunner from sky.provision import docker_utils @@ -62,23 +62,7 @@ class AzureNodeProvider(NodeProvider): def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) - if not autostop_lib.get_is_autostopping(): - # TODO(suquark): This is a temporary patch for resource group. - # By default, Ray autoscaler assumes the resource group is still - # here even after the whole cluster is destroyed. However, now we - # deletes the resource group after tearing down the cluster. To - # comfort the autoscaler, we need to create/update it here, so the - # resource group always exists. - # - # We should not re-configure the resource group again, when it is - # running on the remote VM and the autostopping is in progress, - # because the VM is running which guarantees the resource group - # exists. - from sky.skylet.providers.azure.config import _configure_resource_group - - _configure_resource_group( - {"cluster_name": cluster_name, "provider": provider_config} - ) + subscription_id = provider_config["subscription_id"] self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True) # Sky only supports Azure CLI credential for now. @@ -106,9 +90,20 @@ def match_tags(vm): return False return True - vms = self.compute_client.virtual_machines.list( - resource_group_name=self.provider_config["resource_group"] - ) + try: + vms = list( + self.compute_client.virtual_machines.list( + resource_group_name=self.provider_config["resource_group"] + ) + ) + except azure.exceptions().ResourceNotFoundError as e: + if "Code: ResourceGroupNotFound" in e.exc_msg: + logger.debug( + "Resource group not found. VMs should have been terminated." + ) + vms = [] + else: + raise nodes = [self._extract_metadata(vm) for vm in filter(match_tags, vms)] self.cached_nodes = {node["name"]: node for node in nodes} @@ -308,6 +303,33 @@ def _create_node(self, node_config, tags, count): template_params["nsg"] = self.provider_config["nsg"] template_params["subnet"] = self.provider_config["subnet"] + if node_config.get("need_nvidia_driver_extension", False): + # Configure driver extension for A10 GPUs. A10 GPUs requires a + # special type of drivers which is available at Microsoft HPC + # extension. Reference: https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2 + for r in template["resources"]: + if r["type"] == "Microsoft.Compute/virtualMachines": + # Add a nested extension resource for A10 GPUs + r["resources"] = [ + { + "type": "extensions", + "apiVersion": "2015-06-15", + "location": "[variables('location')]", + "dependsOn": [ + "[concat('Microsoft.Compute/virtualMachines/', parameters('vmName'), copyIndex())]" + ], + "name": "NvidiaGpuDriverLinux", + "properties": { + "publisher": "Microsoft.HpcCompute", + "type": "NvidiaGpuDriverLinux", + "typeHandlerVersion": "1.9", + "autoUpgradeMinorVersion": True, + "settings": {}, + }, + }, + ] + break + parameters = { "properties": { "mode": DeploymentMode.incremental, diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 66c01f53617..ac84f8a4fd3 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -14,6 +14,9 @@ docker: {%- if custom_resources is not none %} --gpus all {%- endif %} + {%- for run_option in docker_run_options %} + - {{run_option}} + {%- endfor %} {%- if docker_login_config is not none %} docker_login_config: username: |- @@ -153,6 +156,9 @@ setup_commands: # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - mkdir -p ~/.ssh; touch ~/.ssh/config; + {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} {{ conda_installation_commands }} conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com" || true; {{ ray_skypilot_installation_commands }} diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 803327f1032..e8c388e1879 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -14,6 +14,9 @@ docker: {%- if custom_resources is not none %} --gpus all {%- endif %} + {%- for run_option in docker_run_options %} + - {{run_option}} + {%- endfor %} {%- endif %} provider: @@ -77,6 +80,7 @@ available_node_types: # billingProfile: # maxPrice: -1 {%- endif %} + need_nvidia_driver_extension: {{need_nvidia_driver_extension}} # TODO: attach disk {% if num_nodes > 1 %} ray.worker.default: @@ -105,6 +109,7 @@ available_node_types: # billingProfile: # maxPrice: -1 {%- endif %} + need_nvidia_driver_extension: {{need_nvidia_driver_extension}} {%- endif %} head_node_type: ray.head.default diff --git a/sky/templates/cudo-ray.yml.j2 b/sky/templates/cudo-ray.yml.j2 index f8f5c1cdc59..165e8fde2aa 100644 --- a/sky/templates/cudo-ray.yml.j2 +++ b/sky/templates/cudo-ray.yml.j2 @@ -54,7 +54,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/templates/fluidstack-ray.yml.j2 b/sky/templates/fluidstack-ray.yml.j2 index a0f952a443f..309a5393828 100644 --- a/sky/templates/fluidstack-ray.yml.j2 +++ b/sky/templates/fluidstack-ray.yml.j2 @@ -55,7 +55,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 9c2092bdfaf..e01ed351bfa 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -15,6 +15,9 @@ docker: {%- if gpu is not none %} --gpus all {%- endif %} + {%- for run_option in docker_run_options %} + - {{run_option}} + {%- endfor %} {%- if docker_login_config is not none %} docker_login_config: username: |- @@ -62,6 +65,7 @@ provider: # The upper-level SkyPilot code has make sure there will not be resource # leakage. disable_launch_config_check: true + use_managed_instance_group: {{ gcp_use_managed_instance_group }} auth: ssh_user: gcpuser @@ -79,6 +83,12 @@ available_node_types: {%- for label_key, label_value in labels.items() %} {{ label_key }}: {{ label_value|tojson }} {%- endfor %} + use-managed-instance-group: {{ gcp_use_managed_instance_group_value|tojson }} + {%- if gcp_use_managed_instance_group %} + managed-instance-group: + run_duration: {{ run_duration }} + provision_timeout: {{ provision_timeout }} + {%- endif %} {%- if specific_reservations %} reservationAffinity: consumeReservationType: SPECIFIC_RESERVATION @@ -175,6 +185,9 @@ setup_commands: # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; }; + {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} {%- if docker_image is none %} sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; diff --git a/sky/templates/jobs-controller.yaml.j2 b/sky/templates/jobs-controller.yaml.j2 index 7d15dc680ac..51083e84a59 100644 --- a/sky/templates/jobs-controller.yaml.j2 +++ b/sky/templates/jobs-controller.yaml.j2 @@ -11,6 +11,9 @@ file_mounts: setup: | {{ sky_activate_python_env }} + # Disable the pip version check to avoid the warning message, which makes the + # output hard to read. + export PIP_DISABLE_PIP_VERSION_CHECK=1 {%- for cmd in cloud_dependencies_installation_commands %} {{cmd}} diff --git a/sky/templates/kubernetes-port-forward-proxy-command.sh.j2 b/sky/templates/kubernetes-port-forward-proxy-command.sh similarity index 83% rename from sky/templates/kubernetes-port-forward-proxy-command.sh.j2 rename to sky/templates/kubernetes-port-forward-proxy-command.sh index 39159eb15b9..d9e409b5545 100644 --- a/sky/templates/kubernetes-port-forward-proxy-command.sh.j2 +++ b/sky/templates/kubernetes-port-forward-proxy-command.sh @@ -1,6 +1,14 @@ #!/usr/bin/env bash set -uo pipefail +# Check if pod name is passed as an argument +if [ $# -eq 0 ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +POD_NAME="$1" # The first argument is the name of the pod + # Checks if socat is installed if ! command -v socat > /dev/null; then echo "Using 'port-forward' mode to run ssh session on Kubernetes instances requires 'socat' to be installed. Please install 'socat'" >&2 @@ -18,7 +26,7 @@ fi # This is preferred because of socket re-use issues in kubectl port-forward, # see - https://github.com/kubernetes/kubernetes/issues/74551#issuecomment-769185879 KUBECTL_OUTPUT=$(mktemp) -kubectl port-forward svc/{{ ssh_jump_name }} :22 > "${KUBECTL_OUTPUT}" 2>&1 & +kubectl port-forward pod/"${POD_NAME}" :22 > "${KUBECTL_OUTPUT}" 2>&1 & # Capture the PID for the backgrounded kubectl command K8S_PORT_FWD_PID=$! @@ -49,11 +57,6 @@ while ! nc -z 127.0.0.1 "${local_port}"; do sleep 0.1 done -# To avoid errors when many concurrent requests are sent (see https://github.com/skypilot-org/skypilot/issues/2628), -# we add a random delay before establishing the socat connection. -# Empirically, this needs to be at least 1 second. We set this to be random between 1 and 2 seconds. -sleep $(shuf -i 10-20 -n 1 | awk '{printf "%f", $1/10}') - # Establishes two directional byte streams to handle stdin/stdout between # terminal and the jump pod. # socat process terminates when port-forward terminates. diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 7078a6ca787..bd4bafd43d5 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -24,6 +24,9 @@ provider: # This should be one of KubernetesPortMode port_mode: {{k8s_port_mode}} + # The networking mode used to ssh to pods. One of KubernetesNetworkingMode. + networking_mode: {{k8s_networking_mode}} + # We use internal IPs since we set up a port-forward between the kubernetes # cluster and the local machine, or directly use NodePort to reach the # head node. @@ -261,7 +264,7 @@ available_node_types: skypilot-user: {{ user }} # Custom tags for the pods {%- for label_key, label_value in labels.items() %} - {{ label_key }}: {{ label_value }} + {{ label_key }}: {{ label_value|tojson }} {%- endfor %} {% if k8s_fuse_device_required %} annotations: @@ -276,9 +279,22 @@ available_node_types: restartPolicy: Never # Add node selector if GPUs are requested: - {% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %} + {% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %} nodeSelector: + {% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %} {{k8s_acc_label_key}}: {{k8s_acc_label_value}} + {% endif %} + {% if k8s_spot_label_key is not none %} + {{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}} + {% endif %} + {% endif %} + + {% if k8s_spot_label_key is not none %} + tolerations: + - key: {{k8s_spot_label_key}} + operator: Equal + value: {{k8s_spot_label_value|tojson}} + effect: NoSchedule {% endif %} # This volume allocates shared memory for Ray to use for its plasma @@ -351,6 +367,9 @@ setup_commands: # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - sudo DEBIAN_FRONTEND=noninteractive apt install gcc patch pciutils rsync fuse curl -y; mkdir -p ~/.ssh; touch ~/.ssh/config; + {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} {{ conda_installation_commands }} {{ ray_skypilot_installation_commands }} sudo touch ~/.sudo_as_admin_successful; diff --git a/sky/templates/paperspace-ray.yml.j2 b/sky/templates/paperspace-ray.yml.j2 index ba0886ee679..400714978b9 100644 --- a/sky/templates/paperspace-ray.yml.j2 +++ b/sky/templates/paperspace-ray.yml.j2 @@ -14,6 +14,9 @@ docker: {%- if custom_resources is not none %} --gpus all {%- endif %} + {%- for run_option in docker_run_options %} + - {{run_option}} + {%- endfor %} {%- if docker_login_config is not none %} docker_login_config: username: |- @@ -73,7 +76,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/templates/runpod-ray.yml.j2 b/sky/templates/runpod-ray.yml.j2 index 62206d1a85c..8c063ac4f5d 100644 --- a/sky/templates/runpod-ray.yml.j2 +++ b/sky/templates/runpod-ray.yml.j2 @@ -52,7 +52,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index a4f1b49e3ed..a20c2d680aa 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -4,6 +4,9 @@ name: {{service_name}} setup: | {{ sky_activate_python_env }} + # Disable the pip version check to avoid the warning message, which makes the + # output hard to read. + export PIP_DISABLE_PIP_VERSION_CHECK=1 # Install all cloud dependencies. # This is for multicloud support. To allow controller launch on all clouds, diff --git a/sky/templates/vsphere-ray.yml.j2 b/sky/templates/vsphere-ray.yml.j2 index 7fc4cd9d01c..81c139d397d 100644 --- a/sky/templates/vsphere-ray.yml.j2 +++ b/sky/templates/vsphere-ray.yml.j2 @@ -51,7 +51,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/usage/usage_lib.py b/sky/usage/usage_lib.py index 32eb670fa2c..a6c10da5c7a 100644 --- a/sky/usage/usage_lib.py +++ b/sky/usage/usage_lib.py @@ -140,6 +140,7 @@ def __init__(self) -> None: #: Requested number of nodes self.task_num_nodes: Optional[int] = None # update_actual_task # YAMLs converted to JSON. + # TODO: include the skypilot config used in task yaml. self.user_task_yaml: Optional[List[Dict[ str, Any]]] = None # update_user_task_yaml self.actual_task: Optional[List[Dict[str, diff --git a/sky/utils/command_runner.py b/sky/utils/command_runner.py index f43296c2f1e..dce5ee22ba7 100644 --- a/sky/utils/command_runner.py +++ b/sky/utils/command_runner.py @@ -5,7 +5,7 @@ import pathlib import shlex import time -from typing import Any, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union from sky import sky_logging from sky.skylet import constants @@ -19,6 +19,10 @@ # The git exclude file to support. GIT_EXCLUDE = '.git/info/exclude' # Rsync options +# TODO(zhwu): This will print a per-file progress bar (with -P), +# shooting a lot of messages to the output. --info=progress2 is used +# to get a total progress bar, but it requires rsync>=3.1.0 and Mac +# OS has a default rsync==2.6.9 (16 years old). RSYNC_DISPLAY_OPTION = '-Pavz' # Legend # dir-merge: ignore file can appear in any subdir, applies to that @@ -30,6 +34,7 @@ RSYNC_EXCLUDE_OPTION = '--exclude-from={}' _HASH_MAX_LENGTH = 10 +_DEFAULT_CONNECT_TIMEOUT = 30 def _ssh_control_path(ssh_control_filename: Optional[str]) -> Optional[str]: @@ -60,7 +65,7 @@ def ssh_options_list( ) -> List[str]: """Returns a list of sane options for 'ssh'.""" if connect_timeout is None: - connect_timeout = 30 + connect_timeout = _DEFAULT_CONNECT_TIMEOUT # Forked from Ray SSHOptions: # https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/command_runner.py arg_dict = { @@ -75,7 +80,7 @@ def ssh_options_list( # that case. 'UserKnownHostsFile': os.devnull, # Suppresses the warning messages, such as: - # Warning: Permanently added '34.69.216.203' (ED25519) to the list of + # Warning: Permanently added 'xx.xx.xx.xx' (EDxxx) to the list of # known hosts. 'LogLevel': 'ERROR', # Try fewer extraneous key pairs. @@ -170,7 +175,7 @@ def _get_command_to_run( # We need this to correctly run the cmd, and get the output. command = [ - 'bash', + '/bin/bash', '--login', '-c', ] @@ -207,6 +212,92 @@ def _get_command_to_run( command_str = ' '.join(command) return command_str + def _rsync( + self, + source: str, + target: str, + node_destination: str, + up: bool, + rsh_option: str, + # Advanced options. + log_path: str = os.devnull, + stream_logs: bool = True, + max_retry: int = 1, + prefix_command: Optional[str] = None, + get_remote_home_dir: Callable[[], str] = lambda: '~') -> None: + """Builds the rsync command.""" + # Build command. + rsync_command = [] + if prefix_command is not None: + rsync_command.append(prefix_command) + rsync_command += ['rsync', RSYNC_DISPLAY_OPTION] + + # --filter + rsync_command.append(RSYNC_FILTER_OPTION) + + if up: + # Build --exclude-from argument. + # The source is a local path, so we need to resolve it. + resolved_source = pathlib.Path(source).expanduser().resolve() + if (resolved_source / GIT_EXCLUDE).exists(): + # Ensure file exists; otherwise, rsync will error out. + # + # We shlex.quote() because the path may contain spaces: + # 'my dir/.git/info/exclude' + # Without quoting rsync fails. + rsync_command.append( + RSYNC_EXCLUDE_OPTION.format( + shlex.quote(str(resolved_source / GIT_EXCLUDE)))) + + rsync_command.append(f'-e {shlex.quote(rsh_option)}') + + if up: + resolved_target = target + if target.startswith('~'): + remote_home_dir = get_remote_home_dir() + resolved_target = target.replace('~', remote_home_dir) + full_source_str = str(resolved_source) + if resolved_source.is_dir(): + full_source_str = os.path.join(full_source_str, '') + rsync_command.extend([ + f'{full_source_str!r}', + f'{node_destination}:{resolved_target!r}', + ]) + else: + resolved_source = source + if source.startswith('~'): + remote_home_dir = get_remote_home_dir() + resolved_source = source.replace('~', remote_home_dir) + rsync_command.extend([ + f'{node_destination}:{resolved_source!r}', + f'{os.path.expanduser(target)!r}', + ]) + command = ' '.join(rsync_command) + logger.debug(f'Running rsync command: {command}') + + backoff = common_utils.Backoff(initial_backoff=5, max_backoff_factor=5) + assert max_retry > 0, f'max_retry {max_retry} must be positive.' + while max_retry >= 0: + returncode, stdout, stderr = log_lib.run_with_log( + command, + log_path=log_path, + stream_logs=stream_logs, + shell=True, + require_outputs=True) + if returncode == 0: + break + max_retry -= 1 + time.sleep(backoff.current_backoff()) + + direction = 'up' if up else 'down' + error_msg = (f'Failed to rsync {direction}: {source} -> {target}. ' + 'Ensure that the network is stable, then retry.') + subprocess_utils.handle_returncode(returncode, + command, + error_msg, + stderr=stdout + stderr, + stream_logs=stream_logs) + @timeline.event def run( self, @@ -448,13 +539,11 @@ def run( proc = subprocess_utils.run(command, shell=False, check=False) return proc.returncode, '', '' - command_str = self._get_command_to_run( - cmd, - process_stream, - separate_stderr, - # +1 to skip first new line. - skip_lines=skip_lines + 1, - source_bashrc=source_bashrc) + command_str = self._get_command_to_run(cmd, + process_stream, + separate_stderr, + skip_lines=skip_lines, + source_bashrc=source_bashrc) command = base_ssh_command + [shlex.quote(command_str)] log_dir = os.path.expanduser(os.path.dirname(log_path)) @@ -508,30 +597,6 @@ def rsync( Raises: exceptions.CommandError: rsync command failed. """ - # Build command. - # TODO(zhwu): This will print a per-file progress bar (with -P), - # shooting a lot of messages to the output. --info=progress2 is used - # to get a total progress bar, but it requires rsync>=3.1.0 and Mac - # OS has a default rsync==2.6.9 (16 years old). - rsync_command = ['rsync', RSYNC_DISPLAY_OPTION] - - # --filter - rsync_command.append(RSYNC_FILTER_OPTION) - - if up: - # The source is a local path, so we need to resolve it. - # --exclude-from - resolved_source = pathlib.Path(source).expanduser().resolve() - if (resolved_source / GIT_EXCLUDE).exists(): - # Ensure file exists; otherwise, rsync will error out. - # - # We shlex.quote() because the path may contain spaces: - # 'my dir/.git/info/exclude' - # Without quoting rsync fails. - rsync_command.append( - RSYNC_EXCLUDE_OPTION.format( - shlex.quote(str(resolved_source / GIT_EXCLUDE)))) - if self._docker_ssh_proxy_command is not None: docker_ssh_proxy_command = self._docker_ssh_proxy_command(['ssh']) else: @@ -544,43 +609,201 @@ def rsync( docker_ssh_proxy_command=docker_ssh_proxy_command, port=self.port, disable_control_master=self.disable_control_master)) - rsync_command.append(f'-e "ssh {ssh_options}"') - # To support spaces in the path, we need to quote source and target. - # rsync doesn't support '~' in a quoted local path, but it is ok to - # have '~' in a quoted remote path. - if up: - full_source_str = str(resolved_source) - if resolved_source.is_dir(): - full_source_str = os.path.join(full_source_str, '') - rsync_command.extend([ - f'{full_source_str!r}', - f'{self.ssh_user}@{self.ip}:{target!r}', - ]) - else: - rsync_command.extend([ - f'{self.ssh_user}@{self.ip}:{source!r}', - f'{os.path.expanduser(target)!r}', - ]) - command = ' '.join(rsync_command) + rsh_option = f'ssh {ssh_options}' + self._rsync(source, + target, + node_destination=f'{self.ssh_user}@{self.ip}', + up=up, + rsh_option=rsh_option, + log_path=log_path, + stream_logs=stream_logs, + max_retry=max_retry) - backoff = common_utils.Backoff(initial_backoff=5, max_backoff_factor=5) - while max_retry >= 0: - returncode, stdout, stderr = log_lib.run_with_log( - command, - log_path=log_path, - stream_logs=stream_logs, - shell=True, - require_outputs=True) - if returncode == 0: - break - max_retry -= 1 - time.sleep(backoff.current_backoff()) - direction = 'up' if up else 'down' - error_msg = (f'Failed to rsync {direction}: {source} -> {target}. ' - 'Ensure that the network is stable, then retry.') - subprocess_utils.handle_returncode(returncode, - command, - error_msg, - stderr=stdout + stderr, - stream_logs=stream_logs) +class KubernetesCommandRunner(CommandRunner): + """Runner for Kubernetes commands.""" + + def __init__( + self, + node: Tuple[str, str], + **kwargs, + ): + """Initialize KubernetesCommandRunner. + + Example Usage: + runner = KubernetesCommandRunner((namespace, pod_name)) + runner.run('ls -l') + runner.rsync(source, target, up=True) + + Args: + node: The namespace and pod_name of the remote machine. + """ + del kwargs + super().__init__(node) + self.namespace, self.pod_name = node + + @timeline.event + def run( + self, + cmd: Union[str, List[str]], + *, + port_forward: Optional[List[int]] = None, + require_outputs: bool = False, + # Advanced options. + log_path: str = os.devnull, + # If False, do not redirect stdout/stderr to optimize performance. + process_stream: bool = True, + stream_logs: bool = True, + ssh_mode: SshMode = SshMode.NON_INTERACTIVE, + separate_stderr: bool = False, + connect_timeout: Optional[int] = None, + source_bashrc: bool = False, + skip_lines: int = 0, + **kwargs) -> Union[int, Tuple[int, str, str]]: + """Uses 'kubectl exec' to run 'cmd' on a pod by its name and namespace. + + Args: + cmd: The command to run. + port_forward: This should be None for k8s. + + Advanced options: + + require_outputs: Whether to return the stdout/stderr of the command. + log_path: Redirect stdout/stderr to the log_path. + stream_logs: Stream logs to the stdout/stderr. + check: Check the success of the command. + ssh_mode: The mode to use for ssh. + See SSHMode for more details. + separate_stderr: Whether to separate stderr from stdout. + connect_timeout: timeout in seconds for the pod connection. + source_bashrc: Whether to source the bashrc before running the + command. + skip_lines: The number of lines to skip at the beginning of the + output. This is used when the output is not processed by + SkyPilot but we still want to get rid of some warning messages, + such as SSH warnings. + + Returns: + returncode + or + A tuple of (returncode, stdout, stderr). + """ + # TODO(zhwu): implement port_forward for k8s. + assert port_forward is None, ('port_forward is not supported for k8s ' + f'for now, but got: {port_forward}') + if connect_timeout is None: + connect_timeout = _DEFAULT_CONNECT_TIMEOUT + kubectl_args = [ + '--pod-running-timeout', f'{connect_timeout}s', '-n', + self.namespace, self.pod_name + ] + if ssh_mode == SshMode.LOGIN: + assert isinstance(cmd, list), 'cmd must be a list for login mode.' + base_cmd = ['kubectl', 'exec', '-it', *kubectl_args, '--'] + command = base_cmd + cmd + proc = subprocess_utils.run(command, shell=False, check=False) + return proc.returncode, '', '' + + kubectl_base_command = ['kubectl', 'exec'] + + if ssh_mode == SshMode.INTERACTIVE: + kubectl_base_command.append('-i') + kubectl_base_command += [*kubectl_args, '--'] + + command_str = self._get_command_to_run(cmd, + process_stream, + separate_stderr, + skip_lines=skip_lines, + source_bashrc=source_bashrc) + command = kubectl_base_command + [ + # It is important to use /bin/bash -c here to make sure we quote the + # command to be run properly. Otherwise, directly appending commands + # after '--' will not work for some commands, such as '&&', '>' etc. + '/bin/bash', + '-c', + shlex.quote(command_str) + ] + + log_dir = os.path.expanduser(os.path.dirname(log_path)) + os.makedirs(log_dir, exist_ok=True) + + executable = None + if not process_stream: + if stream_logs: + command += [ + f'| tee {log_path}', + # This also requires the executor to be '/bin/bash' instead + # of the default '/bin/sh'. + '; exit ${PIPESTATUS[0]}' + ] + else: + command += [f'> {log_path}'] + executable = '/bin/bash' + return log_lib.run_with_log(' '.join(command), + log_path, + require_outputs=require_outputs, + stream_logs=stream_logs, + process_stream=process_stream, + shell=True, + executable=executable, + **kwargs) + + @timeline.event + def rsync( + self, + source: str, + target: str, + *, + up: bool, + # Advanced options. + log_path: str = os.devnull, + stream_logs: bool = True, + max_retry: int = 1, + ) -> None: + """Uses 'rsync' to sync 'source' to 'target'. + + Args: + source: The source path. + target: The target path. + up: The direction of the sync, True for local to cluster, False + for cluster to local. + log_path: Redirect stdout/stderr to the log_path. + stream_logs: Stream logs to the stdout/stderr. + max_retry: The maximum number of retries for the rsync command. + This value should be non-negative. + + Raises: + exceptions.CommandError: rsync command failed. + """ + + def get_remote_home_dir() -> str: + # Use `echo ~` to get the remote home directory, instead of pwd or + # echo $HOME, because pwd can be `/` when the remote user is root + # and $HOME is not always set. + rc, remote_home_dir, stderr = self.run('echo ~', + require_outputs=True, + separate_stderr=True, + stream_logs=False) + if rc != 0: + raise ValueError('Failed to get remote home directory: ' + f'{remote_home_dir + stderr}') + remote_home_dir = remote_home_dir.strip() + return remote_home_dir + + # Build command. + helper_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), + 'kubernetes', 'rsync_helper.sh') + self._rsync( + source, + target, + node_destination=f'{self.pod_name}@{self.namespace}', + up=up, + rsh_option=helper_path, + log_path=log_path, + stream_logs=stream_logs, + max_retry=max_retry, + prefix_command=f'chmod +x {helper_path} && ', + # rsync with `kubectl` as the rsh command will cause ~/xx parsed as + # /~/xx, so we need to replace ~ with the remote home directory. We + # only need to do this when ~ is at the beginning of the path. + get_remote_home_dir=get_remote_home_dir) diff --git a/sky/utils/command_runner.pyi b/sky/utils/command_runner.pyi index 9fbad243775..077447e1d5c 100644 --- a/sky/utils/command_runner.pyi +++ b/sky/utils/command_runner.pyi @@ -101,7 +101,8 @@ class CommandRunner: *, up: bool, log_path: str = ..., - stream_logs: bool = ...) -> None: + stream_logs: bool = ..., + max_retry: int = ...) -> None: ... @classmethod @@ -191,5 +192,76 @@ class SSHCommandRunner(CommandRunner): *, up: bool, log_path: str = ..., - stream_logs: bool = ...) -> None: + stream_logs: bool = ..., + max_retry: int = ...) -> None: + ... + + +class KubernetesCommandRunner(CommandRunner): + + def __init__( + self, + node: Tuple[str, str], + ) -> None: + ... + + @typing.overload + def run(self, + cmd: Union[str, List[str]], + *, + port_forward: Optional[List[int]] = ..., + require_outputs: Literal[False] = ..., + log_path: str = ..., + process_stream: bool = ..., + stream_logs: bool = ..., + ssh_mode: SshMode = ..., + separate_stderr: bool = ..., + connect_timeout: Optional[int] = ..., + source_bashrc: bool = ..., + skip_lines: int = ..., + **kwargs) -> int: + ... + + @typing.overload + def run(self, + cmd: Union[str, List[str]], + *, + port_forward: Optional[List[int]] = ..., + require_outputs: Literal[True], + log_path: str = ..., + process_stream: bool = ..., + stream_logs: bool = ..., + ssh_mode: SshMode = ..., + separate_stderr: bool = ..., + connect_timeout: Optional[int] = ..., + source_bashrc: bool = ..., + skip_lines: int = ..., + **kwargs) -> Tuple[int, str, str]: + ... + + @typing.overload + def run(self, + cmd: Union[str, List[str]], + *, + port_forward: Optional[List[int]] = ..., + require_outputs: bool = ..., + log_path: str = ..., + process_stream: bool = ..., + stream_logs: bool = ..., + ssh_mode: SshMode = ..., + separate_stderr: bool = ..., + connect_timeout: Optional[int] = ..., + source_bashrc: bool = ..., + skip_lines: int = ..., + **kwargs) -> Union[Tuple[int, str, str], int]: + ... + + def rsync(self, + source: str, + target: str, + *, + up: bool, + log_path: str = ..., + stream_logs: bool = ..., + max_retry: int = ...) -> None: ... diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py index 103c834000c..a9227fb4c20 100644 --- a/sky/utils/common_utils.py +++ b/sky/utils/common_utils.py @@ -233,7 +233,7 @@ class Backoff: MULTIPLIER = 1.6 JITTER = 0.4 - def __init__(self, initial_backoff: int = 5, max_backoff_factor: int = 5): + def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5): self._initial = True self._backoff = 0.0 self._initial_backoff = initial_backoff diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 5609db4a04a..477ebe8d1ba 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -191,7 +191,7 @@ def _get_cloud_dependencies_installation_commands( prefix_str = 'Check & install cloud dependencies on controller: ' # This is to make sure the shorter checking message does not have junk # characters from the previous message. - empty_str = ' ' * 5 + empty_str = ' ' * 10 aws_dependencies_installation = ( 'pip list | grep boto3 > /dev/null 2>&1 || pip install ' 'botocore>=1.29.10 boto3>=1.26.1; ' @@ -207,7 +207,7 @@ def _get_cloud_dependencies_installation_commands( # fluidstack and paperspace continue if isinstance(cloud, clouds.AWS): - commands.append(f'echo -n "{prefix_str}AWS{empty_str}" && ' + + commands.append(f'echo -en "\\r{prefix_str}AWS{empty_str}" && ' + aws_dependencies_installation) elif isinstance(cloud, clouds.Azure): commands.append( @@ -247,6 +247,13 @@ def _get_cloud_dependencies_installation_commands( '/bin/linux/amd64/kubectl" && ' 'sudo install -o root -g root -m 0755 ' 'kubectl /usr/local/bin/kubectl))') + elif isinstance(cloud, clouds.Cudo): + commands.append( + f'echo -en "\\r{prefix_str}Cudo{empty_str}" && ' + 'pip list | grep cudo-compute > /dev/null 2>&1 || ' + 'pip install "cudo-compute>=0.1.10" > /dev/null 2>&1 && ' + 'wget https://download.cudo.org/compute/cudoctl-0.3.2-amd64.deb -O ~/cudoctl.deb > /dev/null 2>&1 && ' # pylint: disable=line-too-long + 'sudo dpkg -i ~/cudoctl.deb > /dev/null 2>&1') if controller == Controllers.JOBS_CONTROLLER: if isinstance(cloud, clouds.IBM): commands.append( @@ -263,12 +270,6 @@ def _get_cloud_dependencies_installation_commands( f'echo -en "\\r{prefix_str}RunPod{empty_str}" && ' 'pip list | grep runpod > /dev/null 2>&1 || ' 'pip install "runpod>=1.5.1" > /dev/null 2>&1') - elif isinstance(cloud, clouds.Cudo): - # cudo doesn't support open port - commands.append( - f'echo -en "\\r{prefix_str}Cudo{empty_str}" && ' - 'pip list | grep cudo-compute > /dev/null 2>&1 || ' - 'pip install "cudo-compute>=0.1.8" > /dev/null 2>&1') if (cloudflare.NAME in storage_lib.get_cached_enabled_storage_clouds_or_refresh()): commands.append(f'echo -en "\\r{prefix_str}Cloudflare{empty_str}" && ' + @@ -742,3 +743,25 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', store_prefix = store_type.store_prefix() storage_obj.source = f'{store_prefix}{storage_obj.name}' storage_obj.force_delete = True + + # Step 7: Convert all `MOUNT` mode storages which don't specify a source + # to specifying a source. If the source is specified with a local path, + # it was handled in step 6. + updated_mount_storages = {} + for storage_path, storage_obj in task.storage_mounts.items(): + if (storage_obj.mode == storage_lib.StorageMode.MOUNT and + not storage_obj.source): + # Construct source URL with first store type and storage name + # E.g., s3://my-storage-name + source = list( + storage_obj.stores.keys())[0].store_prefix() + storage_obj.name + new_storage = storage_lib.Storage.from_yaml_config({ + 'source': source, + 'persistent': storage_obj.persistent, + 'mode': storage_lib.StorageMode.MOUNT.value, + # We enable force delete to allow the controller to delete + # the object store in case persistent is set to False. + '_force_delete': True + }) + updated_mount_storages[storage_path] = new_storage + task.update_storage_mounts(updated_mount_storages) diff --git a/sky/utils/dag_utils.py b/sky/utils/dag_utils.py index ef80bfd2a17..7a4fe90e7fb 100644 --- a/sky/utils/dag_utils.py +++ b/sky/utils/dag_utils.py @@ -70,9 +70,9 @@ def load_chain_dag_from_yaml( Has special handling for an initial section in YAML that contains only the 'name' field, which is the DAG name. - 'env_overrides' is in effect only when there's exactly one task. It is a - list of (key, value) pairs that will be used to update the task's 'envs' - section. + 'env_overrides' is a list of (key, value) pairs that will be used to update + the task's 'envs' section. If it is a chain dag, the envs will be updated + for all tasks in the chain. Returns: A chain Dag with 1 or more tasks (an empty entrypoint would create a @@ -90,12 +90,6 @@ def load_chain_dag_from_yaml( # YAML has only `name: xxx`. Still instantiate a task. configs = [{'name': dag_name}] - if len(configs) > 1: - # TODO(zongheng): in a chain DAG of N tasks, cli.py currently makes the - # decision to not apply overrides. Here we maintain this behavior. We - # can listen to user feedback to change this. - env_overrides = None - current_task = None with dag_lib.Dag() as dag: for task_config in configs: diff --git a/sky/utils/kubernetes/create_cluster.sh b/sky/utils/kubernetes/create_cluster.sh index 62fb700edf3..52bbd1804e8 100755 --- a/sky/utils/kubernetes/create_cluster.sh +++ b/sky/utils/kubernetes/create_cluster.sh @@ -101,32 +101,6 @@ kind create cluster --config /tmp/skypilot-kind.yaml --name skypilot echo "Kind cluster created." -# Function to wait for SkyPilot GPU labeling jobs to complete -wait_for_gpu_labeling_jobs() { - echo "Starting wait for SkyPilot GPU labeling jobs to complete..." - - SECONDS=0 - TIMEOUT=600 # 10 minutes in seconds - - while true; do - TOTAL_JOBS=$(kubectl get jobs -n kube-system -l job=sky-gpu-labeler --no-headers | wc -l) - COMPLETED_JOBS=$(kubectl get jobs -n kube-system -l job=sky-gpu-labeler --no-headers | grep "1/1" | wc -l) - - if [[ $COMPLETED_JOBS -eq $TOTAL_JOBS ]]; then - echo "All SkyPilot GPU labeling jobs completed ($TOTAL_JOBS)." - break - elif [ $SECONDS -ge $TIMEOUT ]; then - echo "Timeout reached while waiting for GPU labeling jobs." - exit 1 - else - echo "Waiting for GPU labeling jobs to complete... ($COMPLETED_JOBS/$TOTAL_JOBS completed)" - echo "To check status, see GPU labeling pods:" - echo "kubectl get jobs -n kube-system -l job=sky-gpu-labeler" - sleep 5 - fi - done -} - # Function to wait for GPU operator to be correctly installed wait_for_gpu_operator_installation() { echo "Starting wait for GPU operator installation..." @@ -150,22 +124,6 @@ wait_for_gpu_operator_installation() { done } -wait_for_skypilot_gpu_image_pull() { - echo "Pulling SkyPilot GPU image..." - docker pull ${IMAGE_GPU} - echo "Loading SkyPilot GPU image into kind cluster..." - kind load docker-image --name skypilot ${IMAGE_GPU} - echo "SkyPilot GPU image loaded into kind cluster." -} - -wait_for_skypilot_cpu_image_pull() { - echo "Pulling SkyPilot CPU image..." - docker pull ${IMAGE} - echo "Loading SkyPilot CPU image into kind cluster..." - kind load docker-image --name skypilot ${IMAGE} - echo "SkyPilot CPU image loaded into kind cluster." -} - wait_for_nginx_ingress_controller_install() { echo "Starting installation of Nginx Ingress Controller..." @@ -206,21 +164,8 @@ if $ENABLE_GPUS; then nvidia/gpu-operator --set driver.enabled=false # Wait for GPU operator installation to succeed wait_for_gpu_operator_installation - - # Load the SkyPilot GPU image into the cluster for faster labelling - wait_for_skypilot_gpu_image_pull - - # Label nodes with GPUs - echo "Labelling nodes with GPUs..." - python -m sky.utils.kubernetes.gpu_labeler - - # Wait for all the GPU labeling jobs to complete - wait_for_gpu_labeling_jobs fi -# Load local skypilot image on to the cluster for faster startup -wait_for_skypilot_cpu_image_pull - # Install the Nginx Ingress Controller wait_for_nginx_ingress_controller_install diff --git a/sky/utils/kubernetes/generate_kubeconfig.sh b/sky/utils/kubernetes/generate_kubeconfig.sh new file mode 100755 index 00000000000..04ea567d3f2 --- /dev/null +++ b/sky/utils/kubernetes/generate_kubeconfig.sh @@ -0,0 +1,289 @@ +#!/bin/bash +# This script creates a new k8s Service Account and generates a kubeconfig with +# its credentials. This Service Account has the minimal permissions necessary for +# SkyPilot. The kubeconfig is written in the current directory. +# +# Before running this script, you must configure your local kubectl to point to +# the right k8s cluster and have admin-level access. +# +# By default, this script will create a service account "sky-sa" in "default" +# namespace. If you want to use a different namespace or service account name: +# +# * Specify SKYPILOT_NAMESPACE env var to override the default namespace where the service account is created. +# * Specify SKYPILOT_SA_NAME env var to override the default service account name. +# * Specify SKIP_SA_CREATION=1 to skip creating the service account and use an existing one +# +# Usage: +# # Create "sky-sa" service account with minimal permissions in "default" namespace and generate kubeconfig +# $ ./generate_kubeconfig.sh +# +# # Create "my-sa" service account with minimal permissions in "my-namespace" namespace and generate kubeconfig +# $ SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh +# +# # Use an existing service account "my-sa" in "my-namespace" namespace and generate kubeconfig +# $ SKIP_SA_CREATION=1 SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh + +set -eu -o pipefail + +# Allow passing in common name and username in environment. If not provided, +# use default. +SKYPILOT_SA=${SKYPILOT_SA_NAME:-sky-sa} +NAMESPACE=${SKYPILOT_NAMESPACE:-default} + +echo "Service account: ${SKYPILOT_SA}" +echo "Namespace: ${NAMESPACE}" + +# Set OS specific values. +if [[ "$OSTYPE" == "linux-gnu" ]]; then + BASE64_DECODE_FLAG="-d" +elif [[ "$OSTYPE" == "darwin"* ]]; then + BASE64_DECODE_FLAG="-D" +elif [[ "$OSTYPE" == "linux-musl" ]]; then + BASE64_DECODE_FLAG="-d" +else + echo "Unknown OS ${OSTYPE}" + exit 1 +fi + +# If the user has set SKIP_SA_CREATION=1, skip creating the service account. +if [ -z ${SKIP_SA_CREATION+x} ]; then + echo "Creating the Kubernetes Service Account with minimal RBAC permissions." + kubectl apply -f - < kubeconfig < kubeconfig < Tuple[int, str, str]: suffix='.log', delete=False) test.echo(f'Test started. Log: less {log_file.name}') + env_dict = os.environ.copy() + if test.env: + env_dict.update(test.env) for command in test.commands: log_file.write(f'+ {command}\n') log_file.flush() @@ -165,6 +171,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]: stderr=subprocess.STDOUT, shell=True, executable='/bin/bash', + env=env_dict, ) try: proc.wait(timeout=test.timeout) @@ -304,20 +311,15 @@ def test_example_app(): # ---------- A minimal task ---------- def test_minimal(generic_cloud: str): name = _get_cluster_name() - validate_output = _VALIDATE_LAUNCH_OUTPUT - # Kubernetes will output a SSH Warning for proxy jump, which will cause - # the output validation fail. We skip the check for kubernetes for now. - if generic_cloud.lower() == 'kubernetes': - validate_output = 'true' test = Test( 'minimal', [ - f's=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {validate_output}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', # Output validation done. f'sky logs {name} 1 --status', f'sky logs {name} --status | grep "Job 1: SUCCEEDED"', # Equivalent. # Test launch output again on existing cluster - f's=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {validate_output}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 2 --status', f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. # Check the logs downloading @@ -361,6 +363,9 @@ def test_aws_region(): f'sky status --all | grep {name} | grep us-east-2', # Ensure the region is correct. f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-east-2\'', f'sky logs {name} 2 --status', # Ensure the job succeeded. + # A user program should not access SkyPilot runtime env python by default. + f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} || exit 1\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. ], f'sky down -y {name}', ) @@ -381,6 +386,9 @@ def test_gcp_region_and_service_account(): f'sky status --all | grep {name} | grep us-central1', # Ensure the region is correct. f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-central1\'', f'sky logs {name} 3 --status', # Ensure the job succeeded. + # A user program should not access SkyPilot runtime env python by default. + f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} || exit 1\'', + f'sky logs {name} 4 --status', # Ensure the job succeeded. ], f'sky down -y {name}', ) @@ -418,6 +426,9 @@ def test_azure_region(): f'sky logs {name} 2 --status', # Ensure the job succeeded. f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .zone | grep null\'', f'sky logs {name} 3 --status', # Ensure the job succeeded. + # A user program should not access SkyPilot runtime env python by default. + f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} || exit 1\'', + f'sky logs {name} 4 --status', # Ensure the job succeeded. ], f'sky down -y {name}', ) @@ -755,6 +766,36 @@ def test_clone_disk_gcp(): run_one_test(test) +@pytest.mark.gcp +def test_gcp_mig(): + name = _get_cluster_name() + region = 'us-central1' + test = Test( + 'gcp_mig', + [ + f'sky launch -y -c {name} --gpus t4 --num-nodes 2 --image-id skypilot:gpu-debian-10 --cloud gcp --region {region} tests/test_yamls/minimal.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', + f'sky logs {name} 2 --status', + f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. + # Check MIG exists. + f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"', + f'sky autostop -i 0 --down -y {name}', + 'sleep 120', + f'sky status -r {name}; sky status {name} | grep "{name} not found"', + f'gcloud compute instance-templates list | grep "sky-it-{name}"', + # Launch again with the same region. The original instance template + # should be removed. + f'sky launch -y -c {name} --gpus L4 --num-nodes 2 --region {region} nvidia-smi', + f'sky logs {name} 1 | grep "L4"', + f'sky down -y {name}', + f'gcloud compute instance-templates list | grep "sky-it-{name}" && exit 1 || true', + ], + f'sky down -y {name}', + env={'SKYPILOT_CONFIG': 'tests/test_yamls/use_mig_config.yaml'}) + run_one_test(test) + + @pytest.mark.aws def test_image_no_conda(): name = _get_cluster_name() @@ -2324,6 +2365,9 @@ def test_managed_jobs(generic_cloud: str): f'{_JOB_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep "CANCELLING\|CANCELLED"', 'sleep 200', f'{_JOB_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep CANCELLED', + # Test the functionality for logging. + f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', + f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Successfully provisioned cluster:"', f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"', ], # TODO(zhwu): Change to _JOB_CANCEL_WAIT.format(job_name=f'{name}-1 -n {name}-2') when @@ -2836,7 +2880,9 @@ def test_managed_jobs_storage(generic_cloud: str): name = _get_cluster_name() yaml_str = pathlib.Path( 'examples/managed_job_with_storage.yaml').read_text() - storage_name = f'sky-test-{int(time.time())}' + timestamp = int(time.time()) + storage_name = f'sky-test-{timestamp}' + output_storage_name = f'sky-test-output-{timestamp}' # Also perform region testing for bucket creation to validate if buckets are # created in the correct region and correctly mounted in managed jobs. @@ -2851,16 +2897,32 @@ def test_managed_jobs_storage(generic_cloud: str): region_cmd = TestStorageWithCredentials.cli_region_cmd( storage_lib.StoreType.S3, storage_name) region_validation_cmd = f'{region_cmd} | grep {region}' + s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( + storage_lib.StoreType.S3, output_storage_name, 'output.txt') + output_check_cmd = f'{s3_check_file_count} | grep 1' elif generic_cloud == 'gcp': region = 'us-west2' region_flag = f' --region {region}' region_cmd = TestStorageWithCredentials.cli_region_cmd( storage_lib.StoreType.GCS, storage_name) region_validation_cmd = f'{region_cmd} | grep {region}' + gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( + storage_lib.StoreType.GCS, output_storage_name, 'output.txt') + output_check_cmd = f'{gcs_check_file_count} | grep 1' elif generic_cloud == 'kubernetes': + # With Kubernetes, we don't know which object storage provider is used. + # Check both S3 and GCS if bucket exists in either. + s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( + storage_lib.StoreType.S3, output_storage_name, 'output.txt') + s3_output_check_cmd = f'{s3_check_file_count} | grep 1' + gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( + storage_lib.StoreType.GCS, output_storage_name, 'output.txt') + gcs_output_check_cmd = f'{gcs_check_file_count} | grep 1' + output_check_cmd = f'{s3_output_check_cmd} || {gcs_output_check_cmd}' use_spot = ' --no-use-spot' yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name) + yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name) with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: f.write(yaml_str) f.flush() @@ -2873,9 +2935,12 @@ def test_managed_jobs_storage(generic_cloud: str): region_validation_cmd, # Check if the bucket is created in the correct region 'sleep 60', # Wait the spot queue to be updated f'{_JOB_QUEUE_WAIT}| grep {name} | grep SUCCEEDED', - f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]' + f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', + # Check if file was written to the mounted output bucket + output_check_cmd ], - _JOB_CANCEL_WAIT.format(job_name=name), + (_JOB_CANCEL_WAIT.format(job_name=name), + f'; sky storage delete {output_storage_name} || true'), # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, ) @@ -2913,7 +2978,7 @@ def test_managed_jobs_inline_env(generic_cloud: str): test = Test( 'test-managed-jobs-inline-env', [ - f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"', + f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', 'sleep 20', f'{_JOB_QUEUE_WAIT} | grep {name} | grep SUCCEEDED', ], @@ -2931,10 +2996,10 @@ def test_inline_env(generic_cloud: str): test = Test( 'test-inline-env', [ - f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"', + f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', 'sleep 20', f'sky logs {name} 1 --status', - f'sky exec {name} --env TEST_ENV2="success" "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"', + f'sky exec {name} --env TEST_ENV2="success" "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', f'sky logs {name} 2 --status', ], f'sky down -y {name}', @@ -2950,9 +3015,9 @@ def test_inline_env_file(generic_cloud: str): test = Test( 'test-inline-env-file', [ - f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"', + f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', f'sky logs {name} 1 --status', - f'sky exec {name} --env-file examples/sample_dotenv "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"', + f'sky exec {name} --env-file examples/sample_dotenv "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', f'sky logs {name} 2 --status', ], f'sky down -y {name}', @@ -3607,6 +3672,47 @@ def test_skyserve_streaming(generic_cloud: str): run_one_test(test) +@pytest.mark.serve +def test_skyserve_readiness_timeout_fail(generic_cloud: str): + """Test skyserve with large readiness probe latency, expected to fail""" + name = _get_service_name() + test = Test( + f'test-skyserve-readiness-timeout-fail', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task.yaml', + # None of the readiness probe will pass, so the service will be + # terminated after the initial delay. + f's=$(sky serve status {name}); ' + f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do ' + 'echo "Waiting for replica to be failed..."; sleep 5; ' + f's=$(sky serve status {name}); echo "$s"; done;', + 'sleep 60', + f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 1;' + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.serve +def test_skyserve_large_readiness_timeout(generic_cloud: str): + """Test skyserve with customized large readiness timeout""" + name = _get_service_name() + test = Test( + f'test-skyserve-large-readiness-timeout', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + @pytest.mark.serve def test_skyserve_update(generic_cloud: str): """Test skyserve with update""" @@ -3684,7 +3790,7 @@ def test_skyserve_fast_update(generic_cloud: str): f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/bump_version_after.yaml', # sleep to wait for update to be registered. - 'sleep 30', + 'sleep 40', # 2 on-deamnd (ready) + 1 on-demand (provisioning). ( _check_replica_in_status( @@ -3698,7 +3804,7 @@ def test_skyserve_fast_update(generic_cloud: str): # Test rolling update f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/bump_version_before.yaml', # sleep to wait for update to be registered. - 'sleep 15', + 'sleep 25', # 2 on-deamnd (ready) + 1 on-demand (shutting down). _check_replica_in_status(name, [(2, False, 'READY'), (1, False, 'SHUTTING_DOWN')]), @@ -3830,7 +3936,14 @@ def test_skyserve_failures(generic_cloud: str): f's=$(sky serve status {name}); ' f'until echo "$s" | grep "FAILED_PROBING"; do ' 'echo "Waiting for replica to be failed..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done;' + + f's=$(sky serve status {name}); echo "$s"; done', + # Wait for the PENDING replica to appear. + 'sleep 10', + # Wait until the replica is out of PENDING. + f's=$(sky serve status {name}); ' + f'until ! echo "$s" | grep "PENDING" && ! echo "$s" | grep "Please wait for the controller to be ready."; do ' + 'echo "Waiting for replica to be out of pending..."; sleep 5; ' + f's=$(sky serve status {name}); echo "$s"; done; ' + _check_replica_in_status( name, [(1, False, 'FAILED_PROBING'), (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]), diff --git a/tests/test_yamls/use_mig_config.yaml b/tests/test_yamls/use_mig_config.yaml new file mode 100644 index 00000000000..ef715191a1f --- /dev/null +++ b/tests/test_yamls/use_mig_config.yaml @@ -0,0 +1,4 @@ +gcp: + managed_instance_group: + run_duration: 36000 + provision_timeout: 900