Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/skypilot-org/skypilot int…
Browse files Browse the repository at this point in the history
…o k8s_fix_logging
  • Loading branch information
romilbhardwaj committed Jul 18, 2024
2 parents 5eaef72 + c0246ab commit 4da7b75
Show file tree
Hide file tree
Showing 195 changed files with 9,987 additions and 3,747 deletions.
7 changes: 3 additions & 4 deletions .github/workflows/format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ on:
branches:
- master
- 'releases/**'
merge_group:

jobs:
format:
runs-on: ubuntu-latest
Expand All @@ -33,18 +35,15 @@ jobs:
- name: Running yapf
run: |
yapf --diff --recursive ./ --exclude 'sky/skylet/ray_patches/**' \
--exclude 'sky/skylet/providers/azure/**' \
--exclude 'sky/skylet/providers/ibm/**'
- name: Running black
run: |
black --diff --check sky/skylet/providers/azure/ \
sky/skylet/providers/ibm/
black --diff --check sky/skylet/providers/ibm/
- name: Running isort for black formatted files
run: |
isort --diff --check --profile black -l 88 -m 3 \
sky/skylet/providers/ibm/
- name: Running isort for yapf formatted files
run: |
isort --diff --check ./ --sg 'sky/skylet/ray_patches/**' \
--sg 'sky/skylet/providers/azure/**' \
--sg 'sky/skylet/providers/ibm/**'
2 changes: 2 additions & 0 deletions .github/workflows/mypy-generic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ on:
branches:
- master
- 'releases/**'
merge_group:

jobs:
mypy:
runs-on: ubuntu-latest
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ on:
branches:
- master
- 'releases/**'
merge_group:

jobs:
pylint:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/pytest-generic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ on:
branches:
- master
- 'releases/**'
merge_group:

jobs:
python-test:
runs-on: ubuntu-latest
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ on:
branches:
- master
- 'releases/**'
merge_group:

jobs:
python-test:
strategy:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/test-doc-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ on:
branches:
- master
- 'releases/**'
merge_group:

jobs:
format:
runs-on: ubuntu-latest
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/test-poetry-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ on:
branches:
- master
- 'releases/**'
merge_group:

jobs:
poetry-build-test:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ RUN conda install -c conda-forge google-cloud-sdk && \
rm -rf /var/lib/apt/lists/*

# Install sky
RUN pip install --no-cache-dir "skypilot[all]==0.5.0"
RUN pip install --no-cache-dir "skypilot[all]==0.6.0"
28 changes: 16 additions & 12 deletions Dockerfile_k8s
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ FROM continuumio/miniconda3:23.3.1-0
# TODO(romilb): Investigate if this image can be consolidated with the skypilot
# client image (`Dockerfile`)

ARG DEBIAN_FRONTEND=noninteractive

# Initialize conda for root user, install ssh and other local dependencies
RUN apt update -y && \
apt install gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat curl -y && \
apt install git gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat curl -y && \
rm -rf /var/lib/apt/lists/* && \
apt remove -y python3 && \
conda init
Expand All @@ -25,14 +27,20 @@ RUN useradd -m -s /bin/bash sky && \
# Switch to sky user
USER sky

# Set HOME environment variable for sky user
ENV HOME /home/sky

# Set current working directory
WORKDIR /home/sky

# Install SkyPilot pip dependencies preemptively to speed up provisioning time
RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \
pip install networkx oauth2client pandas pendulum PrettyTable && \
pip install ray[default]==2.9.3 rich tabulate filelock && \
pip install packaging 'protobuf<4.0.0' pulp && \
pip install pycryptodome==3.12.0 && \
pip install docker kubernetes==28.1.0 && \
pip install grpcio==1.51.3 python-dotenv==1.0.1
RUN conda init && \
pip install wheel Click colorama cryptography jinja2 jsonschema networkx \
oauth2client pandas pendulum PrettyTable rich tabulate filelock packaging \
'protobuf<4.0.0' pulp pycryptodome==3.12.0 docker kubernetes==28.1.0 \
grpcio==1.51.3 python-dotenv==1.0.1 ray[default]==2.9.3 && \
curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl

# Add /home/sky/.local/bin/ to PATH
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc
Expand All @@ -43,7 +51,3 @@ COPY --chown=sky . /skypilot/sky/

# Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately
ENV PYTHONUNBUFFERED=1

# Set WORKDIR and initialize conda for sky user
WORKDIR /home/sky
RUN conda init
70 changes: 36 additions & 34 deletions Dockerfile_k8s_gpu
Original file line number Diff line number Diff line change
@@ -1,46 +1,52 @@
# TODO(romilb) - The base image used here (ray) is very large (11.4GB).
# as a result, this built image is about 13.5GB. We need to pick a lighter base
# image.
FROM rayproject/ray:2.9.3-py310-gpu
# We use the cuda runtime image instead of devel image to reduce size (1.3GB vs 3.6GB)
FROM nvidia/cuda:12.1.1-runtime-ubuntu20.04

# Initialize conda for root user, install ssh and other local dependencies
ARG DEBIAN_FRONTEND=noninteractive

# Install ssh and other local dependencies
# We remove cuda lists to avoid conflicts with the cuda version installed by ray
RUN sudo rm -rf /etc/apt/sources.list.d/cuda* && \
sudo apt update -y && \
sudo apt install gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \
sudo rm -rf /var/lib/apt/lists/* && \
sudo apt remove -y python3 && \
conda init
RUN rm -rf /etc/apt/sources.list.d/cuda* && \
apt update -y && \
apt install git gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \
rm -rf /var/lib/apt/lists/*

# Setup SSH and generate hostkeys
RUN sudo mkdir -p /var/run/sshd && \
sudo sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
sudo sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
cd /etc/ssh/ && \
sudo ssh-keygen -A

# Setup new user named sky and add to sudoers. \
# Also add /opt/conda/bin to sudo path and give sky user access to /home/ray
# Also add /opt/conda/bin to sudo path and give sky user permission to run sudo without password
RUN sudo useradd -m -s /bin/bash sky && \
sudo /bin/bash -c 'echo "sky ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers' && \
sudo /bin/bash -c "echo 'Defaults secure_path=\"/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\"' > /etc/sudoers.d/sky" && \
sudo chmod -R a+rwx /home/ray
sudo /bin/bash -c "echo 'Defaults secure_path=\"/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\"' > /etc/sudoers.d/sky"

# Switch to sky user
USER sky

# Set HOME environment variable for sky user, otherwise Ray base image HOME overrides
# Set HOME environment variable for sky user
ENV HOME /home/sky

# Setup SSH and generate hostkeys
RUN sudo mkdir -p /var/run/sshd && \
sudo chmod 0755 /var/run/sshd && \
sudo sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
sudo sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
cd /etc/ssh/ && \
ssh-keygen -A
# Set current working directory
WORKDIR /home/sky

# Install SkyPilot pip dependencies
RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \
pip install networkx oauth2client pandas pendulum PrettyTable && \
pip install rich tabulate filelock && \
pip install packaging 'protobuf<4.0.0' pulp && \
pip install pycryptodome==3.12.0 && \
pip install docker kubernetes==28.1.0 && \
pip install grpcio==1.51.3 python-dotenv==1.0.1
SHELL ["/bin/bash", "-c"]

# Install conda and other dependencies
# Keep the conda and Ray versions below in sync with the ones in skylet.constants
RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && \
bash Miniconda3-Linux-x86_64.sh -b && \
eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true && conda activate base && \
grep "# >>> conda initialize >>>" ~/.bashrc || { conda init && source ~/.bashrc; } && \
rm Miniconda3-Linux-x86_64.sh && \
pip install wheel Click colorama cryptography jinja2 jsonschema networkx \
oauth2client pandas pendulum PrettyTable rich tabulate filelock packaging \
'protobuf<4.0.0' pulp pycryptodome==3.12.0 docker kubernetes==28.1.0 \
grpcio==1.51.3 python-dotenv==1.0.1 ray[default]==2.9.3 && \
curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl

# Add /home/sky/.local/bin/ to PATH
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc
Expand All @@ -51,7 +57,3 @@ COPY --chown=sky . /skypilot/sky/

# Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately
ENV PYTHONUNBUFFERED=1

# Set WORKDIR and initialize conda for sky user
WORKDIR /home/sky
RUN conda init
19 changes: 11 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,11 @@

----
:fire: *News* :fire:
- [Jun, 2024] Reproduce **GPT** with [llm.c](https://github.com/karpathy/llm.c/discussions/481) on any cloud: [**guide**](./llm/gpt-2/)
- [Apr, 2024] Serve and finetune [**Llama 3**](https://skypilot.readthedocs.io/en/latest/gallery/llms/llama-3.html) on any cloud or Kubernetes: [**example**](./llm/llama-3/)
- [Apr, 2024] Serve [**Qwen-110B**](https://qwenlm.github.io/blog/qwen1.5-110b/) on your infra: [**example**](./llm/qwen/)
- [Apr, 2024] Using [**Ollama**](https://github.com/ollama/ollama) to deploy quantized LLMs on CPUs and GPUs: [**example**](./llm/ollama/)
- [Mar, 2024] Serve and deploy [**Databricks DBRX**](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) on your infra: [**example**](./llm/dbrx/)
- [Feb, 2024] Deploying and scaling [**Gemma**](https://blog.google/technology/developers/gemma-open-models/) with SkyServe: [**example**](./llm/gemma/)
- [Feb, 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/)
- [Feb, 2024] Serving [**Code Llama 70B**](https://ai.meta.com/blog/code-llama-large-language-model-coding/) with vLLM and SkyServe: [**example**](./llm/codellama/)
- [Dec, 2023] [**Mixtral 8x7B**](https://mistral.ai/news/mixtral-of-experts/), a high quality sparse mixture-of-experts model, was released by Mistral AI! Deploy via SkyPilot on any cloud: [**example**](./llm/mixtral/)
- [Nov, 2023] Using [**Axolotl**](https://github.com/OpenAccess-AI-Collective/axolotl) to finetune Mistral 7B on the cloud (on-demand and spot): [**example**](./llm/axolotl/)
Expand All @@ -43,6 +42,8 @@
<details>
<summary>Archived</summary>

- [Mar, 2024] Serve and deploy [**Databricks DBRX**](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) on your infra: [**example**](./llm/dbrx/)
- [Feb, 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/)
- [Dec, 2023] Using [**LoRAX**](https://github.com/predibase/lorax) to serve 1000s of finetuned LLMs on a single instance in the cloud: [**example**](./llm/lorax/)
- [Sep, 2023] [**Mistral 7B**](https://mistral.ai/news/announcing-mistral-7b/), a high-quality open LLM, was released! Deploy via SkyPilot on any cloud: [**Mistral docs**](https://docs.mistral.ai/self-deployment/skypilot)
- [July, 2023] Self-Hosted **Llama-2 Chatbot** on Any Cloud: [**example**](./llm/llama-2/)
Expand All @@ -69,13 +70,13 @@ SkyPilot **cuts your cloud costs**:

SkyPilot supports your existing GPU, TPU, and CPU workloads, with no code changes.

Install with pip (we recommend the nightly build for the latest features or [from source](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)):
Install with pip:
```bash
pip install "skypilot-nightly[aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp,kubernetes]" # choose your clouds
pip install -U "skypilot[aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp,kubernetes]" # choose your clouds
```
To get the last release, use:
To get the latest features and fixes, use the nightly build or [install from source](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html):
```bash
pip install -U "skypilot[aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp,kubernetes]" # choose your clouds
pip install "skypilot-nightly[aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp,kubernetes]" # choose your clouds
```

Current supported providers (AWS, Azure, GCP, OCI, Lambda Cloud, RunPod, Fluidstack, Paperspace, Cudo, IBM, Samsung, Cloudflare, any Kubernetes cluster):
Expand Down Expand Up @@ -153,8 +154,9 @@ To learn more, see our [Documentation](https://skypilot.readthedocs.io/en/latest
<!-- Keep this section in sync with index.rst in SkyPilot Docs -->
Runnable examples:
- LLMs on SkyPilot
- [GPT-2 via `llm.c`](./llm/gpt-2/)
- [Llama 3](./llm/llama-3/)
- [Qwen](./llm/qwen/)
- [Qwen](./llm/qwen/)
- [Databricks DBRX](./llm/dbrx/)
- [Gemma](./llm/gemma/)
- [Mixtral 8x7B](./llm/mixtral/); [Mistral 7B](https://docs.mistral.ai/self-deployment/skypilot/) (from official Mistral team)
Expand All @@ -172,7 +174,7 @@ Runnable examples:
- [LocalGPT](./llm/localgpt)
- [Falcon](./llm/falcon)
- Add yours here & see more in [`llm/`](./llm)!
- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama) and [many more (`examples/`)](./examples).
- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2) and [many more (`examples/`)](./examples).

Follow updates:
- [Twitter](https://twitter.com/skypilot_org)
Expand All @@ -183,6 +185,7 @@ Read the research:
- [SkyPilot paper](https://www.usenix.org/system/files/nsdi23-yang-zongheng.pdf) and [talk](https://www.usenix.org/conference/nsdi23/presentation/yang-zongheng) (NSDI 2023)
- [Sky Computing whitepaper](https://arxiv.org/abs/2205.07147)
- [Sky Computing vision paper](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s02-stoica.pdf) (HotOS 2021)
- [Policy for Managed Spot Jobs](https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao) (NSDI 2024)

## Support and Questions
We are excited to hear your feedback!
Expand Down
2 changes: 2 additions & 0 deletions docs/source/_gallery_original/index.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.. _ai-gallery:

AI Gallery
====================

Expand Down
1 change: 0 additions & 1 deletion docs/source/_static/custom.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ document.addEventListener('DOMContentLoaded', () => {
{ selector: '.caption-text', text: 'SkyServe: Model Serving' },
{ selector: '.toctree-l1 > a', text: 'Managed Jobs' },
{ selector: '.toctree-l1 > a', text: 'Running on Kubernetes' },
{ selector: '.toctree-l1 > a', text: 'DBRX (Databricks)' },
{ selector: '.toctree-l1 > a', text: 'Ollama' },
{ selector: '.toctree-l1 > a', text: 'Llama-3 (Meta)' },
{ selector: '.toctree-l1 > a', text: 'Qwen (Alibaba)' },
Expand Down
19 changes: 16 additions & 3 deletions docs/source/cloud-setup/cloud-permissions/gcp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ User
resourcemanager.projects.getIamPolicy
.. note::

For custom VPC users (with :code:`gcp.vpc_name` specified in :code:`~/.sky/config.yaml`, check `here <#_gcp-bring-your-vpc>`_), :code:`compute.firewalls.create` and :code:`compute.firewalls.delete` are not necessary unless opening ports is needed via `resources.ports` in task yaml.

.. note::
Expand Down Expand Up @@ -145,7 +145,7 @@ User
8. **Optional**: If the user needs to use custom machine images with ``sky launch --image-id``, you can additionally add the following permissions:

.. code-block:: text
compute.disks.get
compute.disks.resize
compute.images.get
Expand Down Expand Up @@ -297,7 +297,7 @@ To do so, you can use SkyPilot's global config file ``~/.sky/config.yaml`` to sp
use_internal_ips: true
# VPC with NAT setup, see below
vpc_name: my-vpc-name
ssh_proxy_command: ssh -W %h:%p -o StrictHostKeyChecking=no [email protected]
ssh_proxy_command: ssh -W %h:%p -o StrictHostKeyChecking=no [email protected]
The ``gcp.ssh_proxy_command`` field is optional. If SkyPilot is run on a machine that can directly access the internal IPs of the instances, it can be omitted. Otherwise, it should be set to a command that can be used to proxy SSH connections to the internal IPs of the instances.

Expand Down Expand Up @@ -338,3 +338,16 @@ If proxy is not needed, but the regions need to be limited, you can set the ``gc
ssh_proxy_command:
us-west1: null
us-east1: null
Force Enable Exteral IPs
~~~~~~~~~~~~~~~~~~~~~~~~

An alternative to setting up cloud NAT for instances that need to access the public internet but are in a VPC and communicated with via their internal IP is to force them to be created with an external IP address.

.. code-block:: yaml
gcp:
use_internal_ips: true
vpc_name: my-vpc-name
force_enable_external_ips: true
Loading

0 comments on commit 4da7b75

Please sign in to comment.