Skip to content

Commit

Permalink
Merge branch 'master' into shethhriday29-rclone-cache
Browse files Browse the repository at this point in the history
  • Loading branch information
landscapepainter committed Jul 7, 2024
2 parents 3818a05 + 994d35a commit d393206
Show file tree
Hide file tree
Showing 125 changed files with 5,602 additions and 1,982 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ RUN conda install -c conda-forge google-cloud-sdk && \
rm -rf /var/lib/apt/lists/*

# Install sky
RUN pip install --no-cache-dir "skypilot[all]==0.5.0"
RUN pip install --no-cache-dir "skypilot[all]==0.6.0"
28 changes: 16 additions & 12 deletions Dockerfile_k8s
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ FROM continuumio/miniconda3:23.3.1-0
# TODO(romilb): Investigate if this image can be consolidated with the skypilot
# client image (`Dockerfile`)

ARG DEBIAN_FRONTEND=noninteractive

# Initialize conda for root user, install ssh and other local dependencies
RUN apt update -y && \
apt install gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat curl -y && \
apt install git gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat curl -y && \
rm -rf /var/lib/apt/lists/* && \
apt remove -y python3 && \
conda init
Expand All @@ -25,14 +27,20 @@ RUN useradd -m -s /bin/bash sky && \
# Switch to sky user
USER sky

# Set HOME environment variable for sky user
ENV HOME /home/sky

# Set current working directory
WORKDIR /home/sky

# Install SkyPilot pip dependencies preemptively to speed up provisioning time
RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \
pip install networkx oauth2client pandas pendulum PrettyTable && \
pip install ray[default]==2.9.3 rich tabulate filelock && \
pip install packaging 'protobuf<4.0.0' pulp && \
pip install pycryptodome==3.12.0 && \
pip install docker kubernetes==28.1.0 && \
pip install grpcio==1.51.3 python-dotenv==1.0.1
RUN conda init && \
pip install wheel Click colorama cryptography jinja2 jsonschema networkx \
oauth2client pandas pendulum PrettyTable rich tabulate filelock packaging \
'protobuf<4.0.0' pulp pycryptodome==3.12.0 docker kubernetes==28.1.0 \
grpcio==1.51.3 python-dotenv==1.0.1 ray[default]==2.9.3 && \
curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl

# Add /home/sky/.local/bin/ to PATH
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc
Expand All @@ -43,7 +51,3 @@ COPY --chown=sky . /skypilot/sky/

# Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately
ENV PYTHONUNBUFFERED=1

# Set WORKDIR and initialize conda for sky user
WORKDIR /home/sky
RUN conda init
70 changes: 36 additions & 34 deletions Dockerfile_k8s_gpu
Original file line number Diff line number Diff line change
@@ -1,46 +1,52 @@
# TODO(romilb) - The base image used here (ray) is very large (11.4GB).
# as a result, this built image is about 13.5GB. We need to pick a lighter base
# image.
FROM rayproject/ray:2.9.3-py310-gpu
# We use the cuda runtime image instead of devel image to reduce size (1.3GB vs 3.6GB)
FROM nvidia/cuda:12.1.1-runtime-ubuntu20.04

# Initialize conda for root user, install ssh and other local dependencies
ARG DEBIAN_FRONTEND=noninteractive

# Install ssh and other local dependencies
# We remove cuda lists to avoid conflicts with the cuda version installed by ray
RUN sudo rm -rf /etc/apt/sources.list.d/cuda* && \
sudo apt update -y && \
sudo apt install gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \
sudo rm -rf /var/lib/apt/lists/* && \
sudo apt remove -y python3 && \
conda init
RUN rm -rf /etc/apt/sources.list.d/cuda* && \
apt update -y && \
apt install git gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \
rm -rf /var/lib/apt/lists/*

# Setup SSH and generate hostkeys
RUN sudo mkdir -p /var/run/sshd && \
sudo sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
sudo sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
cd /etc/ssh/ && \
sudo ssh-keygen -A

# Setup new user named sky and add to sudoers. \
# Also add /opt/conda/bin to sudo path and give sky user access to /home/ray
# Also add /opt/conda/bin to sudo path and give sky user permission to run sudo without password
RUN sudo useradd -m -s /bin/bash sky && \
sudo /bin/bash -c 'echo "sky ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers' && \
sudo /bin/bash -c "echo 'Defaults secure_path=\"/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\"' > /etc/sudoers.d/sky" && \
sudo chmod -R a+rwx /home/ray
sudo /bin/bash -c "echo 'Defaults secure_path=\"/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\"' > /etc/sudoers.d/sky"

# Switch to sky user
USER sky

# Set HOME environment variable for sky user, otherwise Ray base image HOME overrides
# Set HOME environment variable for sky user
ENV HOME /home/sky

# Setup SSH and generate hostkeys
RUN sudo mkdir -p /var/run/sshd && \
sudo chmod 0755 /var/run/sshd && \
sudo sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
sudo sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
cd /etc/ssh/ && \
ssh-keygen -A
# Set current working directory
WORKDIR /home/sky

# Install SkyPilot pip dependencies
RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \
pip install networkx oauth2client pandas pendulum PrettyTable && \
pip install rich tabulate filelock && \
pip install packaging 'protobuf<4.0.0' pulp && \
pip install pycryptodome==3.12.0 && \
pip install docker kubernetes==28.1.0 && \
pip install grpcio==1.51.3 python-dotenv==1.0.1
SHELL ["/bin/bash", "-c"]

# Install conda and other dependencies
# Keep the conda and Ray versions below in sync with the ones in skylet.constants
RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && \
bash Miniconda3-Linux-x86_64.sh -b && \
eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true && conda activate base && \
grep "# >>> conda initialize >>>" ~/.bashrc || { conda init && source ~/.bashrc; } && \
rm Miniconda3-Linux-x86_64.sh && \
pip install wheel Click colorama cryptography jinja2 jsonschema networkx \
oauth2client pandas pendulum PrettyTable rich tabulate filelock packaging \
'protobuf<4.0.0' pulp pycryptodome==3.12.0 docker kubernetes==28.1.0 \
grpcio==1.51.3 python-dotenv==1.0.1 ray[default]==2.9.3 && \
curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl

# Add /home/sky/.local/bin/ to PATH
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc
Expand All @@ -51,7 +57,3 @@ COPY --chown=sky . /skypilot/sky/

# Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately
ENV PYTHONUNBUFFERED=1

# Set WORKDIR and initialize conda for sky user
WORKDIR /home/sky
RUN conda init
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,11 @@

----
:fire: *News* :fire:
- [Jun, 2024] Reproduce **GPT** with [llm.c](https://github.com/karpathy/llm.c/discussions/481) on any cloud: [**guide**](./llm/gpt-2/)
- [Apr, 2024] Serve and finetune [**Llama 3**](https://skypilot.readthedocs.io/en/latest/gallery/llms/llama-3.html) on any cloud or Kubernetes: [**example**](./llm/llama-3/)
- [Apr, 2024] Serve [**Qwen-110B**](https://qwenlm.github.io/blog/qwen1.5-110b/) on your infra: [**example**](./llm/qwen/)
- [Apr, 2024] Using [**Ollama**](https://github.com/ollama/ollama) to deploy quantized LLMs on CPUs and GPUs: [**example**](./llm/ollama/)
- [Mar, 2024] Serve and deploy [**Databricks DBRX**](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) on your infra: [**example**](./llm/dbrx/)
- [Feb, 2024] Deploying and scaling [**Gemma**](https://blog.google/technology/developers/gemma-open-models/) with SkyServe: [**example**](./llm/gemma/)
- [Feb, 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/)
- [Feb, 2024] Serving [**Code Llama 70B**](https://ai.meta.com/blog/code-llama-large-language-model-coding/) with vLLM and SkyServe: [**example**](./llm/codellama/)
- [Dec, 2023] [**Mixtral 8x7B**](https://mistral.ai/news/mixtral-of-experts/), a high quality sparse mixture-of-experts model, was released by Mistral AI! Deploy via SkyPilot on any cloud: [**example**](./llm/mixtral/)
- [Nov, 2023] Using [**Axolotl**](https://github.com/OpenAccess-AI-Collective/axolotl) to finetune Mistral 7B on the cloud (on-demand and spot): [**example**](./llm/axolotl/)
Expand All @@ -43,6 +42,8 @@
<details>
<summary>Archived</summary>

- [Mar, 2024] Serve and deploy [**Databricks DBRX**](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) on your infra: [**example**](./llm/dbrx/)
- [Feb, 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/)
- [Dec, 2023] Using [**LoRAX**](https://github.com/predibase/lorax) to serve 1000s of finetuned LLMs on a single instance in the cloud: [**example**](./llm/lorax/)
- [Sep, 2023] [**Mistral 7B**](https://mistral.ai/news/announcing-mistral-7b/), a high-quality open LLM, was released! Deploy via SkyPilot on any cloud: [**Mistral docs**](https://docs.mistral.ai/self-deployment/skypilot)
- [July, 2023] Self-Hosted **Llama-2 Chatbot** on Any Cloud: [**example**](./llm/llama-2/)
Expand Down Expand Up @@ -153,8 +154,9 @@ To learn more, see our [Documentation](https://skypilot.readthedocs.io/en/latest
<!-- Keep this section in sync with index.rst in SkyPilot Docs -->
Runnable examples:
- LLMs on SkyPilot
- [GPT-2 via `llm.c`](./llm/gpt-2/)
- [Llama 3](./llm/llama-3/)
- [Qwen](./llm/qwen/)
- [Qwen](./llm/qwen/)
- [Databricks DBRX](./llm/dbrx/)
- [Gemma](./llm/gemma/)
- [Mixtral 8x7B](./llm/mixtral/); [Mistral 7B](https://docs.mistral.ai/self-deployment/skypilot/) (from official Mistral team)
Expand All @@ -172,7 +174,7 @@ Runnable examples:
- [LocalGPT](./llm/localgpt)
- [Falcon](./llm/falcon)
- Add yours here & see more in [`llm/`](./llm)!
- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama) and [many more (`examples/`)](./examples).
- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2) and [many more (`examples/`)](./examples).

Follow updates:
- [Twitter](https://twitter.com/skypilot_org)
Expand All @@ -183,6 +185,7 @@ Read the research:
- [SkyPilot paper](https://www.usenix.org/system/files/nsdi23-yang-zongheng.pdf) and [talk](https://www.usenix.org/conference/nsdi23/presentation/yang-zongheng) (NSDI 2023)
- [Sky Computing whitepaper](https://arxiv.org/abs/2205.07147)
- [Sky Computing vision paper](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s02-stoica.pdf) (HotOS 2021)
- [Policy for Managed Spot Jobs](https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao) (NSDI 2024)

## Support and Questions
We are excited to hear your feedback!
Expand Down
2 changes: 2 additions & 0 deletions docs/source/_gallery_original/index.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.. _ai-gallery:

AI Gallery
====================

Expand Down
1 change: 0 additions & 1 deletion docs/source/_static/custom.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ document.addEventListener('DOMContentLoaded', () => {
{ selector: '.caption-text', text: 'SkyServe: Model Serving' },
{ selector: '.toctree-l1 > a', text: 'Managed Jobs' },
{ selector: '.toctree-l1 > a', text: 'Running on Kubernetes' },
{ selector: '.toctree-l1 > a', text: 'DBRX (Databricks)' },
{ selector: '.toctree-l1 > a', text: 'Ollama' },
{ selector: '.toctree-l1 > a', text: 'Llama-3 (Meta)' },
{ selector: '.toctree-l1 > a', text: 'Qwen (Alibaba)' },
Expand Down
Loading

0 comments on commit d393206

Please sign in to comment.