diff --git a/README.md b/README.md index b803a998924..a6f1df49c91 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ ---- :fire: *News* :fire: +- [Jul, 2024] [Finetune](./llm/llama-3_1-finetuning/) and [serve](./llm/llama-3_1/) **Llama 3.1** on your infra - [Jun, 2024] Reproduce **GPT** with [llm.c](https://github.com/karpathy/llm.c/discussions/481) on any cloud: [**guide**](./llm/gpt-2/) - [Apr, 2024] Serve and finetune [**Llama 3**](https://skypilot.readthedocs.io/en/latest/gallery/llms/llama-3.html) on any cloud or Kubernetes: [**example**](./llm/llama-3/) - [Apr, 2024] Serve [**Qwen-110B**](https://qwenlm.github.io/blog/qwen1.5-110b/) on your infra: [**example**](./llm/qwen/) diff --git a/docs/source/_gallery_original/index.rst b/docs/source/_gallery_original/index.rst index e8a540c883c..56ff51a889e 100644 --- a/docs/source/_gallery_original/index.rst +++ b/docs/source/_gallery_original/index.rst @@ -39,6 +39,7 @@ Contents DBRX (Databricks) Llama-2 (Meta) Llama-3 (Meta) + Llama-3.1 (Meta) Qwen (Alibaba) CodeLlama (Meta) Gemma (Google) diff --git a/docs/source/_gallery_original/llms/llama-3_1.md b/docs/source/_gallery_original/llms/llama-3_1.md new file mode 120000 index 00000000000..27589363fcb --- /dev/null +++ b/docs/source/_gallery_original/llms/llama-3_1.md @@ -0,0 +1 @@ +../../../../llm/llama-3_1/README.md \ No newline at end of file diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index 5630793d8ff..e0de1b50d51 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -28,9 +28,7 @@ document.addEventListener('DOMContentLoaded', () => { { selector: '.caption-text', text: 'SkyServe: Model Serving' }, { selector: '.toctree-l1 > a', text: 'Managed Jobs' }, { selector: '.toctree-l1 > a', text: 'Running on Kubernetes' }, - { selector: '.toctree-l1 > a', text: 'Ollama' }, - { selector: '.toctree-l1 > a', text: 'Llama-3 (Meta)' }, - { selector: '.toctree-l1 > a', text: 'Qwen (Alibaba)' }, + { selector: '.toctree-l1 > a', text: 'Llama-3.1 (Meta)' }, ]; newItems.forEach(({ selector, text }) => { document.querySelectorAll(selector).forEach((el) => { diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst index b4bd66fba6f..5b8d144af70 100644 --- a/docs/source/docs/index.rst +++ b/docs/source/docs/index.rst @@ -69,6 +69,7 @@ Runnable examples: * **LLMs on SkyPilot** + * `Llama 3.1 finetuning `_ and `serving `_ * `GPT-2 via llm.c `_ * `Llama 3 `_ * `Qwen `_ diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index cb06a28cdf0..fc5eddd6a47 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -185,6 +185,14 @@ Available fields and semantics: # - "*": my-default-security-group security_group_name: my-security-group + # Encrypted boot disk (optional). + # + # Set to true to encrypt the boot disk of all AWS instances launched by + # SkyPilot. This is useful for compliance with data protection regulations. + # + # Default: false. + disk_encrypted: false + # Identity to use for AWS instances (optional). # # LOCAL_CREDENTIALS: The user's local credential files will be uploaded to diff --git a/llm/llama-3_1-finetuning/configs/70B-lora.yaml b/llm/llama-3_1-finetuning/configs/70B-lora.yaml new file mode 100644 index 00000000000..612048536a3 --- /dev/null +++ b/llm/llama-3_1-finetuning/configs/70B-lora.yaml @@ -0,0 +1,99 @@ +# Config for multi-device LoRA in lora_finetune_distributed.py +# using a Llama3.1 70B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3.1-70B-Instruct --output-dir /tmp/Meta-Llama-3.1-70B-Instruct --ignore-patterns "original/consolidated*" +# +# This config needs 8 GPUs to run +# tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_1/70B_lora + +# Model Arguments +model: + _component_: torchtune.models.llama3_1.lora_llama3_1_70b + lora_attn_modules: ['q_proj', 'k_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 16 + lora_alpha: 32 + +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3.1-70B-Instruct/original/tokenizer.model + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/ + checkpoint_files: [ + model-00001-of-00030.safetensors, + model-00002-of-00030.safetensors, + model-00003-of-00030.safetensors, + model-00004-of-00030.safetensors, + model-00005-of-00030.safetensors, + model-00006-of-00030.safetensors, + model-00007-of-00030.safetensors, + model-00008-of-00030.safetensors, + model-00009-of-00030.safetensors, + model-00010-of-00030.safetensors, + model-00011-of-00030.safetensors, + model-00012-of-00030.safetensors, + model-00013-of-00030.safetensors, + model-00014-of-00030.safetensors, + model-00015-of-00030.safetensors, + model-00016-of-00030.safetensors, + model-00017-of-00030.safetensors, + model-00018-of-00030.safetensors, + model-00019-of-00030.safetensors, + model-00020-of-00030.safetensors, + model-00021-of-00030.safetensors, + model-00022-of-00030.safetensors, + model-00023-of-00030.safetensors, + model-00024-of-00030.safetensors, + model-00025-of-00030.safetensors, + model-00026-of-00030.safetensors, + model-00027-of-00030.safetensors, + model-00028-of-00030.safetensors, + model-00029-of-00030.safetensors, + model-00030-of-00030.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3.1-70B-Instruct/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_dataset +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + +# Logging +output_dir: /tmp/lora_finetune_output +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True diff --git a/llm/llama-3_1-finetuning/configs/8B-lora.yaml b/llm/llama-3_1-finetuning/configs/8B-lora.yaml new file mode 100644 index 00000000000..d3e3be5af8e --- /dev/null +++ b/llm/llama-3_1-finetuning/configs/8B-lora.yaml @@ -0,0 +1,83 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Llama3.1 8B Instruct model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" +# +# To launch on 2 devices, run the following command from root: +# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_1/8B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_1/8B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 8B_lora_single_device.yaml +# or 8B_qlora_single_device.yaml + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model + +# Model Arguments +model: + _component_: torchtune.models.llama3_1.lora_llama3_1_8b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3.1-8B-Instruct/ + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3.1-8B-Instruct/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 32 + +# Logging +output_dir: /tmp/lora_finetune_output +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False diff --git a/llm/llama-3_1-finetuning/lora.yaml b/llm/llama-3_1-finetuning/lora.yaml new file mode 100644 index 00000000000..35b0fc6faad --- /dev/null +++ b/llm/llama-3_1-finetuning/lora.yaml @@ -0,0 +1,58 @@ +# LoRA finetuning Meta Llama-3.1 on any of your own infra. +# +# Usage: +# +# HF_TOKEN=xxx sky launch lora.yaml -c llama31 --env HF_TOKEN +# +# To finetune a 70B model: +# +# HF_TOKEN=xxx sky launch lora.yaml -c llama31-70 --env HF_TOKEN --env MODEL_SIZE=70B + +envs: + MODEL_SIZE: 8B + HF_TOKEN: + DATASET: "yahma/alpaca-cleaned" + # Change this to your own checkpoint bucket + CHECKPOINT_BUCKET_NAME: sky-llama-31-checkpoints + + +resources: + accelerators: A100:8 + disk_tier: best + use_spot: true + +file_mounts: + /configs: ./configs + /output: + name: $CHECKPOINT_BUCKET_NAME + mode: MOUNT + # Optionally, specify the store to enforce to use one of the stores below: + # r2/azure/gcs/s3/cos + # store: r2 + +setup: | + pip install torch torchvision + + # Install torch tune from source for the latest Llama-3.1 model + pip install git+https://github.com/pytorch/torchtune.git@58255001bd0b1e3a81a6302201024e472af05379 + # pip install torchtune + + tune download meta-llama/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --hf-token $HF_TOKEN \ + --output-dir /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --ignore-patterns "original/consolidated*" + +run: | + tune run --nproc_per_node $SKYPILOT_NUM_GPUS_PER_NODE \ + lora_finetune_distributed \ + --config /configs/${MODEL_SIZE}-lora.yaml \ + dataset.source=$DATASET + + # Remove the checkpoint files to save space, LoRA serving only needs the + # adapter files. + rm /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/*.pt + rm /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/*.safetensors + + mkdir -p /output/$MODEL_SIZE-lora + rsync -Pavz /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct /output/$MODEL_SIZE-lora + cp -r /tmp/lora_finetune_output /output/$MODEL_SIZE-lora/ diff --git a/llm/llama-3_1-finetuning/readme.md b/llm/llama-3_1-finetuning/readme.md new file mode 100644 index 00000000000..836f3bf1b3b --- /dev/null +++ b/llm/llama-3_1-finetuning/readme.md @@ -0,0 +1,267 @@ +# Finetune Llama 3.1 on your infra + +
+
+ +
+ +On July 23, 2024, Meta released the [Llama 3.1 model family](https://ai.meta.com/blog/meta-llama-3-1/), including a 405B parameter model in both base model and instruction-tuned forms. Llama 3.1 405B became _the first open LLM that closely rivals top proprietary models_ like GPT-4o and Claude 3.5 Sonnet. + +This guide shows how to use [SkyPilot](https://github.com/skypilot-org/skypilot) and [torchtune](https://pytorch.org/torchtune/stable/index.html) to **finetune Llama 3.1 on your own data and infra**. Everything is packaged in a simple [SkyPilot YAML](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html), that can be launched with one command on your infra: +- Local GPU workstation +- Kubernetes cluster +- Cloud accounts ([12 clouds supported](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)) + +
+
+ +
+ + + +## Let's finetune Llama 3.1 +We will use [torchtune](https://pytorch.org/torchtune/stable/index.html) to finetune Llama 3.1. The example below uses the [`yahma/alpaca-cleaned`](https://huggingface.co/datasets/yahma/alpaca-cleaned) dataset, which you can replace with your own dataset later. + +To set up the environment for launching the finetuning job, finish the [Appendix: Preparation](#appendix-preparation) section first. + +The finetuning job is packaged in a SkyPilot YAML. It can be launched on any of your own infra, such as Kubernetes or any cloud, with the same interface: + +
+ + SkyPilot YAML for finetuning Llama 3.1: lora.yaml + + +```yaml +# LoRA finetuning Meta Llama 3.1 on any of your own infra. +# +# Usage: +# +# HF_TOKEN=xxx sky launch lora.yaml -c llama31 --env HF_TOKEN +# +# To finetune a 70B model: +# +# HF_TOKEN=xxx sky launch lora.yaml -c llama31-70 --env HF_TOKEN --env MODEL_SIZE=70B + +envs: + MODEL_SIZE: 8B + HF_TOKEN: + DATASET: "yahma/alpaca-cleaned" + # Change this to your own checkpoint bucket + CHECKPOINT_BUCKET_NAME: sky-llama-31-checkpoints + + +resources: + accelerators: A100:8 + disk_tier: best + use_spot: true + +file_mounts: + /configs: ./configs + /output: + name: $CHECKPOINT_BUCKET_NAME + mode: MOUNT + # Optionally, specify the store to enforce to use one of the stores below: + # r2/azure/gcs/s3/cos + # store: r2 + +setup: | + pip install torch torchvision + + # Install torch tune from source for the latest Llama 3.1 model + pip install git+https://github.com/pytorch/torchtune.git@58255001bd0b1e3a81a6302201024e472af05379 + # pip install torchtune + + tune download meta-llama/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --hf-token $HF_TOKEN \ + --output-dir /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --ignore-patterns "original/consolidated*" + +run: | + tune run --nproc_per_node $SKYPILOT_NUM_GPUS_PER_NODE \ + lora_finetune_distributed \ + --config /configs/${MODEL_SIZE}-lora.yaml \ + dataset.source=$DATASET + + # Remove the checkpoint files to save space, LoRA serving only needs the + # adapter files. + rm /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/*.pt + rm /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/*.safetensors + + mkdir -p /output/$MODEL_SIZE-lora + rsync -Pavz /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct /output/$MODEL_SIZE-lora + cp -r /tmp/lora_finetune_output /output/$MODEL_SIZE-lora/ +``` + +
+ +Run the following on your local machine: + +```bash +# Download the files for Llama 3.1 finetuning +git clone https://github.com/skypilot-org/skypilot +cd skypilot/llm/llama-3.1 + +export HF_TOKEN=xxxx + +# It takes about 40 mins on 8 A100 GPUs to finetune a 8B +# Llama3.1 model with LoRA on Alpaca dataset. +sky launch -c llama31 lora.yaml \ + --env HF_TOKEN --env MODEL_SIZE=8B \ + --env CHECKPOINT_BUCKET_NAME="your-own-bucket-name" +``` + + +To finetune a larger model with 70B parameters, you can simply change the parameters as below: +```bash +sky launch -c llama31-70 lora.yaml \ + --env HF_TOKEN --env MODEL_SIZE=70B \ + --env CHECKPOINT_BUCKET_NAME="your-own-bucket-name" +``` + +**Finetuning Llama 3.1 405B**: Work in progress! If you want to follow the work, join the [SkyPilot community Slack](https://slack.skypilot.co/) for discussions. + +## Use your custom data +The example above finetune Llama 3.1 on Alpaca dataset ([`yahma/alpaca-cleaned`](https://huggingface.co/datasets/yahma/alpaca-cleaned)), but for real use cases, you may want to finetune it on your own dataset. + +You can do so by specifying the huggingface path to your own dataset as following (we use [`gbharti/finance-alpaca`](https://huggingface.co/datasets/gbharti/finance-alpaca) as an example below): +```bash +# It takes about 1 hour on 8 A100 GPUs to finetune a 8B +# Llama3.1 model with LoRA on finance dataset. +sky launch -c llama31 lora.yaml \ + --env HF_TOKEN --env MODEL_SIZE=8B \ + --env CHECKPOINT_BUCKET_NAME="your-own-bucket-name" \ + --env DATASET="gbharti/finance-alpaca" +``` + +
+
+ + + +
Training Loss of LoRA finetuning Llama 3.1
+
+ +## Serve the fine tuned model + +With a finetuned Llama 3.1 trained on your dataset, you can now serve the finetuned model with a single command: + +> Note: `CHECKPOINT_BUCKET_NAME` should be the bucket you used for storing checkpoints in the previous finetuning step. + +```bash +sky launch -c serve-llama31 serve.yaml \ + --env LORA_NAME="my-finance-lora" \ + --env CHECKPOINT_BUCEKT_NAME="your-own-bucket-name" +``` + +You can interact with the model in a terminal: +```console +ENDPOINT=$(sky status --endpoint 8081 serve-llama31) +curl http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "my-finance-lora", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "For a car, what scams can be plotted with 0% financing vs rebate?" + } + ] + }' | jq . +``` + +:tada: **Congratulations!** You now have a finetuned Llama 3.1 8B model that is well versed in finance topics. To recap, all model checkpoints and replicas **stay in your own private infrastructure**. + +
+ SkyPilot YAML serve.yaml for serving the finetuned model + +```yaml +# Serve a LoRA finetuned Meta Llama 3.1. +# +# Usage: +# +# HF_TOKEN=xxx sky launch serve.yaml -c llama31-serve --env HF_TOKEN + +envs: + MODEL_SIZE: 8B + HF_TOKEN: + # Change this to your checkpoint bucket created in lora.yaml + CHECKPOINT_BUCKET_NAME: your-checkpoint-bucket + LORA_NAME: my-finance-lora + +resources: + accelerators: L4 + ports: 8081 + cpus: 32+ + +file_mounts: + /checkpoints: + name: $CHECKPOINT_BUCKET_NAME + mode: MOUNT + +setup: | + pip install vllm==0.5.3post1 + pip install vllm-flash-attn==2.5.9.post1 + pip install openai + +run: | + vllm serve meta-llama/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE --enable-lora \ + --lora-modules $LORA_NAME=/checkpoints/${MODEL_SIZE}-lora/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/ \ + --max-model-len=2048 --port 8081 +``` + +
+ +## Appendix: Preparation +1. Request the access to [Llama 3.1 weights on huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) (Click on the blue box and follow the steps): +![](https://imgur.com/snIQhr9.png) + +2. Get your [huggingface access token](https://huggingface.co/settings/tokens): +![](https://imgur.com/3idBgHn.png) + + +3. Add huggingface token to your environment variable: +```bash +export HF_TOKEN="xxxx" +``` + +4. Install SkyPilot for launching the finetuning: +```bash +pip install skypilot-nightly[aws,gcp,kubernetes] +# or other clouds (12 clouds + kubernetes supported) you have setup +# See: https://skypilot.readthedocs.io/en/latest/getting-started/installation.html +``` + +5. Check your infra setup: +```console +sky check + +🎉 Enabled clouds 🎉 + ✔ AWS + ✔ GCP + ✔ Azure + ✔ OCI + ✔ Lambda + ✔ RunPod + ✔ Paperspace + ✔ Fluidstack + ✔ Cudo + ✔ IBM + ✔ SCP + ✔ vSphere + ✔ Cloudflare (for R2 object store) + ✔ Kubernetes +``` + + + +## What's next + +* [AI on Kubernetes Without the Pain](https://blog.skypilot.co/ai-on-kubernetes/) +* [SkyPilot AI Gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html) +* [SkyPilot Docs](https://skypilot.readthedocs.io/en/latest/docs/index.html) +* [SkyPilot GitHub](https://github.com/skypilot-org/skypilot) diff --git a/llm/llama-3_1-finetuning/serve.yaml b/llm/llama-3_1-finetuning/serve.yaml new file mode 100644 index 00000000000..c1df6b6b8c7 --- /dev/null +++ b/llm/llama-3_1-finetuning/serve.yaml @@ -0,0 +1,33 @@ +# Serve a LoRA finetuned Meta Llama-3.1. +# +# Usage: +# +# HF_TOKEN=xxx sky launch serve.yaml -c llama31-serve --env HF_TOKEN + +envs: + MODEL_SIZE: 8B + HF_TOKEN: + # Change this to your checkpoint bucket created in lora.yaml + CHECKPOINT_BUCKET_NAME: your-checkpoint-bucket + LORA_NAME: my-finance-lora + +resources: + accelerators: L4 + ports: 8081 + cpus: 32+ + +file_mounts: + /checkpoints: + name: $CHECKPOINT_BUCKET_NAME + mode: MOUNT + +setup: | + pip install vllm==0.5.3post1 + pip install vllm-flash-attn==2.5.9.post1 + pip install openai + +run: | + vllm serve meta-llama/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE --enable-lora \ + --lora-modules $LORA_NAME=/checkpoints/${MODEL_SIZE}-lora/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/ \ + --max-model-len=2048 --port 8081 diff --git a/llm/llama-3_1/README.md b/llm/llama-3_1/README.md new file mode 100644 index 00000000000..6cfeb8dc5f9 --- /dev/null +++ b/llm/llama-3_1/README.md @@ -0,0 +1,308 @@ +# Serve Llama 3.1 on Your Own Infrastructure + + +

+Llama-3.1 on SkyPilot +

+ +On July 23, 2024, Meta AI released the [Llama 3.1 model family](https://ai.meta.com/blog/meta-llama-3-1/), including a 405B parameter model in both base model and instruction-tuned forms. + +Llama 3.1 405B became the most capable open LLM model to date. This is **the first time an open LLM closely rivals state-of-the-art proprietary models** like GPT-4o and Claude 3.5 Sonnet. + +This guide walks through how to serve Llama 3.1 models **completely on your infrastructure** (cluster or cloud VPC). Supported infra: + +- Local GPU workstation +- Kubernetes cluster +- Cloud accounts ([12 clouds supported](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)) + +SkyPilot will be used as the unified framework to launch serving on any (or multiple) infra that you bring. + +## Serving Llama 3.1 on your infra + +Below is a step-by-step guide to using SkyPilot for testing a new model on a GPU dev node, and then packaging it for one-click deployment across any infrastructure. + +**To skip directly to the packaged deployment YAML for Llama 3.1, see [Step 3: Package and deploy using SkyPilot](#step-3-package-and-deploy-using-skypilot).** + +### GPUs required for serving Llama 3.1 + +Llama 3.1 comes in different sizes, and each size has different GPU requirements. Here is the model-GPU compatibility matrix (applies to both pretrained and instruction tuned models): + +| **GPU** | **Meta-Llama-3.1-8B** | **Meta-Llama-3.1-70B** | **Meta-Llama-3.1-405B-FP8** | +|----------------- |------------------------------ |------------------------ |------------------------------ | +| **L4:1** | ✅, with `--max-model-len 4096` | ❌ | ❌ | +| **L4:8** | ✅ | ❌ | ❌ | +| **A100:8** | ✅ | ✅ | ❌ | +| **A100-80GB:8** | ✅ | ✅ | ✅, with `--max-model-len 4096` | + + +### Step 0: Bring your infra + +Install SkyPilot on your local machine: + +```bash +pip install 'skypilot-nightly[all]' +``` + +Pick one of the following depending on what infra you want to run Llama 3.1 on: + +**If your local machine is a GPU node**: use this command to up a lightweight kubernetes cluster: + +```bash +sky local up +``` + +**If you have a Kubernetes GPU cluster** (e.g., on-prem, EKS / GKE / AKS / ...): + +```bash +# Should show Enabled if you have ~/.kube/config set up. +sky check kubernetes +``` + +**If you want to use clouds** (e.g., reserved instances): 12+ clouds are supported: + +```bash +sky check +``` + +See [docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) for details. + +### Step 1: Get a GPU dev node (pod or VM) + +> **Tip:** If you simply want the final deployment YAML, skip directly to [Step 3](#step-3-package-and-deploy-using-skypilot). + +One command to get a GPU dev pod/VM: +```bash +sky launch -c llama --gpus A100-80GB:8 +``` +If you are using local machine or Kubernetes, the above will create a pod. If you are using clouds, the above will create a VM. + +You can add a `-r / --retry-until-up` flag to have SkyPilot auto-retry to guard against out-of-capacity errors. + + +> **Tip:** Vary the `--gpus` flag to get different GPU types and counts. For example, `--gpus H100:8` gets you a pod with 8x H100 GPUs. +> +> You can run `sky show-gpus` to see all available GPU types on your infra. + + +Once provisioned, you can easily connect to it to start dev work. Two recommended methods: +- Open up VSCode, click bottom left, `Connect to Host`, type `llama` +- Or, SSH into it with `ssh llama` + +### Step 2: Inside the dev node, test serving + +Once logged in, run the following to install vLLM and run it (which automatically pulls the model weights from HuggingFace): +```bash +pip install vllm==0.5.3.post1 huggingface + +# Paste your HuggingFace token to get access to Meta Llama repos: +# https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f +huggingface-cli login +``` + +We are now ready to start serving. If you have N=8 GPUs +```bash +vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct --tensor-parallel-size 8 +``` +Change the `--tensor-parallel-size` to the number of GPUs you have. + +Tip: available model names can be found [here](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f) and below. +- Pretrained: + - Meta-Llama-3.1-8B + - Meta-Llama-3.1-70B + - Meta-Llama-3.1-405B-FP8 +- Instruction tuned: + - Meta-Llama-3.1-8B-Instruct + - Meta-Llama-3.1-70B-Instruct + - Meta-Llama-3.1-405B-Instruct-FP8 + + +The full precision 405B model Meta-Llama-3.1-405B requires multi-node inference and is work in progress - join the [SkyPilot community Slack](https://slack.skypilot.co/) for discussions. + +Test that `curl` works from within the node: +```bash +ENDPOINT=127.0.0.1:8000 +curl http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ] + }' | jq +``` + +🎉 Voila! You should be getting results like this: + +

+Llama-3.1 on SkyPilot +

+ +When you are done, terminate your cluster with: +``` +sky down llama +``` + +### Step 3: Package and deploy using SkyPilot + +Now that we verified the model is working, let's package it for hands-free deployment. + +Whichever infra you use for GPUs, SkyPilot abstracts away the mundane infra tasks (e.g., setting up services on K8s, opening up ports for cloud VMs), making AI models super easy to deploy via one command. + +[Deploying via SkyPilot](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) has several key benefits: +- Control node & replicas completely stay in your infra +- Automatic load-balancing across multiple replicas +- Automatic recovery of replicas +- Replicas can use different infras to save significant costs + - e.g., a mix of clouds, or a mix of reserved & spot GPUs + +
+Click to see the YAML: serve.yaml. + +```yaml + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3.1-8B-Instruct + HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. + +service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + +resources: + accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} + # accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + cpus: 32+ + disk_size: 1000 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +setup: | + pip install vllm==0.5.3post1 + pip install vllm-flash-attn==2.5.9.post1 + # Install Gradio for web UI. + pip install gradio openai + +run: | + echo 'Starting vllm api server...' + + vllm serve $MODEL_NAME \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 4096 \ + --port 8081 \ + 2>&1 | tee api_server.log & + + while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do + echo 'Waiting for vllm api server to start...' + sleep 5 + done + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://localhost:8081/v1 +``` + +
+ +You can also get the full YAML file [here](https://github.com/skypilot-org/skypilot/blob/master/llm/llama-3_1/llama-3_1.yaml). + +Launch a fully managed service with load-balancing and auto-recovery: + +``` +HF_TOKEN=xxx sky serve up llama-3_1.yaml -n llama31 --env HF_TOKEN --gpus L4:1 --env MODEL_NAME=meta-llama/Meta-Llama-3.1-8B-Instruct +``` + +Wait until the service is ready: + +``` +watch -n10 sky serve status llama31 +``` + +Get a single endpoint that load-balances across replicas: + +``` +ENDPOINT=$(sky serve status --endpoint llama31) +``` + +Query the endpoint in a terminal: +``` +curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ] + }' | jq . +``` + + +
+Click to see the output + +```console +{ + "id": "chat-5cdbc2091c934e619e56efd0ed85e28f", + "object": "chat.completion", + "created": 1721784853, + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "I am a helpful assistant, here to provide information and assist with tasks to the best of my abilities. I'm a computer program designed to simulate conversation and answer questions on a wide range of topics. I can help with things like:\n\n* Providing definitions and explanations\n* Answering questions on history, science, and technology\n* Generating text and ideas\n* Translating languages\n* Offering suggestions and recommendations\n* And more!\n\nI'm constantly learning and improving, so feel free to ask me anything. What can I help you with today?", + "tool_calls": [] + }, + "logprobs": null, + "finish_reason": "stop", + "stop_reason": null + } + ], + "usage": { + "prompt_tokens": 25, + "total_tokens": 136, + "completion_tokens": 111 + } +} +``` + +
+ +🎉 **Congratulations!** You are now serving a Llama 3.1 8B model across two replicas. To recap, all model replicas **stay in your own private infrastructure** and SkyPilot ensures they are **healthy and available**. + + +Details on autoscaling, rolling updates, and more in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). + +When you are done, shut down all resources: + +``` +sky serve down llama31 +``` + +## Bonus: Finetuning Llama 3.1 +You can also finetune Llama 3.1 on your infra with SkyPilot. Check out our [blog](https://blog.skypilot.co/finetune-llama-3_1-on-your-infra/) for more details. diff --git a/llm/llama-3_1/llama-3_1.yaml b/llm/llama-3_1/llama-3_1.yaml new file mode 100644 index 00000000000..a86d3e51666 --- /dev/null +++ b/llm/llama-3_1/llama-3_1.yaml @@ -0,0 +1,109 @@ +# Serving Meta Llama-3.1 on your own infra. +# +# Usage: +# +# # Launch Llama-3.1 8B on a single L4 GPU: +# HF_TOKEN=xxx sky launch llama-31.yaml -c llama31 --gpus L4:1 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3.1-8B-Instruct +# +# # Launch Llama-3.1 405B-FP8 on a A100-80GB:8 GPU: +# HF_TOKEN=xxx sky launch llama-31.yaml -c llama31 --gpus A100-80GB:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 +# +# curl /v1/chat/completions: +# +# ENDPOINT=$(sky status --endpoint 8081 llama31) +# +# curl http://$ENDPOINT/v1/chat/completions \ +# -H "Content-Type: application/json" \ +# -d '{ +# "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", +# "messages": [ +# { +# "role": "system", +# "content": "You are a helpful assistant." +# }, +# { +# "role": "user", +# "content": "Who are you?" +# } +# ] +# }' +# +# Chat with model with Gradio UI (URLs printed in logs): +# +# Running on local URL: http://127.0.0.1:8811 +# Running on public URL: https://.gradio.live +# +# Scale up with SkyServe: +# HF_TOKEN=xxx sky serve up llama-31.yaml -n llama31 --env HF_TOKEN --gpus L4:1 --env MODEL_NAME=meta-llama/Meta-Llama-3.1-8B-Instruct +# +# curl /v1/chat/completions: +# +# ENDPOINT=$(sky serve status --endpoint llama31) +# curl -L $ENDPOINT/v1/models +# curl -L http://$ENDPOINT/v1/chat/completions \ +# -H "Content-Type: application/json" \ +# -d '{ +# "model": "meta-llama/Meta-Llama-3-8B-Instruct", +# "messages": [ +# { +# "role": "system", +# "content": "You are a helpful assistant." +# }, +# { +# "role": "user", +# "content": "Who are you?" +# } +# ] +# }' + + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3.1-8B-Instruct + HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. + +service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + +resources: + accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} + # accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + cpus: 32+ + disk_size: 1000 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +setup: | + pip install vllm==0.5.3post1 + pip install vllm-flash-attn==2.5.9.post1 + # Install Gradio for web UI. + pip install gradio openai + +run: | + echo 'Starting vllm api server...' + + vllm serve $MODEL_NAME \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 4096 \ + --port 8081 \ + 2>&1 | tee api_server.log & + + while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do + echo 'Waiting for vllm api server to start...' + sleep 5 + done + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://localhost:8081/v1 diff --git a/sky/adaptors/azure.py b/sky/adaptors/azure.py index 731d7e836c3..0cadb0b2bd8 100644 --- a/sky/adaptors/azure.py +++ b/sky/adaptors/azure.py @@ -186,7 +186,7 @@ def get_client(name: str, if 'ERROR: AADSTS50020' in str(e): with ux_utils.print_exception_no_traceback(): raise sky_exceptions.StorageBucketGetError( - 'Attempted to fetch a non-existant public ' + 'Attempted to fetch a non-existent public ' 'container name: ' f'{container_client.container_name}. ' 'Please check if the name is correct.') diff --git a/sky/adaptors/kubernetes.py b/sky/adaptors/kubernetes.py index 7f52a099f56..52eb339d213 100644 --- a/sky/adaptors/kubernetes.py +++ b/sky/adaptors/kubernetes.py @@ -4,6 +4,7 @@ import logging import os +from typing import Any, Callable, Set from sky.adaptors import common from sky.sky_logging import set_logging_level @@ -30,11 +31,19 @@ API_TIMEOUT = 5 -def _decorate_methods(obj, decorator): +def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str): for attr_name in dir(obj): attr = getattr(obj, attr_name) + # Skip methods starting with '__' since they are invoked through one + # of the main methods, which are already decorated. if callable(attr) and not attr_name.startswith('__'): - setattr(obj, attr_name, decorator(attr)) + decorated_types: Set[str] = getattr(attr, '_sky_decorator_types', + set()) + if decoration_type not in decorated_types: + decorated_attr = decorator(attr) + decorated_attr._sky_decorator_types = ( # pylint: disable=protected-access + decorated_types | {decoration_type}) + setattr(obj, attr_name, decorated_attr) return obj @@ -49,7 +58,7 @@ def decorated_api(api): def wrapped(*args, **kwargs): obj = api(*args, **kwargs) - _decorate_methods(obj, set_logging_level(logger, level)) + _decorate_methods(obj, set_logging_level(logger, level), 'api_log') return obj return wrapped diff --git a/sky/benchmark/benchmark_utils.py b/sky/benchmark/benchmark_utils.py index 2ef6825eaa0..11160332209 100644 --- a/sky/benchmark/benchmark_utils.py +++ b/sky/benchmark/benchmark_utils.py @@ -20,6 +20,7 @@ import sky from sky import backends +from sky import clouds from sky import data from sky import global_user_state from sky import sky_logging @@ -170,8 +171,13 @@ def _create_benchmark_bucket() -> Tuple[str, str]: # Select the bucket type. enabled_clouds = storage_lib.get_cached_enabled_storage_clouds_or_refresh( raise_if_no_cloud_access=True) - # Already checked by raise_if_no_cloud_access=True. - assert enabled_clouds + # Sky Benchmark only supports S3 (see _download_remote_dir and + # _delete_remote_dir). + enabled_clouds = [ + cloud for cloud in enabled_clouds if cloud in [str(clouds.AWS())] + ] + assert enabled_clouds, ('No enabled cloud storage found. Sky Benchmark ' + 'requires GCP or AWS to store logs.') bucket_type = data.StoreType.from_cloud(enabled_clouds[0]).value # Create a benchmark bucket. @@ -242,14 +248,8 @@ def _download_remote_dir(remote_dir: str, local_dir: str, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) - elif bucket_type == data.StoreType.GCS: - remote_dir = f'gs://{remote_dir}' - subprocess.run(['gsutil', '-m', 'cp', '-r', remote_dir, local_dir], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - check=True) else: - raise RuntimeError('Azure Blob Storage is not supported yet.') + raise RuntimeError(f'{bucket_type} is not supported yet.') def _delete_remote_dir(remote_dir: str, bucket_type: data.StoreType) -> None: @@ -260,20 +260,8 @@ def _delete_remote_dir(remote_dir: str, bucket_type: data.StoreType) -> None: stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) - elif bucket_type == data.StoreType.GCS: - remote_dir = f'gs://{remote_dir}' - proc = subprocess.run(['gsutil', '-m', 'rm', '-r', remote_dir], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=False) - if proc.returncode != 0: - stderr = proc.stderr.decode('utf-8') - if 'BucketNotFoundException: 404' in stderr: - logger.warning(f'Bucket {remote_dir} does not exist. Skip') - else: - raise RuntimeError(f'Failed to delete {remote_dir}: {stderr}') else: - raise RuntimeError('Azure Blob Storage is not supported yet.') + raise RuntimeError(f'{bucket_type} is not supported yet.') def _read_timestamp(path: str) -> float: diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index fc001ea75c0..021f243da70 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -444,7 +444,7 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> resources_utils.FeasibleResources: if resources.instance_type is not None: assert resources.is_launchable(), resources # Check the instance type is valid in the cloud @@ -455,10 +455,12 @@ def _get_feasible_launchable_resources( region=resources.region, zone=resources.zone) if not regions: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) # Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x). resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -484,9 +486,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -501,8 +504,10 @@ def _make(instance_list): zone=resources.zone, clouds='aws') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index a035ff256c1..928ceb5cc52 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -378,17 +378,19 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]: def _get_feasible_launchable_resources( self, resources: 'resources.Resources' - ) -> Tuple[List['resources.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources ok, _ = Azure.check_disk_tier(resources.instance_type, resources.disk_tier) if not ok: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) # Treat Resources(Azure, Standard_NC4as_T4_v3, T4) as # Resources(Azure, Standard_NC4as_T4_v3). resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -418,9 +420,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -435,8 +438,10 @@ def _make(instance_list): zone=resources.zone, clouds='azure') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 93048a84e74..ce9c2ae602d 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -341,11 +341,10 @@ def is_label_valid(cls, label_key: str, return True, None def get_feasible_launchable_resources( - self, - resources: 'resources_lib.Resources', - num_nodes: int = 1 - ) -> Tuple[List['resources_lib.Resources'], List[str]]: - """Returns ([feasible and launchable resources], [fuzzy candidates]). + self, + resources: 'resources_lib.Resources', + num_nodes: int = 1) -> 'resources_utils.FeasibleResources': + """Returns FeasibleResources for the given resources. Feasible resources refer to an offering respecting the resource requirements. Currently, this function implements "filtering" the @@ -353,10 +352,15 @@ def get_feasible_launchable_resources( Launchable resources require a cloud and an instance type be assigned. - Fuzzy candidates example: when the requested GPU is A100:1 but is not - available in a cloud/region, the fuzzy candidates are results of a fuzzy - search in the catalog that are offered in the location. E.g., - ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8'] + The returned dataclass object FeasibleResources contains three fields: + + - resources_list: a list of resources that are feasible to launch + - fuzzy_candidate_list: a list of resources that loosely match requested + resources. E.g., when A100:1 GPU is requested but is not available + in a cloud/region, the fuzzy candidates are results of a fuzzy + search in the catalog that are offered in the location. E.g., + ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8'] + - hint: an optional string hint if no feasible resources are found. """ if resources.is_launchable(): self._check_instance_type_accelerators_combination(resources) @@ -372,13 +376,18 @@ def get_feasible_launchable_resources( # TODO(zhwu): The resources are now silently filtered out. We # should have some logging telling the user why the resources # are not considered. - return ([], []) + return resources_utils.FeasibleResources(resources_list=[], + fuzzy_candidate_list=[], + hint=None) return self._get_feasible_launchable_resources(resources) def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': """See get_feasible_launchable_resources().""" + # TODO: Currently only the Kubernetes implementation of this method + # returns hints when no feasible resources are found. This should be + # implemented for all clouds. raise NotImplementedError def get_reservations_available_resources( diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 8f7d4eaf923..8f100caebad 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -214,13 +214,16 @@ def make_deploy_resources_variables( } def _get_feasible_launchable_resources( - self, resources: 'resources_lib.Resources'): + self, resources: 'resources_lib.Resources' + ) -> 'resources_utils.FeasibleResources': if resources.use_spot: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -243,9 +246,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -260,8 +264,10 @@ def _make(instance_list): zone=resources.zone, clouds='cudo') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/fluidstack.py b/sky/clouds/fluidstack.py index 40166e06d09..4bc4fca9d8a 100644 --- a/sky/clouds/fluidstack.py +++ b/sky/clouds/fluidstack.py @@ -202,7 +202,9 @@ def _get_feasible_launchable_resources( assert resources.is_launchable(), resources # Accelerators are part of the instance type in Fluidstack Cloud resources = resources.copy(accelerators=None) - return ([resources], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -230,9 +232,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -247,8 +250,10 @@ def _make(instance_list): zone=resources.zone, clouds='fluidstack') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 050fda07fe4..e24e67b2486 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -526,10 +526,10 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources.Resources' - ) -> Tuple[List['resources.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) if resources.accelerators is None: # Return a default instance type with the given number of vCPUs. @@ -538,7 +538,9 @@ def _get_feasible_launchable_resources( memory=resources.memory, disk_tier=resources.disk_tier) if host_vm_type is None: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) else: r = resources.copy( cloud=GCP(), @@ -547,7 +549,7 @@ def _get_feasible_launchable_resources( cpus=None, memory=None, ) - return ([r], []) + return resources_utils.FeasibleResources([r], [], None) # Find instance candidates to meet user's requirements assert len(resources.accelerators.items() @@ -569,7 +571,8 @@ def _get_feasible_launchable_resources( clouds='gcp') if instance_list is None: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) assert len( instance_list ) == 1, f'More than one instance type matched, {instance_list}' @@ -584,11 +587,13 @@ def _get_feasible_launchable_resources( if resources.cpus.endswith('+'): cpus = float(resources.cpus[:-1]) if cpus > num_cpus_in_tpu_vm: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources( + [], fuzzy_candidate_list, None) else: cpus = float(resources.cpus) if cpus != num_cpus_in_tpu_vm: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources( + [], fuzzy_candidate_list, None) # FIXME(woosuk, wei-lin): This leverages the fact that TPU VMs # have 334 GB RAM, and 400 GB RAM for tpu-v4. We need to move # this to service catalog, instead. @@ -597,11 +602,13 @@ def _get_feasible_launchable_resources( if resources.memory.endswith('+'): memory = float(resources.memory[:-1]) if memory > memory_in_tpu_vm: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources( + [], fuzzy_candidate_list, None) else: memory = float(resources.memory) if memory != memory_in_tpu_vm: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources( + [], fuzzy_candidate_list, None) else: host_vm_type = instance_list[0] @@ -613,7 +620,8 @@ def _get_feasible_launchable_resources( cpus=None, memory=None, ) - return ([r], fuzzy_candidate_list) + return resources_utils.FeasibleResources([r], fuzzy_candidate_list, + None) @classmethod def get_accelerators_from_instance_type( diff --git a/sky/clouds/ibm.py b/sky/clouds/ibm.py index e468fecf00f..b78cc4287c0 100644 --- a/sky/clouds/ibm.py +++ b/sky/clouds/ibm.py @@ -266,12 +266,15 @@ def get_default_instance_type( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': fuzzy_candidate_list: List[str] = [] if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], fuzzy_candidate_list) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([resources], + fuzzy_candidate_list, None) def _make(instance_list): resource_list = [] @@ -296,9 +299,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -312,8 +316,10 @@ def _make(instance_list): zone=resources.zone, clouds='ibm') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def get_default_image(cls, region) -> str: diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 113774142c9..4dd1fe8ce75 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -342,12 +342,13 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': fuzzy_candidate_list: List[str] = [] if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], fuzzy_candidate_list) + return resources_utils.FeasibleResources([resources], + fuzzy_candidate_list, None) def _make(instance_list): resource_list = [] @@ -403,10 +404,11 @@ def _make(instance_list): logger.debug(f'Instance type {chosen_instance_type} does ' 'not fit in the Kubernetes cluster. ' f'Reason: {reason}') - return [], [] + return resources_utils.FeasibleResources([], [], reason) # No fuzzy lists for Kubernetes - return _make([chosen_instance_type]), [] + return resources_utils.FeasibleResources(_make([chosen_instance_type]), + [], None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index 036f5a23979..ce45f087296 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -178,12 +178,14 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources # Accelerators are part of the instance type in Lambda Cloud resources = resources.copy(accelerators=None) - return ([resources], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -209,9 +211,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -226,8 +229,10 @@ def _make(instance_list): zone=resources.zone, clouds='lambda') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index a911c3f38d0..7875e26d9cc 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -295,11 +295,13 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -326,9 +328,10 @@ def _make(instance_list): disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources @@ -344,9 +347,11 @@ def _make(instance_list): zone=resources.zone, clouds='oci') if instance_list is None: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/paperspace.py b/sky/clouds/paperspace.py index efa1afee781..171bcf33f16 100644 --- a/sky/clouds/paperspace.py +++ b/sky/clouds/paperspace.py @@ -196,11 +196,13 @@ def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources'): """Returns a list of feasible resources for the given resources.""" if resources.use_spot: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -223,9 +225,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -241,8 +244,10 @@ def _make(instance_list): clouds='paperspace', )) if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index 3486330b8b3..4fc4bfce85b 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -187,12 +187,12 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': """Returns a list of feasible resources for the given resources.""" if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -215,9 +215,12 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -231,8 +234,10 @@ def _make(instance_list): zone=resources.zone, clouds='runpod') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/scp.py b/sky/clouds/scp.py index da45a7e143e..9cfbd5129f6 100644 --- a/sky/clouds/scp.py +++ b/sky/clouds/scp.py @@ -251,16 +251,18 @@ def _get_default_ami(cls, region_name: str, instance_type: str) -> str: def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': # Check if the host VM satisfies the min/max disk size limits. is_allowed = self._is_disk_size_allowed(resources) if not is_allowed: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources # Accelerators are part of the instance type in SCP Cloud resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -287,9 +289,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -304,8 +307,10 @@ def _make(instance_list): zone=resources.zone, clouds='scp') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/vsphere.py b/sky/clouds/vsphere.py index 968368ff0aa..6e7e1abeb04 100644 --- a/sky/clouds/vsphere.py +++ b/sky/clouds/vsphere.py @@ -197,11 +197,13 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources'): if resources.use_spot: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -226,9 +228,10 @@ def _make(instance_list): disk_tier=resources.disk_tier, ) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -246,8 +249,10 @@ def _make(instance_list): clouds=_CLOUD_VSPHERE, ) if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/data/storage.py b/sky/data/storage.py index d2f052edb8c..0caeef2bc7a 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -2537,7 +2537,7 @@ def _get_bucket(self) -> Tuple[str, bool]: if 'Name or service not known' in error_message: with ux_utils.print_exception_no_traceback(): raise exceptions.StorageBucketGetError( - 'Attempted to fetch the container from non-existant ' + 'Attempted to fetch the container from non-existent ' 'storage account ' f'name: {self.storage_account_name}. Please check ' 'if the name is correct.') diff --git a/sky/optimizer.py b/sky/optimizer.py index 9c11511a38b..7b4b29e3bce 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -348,10 +348,6 @@ def _estimate_nodes_cost_or_time( for orig_resources in node.resources): source_hint = 'kubernetes cluster' - # TODO(romilb): When `sky show-gpus` supports Kubernetes, - # add a hint to run `sky show-gpus --kubernetes` to list - # available accelerators on Kubernetes. - bold = colorama.Style.BRIGHT cyan = colorama.Fore.CYAN reset = colorama.Style.RESET_ALL @@ -1239,21 +1235,25 @@ def _fill_in_launchable_resources( continue clouds_list = ([resources.cloud] if resources.cloud is not None else enabled_clouds) + # If clouds provide hints, store them for later printing. + hints: Dict[clouds.Cloud, str] = {} for cloud in clouds_list: - (feasible_resources, - fuzzy_candidate_list) = cloud.get_feasible_launchable_resources( - resources, num_nodes=task.num_nodes) - if len(feasible_resources) > 0: + feasible_resources = cloud.get_feasible_launchable_resources( + resources, num_nodes=task.num_nodes) + if feasible_resources.hint is not None: + hints[cloud] = feasible_resources.hint + if len(feasible_resources.resources_list) > 0: # Assume feasible_resources is sorted by prices. Guaranteed by # the implementation of get_feasible_launchable_resources and # the underlying service_catalog filtering - cheapest = feasible_resources[0] + cheapest = feasible_resources.resources_list[0] # Generate region/zone-specified resources. launchable[resources].extend( _make_launchables_for_valid_region_zones(cheapest)) - cloud_candidates[cloud] = feasible_resources + cloud_candidates[cloud] = feasible_resources.resources_list else: - all_fuzzy_candidates.update(fuzzy_candidate_list) + all_fuzzy_candidates.update( + feasible_resources.fuzzy_candidate_list) if len(launchable[resources]) == 0: clouds_str = str(clouds_list) if len(clouds_list) > 1 else str( clouds_list[0]) @@ -1269,6 +1269,8 @@ def _fill_in_launchable_resources( f'{colorama.Fore.CYAN}' f'{sorted(all_fuzzy_candidates)}' f'{colorama.Style.RESET_ALL}') + for cloud, hint in hints.items(): + logger.info(f'{repr(cloud)}: {hint}') else: if resources.cpus is not None: logger.info('Try specifying a different CPU count, ' diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 7668c7348aa..a5996abe028 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -222,6 +222,32 @@ def _wait_for_pods_to_run(namespace, new_nodes): Pods may be pulling images or may be in the process of container creation. """ + + def _check_init_containers(pod): + # Check if any of the init containers failed + # to start. Could be because the init container + # command failed or failed to pull image etc. + for init_status in pod.status.init_container_statuses: + init_terminated = init_status.state.terminated + if init_terminated: + if init_terminated.exit_code != 0: + msg = init_terminated.message if ( + init_terminated.message) else str(init_terminated) + raise config_lib.KubernetesError( + 'Failed to run init container for pod ' + f'{pod.metadata.name}. Error details: {msg}.') + continue + init_waiting = init_status.state.waiting + if (init_waiting is not None and init_waiting.reason + not in ['ContainerCreating', 'PodInitializing']): + # TODO(romilb): There may be more states to check for. Add + # them as needed. + msg = init_waiting.message if ( + init_waiting.message) else str(init_waiting) + raise config_lib.KubernetesError( + 'Failed to create init container for pod ' + f'{pod.metadata.name}. Error details: {msg}.') + while True: all_pods_running = True # Iterate over each pod to check their status @@ -246,12 +272,15 @@ def _wait_for_pods_to_run(namespace, new_nodes): # See list of possible reasons for waiting here: # https://stackoverflow.com/a/57886025 waiting = container_status.state.waiting - if (waiting is not None and - waiting.reason != 'ContainerCreating'): - raise config_lib.KubernetesError( - 'Failed to create container while launching ' - 'the node. Error details: ' - f'{container_status.state.waiting.message}.') + if waiting is not None: + if waiting.reason == 'PodInitializing': + _check_init_containers(pod) + elif waiting.reason != 'ContainerCreating': + msg = waiting.message if waiting.message else str( + waiting) + raise config_lib.KubernetesError( + 'Failed to create container while launching ' + f'the node. Error details: {msg}.') # Reaching this point means that one of the pods had an issue, # so break out of the loop, and wait until next second. break diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 80bc96ddb94..f042750d627 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -426,11 +426,16 @@ def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType', ] assert len(gpu_nodes) > 0, 'GPU nodes not found' candidate_nodes = gpu_nodes - not_fit_reason_prefix = (f'GPU nodes with {acc_type} do not have ' - 'enough CPU and/or memory. ') + not_fit_reason_prefix = ( + f'GPU nodes with {acc_type} do not have ' + f'enough CPU (> {k8s_instance_type.cpus} CPUs) and/or ' + f'memory (> {k8s_instance_type.memory} G). ') else: candidate_nodes = nodes - not_fit_reason_prefix = 'No nodes found with enough CPU and/or memory. ' + not_fit_reason_prefix = (f'No nodes found with enough ' + f'CPU (> {k8s_instance_type.cpus} CPUs) ' + 'and/or memory ' + f'(> {k8s_instance_type.memory} G). ') # Check if CPU and memory requirements are met on at least one # candidate node. fits, reason = check_cpu_mem_fits(k8s_instance_type, candidate_nodes) @@ -928,7 +933,8 @@ def construct_ssh_jump_command( ssh_jump_port: Optional[int] = None, ssh_jump_user: str = 'sky', proxy_cmd_path: Optional[str] = None, - proxy_cmd_target_pod: Optional[str] = None) -> str: + proxy_cmd_target_pod: Optional[str] = None, + current_kube_context: Optional[str] = None) -> str: ssh_jump_proxy_command = (f'ssh -tt -i {private_key_path} ' '-o StrictHostKeyChecking=no ' '-o UserKnownHostsFile=/dev/null ' @@ -940,8 +946,11 @@ def construct_ssh_jump_command( proxy_cmd_path = os.path.expanduser(proxy_cmd_path) # adding execution permission to the proxy command script os.chmod(proxy_cmd_path, os.stat(proxy_cmd_path).st_mode | 0o111) + kube_context_flag = f' {current_kube_context}' if (current_kube_context + is not None) else '' ssh_jump_proxy_command += (f' -o ProxyCommand=\'{proxy_cmd_path} ' - f'{proxy_cmd_target_pod}\' ') + f'{proxy_cmd_target_pod}' + f'{kube_context_flag}\'') return ssh_jump_proxy_command @@ -1006,12 +1015,14 @@ def get_ssh_proxy_command( private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port) else: ssh_jump_proxy_command_path = create_proxy_command_script() + current_context = get_current_kube_config_context_name() ssh_jump_proxy_command = construct_ssh_jump_command( private_key_path, ssh_jump_ip, ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER, proxy_cmd_path=ssh_jump_proxy_command_path, - proxy_cmd_target_pod=k8s_ssh_target) + proxy_cmd_target_pod=k8s_ssh_target, + current_kube_context=current_context) return ssh_jump_proxy_command diff --git a/sky/task.py b/sky/task.py index cf26e13717a..cebc616dc6d 100644 --- a/sky/task.py +++ b/sky/task.py @@ -393,6 +393,11 @@ def from_yaml_config( config['service'] = _fill_in_env_vars(config['service'], config.get('envs', {})) + # Fill in any Task.envs into workdir + if config.get('workdir') is not None: + config['workdir'] = _fill_in_env_vars(config['workdir'], + config.get('envs', {})) + task = Task( config.pop('name', None), run=config.pop('run', None), diff --git a/sky/templates/kubernetes-port-forward-proxy-command.sh b/sky/templates/kubernetes-port-forward-proxy-command.sh index d9e409b5545..27580ffbe04 100644 --- a/sky/templates/kubernetes-port-forward-proxy-command.sh +++ b/sky/templates/kubernetes-port-forward-proxy-command.sh @@ -2,12 +2,13 @@ set -uo pipefail # Check if pod name is passed as an argument -if [ $# -eq 0 ]; then - echo "Usage: $0 " >&2 +if [ $# -lt 1 ]; then + echo "Usage: $0 [kube_context]" >&2 exit 1 fi POD_NAME="$1" # The first argument is the name of the pod +KUBE_CONTEXT="${2:-}" # The second argument is the kube context, default is empty # Checks if socat is installed if ! command -v socat > /dev/null; then @@ -26,7 +27,11 @@ fi # This is preferred because of socket re-use issues in kubectl port-forward, # see - https://github.com/kubernetes/kubernetes/issues/74551#issuecomment-769185879 KUBECTL_OUTPUT=$(mktemp) -kubectl port-forward pod/"${POD_NAME}" :22 > "${KUBECTL_OUTPUT}" 2>&1 & +if [ -n "$KUBE_CONTEXT" ]; then + kubectl --context="$KUBE_CONTEXT" port-forward pod/"${POD_NAME}" :22 > "${KUBECTL_OUTPUT}" 2>&1 & +else + kubectl port-forward pod/"${POD_NAME}" :22 > "${KUBECTL_OUTPUT}" 2>&1 & +fi # Capture the PID for the backgrounded kubectl command K8S_PORT_FWD_PID=$! @@ -60,4 +65,4 @@ done # Establishes two directional byte streams to handle stdin/stdout between # terminal and the jump pod. # socat process terminates when port-forward terminates. -socat - tcp:127.0.0.1:"${local_port}" \ No newline at end of file +socat - tcp:127.0.0.1:"${local_port}" diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py index 87a62dab95b..95c784143cc 100644 --- a/sky/utils/resources_utils.py +++ b/sky/utils/resources_utils.py @@ -10,6 +10,7 @@ if typing.TYPE_CHECKING: from sky import backends + from sky import resources as resources_lib _PORT_RANGE_HINT_MSG = ('Invalid port range {}. Please use the format ' '"from-to", in which from <= to. e.g. "1-3".') @@ -157,3 +158,21 @@ def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle', launched_resource_str) return f'{handle.launched_nodes}x {launched_resource_str}' return _DEFAULT_MESSAGE_HANDLE_INITIALIZING + + +@dataclasses.dataclass +class FeasibleResources: + """Feasible resources returned by cloud. + + Used to represent a collection of feasible resources returned by cloud, + any fuzzy candidates, and optionally a string hint if no feasible resources + are found. + + Fuzzy candidates example: when the requested GPU is A100:1 but is not + available in a cloud/region, the fuzzy candidates are results of a fuzzy + search in the catalog that are offered in the location. E.g., + ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8'] + """ + resources_list: List['resources_lib.Resources'] + fuzzy_candidate_list: List[str] + hint: Optional[str] diff --git a/tests/test_smoke.py b/tests/test_smoke.py index c4347f53a21..952832718fb 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -47,6 +47,7 @@ from sky import global_user_state from sky import jobs from sky import serve +from sky import skypilot_config from sky.adaptors import cloudflare from sky.adaptors import ibm from sky.clouds import AWS @@ -1191,9 +1192,22 @@ def test_docker_storage_mounts(generic_cloud: str, image_id: str): template = jinja2.Template(template_str) # ubuntu 18.04 does not support fuse3, and blobfuse2 depends on fuse3. azure_mount_unsupported_ubuntu_version = '18.04' + # Commands to verify bucket upload. We need to check all three + # storage types because the optimizer may pick any of them. + s3_command = f'aws s3 ls {storage_name}/hello.txt' + gsutil_command = f'gsutil ls gs://{storage_name}/hello.txt' + azure_blob_command = TestStorageWithCredentials.cli_ls_cmd( + storage_lib.StoreType.AZURE, storage_name, suffix='hello.txt') if azure_mount_unsupported_ubuntu_version in image_id: + # The store for mount_private_mount is not specified in the template. + # If we're running on Azure, the private mount will be created on + # azure blob. That will not be supported on the ubuntu 18.04 image + # and thus fail. For other clouds, the private mount on other + # storage types (GCS/S3) should succeed. + include_private_mount = False if generic_cloud == 'azure' else True content = template.render(storage_name=storage_name, - include_azure_mount=False) + include_azure_mount=False, + include_private_mount=include_private_mount) else: content = template.render(storage_name=storage_name,) with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: @@ -1204,8 +1218,10 @@ def test_docker_storage_mounts(generic_cloud: str, image_id: str): *STORAGE_SETUP_COMMANDS, f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. - f'aws s3 ls {storage_name}/hello.txt || ' - f'gsutil ls gs://{storage_name}/hello.txt', + # Check AWS, GCP, or Azure storage mount. + f'{s3_command} || ' + f'{gsutil_command} || ' + f'{azure_blob_command}', ] test = Test( 'docker_storage_mounts', @@ -3132,7 +3148,8 @@ def test_kubernetes_custom_image(image_id): run_one_test(test) -@pytest.mark.no_fluidstack + +@pytest.mark.azure def test_azure_start_stop_two_nodes(): name = _get_cluster_name() test = Test( @@ -4299,7 +4316,11 @@ def cli_ls_cmd(store_type, bucket_name, suffix=''): return f'gsutil ls {url}' if store_type == storage_lib.StoreType.AZURE: default_region = 'eastus' - storage_account_name = ( + config_storage_account = skypilot_config.get_nested( + ('azure', 'storage_account'), None) + storage_account_name = config_storage_account if ( + config_storage_account is not None + ) else ( storage_lib.AzureBlobStore.DEFAULT_STORAGE_ACCOUNT_NAME.format( region=default_region, user_hash=common_utils.get_user_hash())) @@ -4897,10 +4918,14 @@ def test_private_bucket(self, private_bucket): private_bucket).path.strip('/') else: private_bucket_name = urllib.parse.urlsplit(private_bucket).netloc - with pytest.raises( - sky.exceptions.StorageBucketGetError, - match=storage_lib._BUCKET_FAIL_TO_CONNECT_MESSAGE.format( - name=private_bucket_name)): + match_str = storage_lib._BUCKET_FAIL_TO_CONNECT_MESSAGE.format( + name=private_bucket_name) + if store_type == 'https': + # Azure blob uses a different error string since container may + # not exist even though the bucket name is ok. + match_str = 'Attempted to fetch a non-existent public container' + with pytest.raises(sky.exceptions.StorageBucketGetError, + match=match_str): storage_obj = storage_lib.Storage(source=private_bucket) @pytest.mark.no_fluidstack @@ -5291,6 +5316,7 @@ def test_multiple_resources(): @pytest.mark.no_fluidstack # Requires other clouds to be enabled @pytest.mark.no_paperspace # Requires other clouds to be enabled @pytest.mark.no_kubernetes +@pytest.mark.aws # SkyBenchmark requires S3 access def test_sky_bench(generic_cloud: str): name = _get_cluster_name() test = Test( diff --git a/tests/test_yaml_parser.py b/tests/test_yaml_parser.py index 1453cfe1620..7d304b60633 100644 --- a/tests/test_yaml_parser.py +++ b/tests/test_yaml_parser.py @@ -146,3 +146,14 @@ def test_invalid_empty_envs(tmp_path): with pytest.raises(ValueError) as e: Task.from_yaml(config_path) assert 'Environment variable \'env_key2\' is None.' in e.value.args[0] + + +def test_replace_envs_in_workdir(tmpdir, tmp_path): + config_path = _create_config_file( + textwrap.dedent(f"""\ + envs: + env_key1: {tmpdir} + workdir: $env_key1 + """), tmp_path) + task = Task.from_yaml(config_path) + assert task.workdir == tmpdir diff --git a/tests/test_yamls/test_storage_mounting.yaml.j2 b/tests/test_yamls/test_storage_mounting.yaml.j2 index c61250bae14..4241c63409e 100644 --- a/tests/test_yamls/test_storage_mounting.yaml.j2 +++ b/tests/test_yamls/test_storage_mounting.yaml.j2 @@ -28,11 +28,13 @@ file_mounts: source: ['~/tmp-workdir/tmp file', '~/tmp-workdir/tmp file2'] mode: COPY + {% if include_private_mount | default(True) %} # Mounting private buckets in MOUNT mode /mount_private_mount: name: {{storage_name}} source: ~/tmp-workdir mode: MOUNT + {% endif %} run: | set -ex @@ -49,12 +51,16 @@ run: | ls -ltr /mount_private_copy/tmp\ file ls -ltr /mount_private_copy_lof/tmp\ file ls -ltr /mount_private_copy_lof/tmp\ file2 + {% if include_private_mount | default(True) %} ls -ltr /mount_private_mount/foo ls -ltr /mount_private_mount/tmp\ file + {% endif %} # Symlinks are not copied to buckets ! ls /mount_private_copy/circle-link + {% if include_private_mount | default(True) %} ! ls /mount_private_mount/circle-link # Write to private bucket in MOUNT mode should pass echo "hello" > /mount_private_mount/hello.txt + {% endif %}