diff --git a/README.md b/README.md index b803a998924..3b7608f4764 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ ---- :fire: *News* :fire: +- [Jul, 2024] Finetune **Llama 3.1** on your infra: [**example**](./llm/llama-3_1-finetuning/). - [Jun, 2024] Reproduce **GPT** with [llm.c](https://github.com/karpathy/llm.c/discussions/481) on any cloud: [**guide**](./llm/gpt-2/) - [Apr, 2024] Serve and finetune [**Llama 3**](https://skypilot.readthedocs.io/en/latest/gallery/llms/llama-3.html) on any cloud or Kubernetes: [**example**](./llm/llama-3/) - [Apr, 2024] Serve [**Qwen-110B**](https://qwenlm.github.io/blog/qwen1.5-110b/) on your infra: [**example**](./llm/qwen/) diff --git a/llm/llama-3_1-finetuning/configs/70B-lora.yaml b/llm/llama-3_1-finetuning/configs/70B-lora.yaml new file mode 100644 index 00000000000..612048536a3 --- /dev/null +++ b/llm/llama-3_1-finetuning/configs/70B-lora.yaml @@ -0,0 +1,99 @@ +# Config for multi-device LoRA in lora_finetune_distributed.py +# using a Llama3.1 70B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3.1-70B-Instruct --output-dir /tmp/Meta-Llama-3.1-70B-Instruct --ignore-patterns "original/consolidated*" +# +# This config needs 8 GPUs to run +# tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_1/70B_lora + +# Model Arguments +model: + _component_: torchtune.models.llama3_1.lora_llama3_1_70b + lora_attn_modules: ['q_proj', 'k_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 16 + lora_alpha: 32 + +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3.1-70B-Instruct/original/tokenizer.model + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/ + checkpoint_files: [ + model-00001-of-00030.safetensors, + model-00002-of-00030.safetensors, + model-00003-of-00030.safetensors, + model-00004-of-00030.safetensors, + model-00005-of-00030.safetensors, + model-00006-of-00030.safetensors, + model-00007-of-00030.safetensors, + model-00008-of-00030.safetensors, + model-00009-of-00030.safetensors, + model-00010-of-00030.safetensors, + model-00011-of-00030.safetensors, + model-00012-of-00030.safetensors, + model-00013-of-00030.safetensors, + model-00014-of-00030.safetensors, + model-00015-of-00030.safetensors, + model-00016-of-00030.safetensors, + model-00017-of-00030.safetensors, + model-00018-of-00030.safetensors, + model-00019-of-00030.safetensors, + model-00020-of-00030.safetensors, + model-00021-of-00030.safetensors, + model-00022-of-00030.safetensors, + model-00023-of-00030.safetensors, + model-00024-of-00030.safetensors, + model-00025-of-00030.safetensors, + model-00026-of-00030.safetensors, + model-00027-of-00030.safetensors, + model-00028-of-00030.safetensors, + model-00029-of-00030.safetensors, + model-00030-of-00030.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3.1-70B-Instruct/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_dataset +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + +# Logging +output_dir: /tmp/lora_finetune_output +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True diff --git a/llm/llama-3_1-finetuning/configs/8B-lora.yaml b/llm/llama-3_1-finetuning/configs/8B-lora.yaml new file mode 100644 index 00000000000..d3e3be5af8e --- /dev/null +++ b/llm/llama-3_1-finetuning/configs/8B-lora.yaml @@ -0,0 +1,83 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Llama3.1 8B Instruct model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" +# +# To launch on 2 devices, run the following command from root: +# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_1/8B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_1/8B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 8B_lora_single_device.yaml +# or 8B_qlora_single_device.yaml + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model + +# Model Arguments +model: + _component_: torchtune.models.llama3_1.lora_llama3_1_8b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3.1-8B-Instruct/ + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3.1-8B-Instruct/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 32 + +# Logging +output_dir: /tmp/lora_finetune_output +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False diff --git a/llm/llama-3_1-finetuning/lora.yaml b/llm/llama-3_1-finetuning/lora.yaml new file mode 100644 index 00000000000..35b0fc6faad --- /dev/null +++ b/llm/llama-3_1-finetuning/lora.yaml @@ -0,0 +1,58 @@ +# LoRA finetuning Meta Llama-3.1 on any of your own infra. +# +# Usage: +# +# HF_TOKEN=xxx sky launch lora.yaml -c llama31 --env HF_TOKEN +# +# To finetune a 70B model: +# +# HF_TOKEN=xxx sky launch lora.yaml -c llama31-70 --env HF_TOKEN --env MODEL_SIZE=70B + +envs: + MODEL_SIZE: 8B + HF_TOKEN: + DATASET: "yahma/alpaca-cleaned" + # Change this to your own checkpoint bucket + CHECKPOINT_BUCKET_NAME: sky-llama-31-checkpoints + + +resources: + accelerators: A100:8 + disk_tier: best + use_spot: true + +file_mounts: + /configs: ./configs + /output: + name: $CHECKPOINT_BUCKET_NAME + mode: MOUNT + # Optionally, specify the store to enforce to use one of the stores below: + # r2/azure/gcs/s3/cos + # store: r2 + +setup: | + pip install torch torchvision + + # Install torch tune from source for the latest Llama-3.1 model + pip install git+https://github.com/pytorch/torchtune.git@58255001bd0b1e3a81a6302201024e472af05379 + # pip install torchtune + + tune download meta-llama/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --hf-token $HF_TOKEN \ + --output-dir /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --ignore-patterns "original/consolidated*" + +run: | + tune run --nproc_per_node $SKYPILOT_NUM_GPUS_PER_NODE \ + lora_finetune_distributed \ + --config /configs/${MODEL_SIZE}-lora.yaml \ + dataset.source=$DATASET + + # Remove the checkpoint files to save space, LoRA serving only needs the + # adapter files. + rm /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/*.pt + rm /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/*.safetensors + + mkdir -p /output/$MODEL_SIZE-lora + rsync -Pavz /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct /output/$MODEL_SIZE-lora + cp -r /tmp/lora_finetune_output /output/$MODEL_SIZE-lora/ diff --git a/llm/llama-3_1-finetuning/readme.md b/llm/llama-3_1-finetuning/readme.md new file mode 100644 index 00000000000..836f3bf1b3b --- /dev/null +++ b/llm/llama-3_1-finetuning/readme.md @@ -0,0 +1,267 @@ +# Finetune Llama 3.1 on your infra + +
+
+ +
+ +On July 23, 2024, Meta released the [Llama 3.1 model family](https://ai.meta.com/blog/meta-llama-3-1/), including a 405B parameter model in both base model and instruction-tuned forms. Llama 3.1 405B became _the first open LLM that closely rivals top proprietary models_ like GPT-4o and Claude 3.5 Sonnet. + +This guide shows how to use [SkyPilot](https://github.com/skypilot-org/skypilot) and [torchtune](https://pytorch.org/torchtune/stable/index.html) to **finetune Llama 3.1 on your own data and infra**. Everything is packaged in a simple [SkyPilot YAML](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html), that can be launched with one command on your infra: +- Local GPU workstation +- Kubernetes cluster +- Cloud accounts ([12 clouds supported](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)) + +
+
+ +
+ + + +## Let's finetune Llama 3.1 +We will use [torchtune](https://pytorch.org/torchtune/stable/index.html) to finetune Llama 3.1. The example below uses the [`yahma/alpaca-cleaned`](https://huggingface.co/datasets/yahma/alpaca-cleaned) dataset, which you can replace with your own dataset later. + +To set up the environment for launching the finetuning job, finish the [Appendix: Preparation](#appendix-preparation) section first. + +The finetuning job is packaged in a SkyPilot YAML. It can be launched on any of your own infra, such as Kubernetes or any cloud, with the same interface: + +
+ + SkyPilot YAML for finetuning Llama 3.1: lora.yaml + + +```yaml +# LoRA finetuning Meta Llama 3.1 on any of your own infra. +# +# Usage: +# +# HF_TOKEN=xxx sky launch lora.yaml -c llama31 --env HF_TOKEN +# +# To finetune a 70B model: +# +# HF_TOKEN=xxx sky launch lora.yaml -c llama31-70 --env HF_TOKEN --env MODEL_SIZE=70B + +envs: + MODEL_SIZE: 8B + HF_TOKEN: + DATASET: "yahma/alpaca-cleaned" + # Change this to your own checkpoint bucket + CHECKPOINT_BUCKET_NAME: sky-llama-31-checkpoints + + +resources: + accelerators: A100:8 + disk_tier: best + use_spot: true + +file_mounts: + /configs: ./configs + /output: + name: $CHECKPOINT_BUCKET_NAME + mode: MOUNT + # Optionally, specify the store to enforce to use one of the stores below: + # r2/azure/gcs/s3/cos + # store: r2 + +setup: | + pip install torch torchvision + + # Install torch tune from source for the latest Llama 3.1 model + pip install git+https://github.com/pytorch/torchtune.git@58255001bd0b1e3a81a6302201024e472af05379 + # pip install torchtune + + tune download meta-llama/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --hf-token $HF_TOKEN \ + --output-dir /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --ignore-patterns "original/consolidated*" + +run: | + tune run --nproc_per_node $SKYPILOT_NUM_GPUS_PER_NODE \ + lora_finetune_distributed \ + --config /configs/${MODEL_SIZE}-lora.yaml \ + dataset.source=$DATASET + + # Remove the checkpoint files to save space, LoRA serving only needs the + # adapter files. + rm /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/*.pt + rm /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/*.safetensors + + mkdir -p /output/$MODEL_SIZE-lora + rsync -Pavz /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct /output/$MODEL_SIZE-lora + cp -r /tmp/lora_finetune_output /output/$MODEL_SIZE-lora/ +``` + +
+ +Run the following on your local machine: + +```bash +# Download the files for Llama 3.1 finetuning +git clone https://github.com/skypilot-org/skypilot +cd skypilot/llm/llama-3.1 + +export HF_TOKEN=xxxx + +# It takes about 40 mins on 8 A100 GPUs to finetune a 8B +# Llama3.1 model with LoRA on Alpaca dataset. +sky launch -c llama31 lora.yaml \ + --env HF_TOKEN --env MODEL_SIZE=8B \ + --env CHECKPOINT_BUCKET_NAME="your-own-bucket-name" +``` + + +To finetune a larger model with 70B parameters, you can simply change the parameters as below: +```bash +sky launch -c llama31-70 lora.yaml \ + --env HF_TOKEN --env MODEL_SIZE=70B \ + --env CHECKPOINT_BUCKET_NAME="your-own-bucket-name" +``` + +**Finetuning Llama 3.1 405B**: Work in progress! If you want to follow the work, join the [SkyPilot community Slack](https://slack.skypilot.co/) for discussions. + +## Use your custom data +The example above finetune Llama 3.1 on Alpaca dataset ([`yahma/alpaca-cleaned`](https://huggingface.co/datasets/yahma/alpaca-cleaned)), but for real use cases, you may want to finetune it on your own dataset. + +You can do so by specifying the huggingface path to your own dataset as following (we use [`gbharti/finance-alpaca`](https://huggingface.co/datasets/gbharti/finance-alpaca) as an example below): +```bash +# It takes about 1 hour on 8 A100 GPUs to finetune a 8B +# Llama3.1 model with LoRA on finance dataset. +sky launch -c llama31 lora.yaml \ + --env HF_TOKEN --env MODEL_SIZE=8B \ + --env CHECKPOINT_BUCKET_NAME="your-own-bucket-name" \ + --env DATASET="gbharti/finance-alpaca" +``` + +
+
+ + + +
Training Loss of LoRA finetuning Llama 3.1
+
+ +## Serve the fine tuned model + +With a finetuned Llama 3.1 trained on your dataset, you can now serve the finetuned model with a single command: + +> Note: `CHECKPOINT_BUCKET_NAME` should be the bucket you used for storing checkpoints in the previous finetuning step. + +```bash +sky launch -c serve-llama31 serve.yaml \ + --env LORA_NAME="my-finance-lora" \ + --env CHECKPOINT_BUCEKT_NAME="your-own-bucket-name" +``` + +You can interact with the model in a terminal: +```console +ENDPOINT=$(sky status --endpoint 8081 serve-llama31) +curl http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "my-finance-lora", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "For a car, what scams can be plotted with 0% financing vs rebate?" + } + ] + }' | jq . +``` + +:tada: **Congratulations!** You now have a finetuned Llama 3.1 8B model that is well versed in finance topics. To recap, all model checkpoints and replicas **stay in your own private infrastructure**. + +
+ SkyPilot YAML serve.yaml for serving the finetuned model + +```yaml +# Serve a LoRA finetuned Meta Llama 3.1. +# +# Usage: +# +# HF_TOKEN=xxx sky launch serve.yaml -c llama31-serve --env HF_TOKEN + +envs: + MODEL_SIZE: 8B + HF_TOKEN: + # Change this to your checkpoint bucket created in lora.yaml + CHECKPOINT_BUCKET_NAME: your-checkpoint-bucket + LORA_NAME: my-finance-lora + +resources: + accelerators: L4 + ports: 8081 + cpus: 32+ + +file_mounts: + /checkpoints: + name: $CHECKPOINT_BUCKET_NAME + mode: MOUNT + +setup: | + pip install vllm==0.5.3post1 + pip install vllm-flash-attn==2.5.9.post1 + pip install openai + +run: | + vllm serve meta-llama/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE --enable-lora \ + --lora-modules $LORA_NAME=/checkpoints/${MODEL_SIZE}-lora/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/ \ + --max-model-len=2048 --port 8081 +``` + +
+ +## Appendix: Preparation +1. Request the access to [Llama 3.1 weights on huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) (Click on the blue box and follow the steps): +![](https://imgur.com/snIQhr9.png) + +2. Get your [huggingface access token](https://huggingface.co/settings/tokens): +![](https://imgur.com/3idBgHn.png) + + +3. Add huggingface token to your environment variable: +```bash +export HF_TOKEN="xxxx" +``` + +4. Install SkyPilot for launching the finetuning: +```bash +pip install skypilot-nightly[aws,gcp,kubernetes] +# or other clouds (12 clouds + kubernetes supported) you have setup +# See: https://skypilot.readthedocs.io/en/latest/getting-started/installation.html +``` + +5. Check your infra setup: +```console +sky check + +🎉 Enabled clouds 🎉 + ✔ AWS + ✔ GCP + ✔ Azure + ✔ OCI + ✔ Lambda + ✔ RunPod + ✔ Paperspace + ✔ Fluidstack + ✔ Cudo + ✔ IBM + ✔ SCP + ✔ vSphere + ✔ Cloudflare (for R2 object store) + ✔ Kubernetes +``` + + + +## What's next + +* [AI on Kubernetes Without the Pain](https://blog.skypilot.co/ai-on-kubernetes/) +* [SkyPilot AI Gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html) +* [SkyPilot Docs](https://skypilot.readthedocs.io/en/latest/docs/index.html) +* [SkyPilot GitHub](https://github.com/skypilot-org/skypilot) diff --git a/llm/llama-3_1-finetuning/serve.yaml b/llm/llama-3_1-finetuning/serve.yaml new file mode 100644 index 00000000000..c1df6b6b8c7 --- /dev/null +++ b/llm/llama-3_1-finetuning/serve.yaml @@ -0,0 +1,33 @@ +# Serve a LoRA finetuned Meta Llama-3.1. +# +# Usage: +# +# HF_TOKEN=xxx sky launch serve.yaml -c llama31-serve --env HF_TOKEN + +envs: + MODEL_SIZE: 8B + HF_TOKEN: + # Change this to your checkpoint bucket created in lora.yaml + CHECKPOINT_BUCKET_NAME: your-checkpoint-bucket + LORA_NAME: my-finance-lora + +resources: + accelerators: L4 + ports: 8081 + cpus: 32+ + +file_mounts: + /checkpoints: + name: $CHECKPOINT_BUCKET_NAME + mode: MOUNT + +setup: | + pip install vllm==0.5.3post1 + pip install vllm-flash-attn==2.5.9.post1 + pip install openai + +run: | + vllm serve meta-llama/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE --enable-lora \ + --lora-modules $LORA_NAME=/checkpoints/${MODEL_SIZE}-lora/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/ \ + --max-model-len=2048 --port 8081