From 4c4493563bebe1a7d02b216ea594fb2105c7a751 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 31 May 2024 00:02:42 +0000
Subject: [PATCH] address comments

---
 llm/gpt-2/README.md      | 50 +++++++++++++++++-----
 llm/gpt-2/gpt2-data.yaml |  2 +-
 llm/gpt-2/gpt2.yaml      | 89 ++++++++++++++++++++++++++++++++++++++++
 sky/utils/schemas.py     |  6 ++-
 4 files changed, 134 insertions(+), 13 deletions(-)
 create mode 100644 llm/gpt-2/gpt2.yaml

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index 404d3bdcfcc..9cdfd76f462 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -7,7 +7,7 @@ With SkyPilot, you can run GPT-2 (124M) training on any cloud.
 
 1. Install [SkyPilot](https://github.com/skypilot-org/skypilot):
 ```bash
-pip install skypilot-nightly
+pip install "skypilot-nightly[aws,gcp,azure,kubernetes,lambda,fluidstack]" # Choose the clouds you want to enable
 ```
 2. Enable clouds for SkyPilot:
 ```bash
@@ -15,21 +15,49 @@ sky check
 ```
 Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
 
-3. Download the YAMLs in this directory for data processing and training:
+3. Download the YAML for starting the training:
 ```bash
-wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml
-wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml
+wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2.yaml
 ```
 
-## Data processing
+## Run GPT-2 training
+
+Run the following command to start GPT-2 (124M) training on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name):
+
+```bash
+sky launch -c gpt2 gpt2.yaml
+```
+
+Or, you can train the model with a single A100, by adding `--gpu A100`:
+```bash
+sky launch -c gpt2 gpt2.yaml --gpu A100
+```
+
+## Advanced: Run GPT-2 training in two stages
+
+The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily
+separate the data processing and training into two stages and execute them sequantially manually, or let SkyPilot manage the dependencies between the two stages.
+
+With this data processing can be run on cheaper CPU VMs (e.g., ~\$1.5/hour), and run the training on more expensive GPU VMs (e.g., ~\$1.3-\$3.6/hour for a single A100 GPU, or \$10.3-\$32.8/hour for 8 A100 GPUs).
+
+We can run the data processing on a CPU VM and store the processed data in a cloud bucket. Then, we can run the training on a GPU VM with the processed data.
+
+```bash
+wget https://raw.githubusercontent.com//skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml
+wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml
+```
+
+### Run two stages manually
+#### Data processing
 
 Run the following command to process the training data on a CPU VM and store it in a cloud bucket for future use (replace `your-bucket-name` with your bucket name):
+
 ```bash
 sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
 
-## Training
+#### Training
 
 After the data is processed, you can then train the model on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name):
 
@@ -43,14 +71,14 @@ sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_
 ```
 
 
-## Run in a Pipeline
+### Run in a Pipeline
 
 We can also combine the two steps into a single SkyPilot job, and let SkyPilot to handle the dependencies between the two steps. Here is an example of how to do this (replace `your-bucket-name` with your bucket name):
 ```bash
-cat gpt2-data.yaml > gpt2.yaml
-echo "---" >> gpt2.yaml
-cat gpt2-train.yaml >> gpt2.yaml
-sky jobs launch -n gpt2 gpt2.yaml --env BUCKET_NAME=your-bucket-name
+cat gpt2-data.yaml > gpt2-pipeline.yaml
+echo "---" >> gpt2-pipeline.yaml
+cat gpt2-train.yaml >> gpt2-pipeline.yaml
+sky jobs launch -n gpt2 gpt2-pipeline.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
 SkyPilot will first download and process the dataset on a CPU VM and store the
diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml
index 42daee00584..3bd082d02e2 100644
--- a/llm/gpt-2/gpt2-data.yaml
+++ b/llm/gpt-2/gpt2-data.yaml
@@ -4,7 +4,7 @@ envs:
   BUCKET_NAME: # Fill in your bucket name
 
 resources:
-  cpus: 64+
+  cpus: 32+
 
 file_mounts:
   /cache:
diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml
new file mode 100644
index 00000000000..0698073e3a0
--- /dev/null
+++ b/llm/gpt-2/gpt2.yaml
@@ -0,0 +1,89 @@
+name: train
+
+resources:
+  accelerators: A100:8
+  # Use docker image for latest version g++ to enable the compilation of llm.c.
+  image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+  any_of:
+    # Avoid using docker image for lambda due to the docker is not supported on
+    # Lambda yet, but the base image works.
+    - cloud: lambda
+      image_id: null
+    - cloud: aws
+    - cloud: gcp
+    - cloud: azure
+    - cloud: fluidstack
+    - cloud: kubernetes
+  
+
+setup: |
+  cd ~
+  pip install tqdm tiktoken requests datasets
+
+  # Training dependencies
+  # install cudnn so we can use FlashAttention and run fast (optional)
+  # https://developer.nvidia.com/cudnn-downloads
+  # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04
+  if [ -f ./CUDNN_INSTALLED ]; then
+    echo "cudnn already installed"
+  else
+    system=$(lsb_release -si | tr '[:upper:]' '[:lower:]')
+    # Get version and remove the dot
+    version=$(lsb_release -sr | tr -d .)
+    export system_version="${system}${version}"
+    wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb
+    sudo dpkg -i cudnn-installer.deb
+    sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/
+    # Remove problematic kubernetes.list source
+    sudo apt-get update --allow-releaseinfo-change || true
+
+    sudo apt-get -y install cudnn-cuda-12
+
+    touch ./CUDNN_INSTALLED
+  fi
+
+  # "install" cudnn-frontend to ~/
+  sudo apt -y install git
+  git clone https://github.com/NVIDIA/cudnn-frontend.git || true
+
+  # install MPI (optional, if you intend to use multiple GPUs)
+  # SkyPilot do not install MPI as that requires NCCL which needs to be manually
+  # installed.
+  sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev
+  # install nccl
+  pip install nvidia-nccl-cu12
+  export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib
+  export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include
+
+  git clone https://github.com/karpathy/llm.c.git || true
+  cd llm.c
+  # compile llm.c (mixed precision, with cuDNN flash-attention)
+  # first compilation is ~1 minute, mostly due to cuDNN
+  make train_gpt2cu USE_CUDNN=1
+
+
+run: |
+  cd ~/llm.c
+  # Processing data
+  # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
+  # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
+  # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
+  python dev/data/fineweb.py --version 10B
+
+  # Start training on multiple GPUs
+  mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \
+      -i "dev/data/fineweb10B/fineweb_train_*.bin" \
+      -j "dev/data/fineweb10B/fineweb_val_*.bin" \
+      -o log124M \
+      -e "d12" \
+      -b 64 -t 1024 \
+      -d 524288 \
+      -r 1 \
+      -z 1 \
+      -c 0.1 \
+      -l 0.0006 \
+      -q 0.0 \
+      -u 700 \
+      -n 5000 \
+      -v 250 -s 20000 \
+      -h 1
diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py
index 8bbe1d54e60..02c8452ef2a 100644
--- a/sky/utils/schemas.py
+++ b/sky/utils/schemas.py
@@ -141,7 +141,11 @@ def _get_single_resources_schema():
                 }, {
                     'type': 'object',
                     'required': [],
-                }]
+                },
+                {  
+                    'type': 'null',
+                }
+                ]
             },
             # The following fields are for internal use only.
             '_docker_login_config': {