From 374f0f29a13b15033640c4955a81f49524d630bc Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 28 May 2024 23:00:38 +0000
Subject: [PATCH 01/29] add gpt-2 example

---
 llm/gpt-2/README.md | 12 ++++++
 llm/gpt-2/gpt2.yaml | 91 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)
 create mode 100644 llm/gpt-2/README.md
 create mode 100644 llm/gpt-2/gpt2.yaml

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
new file mode 100644
index 00000000000..33b5453569e
--- /dev/null
+++ b/llm/gpt-2/README.md
@@ -0,0 +1,12 @@
+# GPT-2 (124M) in llm.c in 90 minutes
+
+https://github.com/karpathy/llm.c/discussions/481
+
+```bash
+sky jobs launch -n gpt2 gpt2.yaml
+```
+
+SkyPilot will first download and process the dataset on a CPU VM and store the
+processed data in a GCS bucket. Then, it will launch a GPT-2 training job on a
+GPU VM. The training job will train GPT-2 (124M) on the processed data.
+
diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml
new file mode 100644
index 00000000000..75cc753a609
--- /dev/null
+++ b/llm/gpt-2/gpt2.yaml
@@ -0,0 +1,91 @@
+# name: gpt2-data
+
+# resources:
+#   cpus: 64+
+
+# file_mounts:
+#   /cache:
+#     name: gpt2-data-skypilot
+#     store: gcs
+#     mode: MOUNT
+
+# setup: |
+#   pip install tqdm tiktoken requests datasets
+#   # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
+#   # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
+#   # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
+#   git clone https://github.com/karpathy/llm.c.git || true
+
+# run: |
+#   cd llm.c
+#   python dev/data/fineweb.py --version 10B
+
+#   rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/
+#   rsync -Pavz dev/data/fineweb10B /cache/
+
+# ---
+
+name: train
+
+
+resources:
+  accelerators: A100:8
+  use_spot: true
+
+file_mounts:
+  ~/.cache/huggingface: gs://gpt2-data-skypilot
+
+setup: |
+  cd ~
+  pip install tqdm tiktoken requests datasets
+
+  # install cudnn so we can use FlashAttention and run fast (optional)
+  # https://developer.nvidia.com/cudnn-downloads
+  # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04
+  if [ -f ./CUDNN_INSTALLED ]; then
+    echo "cudnn already installed"
+  else
+    system=$(lsb_release -si | tr '[:upper:]' '[:lower:]')
+    version=$(lsb_release -sr)
+    system_version="${system}${version}"
+    wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb
+    sudo dpkg -i cudnn-installer.deb
+    sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/
+    sudo apt-get update
+    sudo apt-get -y install cudnn-cuda-12
+    touch ./CUDNN_INSTALLED
+  fi
+
+  # "install" cudnn-frontend to ~/
+  git clone https://github.com/NVIDIA/cudnn-frontend.git || true
+
+  # install MPI (optional, if you intend to use multiple GPUs)
+  sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev
+
+  git clone https://github.com/karpathy/llm.c.git || true
+  cd llm.c
+  mv ~/.cache/huggingface/fineweb10B dev/data/
+  # compile llm.c (mixed precision, with cuDNN flash-attention)
+  # first compilation is ~1 minute, mostly due to cuDNN
+  make train_gpt2cu USE_CUDNN=1
+
+
+run: |
+  cd llm.c
+  # train on multiple GPUs
+  mpirun -np 8 ./train_gpt2cu \
+      -i "dev/data/fineweb10B/fineweb_train_*.bin" \
+      -j "dev/data/fineweb10B/fineweb_val_*.bin" \
+      -o log124M \
+      -e "d12" \
+      -b 64 -t 1024 \
+      -d 524288 \
+      -r 1 \
+      -z 1 \
+      -c 0.1 \
+      -l 0.0006 \
+      -q 0.0 \
+      -u 700 \
+      -n 5000 \
+      -v 250 -s 20000 \
+      -h 1

From 79323f7d729661acab474234b18865c9dc23190f Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 29 May 2024 01:59:30 +0000
Subject: [PATCH 02/29] Use ubuntu for GCP

---
 llm/gpt-2/gpt2.yaml | 35 ++++++++++++++++++++++++++++-------
 sky/clouds/gcp.py   |  6 +++---
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml
index 75cc753a609..d2798cf19cf 100644
--- a/llm/gpt-2/gpt2.yaml
+++ b/llm/gpt-2/gpt2.yaml
@@ -29,13 +29,24 @@ name: train
 
 
 resources:
-  accelerators: A100:8
+  accelerators: A100:1
   use_spot: true
 
-file_mounts:
-  ~/.cache/huggingface: gs://gpt2-data-skypilot
+# file_mounts:
+#   ~/.cache/huggingface: gs://gpt2-data-skypilot
 
 setup: |
+  export PATH="$PATH:$HOME/.local/bin"
+  # Create a gpt2 conda version with the latest gxx
+  # conda activate gpt2
+  # if [ $? -eq 0 ]; then
+  #   echo "gpt2 environment already exists"
+  # else
+  #   conda create -n gpt2 gxx=12 -y
+  #   conda activate gpt2
+  # fi
+
+
   cd ~
   pip install tqdm tiktoken requests datasets
 
@@ -46,13 +57,17 @@ setup: |
     echo "cudnn already installed"
   else
     system=$(lsb_release -si | tr '[:upper:]' '[:lower:]')
-    version=$(lsb_release -sr)
-    system_version="${system}${version}"
+    # Get version and remove the dot
+    version=$(lsb_release -sr | tr -d .)
+    export system_version="${system}${version}"
     wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb
     sudo dpkg -i cudnn-installer.deb
     sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/
-    sudo apt-get update
+    # Remove problematic kubernetes.list source
+    sudo apt-get update --allow-releaseinfo-change
+
     sudo apt-get -y install cudnn-cuda-12
+
     touch ./CUDNN_INSTALLED
   fi
 
@@ -60,7 +75,13 @@ setup: |
   git clone https://github.com/NVIDIA/cudnn-frontend.git || true
 
   # install MPI (optional, if you intend to use multiple GPUs)
+  # SkyPilot do not install MPI as that requires NCCL which needs to be manually
+  # installed.
   sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev
+  # install nccl
+  pip install nvidia-nccl-cu12
+  export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib
+  export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include
 
   git clone https://github.com/karpathy/llm.c.git || true
   cd llm.c
@@ -71,7 +92,7 @@ setup: |
 
 
 run: |
-  cd llm.c
+  cd ~/llm.c
   # train on multiple GPUs
   mpirun -np 8 ./train_gpt2cu \
       -i "dev/data/fineweb10B/fineweb_train_*.bin" \
diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py
index 93260533f27..215fa043a08 100644
--- a/sky/clouds/gcp.py
+++ b/sky/clouds/gcp.py
@@ -404,7 +404,7 @@ def make_deploy_resources_variables(
         # --no-standard-images
         # We use the debian image, as the ubuntu image has some connectivity
         # issue when first booted.
-        image_id = 'skypilot:cpu-debian-11'
+        image_id = 'skypilot:cpu-ubuntu-2204'
 
         r = resources
         # Find GPU spec, if any.
@@ -455,8 +455,8 @@ def make_deploy_resources_variables(
                     # CUDA driver version 470.57.02, CUDA Library 11.4
                     image_id = 'skypilot:k80-debian-10'
                 else:
-                    # CUDA driver version 535.86.10, CUDA Library 12.2
-                    image_id = 'skypilot:gpu-debian-11'
+                    # CUDA driver version 550.54.15, CUDA Library 12.4
+                    image_id = 'skypilot:gpu-ubuntu-2204'
 
         if (resources.image_id is not None and
                 resources.extract_docker_image() is None):

From 03623ee3c13e3cda8b34f4070c1540d571b1c7a6 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 29 May 2024 02:22:12 +0000
Subject: [PATCH 03/29] fix ncl

---
 llm/gpt-2/gpt2.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml
index d2798cf19cf..a1ba2954ea9 100644
--- a/llm/gpt-2/gpt2.yaml
+++ b/llm/gpt-2/gpt2.yaml
@@ -64,7 +64,7 @@ setup: |
     sudo dpkg -i cudnn-installer.deb
     sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/
     # Remove problematic kubernetes.list source
-    sudo apt-get update --allow-releaseinfo-change
+    sudo apt-get update --allow-releaseinfo-change || true
 
     sudo apt-get -y install cudnn-cuda-12
 
@@ -94,7 +94,7 @@ setup: |
 run: |
   cd ~/llm.c
   # train on multiple GPUs
-  mpirun -np 8 ./train_gpt2cu \
+  mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE ./train_gpt2cu \
       -i "dev/data/fineweb10B/fineweb_train_*.bin" \
       -j "dev/data/fineweb10B/fineweb_val_*.bin" \
       -o log124M \

From 3636ea680b2dc80958b7980edda05d4a75372051 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 29 May 2024 02:23:20 +0000
Subject: [PATCH 04/29] Fix GPT-2

---
 llm/gpt-2/gpt2.yaml | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml
index a1ba2954ea9..030e30e16f3 100644
--- a/llm/gpt-2/gpt2.yaml
+++ b/llm/gpt-2/gpt2.yaml
@@ -32,21 +32,10 @@ resources:
   accelerators: A100:1
   use_spot: true
 
-# file_mounts:
-#   ~/.cache/huggingface: gs://gpt2-data-skypilot
+file_mounts:
+  ~/.cache/huggingface: gs://gpt2-data-skypilot
 
 setup: |
-  export PATH="$PATH:$HOME/.local/bin"
-  # Create a gpt2 conda version with the latest gxx
-  # conda activate gpt2
-  # if [ $? -eq 0 ]; then
-  #   echo "gpt2 environment already exists"
-  # else
-  #   conda create -n gpt2 gxx=12 -y
-  #   conda activate gpt2
-  # fi
-
-
   cd ~
   pip install tqdm tiktoken requests datasets
 

From 1694ecd3873ebb3a4eae355d427db1a2162a16ea Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 29 May 2024 02:35:18 +0000
Subject: [PATCH 05/29] add train and data

---
 llm/gpt-2/gpt2-data.yaml                    | 24 +++++++
 llm/gpt-2/{gpt2.yaml => gpt2-pipeline.yaml} | 40 +++++------
 llm/gpt-2/gpt2-train.yaml                   | 74 +++++++++++++++++++++
 3 files changed, 118 insertions(+), 20 deletions(-)
 create mode 100644 llm/gpt-2/gpt2-data.yaml
 rename llm/gpt-2/{gpt2.yaml => gpt2-pipeline.yaml} (77%)
 create mode 100644 llm/gpt-2/gpt2-train.yaml

diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml
new file mode 100644
index 00000000000..d04c9e59e65
--- /dev/null
+++ b/llm/gpt-2/gpt2-data.yaml
@@ -0,0 +1,24 @@
+name: gpt2-data
+
+resources:
+  cpus: 64+
+
+file_mounts:
+  /cache:
+    name: gpt2-data-skypilot
+    store: gcs
+    mode: MOUNT
+
+setup: |
+  pip install tqdm tiktoken requests datasets
+  # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
+  # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
+  # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
+  git clone https://github.com/karpathy/llm.c.git || true
+
+run: |
+  cd llm.c
+  python dev/data/fineweb.py --version 10B
+
+  rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/
+  rsync -Pavz dev/data/fineweb10B /cache/
diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2-pipeline.yaml
similarity index 77%
rename from llm/gpt-2/gpt2.yaml
rename to llm/gpt-2/gpt2-pipeline.yaml
index 030e30e16f3..9bea67e630c 100644
--- a/llm/gpt-2/gpt2.yaml
+++ b/llm/gpt-2/gpt2-pipeline.yaml
@@ -1,29 +1,29 @@
-# name: gpt2-data
+name: gpt2-data
 
-# resources:
-#   cpus: 64+
+resources:
+  cpus: 64+
 
-# file_mounts:
-#   /cache:
-#     name: gpt2-data-skypilot
-#     store: gcs
-#     mode: MOUNT
+file_mounts:
+  /cache:
+    name: gpt2-data-skypilot
+    store: gcs
+    mode: MOUNT
 
-# setup: |
-#   pip install tqdm tiktoken requests datasets
-#   # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
-#   # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
-#   # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
-#   git clone https://github.com/karpathy/llm.c.git || true
+setup: |
+  pip install tqdm tiktoken requests datasets
+  # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
+  # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
+  # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
+  git clone https://github.com/karpathy/llm.c.git || true
 
-# run: |
-#   cd llm.c
-#   python dev/data/fineweb.py --version 10B
+run: |
+  cd llm.c
+  python dev/data/fineweb.py --version 10B
 
-#   rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/
-#   rsync -Pavz dev/data/fineweb10B /cache/
+  rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/
+  rsync -Pavz dev/data/fineweb10B /cache/
 
-# ---
+---
 
 name: train
 
diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
new file mode 100644
index 00000000000..44d9400d0a2
--- /dev/null
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -0,0 +1,74 @@
+name: train
+
+
+resources:
+  accelerators: A100:1
+  use_spot: true
+
+file_mounts:
+  ~/.cache/huggingface: gs://gpt2-data-skypilot
+
+setup: |
+  cd ~
+  pip install tqdm tiktoken requests datasets
+
+  # install cudnn so we can use FlashAttention and run fast (optional)
+  # https://developer.nvidia.com/cudnn-downloads
+  # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04
+  if [ -f ./CUDNN_INSTALLED ]; then
+    echo "cudnn already installed"
+  else
+    system=$(lsb_release -si | tr '[:upper:]' '[:lower:]')
+    # Get version and remove the dot
+    version=$(lsb_release -sr | tr -d .)
+    export system_version="${system}${version}"
+    wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb
+    sudo dpkg -i cudnn-installer.deb
+    sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/
+    # Remove problematic kubernetes.list source
+    sudo apt-get update --allow-releaseinfo-change || true
+
+    sudo apt-get -y install cudnn-cuda-12
+
+    touch ./CUDNN_INSTALLED
+  fi
+
+  # "install" cudnn-frontend to ~/
+  git clone https://github.com/NVIDIA/cudnn-frontend.git || true
+
+  # install MPI (optional, if you intend to use multiple GPUs)
+  # SkyPilot do not install MPI as that requires NCCL which needs to be manually
+  # installed.
+  sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev
+  # install nccl
+  pip install nvidia-nccl-cu12
+  export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib
+  export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include
+
+  git clone https://github.com/karpathy/llm.c.git || true
+  cd llm.c
+  mv ~/.cache/huggingface/fineweb10B dev/data/
+  # compile llm.c (mixed precision, with cuDNN flash-attention)
+  # first compilation is ~1 minute, mostly due to cuDNN
+  make train_gpt2cu USE_CUDNN=1
+
+
+run: |
+  cd ~/llm.c
+  # train on multiple GPUs
+  mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE ./train_gpt2cu \
+      -i "dev/data/fineweb10B/fineweb_train_*.bin" \
+      -j "dev/data/fineweb10B/fineweb_val_*.bin" \
+      -o log124M \
+      -e "d12" \
+      -b 64 -t 1024 \
+      -d 524288 \
+      -r 1 \
+      -z 1 \
+      -c 0.1 \
+      -l 0.0006 \
+      -q 0.0 \
+      -u 700 \
+      -n 5000 \
+      -v 250 -s 20000 \
+      -h 1

From 2c80dcbe58f785ebf552e2dbc5e5dda9fd6e086c Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 29 May 2024 02:37:26 +0000
Subject: [PATCH 06/29] use 8 gpus

---
 llm/gpt-2/gpt2-pipeline.yaml | 3 +--
 llm/gpt-2/gpt2-train.yaml    | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/llm/gpt-2/gpt2-pipeline.yaml b/llm/gpt-2/gpt2-pipeline.yaml
index 9bea67e630c..d69cc4beb4b 100644
--- a/llm/gpt-2/gpt2-pipeline.yaml
+++ b/llm/gpt-2/gpt2-pipeline.yaml
@@ -29,8 +29,7 @@ name: train
 
 
 resources:
-  accelerators: A100:1
-  use_spot: true
+  accelerators: A100:8
 
 file_mounts:
   ~/.cache/huggingface: gs://gpt2-data-skypilot
diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index 44d9400d0a2..60fc29ea3be 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -2,8 +2,7 @@ name: train
 
 
 resources:
-  accelerators: A100:1
-  use_spot: true
+  accelerators: A100:8
 
 file_mounts:
   ~/.cache/huggingface: gs://gpt2-data-skypilot

From 1bef7981b68fcc3bcc96c4da2fafb36986dfe053 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 29 May 2024 02:44:20 +0000
Subject: [PATCH 07/29] revert gcp change

---
 sky/clouds/gcp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py
index 215fa043a08..93260533f27 100644
--- a/sky/clouds/gcp.py
+++ b/sky/clouds/gcp.py
@@ -404,7 +404,7 @@ def make_deploy_resources_variables(
         # --no-standard-images
         # We use the debian image, as the ubuntu image has some connectivity
         # issue when first booted.
-        image_id = 'skypilot:cpu-ubuntu-2204'
+        image_id = 'skypilot:cpu-debian-11'
 
         r = resources
         # Find GPU spec, if any.
@@ -455,8 +455,8 @@ def make_deploy_resources_variables(
                     # CUDA driver version 470.57.02, CUDA Library 11.4
                     image_id = 'skypilot:k80-debian-10'
                 else:
-                    # CUDA driver version 550.54.15, CUDA Library 12.4
-                    image_id = 'skypilot:gpu-ubuntu-2204'
+                    # CUDA driver version 535.86.10, CUDA Library 12.2
+                    image_id = 'skypilot:gpu-debian-11'
 
         if (resources.image_id is not None and
                 resources.extract_docker_image() is None):

From 92828733ca4aa40a3486d2b812f5d22d1077e3c6 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 29 May 2024 02:48:36 +0000
Subject: [PATCH 08/29] update readme

---
 llm/gpt-2/README.md          |  19 +++++++
 llm/gpt-2/gpt2-pipeline.yaml | 100 -----------------------------------
 llm/gpt-2/gpt2-train.yaml    |   2 +-
 3 files changed, 20 insertions(+), 101 deletions(-)
 delete mode 100644 llm/gpt-2/gpt2-pipeline.yaml

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index 33b5453569e..1c4ea1205c9 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -2,7 +2,26 @@
 
 https://github.com/karpathy/llm.c/discussions/481
 
+## Data processing
+
+```bash
+sky launch -c gpt2-data gpt2-data.yaml -y
+```
+
+
+## Training
+
+```bash
+sky launch -c gpt2-train gpt2-train.yaml -y
+```
+
+
+## Run in a Pipeline
+We can also combine the two steps into a single SkyPilot job:
 ```bash
+cat gpt2-data.yaml > gpt2.yaml
+echo "---" >> gpt2.yaml
+cat gpt2-train.yaml >> gpt2.yaml
 sky jobs launch -n gpt2 gpt2.yaml
 ```
 
diff --git a/llm/gpt-2/gpt2-pipeline.yaml b/llm/gpt-2/gpt2-pipeline.yaml
deleted file mode 100644
index d69cc4beb4b..00000000000
--- a/llm/gpt-2/gpt2-pipeline.yaml
+++ /dev/null
@@ -1,100 +0,0 @@
-name: gpt2-data
-
-resources:
-  cpus: 64+
-
-file_mounts:
-  /cache:
-    name: gpt2-data-skypilot
-    store: gcs
-    mode: MOUNT
-
-setup: |
-  pip install tqdm tiktoken requests datasets
-  # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
-  # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
-  # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
-  git clone https://github.com/karpathy/llm.c.git || true
-
-run: |
-  cd llm.c
-  python dev/data/fineweb.py --version 10B
-
-  rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/
-  rsync -Pavz dev/data/fineweb10B /cache/
-
----
-
-name: train
-
-
-resources:
-  accelerators: A100:8
-
-file_mounts:
-  ~/.cache/huggingface: gs://gpt2-data-skypilot
-
-setup: |
-  cd ~
-  pip install tqdm tiktoken requests datasets
-
-  # install cudnn so we can use FlashAttention and run fast (optional)
-  # https://developer.nvidia.com/cudnn-downloads
-  # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04
-  if [ -f ./CUDNN_INSTALLED ]; then
-    echo "cudnn already installed"
-  else
-    system=$(lsb_release -si | tr '[:upper:]' '[:lower:]')
-    # Get version and remove the dot
-    version=$(lsb_release -sr | tr -d .)
-    export system_version="${system}${version}"
-    wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb
-    sudo dpkg -i cudnn-installer.deb
-    sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/
-    # Remove problematic kubernetes.list source
-    sudo apt-get update --allow-releaseinfo-change || true
-
-    sudo apt-get -y install cudnn-cuda-12
-
-    touch ./CUDNN_INSTALLED
-  fi
-
-  # "install" cudnn-frontend to ~/
-  git clone https://github.com/NVIDIA/cudnn-frontend.git || true
-
-  # install MPI (optional, if you intend to use multiple GPUs)
-  # SkyPilot do not install MPI as that requires NCCL which needs to be manually
-  # installed.
-  sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev
-  # install nccl
-  pip install nvidia-nccl-cu12
-  export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib
-  export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include
-
-  git clone https://github.com/karpathy/llm.c.git || true
-  cd llm.c
-  mv ~/.cache/huggingface/fineweb10B dev/data/
-  # compile llm.c (mixed precision, with cuDNN flash-attention)
-  # first compilation is ~1 minute, mostly due to cuDNN
-  make train_gpt2cu USE_CUDNN=1
-
-
-run: |
-  cd ~/llm.c
-  # train on multiple GPUs
-  mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE ./train_gpt2cu \
-      -i "dev/data/fineweb10B/fineweb_train_*.bin" \
-      -j "dev/data/fineweb10B/fineweb_val_*.bin" \
-      -o log124M \
-      -e "d12" \
-      -b 64 -t 1024 \
-      -d 524288 \
-      -r 1 \
-      -z 1 \
-      -c 0.1 \
-      -l 0.0006 \
-      -q 0.0 \
-      -u 700 \
-      -n 5000 \
-      -v 250 -s 20000 \
-      -h 1
diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index 60fc29ea3be..a075d58717c 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -46,7 +46,7 @@ setup: |
 
   git clone https://github.com/karpathy/llm.c.git || true
   cd llm.c
-  mv ~/.cache/huggingface/fineweb10B dev/data/
+  ln -s ~/.cache/huggingface/fineweb10B dev/data/
   # compile llm.c (mixed precision, with cuDNN flash-attention)
   # first compilation is ~1 minute, mostly due to cuDNN
   make train_gpt2cu USE_CUDNN=1

From 0ee942c8af27a2eaad15a51d08a600bdcfe113fa Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 29 May 2024 03:00:38 +0000
Subject: [PATCH 09/29] Add GCP image

---
 llm/gpt-2/gpt2-train.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index a075d58717c..1ba7c2b20ea 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -3,6 +3,18 @@ name: train
 
 resources:
   accelerators: A100:8
+  any_of:
+    - cloud: gcp
+      # Need to switch to Ubuntu image on GCP for C++ dependencies
+      image_id: projects/deeplearning-platform-release/global/images/common-cu122-v20240514-ubuntu-2204-py310
+    # List all possible cloud below.
+    - cloud: aws
+    - cloud: azure
+    - cloud: lambda
+    - cloud: runpod
+    - cloud: paperspace
+    - cloud: fluidstack
+    - cloud: kubernetes
 
 file_mounts:
   ~/.cache/huggingface: gs://gpt2-data-skypilot

From 5af0d933e57afc082e09a7ab3f640a0e867c6252 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 29 May 2024 03:14:19 +0000
Subject: [PATCH 10/29] make file_mounts more general

---
 llm/gpt-2/gpt2-train.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index 1ba7c2b20ea..0e1340c8236 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -17,7 +17,9 @@ resources:
     - cloud: kubernetes
 
 file_mounts:
-  ~/.cache/huggingface: gs://gpt2-data-skypilot
+  ~/.cache/huggingface:
+    name: gpt2-data-skypilot
+    mode: COPY
 
 setup: |
   cd ~

From 71bcdd0fed9d3160efc4b25fc11d766c26fb97b4 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 29 May 2024 03:45:37 +0000
Subject: [PATCH 11/29] avoid any_of

---
 llm/gpt-2/gpt2-train.yaml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index 0e1340c8236..33e9a61423a 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -3,18 +3,6 @@ name: train
 
 resources:
   accelerators: A100:8
-  any_of:
-    - cloud: gcp
-      # Need to switch to Ubuntu image on GCP for C++ dependencies
-      image_id: projects/deeplearning-platform-release/global/images/common-cu122-v20240514-ubuntu-2204-py310
-    # List all possible cloud below.
-    - cloud: aws
-    - cloud: azure
-    - cloud: lambda
-    - cloud: runpod
-    - cloud: paperspace
-    - cloud: fluidstack
-    - cloud: kubernetes
 
 file_mounts:
   ~/.cache/huggingface:

From 488347f12696298da1d35feeaa2ab5de74654f1e Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 16:20:17 +0000
Subject: [PATCH 12/29] change back to use ubuntu image with wait for GPU

---
 llm/gpt-2/gpt2-train.yaml    | 12 ++++++++++++
 sky/templates/gcp-ray.yml.j2 |  6 ++++++
 2 files changed, 18 insertions(+)

diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index a075d58717c..5ac258fabc7 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -3,6 +3,18 @@ name: train
 
 resources:
   accelerators: A100:8
+  any_of:
+    - cloud: gcp
+      # Use ubuntu 22.04 as the C++ compiler is not latest enough in SkyPillot's
+      # default debian 11 image.
+      image_id: projects/deeplearning-platform-release/global/images/common-cu122-v20240514-ubuntu-2204-py310
+    - cloud: aws
+    - cloud: azure
+    - cloud: lambda
+    - cloud: runpod
+    - cloud: fluidstack
+    - cloud: ibm
+    - cloud: oci
 
 file_mounts:
   ~/.cache/huggingface: gs://gpt2-data-skypilot
diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2
index 9c2092bdfaf..0100c38cad3 100644
--- a/sky/templates/gcp-ray.yml.j2
+++ b/sky/templates/gcp-ray.yml.j2
@@ -174,6 +174,9 @@ setup_commands:
   # Line 'sudo systemctl stop jupyter ..': stop jupyter service to avoid port conflict on 8080
   # Line 'mkdir -p ..': disable host key check
   # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
+  # Line 'while ! nvidia-smi; do sleep 2; echo': wait for CUDA driver to be installed, as for
+  # some deep learning images with Ubuntu, CUDA driver will not be immediately available, due
+  # to `install-nvidia-driver`
   - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
     {%- if docker_image is none %}
     sudo systemctl stop unattended-upgrades || true;
@@ -205,6 +208,9 @@ setup_commands:
     {%- endif %}
     mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n" >> ~/.ssh/config;
     [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
+  {%- if gpu is not none %}
+    nvidia-smi || { while ! nvidia-smi; do sleep 2; echo "Waiting for CUDA driver to be installed"; done; }
+  {% endif %}
 
 # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
 # We do not need to list it here anymore.

From 2e5bacf9a4fe3e029fd0a28bfe9f4e8e2937b9a1 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 16:27:46 +0000
Subject: [PATCH 13/29] wait cuda installation

---
 sky/templates/gcp-ray.yml.j2 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2
index 0100c38cad3..b9f72942f5e 100644
--- a/sky/templates/gcp-ray.yml.j2
+++ b/sky/templates/gcp-ray.yml.j2
@@ -209,7 +209,8 @@ setup_commands:
     mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n" >> ~/.ssh/config;
     [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
   {%- if gpu is not none %}
-    nvidia-smi || { while ! nvidia-smi; do sleep 2; echo "Waiting for CUDA driver to be installed"; done; }
+    curl -L https://github.com/GoogleCloudPlatform/compute-gpu-installation/releases/download/cuda-installer-v1.0.0/cuda_installer.pyz --output cuda_installer.pyz
+    while ! python3 cuda_installer.pyz verify_cuda; do sleep 2; echo "Waiting for CUDA driver to be installed"; done;
   {% endif %}
 
 # Command to start ray clusters are now placed in `sky.provision.instance_setup`.

From c070da0bd4b0866d7f9ef73f364b193ca7c074c4 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 16:57:26 +0000
Subject: [PATCH 14/29] Add retry for file mount and use env for bucket name

---
 llm/gpt-2/gpt2-data.yaml      |  6 ++++--
 llm/gpt-2/gpt2-train.yaml     |  9 +++++---
 sky/backends/backend_utils.py | 40 ++++++++++++++++++++++-------------
 sky/templates/gcp-ray.yml.j2  |  5 ++---
 sky/utils/command_runner.pyi  |  6 ++++--
 5 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml
index d04c9e59e65..42daee00584 100644
--- a/llm/gpt-2/gpt2-data.yaml
+++ b/llm/gpt-2/gpt2-data.yaml
@@ -1,12 +1,14 @@
 name: gpt2-data
 
+envs:
+  BUCKET_NAME: # Fill in your bucket name
+
 resources:
   cpus: 64+
 
 file_mounts:
   /cache:
-    name: gpt2-data-skypilot
-    store: gcs
+    name: $BUCKET_NAME
     mode: MOUNT
 
 setup: |
diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index 8ddd670c5a0..182c95fa1aa 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -1,13 +1,16 @@
 name: train
 
+envs:
+  BUCKET_NAME: # Fill in your bucket name
 
 resources:
   accelerators: A100:8
   any_of:
     - cloud: gcp
       # Use ubuntu 22.04 as the C++ compiler is not latest enough in SkyPillot's
-      # default debian 11 image.
-      image_id: projects/deeplearning-platform-release/global/images/common-cu122-v20240514-ubuntu-2204-py310
+      # default debian 11 image. We use our custom image with CUDA 12.2
+      # installed.
+      image_id: projects/skypilot-375900/global/images/llm-c
     - cloud: aws
     - cloud: azure
     - cloud: lambda
@@ -18,7 +21,7 @@ resources:
 
 file_mounts:
   ~/.cache/huggingface:
-    name: gpt2-data-skypilot
+    name: $BUCKET_NAME
     mode: COPY
 
 setup: |
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index b1598c7c039..6759aa30d3f 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -1273,21 +1273,30 @@ def parallel_data_transfer_to_nodes(
 
     def _sync_node(runner: 'command_runner.CommandRunner') -> None:
         if cmd is not None:
-            rc, stdout, stderr = runner.run(cmd,
-                                            log_path=log_path,
-                                            stream_logs=stream_logs,
-                                            require_outputs=True,
-                                            source_bashrc=source_bashrc)
-            err_msg = ('Failed to run command before rsync '
-                       f'{origin_source} -> {target}. '
-                       'Ensure that the network is stable, then retry. '
-                       f'{cmd}')
-            if log_path != os.devnull:
-                err_msg += f' See logs in {log_path}'
-            subprocess_utils.handle_returncode(rc,
-                                               cmd,
-                                               err_msg,
-                                               stderr=stdout + stderr)
+            retry_cnt = 0
+            while retry_cnt < 3:
+                rc, stdout, stderr = runner.run(cmd,
+                                                log_path=log_path,
+                                                stream_logs=stream_logs,
+                                                require_outputs=True,
+                                                source_bashrc=source_bashrc)
+                if rc == 255:
+                    retry_cnt += 1
+                    logger.warning(
+                        f'Failed to run command on {runner.node_id}, likely '
+                        f'due to a reboot. Retrying... (Attempt {retry_cnt})')
+                    time.sleep(5)
+                    continue
+                err_msg = ('Failed to run command before rsync '
+                           f'{origin_source} -> {target}. '
+                           'Ensure that the network is stable, then retry. '
+                           f'{cmd}')
+                if log_path != os.devnull:
+                    err_msg += f' See logs in {log_path}'
+                subprocess_utils.handle_returncode(rc,
+                                                   cmd,
+                                                   err_msg,
+                                                   stderr=stdout + stderr)
 
         if run_rsync:
             assert source is not None
@@ -1299,6 +1308,7 @@ def _sync_node(runner: 'command_runner.CommandRunner') -> None:
                 up=True,
                 log_path=log_path,
                 stream_logs=stream_logs,
+                max_retry=3,
             )
 
     num_nodes = len(runners)
diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2
index b9f72942f5e..b111c6c1cb3 100644
--- a/sky/templates/gcp-ray.yml.j2
+++ b/sky/templates/gcp-ray.yml.j2
@@ -174,7 +174,7 @@ setup_commands:
   # Line 'sudo systemctl stop jupyter ..': stop jupyter service to avoid port conflict on 8080
   # Line 'mkdir -p ..': disable host key check
   # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
-  # Line 'while ! nvidia-smi; do sleep 2; echo': wait for CUDA driver to be installed, as for
+  # Line 'while ! nvidia-smi; do sleep 2...', as for
   # some deep learning images with Ubuntu, CUDA driver will not be immediately available, due
   # to `install-nvidia-driver`
   - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
@@ -209,8 +209,7 @@ setup_commands:
     mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n" >> ~/.ssh/config;
     [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
   {%- if gpu is not none %}
-    curl -L https://github.com/GoogleCloudPlatform/compute-gpu-installation/releases/download/cuda-installer-v1.0.0/cuda_installer.pyz --output cuda_installer.pyz
-    while ! python3 cuda_installer.pyz verify_cuda; do sleep 2; echo "Waiting for CUDA driver to be installed"; done;
+    while ! nvidia-smi; do sleep 2; echo "Waiting for CUDA driver to be installed"; done;
   {% endif %}
 
 # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
diff --git a/sky/utils/command_runner.pyi b/sky/utils/command_runner.pyi
index 9fbad243775..77e5a8959cf 100644
--- a/sky/utils/command_runner.pyi
+++ b/sky/utils/command_runner.pyi
@@ -101,7 +101,8 @@ class CommandRunner:
               *,
               up: bool,
               log_path: str = ...,
-              stream_logs: bool = ...) -> None:
+              stream_logs: bool = ...,
+              max_retry: int = 1) -> None:
         ...
 
     @classmethod
@@ -191,5 +192,6 @@ class SSHCommandRunner(CommandRunner):
               *,
               up: bool,
               log_path: str = ...,
-              stream_logs: bool = ...) -> None:
+              stream_logs: bool = ...,
+              max_retry: int = 1) -> None:
         ...

From 87d2a3ca0b392126ca16752540ccd8d9140bead7 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 17:11:38 +0000
Subject: [PATCH 15/29] revert retries

---
 llm/gpt-2/README.md           | 11 +++++-----
 llm/gpt-2/gpt2-train.yaml     |  2 +-
 sky/backends/backend_utils.py | 40 +++++++++++++----------------------
 3 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index 1c4ea1205c9..9e0e4dd5be7 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -1,18 +1,19 @@
-# GPT-2 (124M) in llm.c in 90 minutes
+# Run GPT-2 (124M) in llm.c on any cloud with SkyPilot
 
-https://github.com/karpathy/llm.c/discussions/481
+This is a reproducible package of llm.c's GPT-2 (124M) training by @karpathy (https://github.com/karpathy/llm.c/discussions/481)
+With SkyPilot, you can run GPT-2 (124M) training on any cloud.
 
 ## Data processing
 
 ```bash
-sky launch -c gpt2-data gpt2-data.yaml -y
+sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
 
 ## Training
 
 ```bash
-sky launch -c gpt2-train gpt2-train.yaml -y
+sky launch -c gpt2-train gpt2-train.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
 
@@ -22,7 +23,7 @@ We can also combine the two steps into a single SkyPilot job:
 cat gpt2-data.yaml > gpt2.yaml
 echo "---" >> gpt2.yaml
 cat gpt2-train.yaml >> gpt2.yaml
-sky jobs launch -n gpt2 gpt2.yaml
+sky jobs launch -n gpt2 gpt2.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
 SkyPilot will first download and process the dataset on a CPU VM and store the
diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index 182c95fa1aa..31bffa370b5 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -10,7 +10,7 @@ resources:
       # Use ubuntu 22.04 as the C++ compiler is not latest enough in SkyPillot's
       # default debian 11 image. We use our custom image with CUDA 12.2
       # installed.
-      image_id: projects/skypilot-375900/global/images/llm-c
+      image_id: projects/skypilot-375900/global/images/gpu
     - cloud: aws
     - cloud: azure
     - cloud: lambda
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index 6759aa30d3f..b1598c7c039 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -1273,30 +1273,21 @@ def parallel_data_transfer_to_nodes(
 
     def _sync_node(runner: 'command_runner.CommandRunner') -> None:
         if cmd is not None:
-            retry_cnt = 0
-            while retry_cnt < 3:
-                rc, stdout, stderr = runner.run(cmd,
-                                                log_path=log_path,
-                                                stream_logs=stream_logs,
-                                                require_outputs=True,
-                                                source_bashrc=source_bashrc)
-                if rc == 255:
-                    retry_cnt += 1
-                    logger.warning(
-                        f'Failed to run command on {runner.node_id}, likely '
-                        f'due to a reboot. Retrying... (Attempt {retry_cnt})')
-                    time.sleep(5)
-                    continue
-                err_msg = ('Failed to run command before rsync '
-                           f'{origin_source} -> {target}. '
-                           'Ensure that the network is stable, then retry. '
-                           f'{cmd}')
-                if log_path != os.devnull:
-                    err_msg += f' See logs in {log_path}'
-                subprocess_utils.handle_returncode(rc,
-                                                   cmd,
-                                                   err_msg,
-                                                   stderr=stdout + stderr)
+            rc, stdout, stderr = runner.run(cmd,
+                                            log_path=log_path,
+                                            stream_logs=stream_logs,
+                                            require_outputs=True,
+                                            source_bashrc=source_bashrc)
+            err_msg = ('Failed to run command before rsync '
+                       f'{origin_source} -> {target}. '
+                       'Ensure that the network is stable, then retry. '
+                       f'{cmd}')
+            if log_path != os.devnull:
+                err_msg += f' See logs in {log_path}'
+            subprocess_utils.handle_returncode(rc,
+                                               cmd,
+                                               err_msg,
+                                               stderr=stdout + stderr)
 
         if run_rsync:
             assert source is not None
@@ -1308,7 +1299,6 @@ def _sync_node(runner: 'command_runner.CommandRunner') -> None:
                 up=True,
                 log_path=log_path,
                 stream_logs=stream_logs,
-                max_retry=3,
             )
 
     num_nodes = len(runners)

From d6e9554ba329a21a81db53ca31afa86716075dd5 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 17:52:59 +0000
Subject: [PATCH 16/29] update the image

---
 llm/gpt-2/gpt2-train.yaml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index 31bffa370b5..fef617ce97f 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -10,14 +10,16 @@ resources:
       # Use ubuntu 22.04 as the C++ compiler is not latest enough in SkyPillot's
       # default debian 11 image. We use our custom image with CUDA 12.2
       # installed.
-      image_id: projects/skypilot-375900/global/images/gpu
-    - cloud: aws
-    - cloud: azure
+      image_id: projects/skypilot-375900/global/images/ubuntu2204-gpu
     - cloud: lambda
+    - cloud: azure
     - cloud: runpod
     - cloud: fluidstack
     - cloud: ibm
     - cloud: oci
+    # TODO: AWS's C++ version is too old to compile llm.c. A update of base AMI
+    # is needed.
+    # - cloud: aws
 
 file_mounts:
   ~/.cache/huggingface:

From ef26ecd5fe6a2c8c3fefed277307919ca5fd118c Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 21:29:30 +0000
Subject: [PATCH 17/29] change to docker for better dependency

---
 llm/gpt-2/README.md       |  2 +-
 llm/gpt-2/gpt2-train.yaml | 21 +++++----------------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index 9e0e4dd5be7..ce50011810e 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -13,7 +13,7 @@ sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name
 ## Training
 
 ```bash
-sky launch -c gpt2-train gpt2-train.yaml --env BUCKET_NAME=your-bucket-name
+sky launch -c gpt2-train --detach-setup gpt2-train.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
 
diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index fef617ce97f..d65df9cb8fa 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -5,21 +5,9 @@ envs:
 
 resources:
   accelerators: A100:8
-  any_of:
-    - cloud: gcp
-      # Use ubuntu 22.04 as the C++ compiler is not latest enough in SkyPillot's
-      # default debian 11 image. We use our custom image with CUDA 12.2
-      # installed.
-      image_id: projects/skypilot-375900/global/images/ubuntu2204-gpu
-    - cloud: lambda
-    - cloud: azure
-    - cloud: runpod
-    - cloud: fluidstack
-    - cloud: ibm
-    - cloud: oci
-    # TODO: AWS's C++ version is too old to compile llm.c. A update of base AMI
-    # is needed.
-    # - cloud: aws
+  # Use docker image for latest version g++ to enable the compilation of llm.c.
+  image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
 
 file_mounts:
   ~/.cache/huggingface:
@@ -52,6 +40,7 @@ setup: |
   fi
 
   # "install" cudnn-frontend to ~/
+  sudo apt -y install git
   git clone https://github.com/NVIDIA/cudnn-frontend.git || true
 
   # install MPI (optional, if you intend to use multiple GPUs)
@@ -74,7 +63,7 @@ setup: |
 run: |
   cd ~/llm.c
   # train on multiple GPUs
-  mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE ./train_gpt2cu \
+  mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \
       -i "dev/data/fineweb10B/fineweb_train_*.bin" \
       -j "dev/data/fineweb10B/fineweb_val_*.bin" \
       -o log124M \

From 2b0a085d4a2b9b62148804d751d221fec8ba445e Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 21:33:22 +0000
Subject: [PATCH 18/29] revert changes in gcp template

---
 sky/templates/gcp-ray.yml.j2 | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2
index b111c6c1cb3..9c2092bdfaf 100644
--- a/sky/templates/gcp-ray.yml.j2
+++ b/sky/templates/gcp-ray.yml.j2
@@ -174,9 +174,6 @@ setup_commands:
   # Line 'sudo systemctl stop jupyter ..': stop jupyter service to avoid port conflict on 8080
   # Line 'mkdir -p ..': disable host key check
   # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
-  # Line 'while ! nvidia-smi; do sleep 2...', as for
-  # some deep learning images with Ubuntu, CUDA driver will not be immediately available, due
-  # to `install-nvidia-driver`
   - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
     {%- if docker_image is none %}
     sudo systemctl stop unattended-upgrades || true;
@@ -208,9 +205,6 @@ setup_commands:
     {%- endif %}
     mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n" >> ~/.ssh/config;
     [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
-  {%- if gpu is not none %}
-    while ! nvidia-smi; do sleep 2; echo "Waiting for CUDA driver to be installed"; done;
-  {% endif %}
 
 # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
 # We do not need to list it here anymore.

From aa8ecfe4583c03516fb747d264815ac00e0da6db Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 22:10:26 +0000
Subject: [PATCH 19/29] avoid using docker on lambda

---
 llm/gpt-2/gpt2-train.yaml | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index d65df9cb8fa..7c79f53ee44 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -7,12 +7,20 @@ resources:
   accelerators: A100:8
   # Use docker image for latest version g++ to enable the compilation of llm.c.
   image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
-
-
+  image_id:
+    # Avoid using docker image for lambda due to the docker is not supported on
+    # Lambda yet, but the base image works.
+    - cloud: lambda
+      image_id: null
+    - cloud: aws
+    - cloud: gcp
+    - cloud: azure
+    - cloud: fluidstack
+  
 file_mounts:
-  ~/.cache/huggingface:
-    name: $BUCKET_NAME
-    mode: COPY
+  ~/.cache/huggingface: gs://$BUCKET_NAME
+    # name: $BUCKET_NAME
+    # mode: COPY
 
 setup: |
   cd ~

From 265e43c42a6b1da4993fd954e22b48e53b9bf81d Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 22:17:05 +0000
Subject: [PATCH 20/29] Add single GPU

---
 llm/gpt-2/README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index ce50011810e..45b616e09df 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -16,6 +16,11 @@ sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name
 sky launch -c gpt2-train --detach-setup gpt2-train.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
+Or you can train the model with a single A100:
+```bash
+sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu a100 --env BUCKET_NAME=your-bucket-name
+```
+
 
 ## Run in a Pipeline
 We can also combine the two steps into a single SkyPilot job:

From 598dca52e03928665e48dd6a5894cfea39459407 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 22:43:06 +0000
Subject: [PATCH 21/29] Elaborate readme

---
 llm/gpt-2/README.md | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index 45b616e09df..324d7d856df 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -1,10 +1,28 @@
-# Run GPT-2 (124M) in llm.c on any cloud with SkyPilot
+# Run GPT-2 in llm.c on any cloud with SkyPilot
 
 This is a reproducible package of llm.c's GPT-2 (124M) training by @karpathy (https://github.com/karpathy/llm.c/discussions/481)
 With SkyPilot, you can run GPT-2 (124M) training on any cloud.
 
+## Prerequisites
+
+1. Install [SkyPilot](https://github.com/skypilot-org/skypilot):
+```bash
+pip install skypilot-nightly
+```
+2. Enable clouds for SkyPilot:
+```bash
+sky check
+```
+Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
+3. Download the YAMLs in this directory for data processing and training:
+```bash
+wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml
+wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml
+```
+
 ## Data processing
 
+Run the following command to process the training data on a CPU VM and store it in a cloud bucket for future use (replace `your-bucket-name` with your bucket name):
 ```bash
 sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name
 ```
@@ -12,18 +30,21 @@ sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name
 
 ## Training
 
+After the data is processed, you can then train the model on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name):
+
 ```bash
 sky launch -c gpt2-train --detach-setup gpt2-train.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
-Or you can train the model with a single A100:
+Or, you can train the model with a single A100, by adding `--gpu A100`:
 ```bash
-sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu a100 --env BUCKET_NAME=your-bucket-name
+sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_NAME=your-bucket-name
 ```
 
 
 ## Run in a Pipeline
-We can also combine the two steps into a single SkyPilot job:
+
+We can also combine the two steps into a single SkyPilot job, and let SkyPilot to handle the dependencies between the two steps. Here is an example of how to do this (replace `your-bucket-name` with your bucket name):
 ```bash
 cat gpt2-data.yaml > gpt2.yaml
 echo "---" >> gpt2.yaml

From 3056c2c8a0a2704db4c1d7436e88fe9f18ef1b86 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 16:31:21 -0700
Subject: [PATCH 22/29] Update llm/gpt-2/README.md

Co-authored-by: Romil Bhardwaj <romil.bhardwaj@berkeley.edu>
---
 llm/gpt-2/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index 324d7d856df..404d3bdcfcc 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -14,6 +14,7 @@ pip install skypilot-nightly
 sky check
 ```
 Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
+
 3. Download the YAMLs in this directory for data processing and training:
 ```bash
 wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml

From 815d23ce0a939b1286efbe7cb333836e082e3fb5 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 30 May 2024 23:33:31 +0000
Subject: [PATCH 23/29] fix

---
 llm/gpt-2/gpt2-train.yaml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index 7c79f53ee44..7c7e58eeaf9 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -7,7 +7,7 @@ resources:
   accelerators: A100:8
   # Use docker image for latest version g++ to enable the compilation of llm.c.
   image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
-  image_id:
+  any_of:
     # Avoid using docker image for lambda due to the docker is not supported on
     # Lambda yet, but the base image works.
     - cloud: lambda
@@ -16,11 +16,12 @@ resources:
     - cloud: gcp
     - cloud: azure
     - cloud: fluidstack
+    - cloud: kubernetes
   
 file_mounts:
-  ~/.cache/huggingface: gs://$BUCKET_NAME
-    # name: $BUCKET_NAME
-    # mode: COPY
+  ~/.cache/huggingface:
+    name: $BUCKET_NAME
+    mode: COPY
 
 setup: |
   cd ~

From 4c4493563bebe1a7d02b216ea594fb2105c7a751 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 31 May 2024 00:02:42 +0000
Subject: [PATCH 24/29] address comments

---
 llm/gpt-2/README.md      | 50 +++++++++++++++++-----
 llm/gpt-2/gpt2-data.yaml |  2 +-
 llm/gpt-2/gpt2.yaml      | 89 ++++++++++++++++++++++++++++++++++++++++
 sky/utils/schemas.py     |  6 ++-
 4 files changed, 134 insertions(+), 13 deletions(-)
 create mode 100644 llm/gpt-2/gpt2.yaml

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index 404d3bdcfcc..9cdfd76f462 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -7,7 +7,7 @@ With SkyPilot, you can run GPT-2 (124M) training on any cloud.
 
 1. Install [SkyPilot](https://github.com/skypilot-org/skypilot):
 ```bash
-pip install skypilot-nightly
+pip install "skypilot-nightly[aws,gcp,azure,kubernetes,lambda,fluidstack]" # Choose the clouds you want to enable
 ```
 2. Enable clouds for SkyPilot:
 ```bash
@@ -15,21 +15,49 @@ sky check
 ```
 Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
 
-3. Download the YAMLs in this directory for data processing and training:
+3. Download the YAML for starting the training:
 ```bash
-wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml
-wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml
+wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2.yaml
 ```
 
-## Data processing
+## Run GPT-2 training
+
+Run the following command to start GPT-2 (124M) training on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name):
+
+```bash
+sky launch -c gpt2 gpt2.yaml
+```
+
+Or, you can train the model with a single A100, by adding `--gpu A100`:
+```bash
+sky launch -c gpt2 gpt2.yaml --gpu A100
+```
+
+## Advanced: Run GPT-2 training in two stages
+
+The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily
+separate the data processing and training into two stages and execute them sequantially manually, or let SkyPilot manage the dependencies between the two stages.
+
+With this data processing can be run on cheaper CPU VMs (e.g., ~\$1.5/hour), and run the training on more expensive GPU VMs (e.g., ~\$1.3-\$3.6/hour for a single A100 GPU, or \$10.3-\$32.8/hour for 8 A100 GPUs).
+
+We can run the data processing on a CPU VM and store the processed data in a cloud bucket. Then, we can run the training on a GPU VM with the processed data.
+
+```bash
+wget https://raw.githubusercontent.com//skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml
+wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml
+```
+
+### Run two stages manually
+#### Data processing
 
 Run the following command to process the training data on a CPU VM and store it in a cloud bucket for future use (replace `your-bucket-name` with your bucket name):
+
 ```bash
 sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
 
-## Training
+#### Training
 
 After the data is processed, you can then train the model on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name):
 
@@ -43,14 +71,14 @@ sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_
 ```
 
 
-## Run in a Pipeline
+### Run in a Pipeline
 
 We can also combine the two steps into a single SkyPilot job, and let SkyPilot to handle the dependencies between the two steps. Here is an example of how to do this (replace `your-bucket-name` with your bucket name):
 ```bash
-cat gpt2-data.yaml > gpt2.yaml
-echo "---" >> gpt2.yaml
-cat gpt2-train.yaml >> gpt2.yaml
-sky jobs launch -n gpt2 gpt2.yaml --env BUCKET_NAME=your-bucket-name
+cat gpt2-data.yaml > gpt2-pipeline.yaml
+echo "---" >> gpt2-pipeline.yaml
+cat gpt2-train.yaml >> gpt2-pipeline.yaml
+sky jobs launch -n gpt2 gpt2-pipeline.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
 SkyPilot will first download and process the dataset on a CPU VM and store the
diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml
index 42daee00584..3bd082d02e2 100644
--- a/llm/gpt-2/gpt2-data.yaml
+++ b/llm/gpt-2/gpt2-data.yaml
@@ -4,7 +4,7 @@ envs:
   BUCKET_NAME: # Fill in your bucket name
 
 resources:
-  cpus: 64+
+  cpus: 32+
 
 file_mounts:
   /cache:
diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml
new file mode 100644
index 00000000000..0698073e3a0
--- /dev/null
+++ b/llm/gpt-2/gpt2.yaml
@@ -0,0 +1,89 @@
+name: train
+
+resources:
+  accelerators: A100:8
+  # Use docker image for latest version g++ to enable the compilation of llm.c.
+  image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+  any_of:
+    # Avoid using docker image for lambda due to the docker is not supported on
+    # Lambda yet, but the base image works.
+    - cloud: lambda
+      image_id: null
+    - cloud: aws
+    - cloud: gcp
+    - cloud: azure
+    - cloud: fluidstack
+    - cloud: kubernetes
+  
+
+setup: |
+  cd ~
+  pip install tqdm tiktoken requests datasets
+
+  # Training dependencies
+  # install cudnn so we can use FlashAttention and run fast (optional)
+  # https://developer.nvidia.com/cudnn-downloads
+  # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04
+  if [ -f ./CUDNN_INSTALLED ]; then
+    echo "cudnn already installed"
+  else
+    system=$(lsb_release -si | tr '[:upper:]' '[:lower:]')
+    # Get version and remove the dot
+    version=$(lsb_release -sr | tr -d .)
+    export system_version="${system}${version}"
+    wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb
+    sudo dpkg -i cudnn-installer.deb
+    sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/
+    # Remove problematic kubernetes.list source
+    sudo apt-get update --allow-releaseinfo-change || true
+
+    sudo apt-get -y install cudnn-cuda-12
+
+    touch ./CUDNN_INSTALLED
+  fi
+
+  # "install" cudnn-frontend to ~/
+  sudo apt -y install git
+  git clone https://github.com/NVIDIA/cudnn-frontend.git || true
+
+  # install MPI (optional, if you intend to use multiple GPUs)
+  # SkyPilot do not install MPI as that requires NCCL which needs to be manually
+  # installed.
+  sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev
+  # install nccl
+  pip install nvidia-nccl-cu12
+  export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib
+  export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include
+
+  git clone https://github.com/karpathy/llm.c.git || true
+  cd llm.c
+  # compile llm.c (mixed precision, with cuDNN flash-attention)
+  # first compilation is ~1 minute, mostly due to cuDNN
+  make train_gpt2cu USE_CUDNN=1
+
+
+run: |
+  cd ~/llm.c
+  # Processing data
+  # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
+  # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
+  # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
+  python dev/data/fineweb.py --version 10B
+
+  # Start training on multiple GPUs
+  mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \
+      -i "dev/data/fineweb10B/fineweb_train_*.bin" \
+      -j "dev/data/fineweb10B/fineweb_val_*.bin" \
+      -o log124M \
+      -e "d12" \
+      -b 64 -t 1024 \
+      -d 524288 \
+      -r 1 \
+      -z 1 \
+      -c 0.1 \
+      -l 0.0006 \
+      -q 0.0 \
+      -u 700 \
+      -n 5000 \
+      -v 250 -s 20000 \
+      -h 1
diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py
index 8bbe1d54e60..02c8452ef2a 100644
--- a/sky/utils/schemas.py
+++ b/sky/utils/schemas.py
@@ -141,7 +141,11 @@ def _get_single_resources_schema():
                 }, {
                     'type': 'object',
                     'required': [],
-                }]
+                },
+                {  
+                    'type': 'null',
+                }
+                ]
             },
             # The following fields are for internal use only.
             '_docker_login_config': {

From 3b7312e67e9f29beb25e683f9de5d4ff6a1a08cc Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 31 May 2024 00:27:25 +0000
Subject: [PATCH 25/29] Fix data fetching

---
 llm/gpt-2/README.md      |  4 ++++
 llm/gpt-2/gpt2-data.yaml | 14 ++++++++++----
 llm/gpt-2/gpt2.yaml      |  6 ++++++
 sky/utils/schemas.py     |  6 ++----
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index 9cdfd76f462..bdc6931ddbf 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -28,11 +28,15 @@ Run the following command to start GPT-2 (124M) training on a GPU VM with 8 A100
 sky launch -c gpt2 gpt2.yaml
 ```
 
+![GPT-2 training with 8 A100 GPUs](https://imgur.com/v8SGpsF.png)
+
 Or, you can train the model with a single A100, by adding `--gpu A100`:
 ```bash
 sky launch -c gpt2 gpt2.yaml --gpu A100
 ```
 
+![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png)
+
 ## Advanced: Run GPT-2 training in two stages
 
 The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily
diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml
index 3bd082d02e2..bc4789c0a54 100644
--- a/llm/gpt-2/gpt2-data.yaml
+++ b/llm/gpt-2/gpt2-data.yaml
@@ -13,13 +13,19 @@ file_mounts:
 
 setup: |
   pip install tqdm tiktoken requests datasets
-  # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
-  # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
-  # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
-  git clone https://github.com/karpathy/llm.c.git || true
+  git clone https://github.com/karpathy/llm.c.git@ed37d9261ba13ef212c01e2de8b309cbb46a2aa7 || true
+
+  # Adding revision to fix the dataset version, as the latest fineweb
+  # dataset removed the samples, causing error:
+  #   Please pass `features` or at least one example when writing data
+  sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py
+
 
 run: |
   cd llm.c
+  # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
+  # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
+  # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
   python dev/data/fineweb.py --version 10B
 
   rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/
diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml
index 0698073e3a0..8e203772128 100644
--- a/llm/gpt-2/gpt2.yaml
+++ b/llm/gpt-2/gpt2.yaml
@@ -57,6 +57,12 @@ setup: |
 
   git clone https://github.com/karpathy/llm.c.git || true
   cd llm.c
+
+  # add revision to fix the dataset version, as the latest fineweb
+  # dataset removed the samples, causing error:
+  #   Please pass `features` or at least one example when writing data
+  sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py
+
   # compile llm.c (mixed precision, with cuDNN flash-attention)
   # first compilation is ~1 minute, mostly due to cuDNN
   make train_gpt2cu USE_CUDNN=1
diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py
index 02c8452ef2a..5bc011abaaa 100644
--- a/sky/utils/schemas.py
+++ b/sky/utils/schemas.py
@@ -141,11 +141,9 @@ def _get_single_resources_schema():
                 }, {
                     'type': 'object',
                     'required': [],
-                },
-                {  
+                }, {
                     'type': 'null',
-                }
-                ]
+                }]
             },
             # The following fields are for internal use only.
             '_docker_login_config': {

From b6566d75ad41e51d54ece3662b7f480fe1ccfa4b Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 31 May 2024 01:03:32 +0000
Subject: [PATCH 26/29] Add visualization

---
 llm/gpt-2/README.md       | 16 ++++++++++++++++
 llm/gpt-2/gpt2-train.yaml |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index bdc6931ddbf..dc6ed780c3c 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -37,6 +37,20 @@ sky launch -c gpt2 gpt2.yaml --gpu A100
 
 ![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png)
 
+### ## Download logs and visualizations
+
+After the training is finished, you can download the logs and visualizations with the following command:
+```bash
+scp -r gpt2:~/llm.c/log124M .
+```
+We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 8000 steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.)
+
+![Training progress](https://imgur.com/qeNNlIB.png)
+
+> Yes! We are able to reproduce the training of GPT-2 (124M) on any cloud with SkyPilot.
+
+
+
 ## Advanced: Run GPT-2 training in two stages
 
 The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily
@@ -89,3 +103,5 @@ SkyPilot will first download and process the dataset on a CPU VM and store the
 processed data in a GCS bucket. Then, it will launch a GPT-2 training job on a
 GPU VM. The training job will train GPT-2 (124M) on the processed data.
 
+
+
diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index 7c7e58eeaf9..e907d28d781 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -88,3 +88,6 @@ run: |
       -n 5000 \
       -v 250 -s 20000 \
       -h 1
+
+  # Upload the log and model to the bucket
+  rsync -Pavz log124M ~/.cache/huggingface

From bea72d565d48d2881d20062a75090eeb7db1a424 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 31 May 2024 01:05:54 +0000
Subject: [PATCH 27/29] update

---
 llm/gpt-2/README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index dc6ed780c3c..b16d4d6a5c6 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -37,7 +37,7 @@ sky launch -c gpt2 gpt2.yaml --gpu A100
 
 ![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png)
 
-### ## Download logs and visualizations
+### Download logs and visualizations
 
 After the training is finished, you can download the logs and visualizations with the following command:
 ```bash
@@ -45,7 +45,9 @@ scp -r gpt2:~/llm.c/log124M .
 ```
 We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 8000 steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.)
 
-![Training progress](https://imgur.com/qeNNlIB.png)
+<div align="center">
+<img src="https://imgur.com/qeNNlIB.png" width="60%">
+</div>
 
 > Yes! We are able to reproduce the training of GPT-2 (124M) on any cloud with SkyPilot.
 

From 888743509025e5233afaf4ee2cf362eccce5ed64 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 31 May 2024 01:08:24 +0000
Subject: [PATCH 28/29] reduce cpu cost

---
 llm/gpt-2/README.md      | 2 +-
 llm/gpt-2/gpt2-data.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index b16d4d6a5c6..eea1a727dd5 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -58,7 +58,7 @@ We can visualize the training progress with the notebook provided in [llm.c](htt
 The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily
 separate the data processing and training into two stages and execute them sequantially manually, or let SkyPilot manage the dependencies between the two stages.
 
-With this data processing can be run on cheaper CPU VMs (e.g., ~\$1.5/hour), and run the training on more expensive GPU VMs (e.g., ~\$1.3-\$3.6/hour for a single A100 GPU, or \$10.3-\$32.8/hour for 8 A100 GPUs).
+With this data processing can be run on cheaper CPU VMs (e.g., ~\$0.4/hour), and run the training on more expensive GPU VMs (e.g., ~\$1.3-\$3.6/hour for a single A100 GPU, or \$10.3-\$32.8/hour for 8 A100 GPUs).
 
 We can run the data processing on a CPU VM and store the processed data in a cloud bucket. Then, we can run the training on a GPU VM with the processed data.
 
diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml
index bc4789c0a54..8346e37ccb6 100644
--- a/llm/gpt-2/gpt2-data.yaml
+++ b/llm/gpt-2/gpt2-data.yaml
@@ -4,7 +4,7 @@ envs:
   BUCKET_NAME: # Fill in your bucket name
 
 resources:
-  cpus: 32+
+  cpus: 8+
 
 file_mounts:
   /cache:

From 7609990c5ee9d7bd2179d1d955406563efc116c4 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 31 May 2024 01:11:48 +0000
Subject: [PATCH 29/29] update loss curve

---
 llm/gpt-2/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index eea1a727dd5..e6a213283d8 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -43,10 +43,10 @@ After the training is finished, you can download the logs and visualizations wit
 ```bash
 scp -r gpt2:~/llm.c/log124M .
 ```
-We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 8000 steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.)
+We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 10K steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.)
 
 <div align="center">
-<img src="https://imgur.com/qeNNlIB.png" width="60%">
+<img src="https://imgur.com/lskPEAQ.png" width="60%">
 </div>
 
 > Yes! We are able to reproduce the training of GPT-2 (124M) on any cloud with SkyPilot.