diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
new file mode 100644
index 00000000000..e6a213283d8
--- /dev/null
+++ b/llm/gpt-2/README.md
@@ -0,0 +1,109 @@
+# Run GPT-2 in llm.c on any cloud with SkyPilot
+
+This is a reproducible package of llm.c's GPT-2 (124M) training by @karpathy (https://github.com/karpathy/llm.c/discussions/481)
+With SkyPilot, you can run GPT-2 (124M) training on any cloud.
+
+## Prerequisites
+
+1. Install [SkyPilot](https://github.com/skypilot-org/skypilot):
+```bash
+pip install "skypilot-nightly[aws,gcp,azure,kubernetes,lambda,fluidstack]" # Choose the clouds you want to enable
+```
+2. Enable clouds for SkyPilot:
+```bash
+sky check
+```
+Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
+
+3. Download the YAML for starting the training:
+```bash
+wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2.yaml
+```
+
+## Run GPT-2 training
+
+Run the following command to start GPT-2 (124M) training on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name):
+
+```bash
+sky launch -c gpt2 gpt2.yaml
+```
+
+![GPT-2 training with 8 A100 GPUs](https://imgur.com/v8SGpsF.png)
+
+Or, you can train the model with a single A100, by adding `--gpu A100`:
+```bash
+sky launch -c gpt2 gpt2.yaml --gpu A100
+```
+
+![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png)
+
+### Download logs and visualizations
+
+After the training is finished, you can download the logs and visualizations with the following command:
+```bash
+scp -r gpt2:~/llm.c/log124M .
+```
+We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 10K steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.)
+
+
+
+
+
+> Yes! We are able to reproduce the training of GPT-2 (124M) on any cloud with SkyPilot.
+
+
+
+## Advanced: Run GPT-2 training in two stages
+
+The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily
+separate the data processing and training into two stages and execute them sequantially manually, or let SkyPilot manage the dependencies between the two stages.
+
+With this data processing can be run on cheaper CPU VMs (e.g., ~\$0.4/hour), and run the training on more expensive GPU VMs (e.g., ~\$1.3-\$3.6/hour for a single A100 GPU, or \$10.3-\$32.8/hour for 8 A100 GPUs).
+
+We can run the data processing on a CPU VM and store the processed data in a cloud bucket. Then, we can run the training on a GPU VM with the processed data.
+
+```bash
+wget https://raw.githubusercontent.com//skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml
+wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml
+```
+
+### Run two stages manually
+#### Data processing
+
+Run the following command to process the training data on a CPU VM and store it in a cloud bucket for future use (replace `your-bucket-name` with your bucket name):
+
+```bash
+sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name
+```
+
+
+#### Training
+
+After the data is processed, you can then train the model on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name):
+
+```bash
+sky launch -c gpt2-train --detach-setup gpt2-train.yaml --env BUCKET_NAME=your-bucket-name
+```
+
+Or, you can train the model with a single A100, by adding `--gpu A100`:
+```bash
+sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_NAME=your-bucket-name
+```
+
+
+### Run in a Pipeline
+
+We can also combine the two steps into a single SkyPilot job, and let SkyPilot to handle the dependencies between the two steps. Here is an example of how to do this (replace `your-bucket-name` with your bucket name):
+```bash
+cat gpt2-data.yaml > gpt2-pipeline.yaml
+echo "---" >> gpt2-pipeline.yaml
+cat gpt2-train.yaml >> gpt2-pipeline.yaml
+sky jobs launch -n gpt2 gpt2-pipeline.yaml --env BUCKET_NAME=your-bucket-name
+```
+
+SkyPilot will first download and process the dataset on a CPU VM and store the
+processed data in a GCS bucket. Then, it will launch a GPT-2 training job on a
+GPU VM. The training job will train GPT-2 (124M) on the processed data.
+
+
+
diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml
new file mode 100644
index 00000000000..8346e37ccb6
--- /dev/null
+++ b/llm/gpt-2/gpt2-data.yaml
@@ -0,0 +1,32 @@
+name: gpt2-data
+
+envs:
+ BUCKET_NAME: # Fill in your bucket name
+
+resources:
+ cpus: 8+
+
+file_mounts:
+ /cache:
+ name: $BUCKET_NAME
+ mode: MOUNT
+
+setup: |
+ pip install tqdm tiktoken requests datasets
+ git clone https://github.com/karpathy/llm.c.git@ed37d9261ba13ef212c01e2de8b309cbb46a2aa7 || true
+
+ # Adding revision to fix the dataset version, as the latest fineweb
+ # dataset removed the samples, causing error:
+ # Please pass `features` or at least one example when writing data
+ sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py
+
+
+run: |
+ cd llm.c
+ # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
+ # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
+ # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
+ python dev/data/fineweb.py --version 10B
+
+ rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/
+ rsync -Pavz dev/data/fineweb10B /cache/
diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
new file mode 100644
index 00000000000..e907d28d781
--- /dev/null
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -0,0 +1,93 @@
+name: train
+
+envs:
+ BUCKET_NAME: # Fill in your bucket name
+
+resources:
+ accelerators: A100:8
+ # Use docker image for latest version g++ to enable the compilation of llm.c.
+ image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+ any_of:
+ # Avoid using docker image for lambda due to the docker is not supported on
+ # Lambda yet, but the base image works.
+ - cloud: lambda
+ image_id: null
+ - cloud: aws
+ - cloud: gcp
+ - cloud: azure
+ - cloud: fluidstack
+ - cloud: kubernetes
+
+file_mounts:
+ ~/.cache/huggingface:
+ name: $BUCKET_NAME
+ mode: COPY
+
+setup: |
+ cd ~
+ pip install tqdm tiktoken requests datasets
+
+ # install cudnn so we can use FlashAttention and run fast (optional)
+ # https://developer.nvidia.com/cudnn-downloads
+ # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04
+ if [ -f ./CUDNN_INSTALLED ]; then
+ echo "cudnn already installed"
+ else
+ system=$(lsb_release -si | tr '[:upper:]' '[:lower:]')
+ # Get version and remove the dot
+ version=$(lsb_release -sr | tr -d .)
+ export system_version="${system}${version}"
+ wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb
+ sudo dpkg -i cudnn-installer.deb
+ sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/
+ # Remove problematic kubernetes.list source
+ sudo apt-get update --allow-releaseinfo-change || true
+
+ sudo apt-get -y install cudnn-cuda-12
+
+ touch ./CUDNN_INSTALLED
+ fi
+
+ # "install" cudnn-frontend to ~/
+ sudo apt -y install git
+ git clone https://github.com/NVIDIA/cudnn-frontend.git || true
+
+ # install MPI (optional, if you intend to use multiple GPUs)
+ # SkyPilot do not install MPI as that requires NCCL which needs to be manually
+ # installed.
+ sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev
+ # install nccl
+ pip install nvidia-nccl-cu12
+ export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib
+ export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include
+
+ git clone https://github.com/karpathy/llm.c.git || true
+ cd llm.c
+ ln -s ~/.cache/huggingface/fineweb10B dev/data/
+ # compile llm.c (mixed precision, with cuDNN flash-attention)
+ # first compilation is ~1 minute, mostly due to cuDNN
+ make train_gpt2cu USE_CUDNN=1
+
+
+run: |
+ cd ~/llm.c
+ # train on multiple GPUs
+ mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \
+ -i "dev/data/fineweb10B/fineweb_train_*.bin" \
+ -j "dev/data/fineweb10B/fineweb_val_*.bin" \
+ -o log124M \
+ -e "d12" \
+ -b 64 -t 1024 \
+ -d 524288 \
+ -r 1 \
+ -z 1 \
+ -c 0.1 \
+ -l 0.0006 \
+ -q 0.0 \
+ -u 700 \
+ -n 5000 \
+ -v 250 -s 20000 \
+ -h 1
+
+ # Upload the log and model to the bucket
+ rsync -Pavz log124M ~/.cache/huggingface
diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml
new file mode 100644
index 00000000000..8e203772128
--- /dev/null
+++ b/llm/gpt-2/gpt2.yaml
@@ -0,0 +1,95 @@
+name: train
+
+resources:
+ accelerators: A100:8
+ # Use docker image for latest version g++ to enable the compilation of llm.c.
+ image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+ any_of:
+ # Avoid using docker image for lambda due to the docker is not supported on
+ # Lambda yet, but the base image works.
+ - cloud: lambda
+ image_id: null
+ - cloud: aws
+ - cloud: gcp
+ - cloud: azure
+ - cloud: fluidstack
+ - cloud: kubernetes
+
+
+setup: |
+ cd ~
+ pip install tqdm tiktoken requests datasets
+
+ # Training dependencies
+ # install cudnn so we can use FlashAttention and run fast (optional)
+ # https://developer.nvidia.com/cudnn-downloads
+ # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04
+ if [ -f ./CUDNN_INSTALLED ]; then
+ echo "cudnn already installed"
+ else
+ system=$(lsb_release -si | tr '[:upper:]' '[:lower:]')
+ # Get version and remove the dot
+ version=$(lsb_release -sr | tr -d .)
+ export system_version="${system}${version}"
+ wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb
+ sudo dpkg -i cudnn-installer.deb
+ sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/
+ # Remove problematic kubernetes.list source
+ sudo apt-get update --allow-releaseinfo-change || true
+
+ sudo apt-get -y install cudnn-cuda-12
+
+ touch ./CUDNN_INSTALLED
+ fi
+
+ # "install" cudnn-frontend to ~/
+ sudo apt -y install git
+ git clone https://github.com/NVIDIA/cudnn-frontend.git || true
+
+ # install MPI (optional, if you intend to use multiple GPUs)
+ # SkyPilot do not install MPI as that requires NCCL which needs to be manually
+ # installed.
+ sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev
+ # install nccl
+ pip install nvidia-nccl-cu12
+ export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib
+ export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include
+
+ git clone https://github.com/karpathy/llm.c.git || true
+ cd llm.c
+
+ # add revision to fix the dataset version, as the latest fineweb
+ # dataset removed the samples, causing error:
+ # Please pass `features` or at least one example when writing data
+ sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py
+
+ # compile llm.c (mixed precision, with cuDNN flash-attention)
+ # first compilation is ~1 minute, mostly due to cuDNN
+ make train_gpt2cu USE_CUDNN=1
+
+
+run: |
+ cd ~/llm.c
+ # Processing data
+ # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
+ # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
+ # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
+ python dev/data/fineweb.py --version 10B
+
+ # Start training on multiple GPUs
+ mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \
+ -i "dev/data/fineweb10B/fineweb_train_*.bin" \
+ -j "dev/data/fineweb10B/fineweb_val_*.bin" \
+ -o log124M \
+ -e "d12" \
+ -b 64 -t 1024 \
+ -d 524288 \
+ -r 1 \
+ -z 1 \
+ -c 0.1 \
+ -l 0.0006 \
+ -q 0.0 \
+ -u 700 \
+ -n 5000 \
+ -v 250 -s 20000 \
+ -h 1
diff --git a/sky/utils/command_runner.pyi b/sky/utils/command_runner.pyi
index 9fbad243775..77e5a8959cf 100644
--- a/sky/utils/command_runner.pyi
+++ b/sky/utils/command_runner.pyi
@@ -101,7 +101,8 @@ class CommandRunner:
*,
up: bool,
log_path: str = ...,
- stream_logs: bool = ...) -> None:
+ stream_logs: bool = ...,
+ max_retry: int = 1) -> None:
...
@classmethod
@@ -191,5 +192,6 @@ class SSHCommandRunner(CommandRunner):
*,
up: bool,
log_path: str = ...,
- stream_logs: bool = ...) -> None:
+ stream_logs: bool = ...,
+ max_retry: int = 1) -> None:
...
diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py
index 8bbe1d54e60..5bc011abaaa 100644
--- a/sky/utils/schemas.py
+++ b/sky/utils/schemas.py
@@ -141,6 +141,8 @@ def _get_single_resources_schema():
}, {
'type': 'object',
'required': [],
+ }, {
+ 'type': 'null',
}]
},
# The following fields are for internal use only.