From 374f0f29a13b15033640c4955a81f49524d630bc Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 28 May 2024 23:00:38 +0000 Subject: [PATCH 01/29] add gpt-2 example --- llm/gpt-2/README.md | 12 ++++++ llm/gpt-2/gpt2.yaml | 91 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 llm/gpt-2/README.md create mode 100644 llm/gpt-2/gpt2.yaml diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md new file mode 100644 index 00000000000..33b5453569e --- /dev/null +++ b/llm/gpt-2/README.md @@ -0,0 +1,12 @@ +# GPT-2 (124M) in llm.c in 90 minutes + +https://github.com/karpathy/llm.c/discussions/481 + +```bash +sky jobs launch -n gpt2 gpt2.yaml +``` + +SkyPilot will first download and process the dataset on a CPU VM and store the +processed data in a GCS bucket. Then, it will launch a GPT-2 training job on a +GPU VM. The training job will train GPT-2 (124M) on the processed data. + diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml new file mode 100644 index 00000000000..75cc753a609 --- /dev/null +++ b/llm/gpt-2/gpt2.yaml @@ -0,0 +1,91 @@ +# name: gpt2-data + +# resources: +# cpus: 64+ + +# file_mounts: +# /cache: +# name: gpt2-data-skypilot +# store: gcs +# mode: MOUNT + +# setup: | +# pip install tqdm tiktoken requests datasets +# # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) +# # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B +# # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb +# git clone https://github.com/karpathy/llm.c.git || true + +# run: | +# cd llm.c +# python dev/data/fineweb.py --version 10B + +# rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/ +# rsync -Pavz dev/data/fineweb10B /cache/ + +# --- + +name: train + + +resources: + accelerators: A100:8 + use_spot: true + +file_mounts: + ~/.cache/huggingface: gs://gpt2-data-skypilot + +setup: | + cd ~ + pip install tqdm tiktoken requests datasets + + # install cudnn so we can use FlashAttention and run fast (optional) + # https://developer.nvidia.com/cudnn-downloads + # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04 + if [ -f ./CUDNN_INSTALLED ]; then + echo "cudnn already installed" + else + system=$(lsb_release -si | tr '[:upper:]' '[:lower:]') + version=$(lsb_release -sr) + system_version="${system}${version}" + wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb + sudo dpkg -i cudnn-installer.deb + sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ + sudo apt-get update + sudo apt-get -y install cudnn-cuda-12 + touch ./CUDNN_INSTALLED + fi + + # "install" cudnn-frontend to ~/ + git clone https://github.com/NVIDIA/cudnn-frontend.git || true + + # install MPI (optional, if you intend to use multiple GPUs) + sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev + + git clone https://github.com/karpathy/llm.c.git || true + cd llm.c + mv ~/.cache/huggingface/fineweb10B dev/data/ + # compile llm.c (mixed precision, with cuDNN flash-attention) + # first compilation is ~1 minute, mostly due to cuDNN + make train_gpt2cu USE_CUDNN=1 + + +run: | + cd llm.c + # train on multiple GPUs + mpirun -np 8 ./train_gpt2cu \ + -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" \ + -o log124M \ + -e "d12" \ + -b 64 -t 1024 \ + -d 524288 \ + -r 1 \ + -z 1 \ + -c 0.1 \ + -l 0.0006 \ + -q 0.0 \ + -u 700 \ + -n 5000 \ + -v 250 -s 20000 \ + -h 1 From 79323f7d729661acab474234b18865c9dc23190f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 29 May 2024 01:59:30 +0000 Subject: [PATCH 02/29] Use ubuntu for GCP --- llm/gpt-2/gpt2.yaml | 35 ++++++++++++++++++++++++++++------- sky/clouds/gcp.py | 6 +++--- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml index 75cc753a609..d2798cf19cf 100644 --- a/llm/gpt-2/gpt2.yaml +++ b/llm/gpt-2/gpt2.yaml @@ -29,13 +29,24 @@ name: train resources: - accelerators: A100:8 + accelerators: A100:1 use_spot: true -file_mounts: - ~/.cache/huggingface: gs://gpt2-data-skypilot +# file_mounts: +# ~/.cache/huggingface: gs://gpt2-data-skypilot setup: | + export PATH="$PATH:$HOME/.local/bin" + # Create a gpt2 conda version with the latest gxx + # conda activate gpt2 + # if [ $? -eq 0 ]; then + # echo "gpt2 environment already exists" + # else + # conda create -n gpt2 gxx=12 -y + # conda activate gpt2 + # fi + + cd ~ pip install tqdm tiktoken requests datasets @@ -46,13 +57,17 @@ setup: | echo "cudnn already installed" else system=$(lsb_release -si | tr '[:upper:]' '[:lower:]') - version=$(lsb_release -sr) - system_version="${system}${version}" + # Get version and remove the dot + version=$(lsb_release -sr | tr -d .) + export system_version="${system}${version}" wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb sudo dpkg -i cudnn-installer.deb sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ - sudo apt-get update + # Remove problematic kubernetes.list source + sudo apt-get update --allow-releaseinfo-change + sudo apt-get -y install cudnn-cuda-12 + touch ./CUDNN_INSTALLED fi @@ -60,7 +75,13 @@ setup: | git clone https://github.com/NVIDIA/cudnn-frontend.git || true # install MPI (optional, if you intend to use multiple GPUs) + # SkyPilot do not install MPI as that requires NCCL which needs to be manually + # installed. sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev + # install nccl + pip install nvidia-nccl-cu12 + export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include git clone https://github.com/karpathy/llm.c.git || true cd llm.c @@ -71,7 +92,7 @@ setup: | run: | - cd llm.c + cd ~/llm.c # train on multiple GPUs mpirun -np 8 ./train_gpt2cu \ -i "dev/data/fineweb10B/fineweb_train_*.bin" \ diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 93260533f27..215fa043a08 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -404,7 +404,7 @@ def make_deploy_resources_variables( # --no-standard-images # We use the debian image, as the ubuntu image has some connectivity # issue when first booted. - image_id = 'skypilot:cpu-debian-11' + image_id = 'skypilot:cpu-ubuntu-2204' r = resources # Find GPU spec, if any. @@ -455,8 +455,8 @@ def make_deploy_resources_variables( # CUDA driver version 470.57.02, CUDA Library 11.4 image_id = 'skypilot:k80-debian-10' else: - # CUDA driver version 535.86.10, CUDA Library 12.2 - image_id = 'skypilot:gpu-debian-11' + # CUDA driver version 550.54.15, CUDA Library 12.4 + image_id = 'skypilot:gpu-ubuntu-2204' if (resources.image_id is not None and resources.extract_docker_image() is None): From 03623ee3c13e3cda8b34f4070c1540d571b1c7a6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 29 May 2024 02:22:12 +0000 Subject: [PATCH 03/29] fix ncl --- llm/gpt-2/gpt2.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml index d2798cf19cf..a1ba2954ea9 100644 --- a/llm/gpt-2/gpt2.yaml +++ b/llm/gpt-2/gpt2.yaml @@ -64,7 +64,7 @@ setup: | sudo dpkg -i cudnn-installer.deb sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ # Remove problematic kubernetes.list source - sudo apt-get update --allow-releaseinfo-change + sudo apt-get update --allow-releaseinfo-change || true sudo apt-get -y install cudnn-cuda-12 @@ -94,7 +94,7 @@ setup: | run: | cd ~/llm.c # train on multiple GPUs - mpirun -np 8 ./train_gpt2cu \ + mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE ./train_gpt2cu \ -i "dev/data/fineweb10B/fineweb_train_*.bin" \ -j "dev/data/fineweb10B/fineweb_val_*.bin" \ -o log124M \ From 3636ea680b2dc80958b7980edda05d4a75372051 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 29 May 2024 02:23:20 +0000 Subject: [PATCH 04/29] Fix GPT-2 --- llm/gpt-2/gpt2.yaml | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml index a1ba2954ea9..030e30e16f3 100644 --- a/llm/gpt-2/gpt2.yaml +++ b/llm/gpt-2/gpt2.yaml @@ -32,21 +32,10 @@ resources: accelerators: A100:1 use_spot: true -# file_mounts: -# ~/.cache/huggingface: gs://gpt2-data-skypilot +file_mounts: + ~/.cache/huggingface: gs://gpt2-data-skypilot setup: | - export PATH="$PATH:$HOME/.local/bin" - # Create a gpt2 conda version with the latest gxx - # conda activate gpt2 - # if [ $? -eq 0 ]; then - # echo "gpt2 environment already exists" - # else - # conda create -n gpt2 gxx=12 -y - # conda activate gpt2 - # fi - - cd ~ pip install tqdm tiktoken requests datasets From 1694ecd3873ebb3a4eae355d427db1a2162a16ea Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 29 May 2024 02:35:18 +0000 Subject: [PATCH 05/29] add train and data --- llm/gpt-2/gpt2-data.yaml | 24 +++++++ llm/gpt-2/{gpt2.yaml => gpt2-pipeline.yaml} | 40 +++++------ llm/gpt-2/gpt2-train.yaml | 74 +++++++++++++++++++++ 3 files changed, 118 insertions(+), 20 deletions(-) create mode 100644 llm/gpt-2/gpt2-data.yaml rename llm/gpt-2/{gpt2.yaml => gpt2-pipeline.yaml} (77%) create mode 100644 llm/gpt-2/gpt2-train.yaml diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml new file mode 100644 index 00000000000..d04c9e59e65 --- /dev/null +++ b/llm/gpt-2/gpt2-data.yaml @@ -0,0 +1,24 @@ +name: gpt2-data + +resources: + cpus: 64+ + +file_mounts: + /cache: + name: gpt2-data-skypilot + store: gcs + mode: MOUNT + +setup: | + pip install tqdm tiktoken requests datasets + # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) + # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B + # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb + git clone https://github.com/karpathy/llm.c.git || true + +run: | + cd llm.c + python dev/data/fineweb.py --version 10B + + rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/ + rsync -Pavz dev/data/fineweb10B /cache/ diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2-pipeline.yaml similarity index 77% rename from llm/gpt-2/gpt2.yaml rename to llm/gpt-2/gpt2-pipeline.yaml index 030e30e16f3..9bea67e630c 100644 --- a/llm/gpt-2/gpt2.yaml +++ b/llm/gpt-2/gpt2-pipeline.yaml @@ -1,29 +1,29 @@ -# name: gpt2-data +name: gpt2-data -# resources: -# cpus: 64+ +resources: + cpus: 64+ -# file_mounts: -# /cache: -# name: gpt2-data-skypilot -# store: gcs -# mode: MOUNT +file_mounts: + /cache: + name: gpt2-data-skypilot + store: gcs + mode: MOUNT -# setup: | -# pip install tqdm tiktoken requests datasets -# # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) -# # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B -# # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb -# git clone https://github.com/karpathy/llm.c.git || true +setup: | + pip install tqdm tiktoken requests datasets + # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) + # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B + # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb + git clone https://github.com/karpathy/llm.c.git || true -# run: | -# cd llm.c -# python dev/data/fineweb.py --version 10B +run: | + cd llm.c + python dev/data/fineweb.py --version 10B -# rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/ -# rsync -Pavz dev/data/fineweb10B /cache/ + rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/ + rsync -Pavz dev/data/fineweb10B /cache/ -# --- +--- name: train diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml new file mode 100644 index 00000000000..44d9400d0a2 --- /dev/null +++ b/llm/gpt-2/gpt2-train.yaml @@ -0,0 +1,74 @@ +name: train + + +resources: + accelerators: A100:1 + use_spot: true + +file_mounts: + ~/.cache/huggingface: gs://gpt2-data-skypilot + +setup: | + cd ~ + pip install tqdm tiktoken requests datasets + + # install cudnn so we can use FlashAttention and run fast (optional) + # https://developer.nvidia.com/cudnn-downloads + # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04 + if [ -f ./CUDNN_INSTALLED ]; then + echo "cudnn already installed" + else + system=$(lsb_release -si | tr '[:upper:]' '[:lower:]') + # Get version and remove the dot + version=$(lsb_release -sr | tr -d .) + export system_version="${system}${version}" + wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb + sudo dpkg -i cudnn-installer.deb + sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ + # Remove problematic kubernetes.list source + sudo apt-get update --allow-releaseinfo-change || true + + sudo apt-get -y install cudnn-cuda-12 + + touch ./CUDNN_INSTALLED + fi + + # "install" cudnn-frontend to ~/ + git clone https://github.com/NVIDIA/cudnn-frontend.git || true + + # install MPI (optional, if you intend to use multiple GPUs) + # SkyPilot do not install MPI as that requires NCCL which needs to be manually + # installed. + sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev + # install nccl + pip install nvidia-nccl-cu12 + export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include + + git clone https://github.com/karpathy/llm.c.git || true + cd llm.c + mv ~/.cache/huggingface/fineweb10B dev/data/ + # compile llm.c (mixed precision, with cuDNN flash-attention) + # first compilation is ~1 minute, mostly due to cuDNN + make train_gpt2cu USE_CUDNN=1 + + +run: | + cd ~/llm.c + # train on multiple GPUs + mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE ./train_gpt2cu \ + -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" \ + -o log124M \ + -e "d12" \ + -b 64 -t 1024 \ + -d 524288 \ + -r 1 \ + -z 1 \ + -c 0.1 \ + -l 0.0006 \ + -q 0.0 \ + -u 700 \ + -n 5000 \ + -v 250 -s 20000 \ + -h 1 From 2c80dcbe58f785ebf552e2dbc5e5dda9fd6e086c Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 29 May 2024 02:37:26 +0000 Subject: [PATCH 06/29] use 8 gpus --- llm/gpt-2/gpt2-pipeline.yaml | 3 +-- llm/gpt-2/gpt2-train.yaml | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/llm/gpt-2/gpt2-pipeline.yaml b/llm/gpt-2/gpt2-pipeline.yaml index 9bea67e630c..d69cc4beb4b 100644 --- a/llm/gpt-2/gpt2-pipeline.yaml +++ b/llm/gpt-2/gpt2-pipeline.yaml @@ -29,8 +29,7 @@ name: train resources: - accelerators: A100:1 - use_spot: true + accelerators: A100:8 file_mounts: ~/.cache/huggingface: gs://gpt2-data-skypilot diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index 44d9400d0a2..60fc29ea3be 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -2,8 +2,7 @@ name: train resources: - accelerators: A100:1 - use_spot: true + accelerators: A100:8 file_mounts: ~/.cache/huggingface: gs://gpt2-data-skypilot From 1bef7981b68fcc3bcc96c4da2fafb36986dfe053 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 29 May 2024 02:44:20 +0000 Subject: [PATCH 07/29] revert gcp change --- sky/clouds/gcp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 215fa043a08..93260533f27 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -404,7 +404,7 @@ def make_deploy_resources_variables( # --no-standard-images # We use the debian image, as the ubuntu image has some connectivity # issue when first booted. - image_id = 'skypilot:cpu-ubuntu-2204' + image_id = 'skypilot:cpu-debian-11' r = resources # Find GPU spec, if any. @@ -455,8 +455,8 @@ def make_deploy_resources_variables( # CUDA driver version 470.57.02, CUDA Library 11.4 image_id = 'skypilot:k80-debian-10' else: - # CUDA driver version 550.54.15, CUDA Library 12.4 - image_id = 'skypilot:gpu-ubuntu-2204' + # CUDA driver version 535.86.10, CUDA Library 12.2 + image_id = 'skypilot:gpu-debian-11' if (resources.image_id is not None and resources.extract_docker_image() is None): From 92828733ca4aa40a3486d2b812f5d22d1077e3c6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 29 May 2024 02:48:36 +0000 Subject: [PATCH 08/29] update readme --- llm/gpt-2/README.md | 19 +++++++ llm/gpt-2/gpt2-pipeline.yaml | 100 ----------------------------------- llm/gpt-2/gpt2-train.yaml | 2 +- 3 files changed, 20 insertions(+), 101 deletions(-) delete mode 100644 llm/gpt-2/gpt2-pipeline.yaml diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index 33b5453569e..1c4ea1205c9 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -2,7 +2,26 @@ https://github.com/karpathy/llm.c/discussions/481 +## Data processing + +```bash +sky launch -c gpt2-data gpt2-data.yaml -y +``` + + +## Training + +```bash +sky launch -c gpt2-train gpt2-train.yaml -y +``` + + +## Run in a Pipeline +We can also combine the two steps into a single SkyPilot job: ```bash +cat gpt2-data.yaml > gpt2.yaml +echo "---" >> gpt2.yaml +cat gpt2-train.yaml >> gpt2.yaml sky jobs launch -n gpt2 gpt2.yaml ``` diff --git a/llm/gpt-2/gpt2-pipeline.yaml b/llm/gpt-2/gpt2-pipeline.yaml deleted file mode 100644 index d69cc4beb4b..00000000000 --- a/llm/gpt-2/gpt2-pipeline.yaml +++ /dev/null @@ -1,100 +0,0 @@ -name: gpt2-data - -resources: - cpus: 64+ - -file_mounts: - /cache: - name: gpt2-data-skypilot - store: gcs - mode: MOUNT - -setup: | - pip install tqdm tiktoken requests datasets - # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) - # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B - # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb - git clone https://github.com/karpathy/llm.c.git || true - -run: | - cd llm.c - python dev/data/fineweb.py --version 10B - - rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/ - rsync -Pavz dev/data/fineweb10B /cache/ - ---- - -name: train - - -resources: - accelerators: A100:8 - -file_mounts: - ~/.cache/huggingface: gs://gpt2-data-skypilot - -setup: | - cd ~ - pip install tqdm tiktoken requests datasets - - # install cudnn so we can use FlashAttention and run fast (optional) - # https://developer.nvidia.com/cudnn-downloads - # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04 - if [ -f ./CUDNN_INSTALLED ]; then - echo "cudnn already installed" - else - system=$(lsb_release -si | tr '[:upper:]' '[:lower:]') - # Get version and remove the dot - version=$(lsb_release -sr | tr -d .) - export system_version="${system}${version}" - wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb - sudo dpkg -i cudnn-installer.deb - sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ - # Remove problematic kubernetes.list source - sudo apt-get update --allow-releaseinfo-change || true - - sudo apt-get -y install cudnn-cuda-12 - - touch ./CUDNN_INSTALLED - fi - - # "install" cudnn-frontend to ~/ - git clone https://github.com/NVIDIA/cudnn-frontend.git || true - - # install MPI (optional, if you intend to use multiple GPUs) - # SkyPilot do not install MPI as that requires NCCL which needs to be manually - # installed. - sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev - # install nccl - pip install nvidia-nccl-cu12 - export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib - export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include - - git clone https://github.com/karpathy/llm.c.git || true - cd llm.c - mv ~/.cache/huggingface/fineweb10B dev/data/ - # compile llm.c (mixed precision, with cuDNN flash-attention) - # first compilation is ~1 minute, mostly due to cuDNN - make train_gpt2cu USE_CUDNN=1 - - -run: | - cd ~/llm.c - # train on multiple GPUs - mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE ./train_gpt2cu \ - -i "dev/data/fineweb10B/fineweb_train_*.bin" \ - -j "dev/data/fineweb10B/fineweb_val_*.bin" \ - -o log124M \ - -e "d12" \ - -b 64 -t 1024 \ - -d 524288 \ - -r 1 \ - -z 1 \ - -c 0.1 \ - -l 0.0006 \ - -q 0.0 \ - -u 700 \ - -n 5000 \ - -v 250 -s 20000 \ - -h 1 diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index 60fc29ea3be..a075d58717c 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -46,7 +46,7 @@ setup: | git clone https://github.com/karpathy/llm.c.git || true cd llm.c - mv ~/.cache/huggingface/fineweb10B dev/data/ + ln -s ~/.cache/huggingface/fineweb10B dev/data/ # compile llm.c (mixed precision, with cuDNN flash-attention) # first compilation is ~1 minute, mostly due to cuDNN make train_gpt2cu USE_CUDNN=1 From 0ee942c8af27a2eaad15a51d08a600bdcfe113fa Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 29 May 2024 03:00:38 +0000 Subject: [PATCH 09/29] Add GCP image --- llm/gpt-2/gpt2-train.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index a075d58717c..1ba7c2b20ea 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -3,6 +3,18 @@ name: train resources: accelerators: A100:8 + any_of: + - cloud: gcp + # Need to switch to Ubuntu image on GCP for C++ dependencies + image_id: projects/deeplearning-platform-release/global/images/common-cu122-v20240514-ubuntu-2204-py310 + # List all possible cloud below. + - cloud: aws + - cloud: azure + - cloud: lambda + - cloud: runpod + - cloud: paperspace + - cloud: fluidstack + - cloud: kubernetes file_mounts: ~/.cache/huggingface: gs://gpt2-data-skypilot From 5af0d933e57afc082e09a7ab3f640a0e867c6252 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 29 May 2024 03:14:19 +0000 Subject: [PATCH 10/29] make file_mounts more general --- llm/gpt-2/gpt2-train.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index 1ba7c2b20ea..0e1340c8236 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -17,7 +17,9 @@ resources: - cloud: kubernetes file_mounts: - ~/.cache/huggingface: gs://gpt2-data-skypilot + ~/.cache/huggingface: + name: gpt2-data-skypilot + mode: COPY setup: | cd ~ From 71bcdd0fed9d3160efc4b25fc11d766c26fb97b4 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 29 May 2024 03:45:37 +0000 Subject: [PATCH 11/29] avoid any_of --- llm/gpt-2/gpt2-train.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index 0e1340c8236..33e9a61423a 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -3,18 +3,6 @@ name: train resources: accelerators: A100:8 - any_of: - - cloud: gcp - # Need to switch to Ubuntu image on GCP for C++ dependencies - image_id: projects/deeplearning-platform-release/global/images/common-cu122-v20240514-ubuntu-2204-py310 - # List all possible cloud below. - - cloud: aws - - cloud: azure - - cloud: lambda - - cloud: runpod - - cloud: paperspace - - cloud: fluidstack - - cloud: kubernetes file_mounts: ~/.cache/huggingface: From 488347f12696298da1d35feeaa2ab5de74654f1e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 16:20:17 +0000 Subject: [PATCH 12/29] change back to use ubuntu image with wait for GPU --- llm/gpt-2/gpt2-train.yaml | 12 ++++++++++++ sky/templates/gcp-ray.yml.j2 | 6 ++++++ 2 files changed, 18 insertions(+) diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index a075d58717c..5ac258fabc7 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -3,6 +3,18 @@ name: train resources: accelerators: A100:8 + any_of: + - cloud: gcp + # Use ubuntu 22.04 as the C++ compiler is not latest enough in SkyPillot's + # default debian 11 image. + image_id: projects/deeplearning-platform-release/global/images/common-cu122-v20240514-ubuntu-2204-py310 + - cloud: aws + - cloud: azure + - cloud: lambda + - cloud: runpod + - cloud: fluidstack + - cloud: ibm + - cloud: oci file_mounts: ~/.cache/huggingface: gs://gpt2-data-skypilot diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 9c2092bdfaf..0100c38cad3 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -174,6 +174,9 @@ setup_commands: # Line 'sudo systemctl stop jupyter ..': stop jupyter service to avoid port conflict on 8080 # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` + # Line 'while ! nvidia-smi; do sleep 2; echo': wait for CUDA driver to be installed, as for + # some deep learning images with Ubuntu, CUDA driver will not be immediately available, due + # to `install-nvidia-driver` - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; }; {%- if docker_image is none %} sudo systemctl stop unattended-upgrades || true; @@ -205,6 +208,9 @@ setup_commands: {%- endif %} mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); + {%- if gpu is not none %} + nvidia-smi || { while ! nvidia-smi; do sleep 2; echo "Waiting for CUDA driver to be installed"; done; } + {% endif %} # Command to start ray clusters are now placed in `sky.provision.instance_setup`. # We do not need to list it here anymore. From 2e5bacf9a4fe3e029fd0a28bfe9f4e8e2937b9a1 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 16:27:46 +0000 Subject: [PATCH 13/29] wait cuda installation --- sky/templates/gcp-ray.yml.j2 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 0100c38cad3..b9f72942f5e 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -209,7 +209,8 @@ setup_commands: mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); {%- if gpu is not none %} - nvidia-smi || { while ! nvidia-smi; do sleep 2; echo "Waiting for CUDA driver to be installed"; done; } + curl -L https://github.com/GoogleCloudPlatform/compute-gpu-installation/releases/download/cuda-installer-v1.0.0/cuda_installer.pyz --output cuda_installer.pyz + while ! python3 cuda_installer.pyz verify_cuda; do sleep 2; echo "Waiting for CUDA driver to be installed"; done; {% endif %} # Command to start ray clusters are now placed in `sky.provision.instance_setup`. From c070da0bd4b0866d7f9ef73f364b193ca7c074c4 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 16:57:26 +0000 Subject: [PATCH 14/29] Add retry for file mount and use env for bucket name --- llm/gpt-2/gpt2-data.yaml | 6 ++++-- llm/gpt-2/gpt2-train.yaml | 9 +++++--- sky/backends/backend_utils.py | 40 ++++++++++++++++++++++------------- sky/templates/gcp-ray.yml.j2 | 5 ++--- sky/utils/command_runner.pyi | 6 ++++-- 5 files changed, 41 insertions(+), 25 deletions(-) diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml index d04c9e59e65..42daee00584 100644 --- a/llm/gpt-2/gpt2-data.yaml +++ b/llm/gpt-2/gpt2-data.yaml @@ -1,12 +1,14 @@ name: gpt2-data +envs: + BUCKET_NAME: # Fill in your bucket name + resources: cpus: 64+ file_mounts: /cache: - name: gpt2-data-skypilot - store: gcs + name: $BUCKET_NAME mode: MOUNT setup: | diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index 8ddd670c5a0..182c95fa1aa 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -1,13 +1,16 @@ name: train +envs: + BUCKET_NAME: # Fill in your bucket name resources: accelerators: A100:8 any_of: - cloud: gcp # Use ubuntu 22.04 as the C++ compiler is not latest enough in SkyPillot's - # default debian 11 image. - image_id: projects/deeplearning-platform-release/global/images/common-cu122-v20240514-ubuntu-2204-py310 + # default debian 11 image. We use our custom image with CUDA 12.2 + # installed. + image_id: projects/skypilot-375900/global/images/llm-c - cloud: aws - cloud: azure - cloud: lambda @@ -18,7 +21,7 @@ resources: file_mounts: ~/.cache/huggingface: - name: gpt2-data-skypilot + name: $BUCKET_NAME mode: COPY setup: | diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index b1598c7c039..6759aa30d3f 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1273,21 +1273,30 @@ def parallel_data_transfer_to_nodes( def _sync_node(runner: 'command_runner.CommandRunner') -> None: if cmd is not None: - rc, stdout, stderr = runner.run(cmd, - log_path=log_path, - stream_logs=stream_logs, - require_outputs=True, - source_bashrc=source_bashrc) - err_msg = ('Failed to run command before rsync ' - f'{origin_source} -> {target}. ' - 'Ensure that the network is stable, then retry. ' - f'{cmd}') - if log_path != os.devnull: - err_msg += f' See logs in {log_path}' - subprocess_utils.handle_returncode(rc, - cmd, - err_msg, - stderr=stdout + stderr) + retry_cnt = 0 + while retry_cnt < 3: + rc, stdout, stderr = runner.run(cmd, + log_path=log_path, + stream_logs=stream_logs, + require_outputs=True, + source_bashrc=source_bashrc) + if rc == 255: + retry_cnt += 1 + logger.warning( + f'Failed to run command on {runner.node_id}, likely ' + f'due to a reboot. Retrying... (Attempt {retry_cnt})') + time.sleep(5) + continue + err_msg = ('Failed to run command before rsync ' + f'{origin_source} -> {target}. ' + 'Ensure that the network is stable, then retry. ' + f'{cmd}') + if log_path != os.devnull: + err_msg += f' See logs in {log_path}' + subprocess_utils.handle_returncode(rc, + cmd, + err_msg, + stderr=stdout + stderr) if run_rsync: assert source is not None @@ -1299,6 +1308,7 @@ def _sync_node(runner: 'command_runner.CommandRunner') -> None: up=True, log_path=log_path, stream_logs=stream_logs, + max_retry=3, ) num_nodes = len(runners) diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index b9f72942f5e..b111c6c1cb3 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -174,7 +174,7 @@ setup_commands: # Line 'sudo systemctl stop jupyter ..': stop jupyter service to avoid port conflict on 8080 # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - # Line 'while ! nvidia-smi; do sleep 2; echo': wait for CUDA driver to be installed, as for + # Line 'while ! nvidia-smi; do sleep 2...', as for # some deep learning images with Ubuntu, CUDA driver will not be immediately available, due # to `install-nvidia-driver` - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; }; @@ -209,8 +209,7 @@ setup_commands: mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); {%- if gpu is not none %} - curl -L https://github.com/GoogleCloudPlatform/compute-gpu-installation/releases/download/cuda-installer-v1.0.0/cuda_installer.pyz --output cuda_installer.pyz - while ! python3 cuda_installer.pyz verify_cuda; do sleep 2; echo "Waiting for CUDA driver to be installed"; done; + while ! nvidia-smi; do sleep 2; echo "Waiting for CUDA driver to be installed"; done; {% endif %} # Command to start ray clusters are now placed in `sky.provision.instance_setup`. diff --git a/sky/utils/command_runner.pyi b/sky/utils/command_runner.pyi index 9fbad243775..77e5a8959cf 100644 --- a/sky/utils/command_runner.pyi +++ b/sky/utils/command_runner.pyi @@ -101,7 +101,8 @@ class CommandRunner: *, up: bool, log_path: str = ..., - stream_logs: bool = ...) -> None: + stream_logs: bool = ..., + max_retry: int = 1) -> None: ... @classmethod @@ -191,5 +192,6 @@ class SSHCommandRunner(CommandRunner): *, up: bool, log_path: str = ..., - stream_logs: bool = ...) -> None: + stream_logs: bool = ..., + max_retry: int = 1) -> None: ... From 87d2a3ca0b392126ca16752540ccd8d9140bead7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 17:11:38 +0000 Subject: [PATCH 15/29] revert retries --- llm/gpt-2/README.md | 11 +++++----- llm/gpt-2/gpt2-train.yaml | 2 +- sky/backends/backend_utils.py | 40 +++++++++++++---------------------- 3 files changed, 22 insertions(+), 31 deletions(-) diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index 1c4ea1205c9..9e0e4dd5be7 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -1,18 +1,19 @@ -# GPT-2 (124M) in llm.c in 90 minutes +# Run GPT-2 (124M) in llm.c on any cloud with SkyPilot -https://github.com/karpathy/llm.c/discussions/481 +This is a reproducible package of llm.c's GPT-2 (124M) training by @karpathy (https://github.com/karpathy/llm.c/discussions/481) +With SkyPilot, you can run GPT-2 (124M) training on any cloud. ## Data processing ```bash -sky launch -c gpt2-data gpt2-data.yaml -y +sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name ``` ## Training ```bash -sky launch -c gpt2-train gpt2-train.yaml -y +sky launch -c gpt2-train gpt2-train.yaml --env BUCKET_NAME=your-bucket-name ``` @@ -22,7 +23,7 @@ We can also combine the two steps into a single SkyPilot job: cat gpt2-data.yaml > gpt2.yaml echo "---" >> gpt2.yaml cat gpt2-train.yaml >> gpt2.yaml -sky jobs launch -n gpt2 gpt2.yaml +sky jobs launch -n gpt2 gpt2.yaml --env BUCKET_NAME=your-bucket-name ``` SkyPilot will first download and process the dataset on a CPU VM and store the diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index 182c95fa1aa..31bffa370b5 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -10,7 +10,7 @@ resources: # Use ubuntu 22.04 as the C++ compiler is not latest enough in SkyPillot's # default debian 11 image. We use our custom image with CUDA 12.2 # installed. - image_id: projects/skypilot-375900/global/images/llm-c + image_id: projects/skypilot-375900/global/images/gpu - cloud: aws - cloud: azure - cloud: lambda diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 6759aa30d3f..b1598c7c039 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1273,30 +1273,21 @@ def parallel_data_transfer_to_nodes( def _sync_node(runner: 'command_runner.CommandRunner') -> None: if cmd is not None: - retry_cnt = 0 - while retry_cnt < 3: - rc, stdout, stderr = runner.run(cmd, - log_path=log_path, - stream_logs=stream_logs, - require_outputs=True, - source_bashrc=source_bashrc) - if rc == 255: - retry_cnt += 1 - logger.warning( - f'Failed to run command on {runner.node_id}, likely ' - f'due to a reboot. Retrying... (Attempt {retry_cnt})') - time.sleep(5) - continue - err_msg = ('Failed to run command before rsync ' - f'{origin_source} -> {target}. ' - 'Ensure that the network is stable, then retry. ' - f'{cmd}') - if log_path != os.devnull: - err_msg += f' See logs in {log_path}' - subprocess_utils.handle_returncode(rc, - cmd, - err_msg, - stderr=stdout + stderr) + rc, stdout, stderr = runner.run(cmd, + log_path=log_path, + stream_logs=stream_logs, + require_outputs=True, + source_bashrc=source_bashrc) + err_msg = ('Failed to run command before rsync ' + f'{origin_source} -> {target}. ' + 'Ensure that the network is stable, then retry. ' + f'{cmd}') + if log_path != os.devnull: + err_msg += f' See logs in {log_path}' + subprocess_utils.handle_returncode(rc, + cmd, + err_msg, + stderr=stdout + stderr) if run_rsync: assert source is not None @@ -1308,7 +1299,6 @@ def _sync_node(runner: 'command_runner.CommandRunner') -> None: up=True, log_path=log_path, stream_logs=stream_logs, - max_retry=3, ) num_nodes = len(runners) From d6e9554ba329a21a81db53ca31afa86716075dd5 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 17:52:59 +0000 Subject: [PATCH 16/29] update the image --- llm/gpt-2/gpt2-train.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index 31bffa370b5..fef617ce97f 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -10,14 +10,16 @@ resources: # Use ubuntu 22.04 as the C++ compiler is not latest enough in SkyPillot's # default debian 11 image. We use our custom image with CUDA 12.2 # installed. - image_id: projects/skypilot-375900/global/images/gpu - - cloud: aws - - cloud: azure + image_id: projects/skypilot-375900/global/images/ubuntu2204-gpu - cloud: lambda + - cloud: azure - cloud: runpod - cloud: fluidstack - cloud: ibm - cloud: oci + # TODO: AWS's C++ version is too old to compile llm.c. A update of base AMI + # is needed. + # - cloud: aws file_mounts: ~/.cache/huggingface: From ef26ecd5fe6a2c8c3fefed277307919ca5fd118c Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 21:29:30 +0000 Subject: [PATCH 17/29] change to docker for better dependency --- llm/gpt-2/README.md | 2 +- llm/gpt-2/gpt2-train.yaml | 21 +++++---------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index 9e0e4dd5be7..ce50011810e 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -13,7 +13,7 @@ sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name ## Training ```bash -sky launch -c gpt2-train gpt2-train.yaml --env BUCKET_NAME=your-bucket-name +sky launch -c gpt2-train --detach-setup gpt2-train.yaml --env BUCKET_NAME=your-bucket-name ``` diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index fef617ce97f..d65df9cb8fa 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -5,21 +5,9 @@ envs: resources: accelerators: A100:8 - any_of: - - cloud: gcp - # Use ubuntu 22.04 as the C++ compiler is not latest enough in SkyPillot's - # default debian 11 image. We use our custom image with CUDA 12.2 - # installed. - image_id: projects/skypilot-375900/global/images/ubuntu2204-gpu - - cloud: lambda - - cloud: azure - - cloud: runpod - - cloud: fluidstack - - cloud: ibm - - cloud: oci - # TODO: AWS's C++ version is too old to compile llm.c. A update of base AMI - # is needed. - # - cloud: aws + # Use docker image for latest version g++ to enable the compilation of llm.c. + image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + file_mounts: ~/.cache/huggingface: @@ -52,6 +40,7 @@ setup: | fi # "install" cudnn-frontend to ~/ + sudo apt -y install git git clone https://github.com/NVIDIA/cudnn-frontend.git || true # install MPI (optional, if you intend to use multiple GPUs) @@ -74,7 +63,7 @@ setup: | run: | cd ~/llm.c # train on multiple GPUs - mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE ./train_gpt2cu \ + mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \ -i "dev/data/fineweb10B/fineweb_train_*.bin" \ -j "dev/data/fineweb10B/fineweb_val_*.bin" \ -o log124M \ From 2b0a085d4a2b9b62148804d751d221fec8ba445e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 21:33:22 +0000 Subject: [PATCH 18/29] revert changes in gcp template --- sky/templates/gcp-ray.yml.j2 | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index b111c6c1cb3..9c2092bdfaf 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -174,9 +174,6 @@ setup_commands: # Line 'sudo systemctl stop jupyter ..': stop jupyter service to avoid port conflict on 8080 # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - # Line 'while ! nvidia-smi; do sleep 2...', as for - # some deep learning images with Ubuntu, CUDA driver will not be immediately available, due - # to `install-nvidia-driver` - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; }; {%- if docker_image is none %} sudo systemctl stop unattended-upgrades || true; @@ -208,9 +205,6 @@ setup_commands: {%- endif %} mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); - {%- if gpu is not none %} - while ! nvidia-smi; do sleep 2; echo "Waiting for CUDA driver to be installed"; done; - {% endif %} # Command to start ray clusters are now placed in `sky.provision.instance_setup`. # We do not need to list it here anymore. From aa8ecfe4583c03516fb747d264815ac00e0da6db Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 22:10:26 +0000 Subject: [PATCH 19/29] avoid using docker on lambda --- llm/gpt-2/gpt2-train.yaml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index d65df9cb8fa..7c79f53ee44 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -7,12 +7,20 @@ resources: accelerators: A100:8 # Use docker image for latest version g++ to enable the compilation of llm.c. image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 - - + image_id: + # Avoid using docker image for lambda due to the docker is not supported on + # Lambda yet, but the base image works. + - cloud: lambda + image_id: null + - cloud: aws + - cloud: gcp + - cloud: azure + - cloud: fluidstack + file_mounts: - ~/.cache/huggingface: - name: $BUCKET_NAME - mode: COPY + ~/.cache/huggingface: gs://$BUCKET_NAME + # name: $BUCKET_NAME + # mode: COPY setup: | cd ~ From 265e43c42a6b1da4993fd954e22b48e53b9bf81d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 22:17:05 +0000 Subject: [PATCH 20/29] Add single GPU --- llm/gpt-2/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index ce50011810e..45b616e09df 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -16,6 +16,11 @@ sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name sky launch -c gpt2-train --detach-setup gpt2-train.yaml --env BUCKET_NAME=your-bucket-name ``` +Or you can train the model with a single A100: +```bash +sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu a100 --env BUCKET_NAME=your-bucket-name +``` + ## Run in a Pipeline We can also combine the two steps into a single SkyPilot job: From 598dca52e03928665e48dd6a5894cfea39459407 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 22:43:06 +0000 Subject: [PATCH 21/29] Elaborate readme --- llm/gpt-2/README.md | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index 45b616e09df..324d7d856df 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -1,10 +1,28 @@ -# Run GPT-2 (124M) in llm.c on any cloud with SkyPilot +# Run GPT-2 in llm.c on any cloud with SkyPilot This is a reproducible package of llm.c's GPT-2 (124M) training by @karpathy (https://github.com/karpathy/llm.c/discussions/481) With SkyPilot, you can run GPT-2 (124M) training on any cloud. +## Prerequisites + +1. Install [SkyPilot](https://github.com/skypilot-org/skypilot): +```bash +pip install skypilot-nightly +``` +2. Enable clouds for SkyPilot: +```bash +sky check +``` +Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +3. Download the YAMLs in this directory for data processing and training: +```bash +wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml +wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml +``` + ## Data processing +Run the following command to process the training data on a CPU VM and store it in a cloud bucket for future use (replace `your-bucket-name` with your bucket name): ```bash sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name ``` @@ -12,18 +30,21 @@ sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name ## Training +After the data is processed, you can then train the model on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name): + ```bash sky launch -c gpt2-train --detach-setup gpt2-train.yaml --env BUCKET_NAME=your-bucket-name ``` -Or you can train the model with a single A100: +Or, you can train the model with a single A100, by adding `--gpu A100`: ```bash -sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu a100 --env BUCKET_NAME=your-bucket-name +sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_NAME=your-bucket-name ``` ## Run in a Pipeline -We can also combine the two steps into a single SkyPilot job: + +We can also combine the two steps into a single SkyPilot job, and let SkyPilot to handle the dependencies between the two steps. Here is an example of how to do this (replace `your-bucket-name` with your bucket name): ```bash cat gpt2-data.yaml > gpt2.yaml echo "---" >> gpt2.yaml From 3056c2c8a0a2704db4c1d7436e88fe9f18ef1b86 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 16:31:21 -0700 Subject: [PATCH 22/29] Update llm/gpt-2/README.md Co-authored-by: Romil Bhardwaj --- llm/gpt-2/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index 324d7d856df..404d3bdcfcc 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -14,6 +14,7 @@ pip install skypilot-nightly sky check ``` Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). + 3. Download the YAMLs in this directory for data processing and training: ```bash wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml From 815d23ce0a939b1286efbe7cb333836e082e3fb5 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 30 May 2024 23:33:31 +0000 Subject: [PATCH 23/29] fix --- llm/gpt-2/gpt2-train.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index 7c79f53ee44..7c7e58eeaf9 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -7,7 +7,7 @@ resources: accelerators: A100:8 # Use docker image for latest version g++ to enable the compilation of llm.c. image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 - image_id: + any_of: # Avoid using docker image for lambda due to the docker is not supported on # Lambda yet, but the base image works. - cloud: lambda @@ -16,11 +16,12 @@ resources: - cloud: gcp - cloud: azure - cloud: fluidstack + - cloud: kubernetes file_mounts: - ~/.cache/huggingface: gs://$BUCKET_NAME - # name: $BUCKET_NAME - # mode: COPY + ~/.cache/huggingface: + name: $BUCKET_NAME + mode: COPY setup: | cd ~ From 4c4493563bebe1a7d02b216ea594fb2105c7a751 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 May 2024 00:02:42 +0000 Subject: [PATCH 24/29] address comments --- llm/gpt-2/README.md | 50 +++++++++++++++++----- llm/gpt-2/gpt2-data.yaml | 2 +- llm/gpt-2/gpt2.yaml | 89 ++++++++++++++++++++++++++++++++++++++++ sky/utils/schemas.py | 6 ++- 4 files changed, 134 insertions(+), 13 deletions(-) create mode 100644 llm/gpt-2/gpt2.yaml diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index 404d3bdcfcc..9cdfd76f462 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -7,7 +7,7 @@ With SkyPilot, you can run GPT-2 (124M) training on any cloud. 1. Install [SkyPilot](https://github.com/skypilot-org/skypilot): ```bash -pip install skypilot-nightly +pip install "skypilot-nightly[aws,gcp,azure,kubernetes,lambda,fluidstack]" # Choose the clouds you want to enable ``` 2. Enable clouds for SkyPilot: ```bash @@ -15,21 +15,49 @@ sky check ``` Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). -3. Download the YAMLs in this directory for data processing and training: +3. Download the YAML for starting the training: ```bash -wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml -wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml +wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2.yaml ``` -## Data processing +## Run GPT-2 training + +Run the following command to start GPT-2 (124M) training on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name): + +```bash +sky launch -c gpt2 gpt2.yaml +``` + +Or, you can train the model with a single A100, by adding `--gpu A100`: +```bash +sky launch -c gpt2 gpt2.yaml --gpu A100 +``` + +## Advanced: Run GPT-2 training in two stages + +The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily +separate the data processing and training into two stages and execute them sequantially manually, or let SkyPilot manage the dependencies between the two stages. + +With this data processing can be run on cheaper CPU VMs (e.g., ~\$1.5/hour), and run the training on more expensive GPU VMs (e.g., ~\$1.3-\$3.6/hour for a single A100 GPU, or \$10.3-\$32.8/hour for 8 A100 GPUs). + +We can run the data processing on a CPU VM and store the processed data in a cloud bucket. Then, we can run the training on a GPU VM with the processed data. + +```bash +wget https://raw.githubusercontent.com//skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml +wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml +``` + +### Run two stages manually +#### Data processing Run the following command to process the training data on a CPU VM and store it in a cloud bucket for future use (replace `your-bucket-name` with your bucket name): + ```bash sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name ``` -## Training +#### Training After the data is processed, you can then train the model on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name): @@ -43,14 +71,14 @@ sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_ ``` -## Run in a Pipeline +### Run in a Pipeline We can also combine the two steps into a single SkyPilot job, and let SkyPilot to handle the dependencies between the two steps. Here is an example of how to do this (replace `your-bucket-name` with your bucket name): ```bash -cat gpt2-data.yaml > gpt2.yaml -echo "---" >> gpt2.yaml -cat gpt2-train.yaml >> gpt2.yaml -sky jobs launch -n gpt2 gpt2.yaml --env BUCKET_NAME=your-bucket-name +cat gpt2-data.yaml > gpt2-pipeline.yaml +echo "---" >> gpt2-pipeline.yaml +cat gpt2-train.yaml >> gpt2-pipeline.yaml +sky jobs launch -n gpt2 gpt2-pipeline.yaml --env BUCKET_NAME=your-bucket-name ``` SkyPilot will first download and process the dataset on a CPU VM and store the diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml index 42daee00584..3bd082d02e2 100644 --- a/llm/gpt-2/gpt2-data.yaml +++ b/llm/gpt-2/gpt2-data.yaml @@ -4,7 +4,7 @@ envs: BUCKET_NAME: # Fill in your bucket name resources: - cpus: 64+ + cpus: 32+ file_mounts: /cache: diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml new file mode 100644 index 00000000000..0698073e3a0 --- /dev/null +++ b/llm/gpt-2/gpt2.yaml @@ -0,0 +1,89 @@ +name: train + +resources: + accelerators: A100:8 + # Use docker image for latest version g++ to enable the compilation of llm.c. + image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + any_of: + # Avoid using docker image for lambda due to the docker is not supported on + # Lambda yet, but the base image works. + - cloud: lambda + image_id: null + - cloud: aws + - cloud: gcp + - cloud: azure + - cloud: fluidstack + - cloud: kubernetes + + +setup: | + cd ~ + pip install tqdm tiktoken requests datasets + + # Training dependencies + # install cudnn so we can use FlashAttention and run fast (optional) + # https://developer.nvidia.com/cudnn-downloads + # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04 + if [ -f ./CUDNN_INSTALLED ]; then + echo "cudnn already installed" + else + system=$(lsb_release -si | tr '[:upper:]' '[:lower:]') + # Get version and remove the dot + version=$(lsb_release -sr | tr -d .) + export system_version="${system}${version}" + wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb + sudo dpkg -i cudnn-installer.deb + sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ + # Remove problematic kubernetes.list source + sudo apt-get update --allow-releaseinfo-change || true + + sudo apt-get -y install cudnn-cuda-12 + + touch ./CUDNN_INSTALLED + fi + + # "install" cudnn-frontend to ~/ + sudo apt -y install git + git clone https://github.com/NVIDIA/cudnn-frontend.git || true + + # install MPI (optional, if you intend to use multiple GPUs) + # SkyPilot do not install MPI as that requires NCCL which needs to be manually + # installed. + sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev + # install nccl + pip install nvidia-nccl-cu12 + export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include + + git clone https://github.com/karpathy/llm.c.git || true + cd llm.c + # compile llm.c (mixed precision, with cuDNN flash-attention) + # first compilation is ~1 minute, mostly due to cuDNN + make train_gpt2cu USE_CUDNN=1 + + +run: | + cd ~/llm.c + # Processing data + # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) + # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B + # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb + python dev/data/fineweb.py --version 10B + + # Start training on multiple GPUs + mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \ + -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" \ + -o log124M \ + -e "d12" \ + -b 64 -t 1024 \ + -d 524288 \ + -r 1 \ + -z 1 \ + -c 0.1 \ + -l 0.0006 \ + -q 0.0 \ + -u 700 \ + -n 5000 \ + -v 250 -s 20000 \ + -h 1 diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 8bbe1d54e60..02c8452ef2a 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -141,7 +141,11 @@ def _get_single_resources_schema(): }, { 'type': 'object', 'required': [], - }] + }, + { + 'type': 'null', + } + ] }, # The following fields are for internal use only. '_docker_login_config': { From 3b7312e67e9f29beb25e683f9de5d4ff6a1a08cc Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 May 2024 00:27:25 +0000 Subject: [PATCH 25/29] Fix data fetching --- llm/gpt-2/README.md | 4 ++++ llm/gpt-2/gpt2-data.yaml | 14 ++++++++++---- llm/gpt-2/gpt2.yaml | 6 ++++++ sky/utils/schemas.py | 6 ++---- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index 9cdfd76f462..bdc6931ddbf 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -28,11 +28,15 @@ Run the following command to start GPT-2 (124M) training on a GPU VM with 8 A100 sky launch -c gpt2 gpt2.yaml ``` +![GPT-2 training with 8 A100 GPUs](https://imgur.com/v8SGpsF.png) + Or, you can train the model with a single A100, by adding `--gpu A100`: ```bash sky launch -c gpt2 gpt2.yaml --gpu A100 ``` +![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png) + ## Advanced: Run GPT-2 training in two stages The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml index 3bd082d02e2..bc4789c0a54 100644 --- a/llm/gpt-2/gpt2-data.yaml +++ b/llm/gpt-2/gpt2-data.yaml @@ -13,13 +13,19 @@ file_mounts: setup: | pip install tqdm tiktoken requests datasets - # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) - # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B - # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb - git clone https://github.com/karpathy/llm.c.git || true + git clone https://github.com/karpathy/llm.c.git@ed37d9261ba13ef212c01e2de8b309cbb46a2aa7 || true + + # Adding revision to fix the dataset version, as the latest fineweb + # dataset removed the samples, causing error: + # Please pass `features` or at least one example when writing data + sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py + run: | cd llm.c + # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) + # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B + # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb python dev/data/fineweb.py --version 10B rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/ diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml index 0698073e3a0..8e203772128 100644 --- a/llm/gpt-2/gpt2.yaml +++ b/llm/gpt-2/gpt2.yaml @@ -57,6 +57,12 @@ setup: | git clone https://github.com/karpathy/llm.c.git || true cd llm.c + + # add revision to fix the dataset version, as the latest fineweb + # dataset removed the samples, causing error: + # Please pass `features` or at least one example when writing data + sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py + # compile llm.c (mixed precision, with cuDNN flash-attention) # first compilation is ~1 minute, mostly due to cuDNN make train_gpt2cu USE_CUDNN=1 diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 02c8452ef2a..5bc011abaaa 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -141,11 +141,9 @@ def _get_single_resources_schema(): }, { 'type': 'object', 'required': [], - }, - { + }, { 'type': 'null', - } - ] + }] }, # The following fields are for internal use only. '_docker_login_config': { From b6566d75ad41e51d54ece3662b7f480fe1ccfa4b Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 May 2024 01:03:32 +0000 Subject: [PATCH 26/29] Add visualization --- llm/gpt-2/README.md | 16 ++++++++++++++++ llm/gpt-2/gpt2-train.yaml | 3 +++ 2 files changed, 19 insertions(+) diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index bdc6931ddbf..dc6ed780c3c 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -37,6 +37,20 @@ sky launch -c gpt2 gpt2.yaml --gpu A100 ![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png) +### ## Download logs and visualizations + +After the training is finished, you can download the logs and visualizations with the following command: +```bash +scp -r gpt2:~/llm.c/log124M . +``` +We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 8000 steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.) + +![Training progress](https://imgur.com/qeNNlIB.png) + +> Yes! We are able to reproduce the training of GPT-2 (124M) on any cloud with SkyPilot. + + + ## Advanced: Run GPT-2 training in two stages The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily @@ -89,3 +103,5 @@ SkyPilot will first download and process the dataset on a CPU VM and store the processed data in a GCS bucket. Then, it will launch a GPT-2 training job on a GPU VM. The training job will train GPT-2 (124M) on the processed data. + + diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index 7c7e58eeaf9..e907d28d781 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -88,3 +88,6 @@ run: | -n 5000 \ -v 250 -s 20000 \ -h 1 + + # Upload the log and model to the bucket + rsync -Pavz log124M ~/.cache/huggingface From bea72d565d48d2881d20062a75090eeb7db1a424 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 May 2024 01:05:54 +0000 Subject: [PATCH 27/29] update --- llm/gpt-2/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index dc6ed780c3c..b16d4d6a5c6 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -37,7 +37,7 @@ sky launch -c gpt2 gpt2.yaml --gpu A100 ![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png) -### ## Download logs and visualizations +### Download logs and visualizations After the training is finished, you can download the logs and visualizations with the following command: ```bash @@ -45,7 +45,9 @@ scp -r gpt2:~/llm.c/log124M . ``` We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 8000 steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.) -![Training progress](https://imgur.com/qeNNlIB.png) +
+ +
> Yes! We are able to reproduce the training of GPT-2 (124M) on any cloud with SkyPilot. From 888743509025e5233afaf4ee2cf362eccce5ed64 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 May 2024 01:08:24 +0000 Subject: [PATCH 28/29] reduce cpu cost --- llm/gpt-2/README.md | 2 +- llm/gpt-2/gpt2-data.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index b16d4d6a5c6..eea1a727dd5 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -58,7 +58,7 @@ We can visualize the training progress with the notebook provided in [llm.c](htt The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily separate the data processing and training into two stages and execute them sequantially manually, or let SkyPilot manage the dependencies between the two stages. -With this data processing can be run on cheaper CPU VMs (e.g., ~\$1.5/hour), and run the training on more expensive GPU VMs (e.g., ~\$1.3-\$3.6/hour for a single A100 GPU, or \$10.3-\$32.8/hour for 8 A100 GPUs). +With this data processing can be run on cheaper CPU VMs (e.g., ~\$0.4/hour), and run the training on more expensive GPU VMs (e.g., ~\$1.3-\$3.6/hour for a single A100 GPU, or \$10.3-\$32.8/hour for 8 A100 GPUs). We can run the data processing on a CPU VM and store the processed data in a cloud bucket. Then, we can run the training on a GPU VM with the processed data. diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml index bc4789c0a54..8346e37ccb6 100644 --- a/llm/gpt-2/gpt2-data.yaml +++ b/llm/gpt-2/gpt2-data.yaml @@ -4,7 +4,7 @@ envs: BUCKET_NAME: # Fill in your bucket name resources: - cpus: 32+ + cpus: 8+ file_mounts: /cache: From 7609990c5ee9d7bd2179d1d955406563efc116c4 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 May 2024 01:11:48 +0000 Subject: [PATCH 29/29] update loss curve --- llm/gpt-2/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index eea1a727dd5..e6a213283d8 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -43,10 +43,10 @@ After the training is finished, you can download the logs and visualizations wit ```bash scp -r gpt2:~/llm.c/log124M . ``` -We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 8000 steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.) +We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 10K steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.)
- +
> Yes! We are able to reproduce the training of GPT-2 (124M) on any cloud with SkyPilot.