From 4c4493563bebe1a7d02b216ea594fb2105c7a751 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 May 2024 00:02:42 +0000 Subject: [PATCH] address comments --- llm/gpt-2/README.md | 50 +++++++++++++++++----- llm/gpt-2/gpt2-data.yaml | 2 +- llm/gpt-2/gpt2.yaml | 89 ++++++++++++++++++++++++++++++++++++++++ sky/utils/schemas.py | 6 ++- 4 files changed, 134 insertions(+), 13 deletions(-) create mode 100644 llm/gpt-2/gpt2.yaml diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index 404d3bdcfcc..9cdfd76f462 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -7,7 +7,7 @@ With SkyPilot, you can run GPT-2 (124M) training on any cloud. 1. Install [SkyPilot](https://github.com/skypilot-org/skypilot): ```bash -pip install skypilot-nightly +pip install "skypilot-nightly[aws,gcp,azure,kubernetes,lambda,fluidstack]" # Choose the clouds you want to enable ``` 2. Enable clouds for SkyPilot: ```bash @@ -15,21 +15,49 @@ sky check ``` Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). -3. Download the YAMLs in this directory for data processing and training: +3. Download the YAML for starting the training: ```bash -wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml -wget https://github.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml +wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2.yaml ``` -## Data processing +## Run GPT-2 training + +Run the following command to start GPT-2 (124M) training on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name): + +```bash +sky launch -c gpt2 gpt2.yaml +``` + +Or, you can train the model with a single A100, by adding `--gpu A100`: +```bash +sky launch -c gpt2 gpt2.yaml --gpu A100 +``` + +## Advanced: Run GPT-2 training in two stages + +The data processing for GPT-2 training is CPU-bound, while the training is GPU-bound. Having the data processing on a GPU VM is not cost-effective. With SkyPilot, you can easily +separate the data processing and training into two stages and execute them sequantially manually, or let SkyPilot manage the dependencies between the two stages. + +With this data processing can be run on cheaper CPU VMs (e.g., ~\$1.5/hour), and run the training on more expensive GPU VMs (e.g., ~\$1.3-\$3.6/hour for a single A100 GPU, or \$10.3-\$32.8/hour for 8 A100 GPUs). + +We can run the data processing on a CPU VM and store the processed data in a cloud bucket. Then, we can run the training on a GPU VM with the processed data. + +```bash +wget https://raw.githubusercontent.com//skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-data.yaml +wget https://raw.githubusercontent.com/skypilot-org/skypilot/blob/master/llm/gpt-2/gpt2-train.yaml +``` + +### Run two stages manually +#### Data processing Run the following command to process the training data on a CPU VM and store it in a cloud bucket for future use (replace `your-bucket-name` with your bucket name): + ```bash sky launch -c gpt2-data gpt2-data.yaml --env BUCKET_NAME=your-bucket-name ``` -## Training +#### Training After the data is processed, you can then train the model on a GPU VM with 8 A100 GPUs (replace `your-bucket-name` with your bucket name): @@ -43,14 +71,14 @@ sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_ ``` -## Run in a Pipeline +### Run in a Pipeline We can also combine the two steps into a single SkyPilot job, and let SkyPilot to handle the dependencies between the two steps. Here is an example of how to do this (replace `your-bucket-name` with your bucket name): ```bash -cat gpt2-data.yaml > gpt2.yaml -echo "---" >> gpt2.yaml -cat gpt2-train.yaml >> gpt2.yaml -sky jobs launch -n gpt2 gpt2.yaml --env BUCKET_NAME=your-bucket-name +cat gpt2-data.yaml > gpt2-pipeline.yaml +echo "---" >> gpt2-pipeline.yaml +cat gpt2-train.yaml >> gpt2-pipeline.yaml +sky jobs launch -n gpt2 gpt2-pipeline.yaml --env BUCKET_NAME=your-bucket-name ``` SkyPilot will first download and process the dataset on a CPU VM and store the diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml index 42daee00584..3bd082d02e2 100644 --- a/llm/gpt-2/gpt2-data.yaml +++ b/llm/gpt-2/gpt2-data.yaml @@ -4,7 +4,7 @@ envs: BUCKET_NAME: # Fill in your bucket name resources: - cpus: 64+ + cpus: 32+ file_mounts: /cache: diff --git a/llm/gpt-2/gpt2.yaml b/llm/gpt-2/gpt2.yaml new file mode 100644 index 00000000000..0698073e3a0 --- /dev/null +++ b/llm/gpt-2/gpt2.yaml @@ -0,0 +1,89 @@ +name: train + +resources: + accelerators: A100:8 + # Use docker image for latest version g++ to enable the compilation of llm.c. + image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + any_of: + # Avoid using docker image for lambda due to the docker is not supported on + # Lambda yet, but the base image works. + - cloud: lambda + image_id: null + - cloud: aws + - cloud: gcp + - cloud: azure + - cloud: fluidstack + - cloud: kubernetes + + +setup: | + cd ~ + pip install tqdm tiktoken requests datasets + + # Training dependencies + # install cudnn so we can use FlashAttention and run fast (optional) + # https://developer.nvidia.com/cudnn-downloads + # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04 + if [ -f ./CUDNN_INSTALLED ]; then + echo "cudnn already installed" + else + system=$(lsb_release -si | tr '[:upper:]' '[:lower:]') + # Get version and remove the dot + version=$(lsb_release -sr | tr -d .) + export system_version="${system}${version}" + wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb + sudo dpkg -i cudnn-installer.deb + sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ + # Remove problematic kubernetes.list source + sudo apt-get update --allow-releaseinfo-change || true + + sudo apt-get -y install cudnn-cuda-12 + + touch ./CUDNN_INSTALLED + fi + + # "install" cudnn-frontend to ~/ + sudo apt -y install git + git clone https://github.com/NVIDIA/cudnn-frontend.git || true + + # install MPI (optional, if you intend to use multiple GPUs) + # SkyPilot do not install MPI as that requires NCCL which needs to be manually + # installed. + sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev + # install nccl + pip install nvidia-nccl-cu12 + export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include + + git clone https://github.com/karpathy/llm.c.git || true + cd llm.c + # compile llm.c (mixed precision, with cuDNN flash-attention) + # first compilation is ~1 minute, mostly due to cuDNN + make train_gpt2cu USE_CUDNN=1 + + +run: | + cd ~/llm.c + # Processing data + # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) + # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B + # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb + python dev/data/fineweb.py --version 10B + + # Start training on multiple GPUs + mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \ + -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" \ + -o log124M \ + -e "d12" \ + -b 64 -t 1024 \ + -d 524288 \ + -r 1 \ + -z 1 \ + -c 0.1 \ + -l 0.0006 \ + -q 0.0 \ + -u 700 \ + -n 5000 \ + -v 250 -s 20000 \ + -h 1 diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 8bbe1d54e60..02c8452ef2a 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -141,7 +141,11 @@ def _get_single_resources_schema(): }, { 'type': 'object', 'required': [], - }] + }, + { + 'type': 'null', + } + ] }, # The following fields are for internal use only. '_docker_login_config': {