diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index e6a213283d8..bc9893fec5b 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -1,7 +1,7 @@ # Run GPT-2 in llm.c on any cloud with SkyPilot -This is a reproducible package of llm.c's GPT-2 (124M) training by @karpathy (https://github.com/karpathy/llm.c/discussions/481) -With SkyPilot, you can run GPT-2 (124M) training on any cloud. +This is a reproducible package of llm.c's GPT-2 (124M) training by @karpathy (https://github.com/karpathy/llm.c/discussions/481). +With SkyPilot, you can run GPT-2 (124M) training on any cloud. SkyPilot looks for the cheapest resources available on the clouds enabled for a user, launches and manages the whole data processing and training pipeline, leading to a close to ~\$20 target cost as @karpathy mentioned in the discussion. ## Prerequisites @@ -30,13 +30,21 @@ sky launch -c gpt2 gpt2.yaml ![GPT-2 training with 8 A100 GPUs](https://imgur.com/v8SGpsF.png) -Or, you can train the model with a single A100, by adding `--gpu A100`: +Or, you can train the model with a single A100, by adding `--gpus A100`: ```bash -sky launch -c gpt2 gpt2.yaml --gpu A100 +sky launch -c gpt2 gpt2.yaml --gpus A100 ``` ![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png) + +It is also possible to speed up the training of the model on 8 H100 (2.3x more tok/s than 8x A100s): +```bash +sky launch -c gpt2 gpt2.yaml --gpus H100:8 +``` + +![GPT-2 training with 8 H100](https://imgur.com/STbi80b.png) + ### Download logs and visualizations After the training is finished, you can download the logs and visualizations with the following command: @@ -85,9 +93,9 @@ After the data is processed, you can then train the model on a GPU VM with 8 A10 sky launch -c gpt2-train --detach-setup gpt2-train.yaml --env BUCKET_NAME=your-bucket-name ``` -Or, you can train the model with a single A100, by adding `--gpu A100`: +Or, you can train the model with a single A100, by adding `--gpus A100`: ```bash -sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_NAME=your-bucket-name +sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpus A100 --env BUCKET_NAME=your-bucket-name ``` @@ -95,12 +103,14 @@ sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_ We can also combine the two steps into a single SkyPilot job, and let SkyPilot to handle the dependencies between the two steps. Here is an example of how to do this (replace `your-bucket-name` with your bucket name): ```bash -cat gpt2-data.yaml > gpt2-pipeline.yaml -echo "---" >> gpt2-pipeline.yaml -cat gpt2-train.yaml >> gpt2-pipeline.yaml sky jobs launch -n gpt2 gpt2-pipeline.yaml --env BUCKET_NAME=your-bucket-name ``` +> Note: the pipeline yaml can be retrieved with the following command: +```bash +cat gpt2-data.yaml > gpt2-pipeline.yaml; echo "---" >> gpt2-pipeline.yaml; cat gpt2-train.yaml >> gpt2-pipeline.yaml +``` + SkyPilot will first download and process the dataset on a CPU VM and store the processed data in a GCS bucket. Then, it will launch a GPT-2 training job on a GPU VM. The training job will train GPT-2 (124M) on the processed data. diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml index 8346e37ccb6..fc7bb02bf95 100644 --- a/llm/gpt-2/gpt2-data.yaml +++ b/llm/gpt-2/gpt2-data.yaml @@ -1,7 +1,8 @@ name: gpt2-data envs: - BUCKET_NAME: # Fill in your bucket name + BUCKET_NAME: # TODO: Fill in your bucket name + BUCKET_STORE: s3 # Can be s3, gcs, or r2. resources: cpus: 8+ @@ -9,6 +10,7 @@ resources: file_mounts: /cache: name: $BUCKET_NAME + store: $BUCKET_STORE mode: MOUNT setup: | diff --git a/llm/gpt-2/gpt2-pipeline.yaml b/llm/gpt-2/gpt2-pipeline.yaml new file mode 100644 index 00000000000..e5ea05f7948 --- /dev/null +++ b/llm/gpt-2/gpt2-pipeline.yaml @@ -0,0 +1,129 @@ +name: gpt2-data + +envs: + BUCKET_NAME: # TODO: Fill in your bucket name + BUCKET_STORE: s3 # Can be s3, gcs, or r2. + +resources: + cpus: 8+ + +file_mounts: + /cache: + name: $BUCKET_NAME + store: $BUCKET_STORE + mode: MOUNT + +setup: | + pip install tqdm tiktoken requests datasets + git clone https://github.com/karpathy/llm.c.git@ed37d9261ba13ef212c01e2de8b309cbb46a2aa7 || true + + # Adding revision to fix the dataset version, as the latest fineweb + # dataset removed the samples, causing error: + # Please pass `features` or at least one example when writing data + sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py + + +run: | + cd llm.c + # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?) + # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B + # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb + python dev/data/fineweb.py --version 10B + + rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/ + rsync -Pavz dev/data/fineweb10B /cache/ +--- +name: gpt2-train + +envs: + BUCKET_NAME: # TODO: Fill in your bucket name + BUCKET_STORE: s3 # Can be s3, gcs, or r2. + +resources: + accelerators: A100:8 + # Use docker image for latest version g++ to enable the compilation of llm.c. + image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + any_of: + # Avoid using docker image for lambda due to the docker is not supported on + # Lambda yet, but the base image works. + - cloud: lambda + image_id: null + - cloud: aws + - cloud: gcp + - cloud: azure + - cloud: fluidstack + - cloud: kubernetes + +file_mounts: + ~/.cache/huggingface: + name: $BUCKET_NAME + store: $BUCKET_STORE + mode: COPY + +setup: | + cd ~ + + # install cudnn so we can use FlashAttention and run fast (optional) + # https://developer.nvidia.com/cudnn-downloads + # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04 + if [ -f ./CUDNN_INSTALLED ]; then + echo "cudnn already installed" + else + system=$(lsb_release -si | tr '[:upper:]' '[:lower:]') + # Get version and remove the dot + version=$(lsb_release -sr | tr -d .) + export system_version="${system}${version}" + wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb + sudo dpkg -i cudnn-installer.deb + sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/ + # Remove problematic kubernetes.list source + sudo apt-get update --allow-releaseinfo-change || true + + sudo apt-get -y install cudnn-cuda-12 + + touch ./CUDNN_INSTALLED + fi + + # "install" cudnn-frontend to ~/ + sudo apt -y install git + git clone https://github.com/NVIDIA/cudnn-frontend.git || true + + # install MPI (optional, if you intend to use multiple GPUs) + # SkyPilot do not install MPI as that requires NCCL which needs to be manually + # installed. + sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev + # install nccl + pip install nvidia-nccl-cu12 + export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include + + git clone https://github.com/karpathy/llm.c.git || true + cd llm.c + ln -s ~/.cache/huggingface/fineweb10B dev/data/ + # compile llm.c (mixed precision, with cuDNN flash-attention) + # first compilation is ~1 minute, mostly due to cuDNN + make train_gpt2cu USE_CUDNN=1 + + +run: | + cd ~/llm.c + # train on multiple GPUs + mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \ + -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" \ + -o log124M \ + -e "d12" \ + -b 64 -t 1024 \ + -d 524288 \ + -r 1 \ + -z 1 \ + -c 0.1 \ + -l 0.0006 \ + -q 0.0 \ + -u 700 \ + -n 5000 \ + -v 250 -s 20000 \ + -h 1 + + # Upload the log and model to the bucket + rsync -Pavz log124M ~/.cache/huggingface diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml index e907d28d781..3a4e8c28d14 100644 --- a/llm/gpt-2/gpt2-train.yaml +++ b/llm/gpt-2/gpt2-train.yaml @@ -1,7 +1,8 @@ -name: train +name: gpt2-train envs: - BUCKET_NAME: # Fill in your bucket name + BUCKET_NAME: # TODO: Fill in your bucket name + BUCKET_STORE: s3 # Can be s3, gcs, or r2. resources: accelerators: A100:8 @@ -21,11 +22,11 @@ resources: file_mounts: ~/.cache/huggingface: name: $BUCKET_NAME + store: $BUCKET_STORE mode: COPY setup: | cd ~ - pip install tqdm tiktoken requests datasets # install cudnn so we can use FlashAttention and run fast (optional) # https://developer.nvidia.com/cudnn-downloads diff --git a/sky/cli.py b/sky/cli.py index 23c5ba7a3cd..db5291d949c 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -772,6 +772,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides( else: task = sky.Task(name='sky-cmd', run=entrypoint) task.set_resources({sky.Resources()}) + # env update has been done for DAG in load_chain_dag_from_yaml for YAML. + task.update_envs(env) # Override. if workdir is not None: @@ -787,7 +789,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides( task.num_nodes = num_nodes if name is not None: task.name = name - task.update_envs(env) return task diff --git a/sky/utils/dag_utils.py b/sky/utils/dag_utils.py index ef80bfd2a17..7a4fe90e7fb 100644 --- a/sky/utils/dag_utils.py +++ b/sky/utils/dag_utils.py @@ -70,9 +70,9 @@ def load_chain_dag_from_yaml( Has special handling for an initial section in YAML that contains only the 'name' field, which is the DAG name. - 'env_overrides' is in effect only when there's exactly one task. It is a - list of (key, value) pairs that will be used to update the task's 'envs' - section. + 'env_overrides' is a list of (key, value) pairs that will be used to update + the task's 'envs' section. If it is a chain dag, the envs will be updated + for all tasks in the chain. Returns: A chain Dag with 1 or more tasks (an empty entrypoint would create a @@ -90,12 +90,6 @@ def load_chain_dag_from_yaml( # YAML has only `name: xxx`. Still instantiate a task. configs = [{'name': dag_name}] - if len(configs) > 1: - # TODO(zongheng): in a chain DAG of N tasks, cli.py currently makes the - # decision to not apply overrides. Here we maintain this behavior. We - # can listen to user feedback to change this. - env_overrides = None - current_task = None with dag_lib.Dag() as dag: for task_config in configs: