From 3f3e53b3e75983e921b5bfeec9e629e43b38cc5c Mon Sep 17 00:00:00 2001 From: Kebe Date: Mon, 18 Mar 2024 05:59:30 +0000 Subject: [PATCH] + addd gpt2 train all-in-one image --- .github/workflows/build-train-gpt-example.yml | 44 ++++++++++++ examples/pre-train-gpt/Dockerfile | 13 ++++ examples/pre-train-gpt/train-gpt.sh | 67 +++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 .github/workflows/build-train-gpt-example.yml create mode 100644 examples/pre-train-gpt/Dockerfile create mode 100644 examples/pre-train-gpt/train-gpt.sh diff --git a/.github/workflows/build-train-gpt-example.yml b/.github/workflows/build-train-gpt-example.yml new file mode 100644 index 0000000..7a73021 --- /dev/null +++ b/.github/workflows/build-train-gpt-example.yml @@ -0,0 +1,44 @@ +name: Build GPT2 train image +on: + push: + paths: + - 'examples/pre-train-gpt/**' + +jobs: + docker-build-train-gpt-example: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Login Github Container registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Docker meta + id: meta + uses: docker/metadata-action@v4 + with: + images: | + ghcr.io/BaizeAI/train-gpt2-example + tags: | + type=raw,value=latest + type=raw,value=${{ github.run_id }} + + - name: Build + uses: docker/build-push-action@v4 + with: + context: examples/pre-train-gpt + platforms: linux/amd64 + push: true + provenance: false + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/examples/pre-train-gpt/Dockerfile b/examples/pre-train-gpt/Dockerfile new file mode 100644 index 0000000..958d33f --- /dev/null +++ b/examples/pre-train-gpt/Dockerfile @@ -0,0 +1,13 @@ +FROM busybox as content + +WORKDIR /app + +RUN wget http://baize-ai.daocloud.io/gpt2-train-data/gpt2-vocab.json && \ + wget http://baize-ai.daocloud.io/gpt2-train-data/gpt2-merges.txt && \ + wget http://baize-ai.daocloud.io/gpt2-train-data/meg-gpt2_text_document.bin && \ + wget http://baize-ai.daocloud.io/gpt2-train-data/meg-gpt2_text_document.idx + +FROM nvcr.io/nvidia/pytorch:24.02-py3 + +COPY --from=content /app /data/gpt2-train-data +ADD train-gpt.sh . diff --git a/examples/pre-train-gpt/train-gpt.sh b/examples/pre-train-gpt/train-gpt.sh new file mode 100644 index 0000000..fc7c117 --- /dev/null +++ b/examples/pre-train-gpt/train-gpt.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# Runs the "345M" parameter model + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +CHECKPOINT_PATH=/checkpoints +VOCAB_FILE=/data/gpt2-train-data/gpt2-vocab.json +MERGE_FILE=/data/gpt2-train-data/gpt2-merges.txt +DATA_PATH=/data/gpt2-train-data/meg-gpt2_text_document + +# TRAIN_SIZE: +# nano: for single p4 can run. +TRAIN_SIZE=${TRAIN_SIZE:-nano} + +if [[ ${TRAIN_SIZE} == "nano" ]]; then + GPT_SIZE_ARGS=" + --num-layers 12 \ + --hidden-size 512 \ + --num-attention-heads 8 + " +else + GPT_SIZE_ARGS=" + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 + " +fi + +GPT_ARGS=" + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --micro-batch-size 4 \ + --global-batch-size 8 \ + --lr 0.00015 \ + --train-iters 500000 \ + --lr-decay-iters 320000 \ + --lr-decay-style cosine \ + --min-lr 1.0e-5 \ + --weight-decay 1e-2 \ + --lr-warmup-fraction .01 \ + --clip-grad 1.0 \ + --fp16 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --split 949,50,1 +" + +OUTPUT_ARGS=${OUTPUT_ARGS:-" + --log-interval 100 \ + --save-interval 1000 \ + --eval-interval 1000 \ + --eval-iters 10 +"} + +torchrun pretrain_gpt.py \ + $GPT_SIZE_ARGS \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + $EXTRA_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH