From 3f3e53b3e75983e921b5bfeec9e629e43b38cc5c Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Mon, 18 Mar 2024 05:59:30 +0000
Subject: [PATCH] + addd gpt2 train all-in-one image

---
 .github/workflows/build-train-gpt-example.yml | 44 ++++++++++++
 examples/pre-train-gpt/Dockerfile             | 13 ++++
 examples/pre-train-gpt/train-gpt.sh           | 67 +++++++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 .github/workflows/build-train-gpt-example.yml
 create mode 100644 examples/pre-train-gpt/Dockerfile
 create mode 100644 examples/pre-train-gpt/train-gpt.sh

diff --git a/.github/workflows/build-train-gpt-example.yml b/.github/workflows/build-train-gpt-example.yml
new file mode 100644
index 0000000..7a73021
--- /dev/null
+++ b/.github/workflows/build-train-gpt-example.yml
@@ -0,0 +1,44 @@
+name: Build GPT2 train image
+on:
+  push:
+    paths:
+    - 'examples/pre-train-gpt/**'
+
+jobs:
+  docker-build-train-gpt-example:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      - name: Login Github Container registry
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: |
+            ghcr.io/BaizeAI/train-gpt2-example
+          tags: |
+            type=raw,value=latest
+            type=raw,value=${{ github.run_id }}
+
+      - name: Build
+        uses: docker/build-push-action@v4
+        with:
+          context: examples/pre-train-gpt
+          platforms: linux/amd64
+          push: true
+          provenance: false
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/examples/pre-train-gpt/Dockerfile b/examples/pre-train-gpt/Dockerfile
new file mode 100644
index 0000000..958d33f
--- /dev/null
+++ b/examples/pre-train-gpt/Dockerfile
@@ -0,0 +1,13 @@
+FROM busybox as content
+
+WORKDIR /app 
+
+RUN wget http://baize-ai.daocloud.io/gpt2-train-data/gpt2-vocab.json && \
+    wget http://baize-ai.daocloud.io/gpt2-train-data/gpt2-merges.txt && \
+    wget http://baize-ai.daocloud.io/gpt2-train-data/meg-gpt2_text_document.bin && \
+    wget http://baize-ai.daocloud.io/gpt2-train-data/meg-gpt2_text_document.idx
+
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+COPY --from=content /app /data/gpt2-train-data
+ADD train-gpt.sh .
diff --git a/examples/pre-train-gpt/train-gpt.sh b/examples/pre-train-gpt/train-gpt.sh
new file mode 100644
index 0000000..fc7c117
--- /dev/null
+++ b/examples/pre-train-gpt/train-gpt.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Runs the "345M" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+CHECKPOINT_PATH=/checkpoints
+VOCAB_FILE=/data/gpt2-train-data/gpt2-vocab.json
+MERGE_FILE=/data/gpt2-train-data/gpt2-merges.txt
+DATA_PATH=/data/gpt2-train-data/meg-gpt2_text_document
+
+# TRAIN_SIZE:
+# nano: for single p4 can run.
+TRAIN_SIZE=${TRAIN_SIZE:-nano}
+
+if [[ ${TRAIN_SIZE} == "nano" ]]; then
+    GPT_SIZE_ARGS="
+        --num-layers 12 \
+        --hidden-size 512 \
+        --num-attention-heads 8
+    "
+else
+    GPT_SIZE_ARGS="
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16
+    "
+fi
+
+GPT_ARGS="
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --lr 0.00015 \
+    --train-iters 500000 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS=${OUTPUT_ARGS:-"
+    --log-interval 100 \
+    --save-interval 1000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"}
+
+torchrun pretrain_gpt.py \
+    $GPT_SIZE_ARGS \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    $EXTRA_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH