Skip to content

Commit

Permalink
+ addd gpt2 train all-in-one image
Browse files Browse the repository at this point in the history
  • Loading branch information
kebe7jun committed Mar 18, 2024
1 parent f51df12 commit 3f3e53b
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 0 deletions.
44 changes: 44 additions & 0 deletions .github/workflows/build-train-gpt-example.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: Build GPT2 train image
on:
push:
paths:
- 'examples/pre-train-gpt/**'

jobs:
docker-build-train-gpt-example:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Login Github Container registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Docker meta
id: meta
uses: docker/metadata-action@v4
with:
images: |
ghcr.io/BaizeAI/train-gpt2-example
tags: |
type=raw,value=latest
type=raw,value=${{ github.run_id }}
- name: Build
uses: docker/build-push-action@v4
with:
context: examples/pre-train-gpt
platforms: linux/amd64
push: true
provenance: false
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
13 changes: 13 additions & 0 deletions examples/pre-train-gpt/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM busybox as content

WORKDIR /app

RUN wget http://baize-ai.daocloud.io/gpt2-train-data/gpt2-vocab.json && \
wget http://baize-ai.daocloud.io/gpt2-train-data/gpt2-merges.txt && \
wget http://baize-ai.daocloud.io/gpt2-train-data/meg-gpt2_text_document.bin && \
wget http://baize-ai.daocloud.io/gpt2-train-data/meg-gpt2_text_document.idx

FROM nvcr.io/nvidia/pytorch:24.02-py3

COPY --from=content /app /data/gpt2-train-data
ADD train-gpt.sh .
67 changes: 67 additions & 0 deletions examples/pre-train-gpt/train-gpt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash

# Runs the "345M" parameter model

export CUDA_DEVICE_MAX_CONNECTIONS=1

CHECKPOINT_PATH=/checkpoints
VOCAB_FILE=/data/gpt2-train-data/gpt2-vocab.json
MERGE_FILE=/data/gpt2-train-data/gpt2-merges.txt
DATA_PATH=/data/gpt2-train-data/meg-gpt2_text_document

# TRAIN_SIZE:
# nano: for single p4 can run.
TRAIN_SIZE=${TRAIN_SIZE:-nano}

if [[ ${TRAIN_SIZE} == "nano" ]]; then
GPT_SIZE_ARGS="
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8
"
else
GPT_SIZE_ARGS="
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16
"
fi

GPT_ARGS="
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 4 \
--global-batch-size 8 \
--lr 0.00015 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"

DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--split 949,50,1
"

OUTPUT_ARGS=${OUTPUT_ARGS:-"
--log-interval 100 \
--save-interval 1000 \
--eval-interval 1000 \
--eval-iters 10
"}

torchrun pretrain_gpt.py \
$GPT_SIZE_ARGS \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
$EXTRA_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH

0 comments on commit 3f3e53b

Please sign in to comment.