Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Examples] Update nemo gpt examples #3743

Merged
merged 5 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions examples/nemo/nemo_gpt_distributed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Distributed training a GPT style model with Nvidia NeMo on multiple nodes.
#
# Inspired from https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
#
# Note that we provide a read-only bucket at gs://sky-wiki-data that is used to
# download preprocessed data to local disk. If you want to preprocess the data
# yourself, see nemo_gpt_preprocessing.yaml.
#
# We use a shared bucket to store the index files that are used to coordinate
# between the head and worker nodes. This shared bucket is mounted as a
# network filesystem (NFS) on the head and worker nodes.
#
# After the script completes, the model checkpoints will be saved in
# /ckpts on the head node (can be changed to /shared for cloud storage).
#
# Usage:
# sky launch --env SHARED_NFS_BUCKET_NAME=<unique_bucket_name> -c nemo_gpt nemo_gpt_distributed.yaml
#
# # Terminate cluster after you're done
# sky down nemo_gpt

resources:
cpus: 8+
memory: 64+
accelerators: A100-80GB:1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we have a test with multi-node multi-gpu as well, i.e. each node having multiple GPUs?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirmed it works with A100-80GB:4 on 2 nodes!

image_id: docker:nvcr.io/nvidia/nemo:24.05

num_nodes: 2

envs:
DATASET_ROOT: /wiki
SHARED_NFS_ROOT: /shared
SHARED_NFS_BUCKET_NAME: # Enter a unique bucket name here for the shared directory - if it doesn't exist SkyPilot will create it
CHECKPOINT_PATH: /ckpts # Store checkpoints at a local path. You can change this to /shared for checkpointing to cloud bucket at every callback, but this will slow down training.

file_mounts:
${DATASET_ROOT}:
source: gs://sky-wiki-data # This is a read-only bucket provided by SkyPilot for the dataset
mode: COPY

# The SHARED_NFS_ROOT path acts as a network filesystem (NFS) between the
# head and worker nodes. In NeMo, the head node writes an indexmap to this
# shared filesystem that is read by workers.
#
# Note that NeMo requires this shared filesystem to be strongly consistent -
# any writes made by the head should be immediately visible to the workers.
${SHARED_NFS_ROOT}:
name: ${SHARED_NFS_BUCKET_NAME}
store: gcs # We recommend using GCS in mount mode - S3 based mounts may fail with "transport endpoint is not connected" error.
mode: MOUNT

setup: |
conda deactivate

# Clone NeMo repo if not already present
if [ ! -d NeMo ]; then
git clone https://github.com/NVIDIA/NeMo.git
cd NeMo
git checkout 5df8e11255802a2ce2f33db6362e60990e215b64
fi

run: |
conda deactivate
# ============= Training =============
# Get the number of nodes and master address from SkyPilot envvars
num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l`
master_addr=`echo "$SKYPILOT_NODE_IPS" | head -n1`

# Kill any existing megatron processes
pkill -f -9 megatron

mkdir -p ${CHECKPOINT_PATH}

echo "Writing checkpoints to ${CHECKPOINT_PATH}"
echo "Writing index files to shared storage ${SHARED_NFS_ROOT}"

python -m torch.distributed.run \
--nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} \
--nnodes=${num_nodes} \
--node_rank=${SKYPILOT_NODE_RANK} \
--master_addr=${master_addr} \
--master_port=12375 \
NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \
--config-path=conf \
--config-name=megatron_gpt_config \
trainer.devices=${SKYPILOT_NUM_GPUS_PER_NODE} \
trainer.num_nodes=${num_nodes} \
trainer.max_epochs=null \
trainer.max_steps=300000 \
trainer.val_check_interval=50 \
trainer.log_every_n_steps=50 \
trainer.limit_val_batches=50 \
trainer.limit_test_batches=50 \
trainer.accumulate_grad_batches=1 \
trainer.precision=16 \
model.mcore_gpt=True \
model.micro_batch_size=6 \
model.global_batch_size=192 \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.max_position_embeddings=1024 \
model.encoder_seq_length=1024 \
model.hidden_size=768 \
model.ffn_hidden_size=3072 \
model.num_layers=12 \
model.num_attention_heads=12 \
model.init_method_std=0.021 \
model.hidden_dropout=0.1 \
model.layernorm_epsilon=1e-5 \
model.tokenizer.vocab_file=${DATASET_ROOT}/gpt2-vocab.json \
model.tokenizer.merge_file=${DATASET_ROOT}/gpt2-merges.txt \
model.data.data_prefix=[1.0,${DATASET_ROOT}/hfbpe_gpt_training_data_text_document] \
model.data.num_workers=2 \
model.data.seq_length=1024 \
model.data.splits_string=\'980,10,10\' \
model.data.index_mapping_dir=${SHARED_NFS_ROOT} \
model.optim.name=fused_adam \
model.optim.lr=6e-4 \
model.optim.betas=[0.9,0.95] \
model.optim.weight_decay=0.1 \
model.optim.sched.name=CosineAnnealing \
model.optim.sched.warmup_steps=750 \
model.optim.sched.constant_steps=80000 \
model.optim.sched.min_lr=6e-5 \
exp_manager.resume_if_exists=True \
exp_manager.resume_ignore_no_checkpoint=True \
exp_manager.create_checkpoint_callback=True \
+exp_manager.checkpoint_callback_params.dirpath=${CHECKPOINT_PATH} \
exp_manager.checkpoint_callback_params.monitor=val_loss \
exp_manager.checkpoint_callback_params.save_top_k=3 \
exp_manager.checkpoint_callback_params.mode=min \
exp_manager.checkpoint_callback_params.always_save_nemo=True

# Optional - if writing checkpoints to a local directory,
# copy final checkpoints to the shared bucket at the end of training (~6 GB)
# if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then
# mkdir -p ${SHARED_NFS_ROOT}/results
# cp -R ${CHECKPOINT_PATH}
# fi
104 changes: 34 additions & 70 deletions examples/nemo/nemo_gpt_singlenode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,98 +6,60 @@
# The specific model used here should fit on GPU with 16GB memory.
#
# After the script completes, the model checkpoints will be saved in
# ~/sky_workdir/nemo_experiments/megatron_gpt/checkpoints on the head node.
# /ckpts (configurable through CHECKPOINT_PATH env var) on the head node.
#
# Usage:
# sky launch -s -c nemo_gpt nemo_gpt_singlenode.yaml
# sky launch -c nemo_gpt nemo_gpt_singlenode.yaml
#
# # Or try on spot A100 GPUs:
# sky launch -c nemo_gpt nemo_gpt_singlenode.yaml --use-spot --gpus A100:1
#
# # The setup will take some time (~1 hr), feel free to ctrl-c once the setup script starts
# # You can reconnect to log stream using `sky logs nemo_gpt_train`
#
# # Terminate cluster after you're done
# sky down nemo_gpt

resources:
cpus: 6+
accelerators: A100:1
cpus: 8+
memory: 64+
accelerators: A100-80GB:1
image_id: docker:nvcr.io/nvidia/nemo:24.05

num_nodes: 1

envs:
DATASET_ROOT: $HOME/wiki/
DATASET_ROOT: /wiki
CHECKPOINT_PATH: /ckpts


file_mounts:
${DATASET_ROOT}:
source: gs://sky-wiki-data # This is a read-only bucket provided by SkyPilot for the dataset
mode: COPY

setup: |
# ============== Dependency Setup ==============
conda activate nemo
if [ $? -eq 0 ]; then
echo "Nemo conda env exists"
else
echo "Setup start"

conda create -y --name nemo python==3.10.12
conda activate nemo
conda deactivate

# Install PyTorch
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install nemo
# Clone NeMo repo if not already present
if [ ! -d NeMo ]; then
git clone https://github.com/NVIDIA/NeMo.git
cd NeMo
git checkout b4ad7eaa7873d632391d6985aa6b359f39c20bab
pip install Cython
pip install .[all]
cd ..

# Install megatron-core
# We install in editable mode because setup.py does not install all
# required modules if we install in non-editable mode.
git clone https://github.com/NVIDIA/Megatron-LM
cd Megatron-LM
git checkout dc21350806361564b8ce61d4a8d247cb195cc5f0
pip install -e .
cd ..

# Install ninja for faster compilation
pip install ninja packaging

# Install transformer engine and flash-attn (Takes ~1hr to compile)
MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation # Version upper capped by TransformerEngine
MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable

pip install pytorch-extension
cd NeMo
git checkout 5df8e11255802a2ce2f33db6362e60990e215b64
fi

# Install Apex
git clone https://github.com/NVIDIA/apex.git
cd apex
git checkout 52e18c894223800cb611682dce27d88050edf1de
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
cd ..

# Install gsutil if it doesn't exist
if ! command -v gsutil &> /dev/null
then
pip install gsutil
else
echo "gsutil exists"
fi
# Install gsutil if it doesn't exist
if ! command -v gsutil &> /dev/null
then
pip install gsutil
else
echo "gsutil exists"
fi

run: |
conda activate nemo
# ============= Data Download =============
# We download pre-processed data from a read-only bucket at gs://sky-wiki-data
# For more on how to pre-process data, see nemo_gpt3_preprocessing.yaml
conda deactivate

# Kill any existing megatron processes
pkill -f -9 megatron

if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then
echo "Data already downloaded"
else
echo "Head node downloading data to shared bucket."
mkdir -p $DATASET_ROOT
gsutil -m cp gs://sky-wiki-data/{gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_ROOT}
fi
mkdir -p ${CHECKPOINT_PATH}

# ============= Training =============
python NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \
Expand All @@ -107,12 +69,13 @@ run: |
trainer.num_nodes=1 \
trainer.max_epochs=null \
trainer.max_steps=300000 \
trainer.val_check_interval=300 \
trainer.val_check_interval=50 \
trainer.log_every_n_steps=50 \
trainer.limit_val_batches=50 \
trainer.limit_test_batches=50 \
trainer.accumulate_grad_batches=1 \
trainer.precision=16 \
model.mcore_gpt=True \
model.micro_batch_size=6 \
model.global_batch_size=192 \
model.tensor_model_parallel_size=1 \
Expand Down Expand Up @@ -143,6 +106,7 @@ run: |
exp_manager.resume_if_exists=True \
exp_manager.resume_ignore_no_checkpoint=True \
exp_manager.create_checkpoint_callback=True \
+exp_manager.checkpoint_callback_params.dirpath=${CHECKPOINT_PATH} \
exp_manager.checkpoint_callback_params.monitor=val_loss \
exp_manager.checkpoint_callback_params.save_top_k=3 \
exp_manager.checkpoint_callback_params.mode=min \
Expand Down
Loading
Loading