Skip to content

Commit

Permalink
fix params
Browse files Browse the repository at this point in the history
  • Loading branch information
romilbhardwaj committed Sep 14, 2023
1 parent 33dabd6 commit 5c7fe1e
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
9 changes: 5 additions & 4 deletions examples/nemo/nemo_gpt3_singlenode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,15 @@ run: |
echo "Data already downloaded"
else
echo "Head node downloading data to shared bucket."
mkdir -p $DATASET_ROOT
gsutil -m cp gs://sky-wiki-data/{gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_ROOT}
fi
# ============= Training =============
conda activate nemo
python NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \
--config-path=NeMo/examples/nlp/language_modeling/conf \
--config-path=conf \
--config-name=megatron_gpt_config \
trainer.devices=${SKYPILOT_NUM_GPUS_PER_NODE} \
trainer.num_nodes=1 \
Expand All @@ -114,9 +115,9 @@ run: |
model.init_method_std=0.021 \
model.hidden_dropout=0.1 \
model.layernorm_epsilon=1e-5 \
model.tokenizer.vocab_file=gpt2-vocab.json \
model.tokenizer.merge_file=gpt2-merges.txt \
model.data.data_prefix=[1.0,hfbpe_gpt_training_data_text_document] \
model.tokenizer.vocab_file=${DATASET_ROOT}/gpt2-vocab.json \
model.tokenizer.merge_file=${DATASET_ROOT}/gpt2-merges.txt \
model.data.data_prefix=[1.0,${DATASET_ROOT}/hfbpe_gpt_training_data_text_document] \
model.data.num_workers=2 \
model.data.seq_length=1024 \
model.data.splits_string=\'980,10,10\' \
Expand Down
6 changes: 3 additions & 3 deletions examples/nemo/nemo_gpt3_train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ run: |
trainer.devices=${SKYPILOT_NUM_GPUS_PER_NODE} \
trainer.num_nodes=${num_nodes} \
trainer.max_epochs=null \
trainer.max_steps=100 \
trainer.val_check_interval=20 \
trainer.log_every_n_steps=10 \
trainer.max_steps=300000 \
trainer.val_check_interval=300 \
trainer.log_every_n_steps=50 \
trainer.limit_val_batches=50 \
trainer.limit_test_batches=50 \
trainer.accumulate_grad_batches=1 \
Expand Down

0 comments on commit 5c7fe1e

Please sign in to comment.