Skip to content

Commit

Permalink
Add -s flag
Browse files Browse the repository at this point in the history
  • Loading branch information
romilbhardwaj committed Sep 13, 2023
1 parent cbcf38f commit 33dabd6
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 10 deletions.
5 changes: 1 addition & 4 deletions examples/nemo/nemo_gpt3_preprocessing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,11 @@
# that can be downloaded to your bucket (see nemo_gpt3_train.yaml).
#
# Usage:
# sky launch -c nemo_gpt3_preprocessing nemo_gpt3_preprocessing.yaml
# sky launch -s -c nemo_gpt3_preprocessing nemo_gpt3_preprocessing.yaml
#
# # Terminate cluster after you're done
# sky down nemo_gpt3_preprocessing

resources:
accelerators: V100:1 # TODO - See if this can be run on CPU-only nodes

num_nodes: 1

envs:
Expand Down
14 changes: 10 additions & 4 deletions examples/nemo/nemo_gpt3_singlenode.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# End to end Distributed training a GPT style model with Nvidia NeMo
# This script includes data download, pre-processing and training, all on a
# single node.
# Single node training a GPT style model with Nvidia NeMo
#
# This script downloads data from read-only bucket at gs://sky-wiki-data.
# If you want to preprocess the data yourself, see nemo_gpt3_preprocessing.yaml.
#
# The specific model used here should fit on GPU with 16GB memory.
#
# After the script completes, the model checkpoints will be saved in
# ~/sky_workdir/nemo_experiments/megatron_gpt/checkpoints on the head node.
#
# Usage:
# sky launch -c nemo_gpt3 nemo_gpt3_singlenode.yaml
# sky launch -s -c nemo_gpt3 nemo_gpt3_singlenode.yaml
#
# # Or try on spot A100 GPUs:
# sky launch -c nemo_gpt3 nemo_gpt3_singlenode.yaml --use-spot --gpus A100:1
Expand Down
13 changes: 11 additions & 2 deletions examples/nemo/nemo_gpt3_train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
# download preprocessed data to your bucket. If you want to preprocess the data
# yourself, see nemo_gpt3_preprocessing.yaml.
#
# After the script completes, the model checkpoints will be saved in
# ~/sky_workdir/nemo_experiments/megatron_gpt/checkpoints on the head node.
#
# Usage:
# sky launch -c nemo_gpt3_train nemo_gpt3_train.yaml
# sky launch -s -c nemo_gpt3_train nemo_gpt3_train.yaml
#
# # Terminate cluster after you're done
# sky down nemo_gpt3_train
Expand All @@ -19,7 +22,7 @@ num_nodes: 2

envs:
DATASET_ROOT: /wiki
BUCKET_NAME: romil-nemo-test-bucket2 # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
BUCKET_NAME: # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it

file_mounts:
${DATASET_ROOT}:
Expand Down Expand Up @@ -158,3 +161,9 @@ run: |
exp_manager.checkpoint_callback_params.save_top_k=3 \
exp_manager.checkpoint_callback_params.mode=min \
exp_manager.checkpoint_callback_params.always_save_nemo=True
# Optional - copy checkpoints to the mounted dataset bucket (~6 GB)
# if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then
# mkdir -p ${DATASET_ROOT}/results
# cp -R ~/sky_workdir/nemo_experiments
# fi

0 comments on commit 33dabd6

Please sign in to comment.