Add -s flag

skypilot-org · Sep 13, 2023 · 33dabd6 · 33dabd6
1 parent cbcf38f
commit 33dabd6
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 10 deletions.
diff --git a/examples/nemo/nemo_gpt3_preprocessing.yaml b/examples/nemo/nemo_gpt3_preprocessing.yaml
@@ -8,14 +8,11 @@
 # that can be downloaded to your bucket (see nemo_gpt3_train.yaml).
 #
 # Usage:
-#   sky launch -c nemo_gpt3_preprocessing nemo_gpt3_preprocessing.yaml
+#   sky launch -s -c nemo_gpt3_preprocessing nemo_gpt3_preprocessing.yaml
 #
 #   # Terminate cluster after you're done
 #   sky down nemo_gpt3_preprocessing
 
-resources:
-  accelerators: V100:1 # TODO - See if this can be run on CPU-only nodes
-
 num_nodes: 1
 
 envs:

diff --git a/examples/nemo/nemo_gpt3_singlenode.yaml b/examples/nemo/nemo_gpt3_singlenode.yaml
@@ -1,9 +1,15 @@
-# End to end Distributed training a GPT style model with Nvidia NeMo
-# This script includes data download, pre-processing and training, all on a
-# single node.
+# Single node training a GPT style model with Nvidia NeMo
+#
+# This script downloads data from read-only bucket at gs://sky-wiki-data.
+# If you want to preprocess the data yourself, see nemo_gpt3_preprocessing.yaml.
+#
+# The specific model used here should fit on GPU with 16GB memory.
+#
+# After the script completes, the model checkpoints will be saved in
+# ~/sky_workdir/nemo_experiments/megatron_gpt/checkpoints on the head node.
 #
 # Usage:
-#   sky launch -c nemo_gpt3 nemo_gpt3_singlenode.yaml
+#   sky launch -s -c nemo_gpt3 nemo_gpt3_singlenode.yaml
 #
 #   # Or try on spot A100 GPUs:
 #   sky launch -c nemo_gpt3 nemo_gpt3_singlenode.yaml --use-spot --gpus A100:1

diff --git a/examples/nemo/nemo_gpt3_train.yaml b/examples/nemo/nemo_gpt3_train.yaml
@@ -6,8 +6,11 @@
 # download preprocessed data to your bucket. If you want to preprocess the data
 # yourself, see nemo_gpt3_preprocessing.yaml.
 #
+# After the script completes, the model checkpoints will be saved in
+# ~/sky_workdir/nemo_experiments/megatron_gpt/checkpoints on the head node.
+#
 # Usage:
-#   sky launch -c nemo_gpt3_train nemo_gpt3_train.yaml
+#   sky launch -s -c nemo_gpt3_train nemo_gpt3_train.yaml
 #
 #   # Terminate cluster after you're done
 #   sky down nemo_gpt3_train
@@ -19,7 +22,7 @@ num_nodes: 2
 
 envs:
   DATASET_ROOT: /wiki
-  BUCKET_NAME: romil-nemo-test-bucket2 # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
+  BUCKET_NAME: # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
 
 file_mounts:
   ${DATASET_ROOT}:
@@ -158,3 +161,9 @@ run: |
     exp_manager.checkpoint_callback_params.save_top_k=3 \
     exp_manager.checkpoint_callback_params.mode=min \
     exp_manager.checkpoint_callback_params.always_save_nemo=True
+  
+  # Optional - copy checkpoints to the mounted dataset bucket (~6 GB)
+  # if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then
+  #     mkdir -p ${DATASET_ROOT}/results
+  #     cp -R ~/sky_workdir/nemo_experiments 
+  # fi