From 56b263c960f683349e306f77e6dba435dc78b011 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Tue, 9 Jul 2024 14:07:27 -0700
Subject: [PATCH 1/5] Update nemo to use container + mcoregpt

---
 examples/nemo/nemo_gpt_train.yaml | 77 +++++++++----------------------
 1 file changed, 22 insertions(+), 55 deletions(-)

diff --git a/examples/nemo/nemo_gpt_train.yaml b/examples/nemo/nemo_gpt_train.yaml
index 125e3665289..08ba718343e 100644
--- a/examples/nemo/nemo_gpt_train.yaml
+++ b/examples/nemo/nemo_gpt_train.yaml
@@ -19,80 +19,43 @@
 #   sky down nemo_gpt_train
 
 resources:
-  cpus: 6+
-  accelerators: A100:1
+  cpus: 16+
+  memory: 128+
+  accelerators: A100-80GB:4
+  image_id: docker:nvcr.io/nvidia/nemo:24.05
 
 num_nodes: 2
 
 envs:
   DATASET_ROOT: /wiki
-  BUCKET_NAME: # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
+  BUCKET_NAME: romil-sky-wiki # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
 
 file_mounts:
   ${DATASET_ROOT}:
     name: ${BUCKET_NAME}
     store: gcs  # We recommend using GCS for large datasets in mount mode - S3 based mounts may fail with "transport endpoint is not connected" error.
     mode: MOUNT
-    
+
 
 setup: |  
   # ============== Dependency Setup ==============
-  conda activate nemo
-  if [ $? -eq 0 ]; then
-      echo "Nemo conda env exists"
-  else
-      echo "Setup start"
-  
-      conda create -y --name nemo python==3.10.12
-      conda activate nemo
-  
-      # Install PyTorch
-      pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
-      
-      # Install nemo
+  conda deactivate
+  # Clone NeMo if not already present
+  if [ ! -d NeMo ]; then
       git clone https://github.com/NVIDIA/NeMo.git
-      cd NeMo
-      git checkout b4ad7eaa7873d632391d6985aa6b359f39c20bab
-      pip install Cython
-      pip install .[all]
-      cd ..
-  
-      # Install megatron-core
-      # We install in editable mode because setup.py does not install all 
-      # required modules if we install in non-editable mode.
-      git clone https://github.com/NVIDIA/Megatron-LM
-      cd Megatron-LM
-      git checkout dc21350806361564b8ce61d4a8d247cb195cc5f0
-      pip install -e .  
-      cd ..
-      
-      # Install ninja for faster compilation
-      pip install ninja packaging
-  
-      # Install transformer engine and flash-attn (Takes ~1hr to compile)
-      MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation # Version upper capped by TransformerEngine
-      MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
-      
-      pip install pytorch-extension
+      git checkout 5df8e11255802a2ce2f33db6362e60990e215b64
+  fi
   
-      # Install Apex
-      git clone https://github.com/NVIDIA/apex.git
-      cd apex
-      git checkout 52e18c894223800cb611682dce27d88050edf1de
-      pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
-      cd ..
-
-      # Install gsutil if it doesn't exist
-      if ! command -v gsutil &> /dev/null
-      then
-          pip install gsutil
-      else
-          echo "gsutil exists"
-      fi
+  # Install gsutil if it doesn't exist
+  if ! command -v gsutil &> /dev/null
+  then
+      pip install gsutil
+  else
+      echo "gsutil exists"
   fi
 
 run: |
-  conda activate nemo
+  conda deactivate
   # ============= Data Download =============
   # We download pre-processed data from a read-only bucket at gs://sky-wiki-data
   # to our shared bucket at gs://${BUCKET_NAME}. 
@@ -120,6 +83,9 @@ run: |
   num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l`
   master_addr=`echo "$SKYPILOT_NODE_IPS" | head -n1`
   
+  # Kill any existing megatron processes
+  pkill -f -9 megatron
+  
   python -m torch.distributed.run \
     --nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} \
     --nnodes=${num_nodes} \
@@ -139,6 +105,7 @@ run: |
     trainer.limit_test_batches=50 \
     trainer.accumulate_grad_batches=1 \
     trainer.precision=16 \
+    model.mcore_gpt=True \
     model.micro_batch_size=6 \
     model.global_batch_size=192 \
     model.tensor_model_parallel_size=1 \

From c93f8be0a1844b1256fa84368b86d4c87844eb53 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Tue, 9 Jul 2024 14:51:07 -0700
Subject: [PATCH 2/5] Update nemo to use container + mcoregpt + checkpoint to
 bucket

---
 examples/nemo/nemo_gpt_train.yaml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/nemo/nemo_gpt_train.yaml b/examples/nemo/nemo_gpt_train.yaml
index 08ba718343e..7a14d0021a2 100644
--- a/examples/nemo/nemo_gpt_train.yaml
+++ b/examples/nemo/nemo_gpt_train.yaml
@@ -86,6 +86,11 @@ run: |
   # Kill any existing megatron processes
   pkill -f -9 megatron
   
+  # Store checkpoints on the shared dataset bucket
+  # Create a directory to store checkpoints
+  CHECKPOINT_PATH=${DATASET_ROOT}/results
+  mkdir -p ${CHECKPOINT_PATH}
+  
   python -m torch.distributed.run \
     --nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} \
     --nnodes=${num_nodes} \
@@ -99,7 +104,7 @@ run: |
     trainer.num_nodes=${num_nodes} \
     trainer.max_epochs=null \
     trainer.max_steps=300000 \
-    trainer.val_check_interval=300 \
+    trainer.val_check_interval=50 \
     trainer.log_every_n_steps=50 \
     trainer.limit_val_batches=50 \
     trainer.limit_test_batches=50 \
@@ -136,6 +141,7 @@ run: |
     exp_manager.resume_if_exists=True \
     exp_manager.resume_ignore_no_checkpoint=True \
     exp_manager.create_checkpoint_callback=True \
+    +exp_manager.checkpoint_callback_params.dirpath=${CHECKPOINT_PATH} \
     exp_manager.checkpoint_callback_params.monitor=val_loss \
     exp_manager.checkpoint_callback_params.save_top_k=3 \
     exp_manager.checkpoint_callback_params.mode=min \

From 9ccd2462c4cdc52cf461976d3d815c7c4a85129a Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Tue, 9 Jul 2024 15:46:08 -0700
Subject: [PATCH 3/5] update

---
 examples/nemo/nemo_gpt_train.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/nemo/nemo_gpt_train.yaml b/examples/nemo/nemo_gpt_train.yaml
index 7a14d0021a2..7130260254f 100644
--- a/examples/nemo/nemo_gpt_train.yaml
+++ b/examples/nemo/nemo_gpt_train.yaml
@@ -43,6 +43,7 @@ setup: |
   # Clone NeMo if not already present
   if [ ! -d NeMo ]; then
       git clone https://github.com/NVIDIA/NeMo.git
+      cd NeMo 
       git checkout 5df8e11255802a2ce2f33db6362e60990e215b64
   fi
   

From 7eac148437460e2be05edc9227f051372aa62ef9 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Tue, 9 Jul 2024 18:57:50 -0700
Subject: [PATCH 4/5] Update nemo examples

---
 examples/nemo/nemo_gpt_singlenode.yaml | 91 +++++++++-----------------
 examples/nemo/nemo_gpt_train.yaml      | 38 ++++++-----
 2 files changed, 53 insertions(+), 76 deletions(-)

diff --git a/examples/nemo/nemo_gpt_singlenode.yaml b/examples/nemo/nemo_gpt_singlenode.yaml
index 079214717e3..58612afd118 100644
--- a/examples/nemo/nemo_gpt_singlenode.yaml
+++ b/examples/nemo/nemo_gpt_singlenode.yaml
@@ -6,23 +6,22 @@
 # The specific model used here should fit on GPU with 16GB memory.
 #
 # After the script completes, the model checkpoints will be saved in
-# ~/sky_workdir/nemo_experiments/megatron_gpt/checkpoints on the head node.
+# ~/sky_workdir/results on the head node.
 #
 # Usage:
-#   sky launch -s -c nemo_gpt nemo_gpt_singlenode.yaml
+#   sky launch -c nemo_gpt nemo_gpt_singlenode.yaml
 #
 #   # Or try on spot A100 GPUs:
 #   sky launch -c nemo_gpt nemo_gpt_singlenode.yaml --use-spot --gpus A100:1
 #
-#   # The setup will take some time (~1 hr), feel free to ctrl-c once the setup script starts
-#   # You can reconnect to log stream using `sky logs nemo_gpt_train`
-#
 #   # Terminate cluster after you're done
 #   sky down nemo_gpt
 
 resources:
-  cpus: 6+
-  accelerators: A100:1
+  cpus: 8+
+  memory: 64+
+  accelerators: A100-80GB:1
+  image_id: docker:nvcr.io/nvidia/nemo:24.05
 
 num_nodes: 1
 
@@ -30,63 +29,25 @@ envs:
   DATASET_ROOT: $HOME/wiki/
 
 setup: |
-  # ============== Dependency Setup ==============
-  conda activate nemo
-  if [ $? -eq 0 ]; then
-      echo "Nemo conda env exists"
-  else
-      echo "Setup start"
-  
-      conda create -y --name nemo python==3.10.12
-      conda activate nemo
+  conda deactivate
   
-      # Install PyTorch
-      pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
-      
-      # Install nemo
+  # Clone NeMo repo if not already present
+  if [ ! -d NeMo ]; then
       git clone https://github.com/NVIDIA/NeMo.git
-      cd NeMo
-      git checkout b4ad7eaa7873d632391d6985aa6b359f39c20bab
-      pip install Cython
-      pip install .[all]
-      cd ..
-  
-      # Install megatron-core
-      # We install in editable mode because setup.py does not install all 
-      # required modules if we install in non-editable mode.
-      git clone https://github.com/NVIDIA/Megatron-LM
-      cd Megatron-LM
-      git checkout dc21350806361564b8ce61d4a8d247cb195cc5f0
-      pip install -e .  
-      cd ..
-      
-      # Install ninja for faster compilation
-      pip install ninja packaging
-  
-      # Install transformer engine and flash-attn (Takes ~1hr to compile)
-      MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation # Version upper capped by TransformerEngine
-      MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
-      
-      pip install pytorch-extension
+      cd NeMo 
+      git checkout 5df8e11255802a2ce2f33db6362e60990e215b64
+  fi
   
-      # Install Apex
-      git clone https://github.com/NVIDIA/apex.git
-      cd apex
-      git checkout 52e18c894223800cb611682dce27d88050edf1de
-      pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
-      cd ..
-        
-      # Install gsutil if it doesn't exist
-      if ! command -v gsutil &> /dev/null
-      then
-          pip install gsutil
-      else
-          echo "gsutil exists"
-      fi
+  # Install gsutil if it doesn't exist
+  if ! command -v gsutil &> /dev/null
+  then
+      pip install gsutil
+  else
+      echo "gsutil exists"
   fi
 
 run: |
-  conda activate nemo
+  conda deactivate
   # ============= Data Download =============
   # We download pre-processed data from a read-only bucket at gs://sky-wiki-data
   # For more on how to pre-process data, see nemo_gpt3_preprocessing.yaml
@@ -94,11 +55,21 @@ run: |
   if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then
       echo "Data already downloaded"
   else
-      echo "Head node downloading data to shared bucket."
+      echo "Head node downloading data to local path."
       mkdir -p $DATASET_ROOT
       gsutil -m cp gs://sky-wiki-data/{gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_ROOT}
   fi 
   
+  # Kill any existing megatron processes
+  pkill -f -9 megatron
+  
+  # Store checkpoints at a local path.
+  # You can change this to the shared bucket for checkpointing to cloud bucket
+  # at every callback, but this will slow down training.
+  # CHECKPOINT_PATH=${DATASET_ROOT}/results
+  CHECKPOINT_PATH=~/sky_workdir/results
+  mkdir -p ${CHECKPOINT_PATH}
+  
   # ============= Training =============    
   python NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
     --config-path=conf \
@@ -113,6 +84,7 @@ run: |
     trainer.limit_test_batches=50 \
     trainer.accumulate_grad_batches=1 \
     trainer.precision=16 \
+    model.mcore_gpt=True \
     model.micro_batch_size=6 \
     model.global_batch_size=192 \
     model.tensor_model_parallel_size=1 \
@@ -143,6 +115,7 @@ run: |
     exp_manager.resume_if_exists=True \
     exp_manager.resume_ignore_no_checkpoint=True \
     exp_manager.create_checkpoint_callback=True \
+    +exp_manager.checkpoint_callback_params.dirpath=${CHECKPOINT_PATH} \
     exp_manager.checkpoint_callback_params.monitor=val_loss \
     exp_manager.checkpoint_callback_params.save_top_k=3 \
     exp_manager.checkpoint_callback_params.mode=min \
diff --git a/examples/nemo/nemo_gpt_train.yaml b/examples/nemo/nemo_gpt_train.yaml
index 7130260254f..db94d4ad04a 100644
--- a/examples/nemo/nemo_gpt_train.yaml
+++ b/examples/nemo/nemo_gpt_train.yaml
@@ -7,28 +7,25 @@
 # yourself, see nemo_gpt_preprocessing.yaml.
 #
 # After the script completes, the model checkpoints will be saved in
-# ~/sky_workdir/nemo_experiments/megatron_gpt/checkpoints on the head node.
+# ~/sky_workdir/results on the head node.
 #
 # Usage:
-#   sky launch -s -c nemo_gpt_train nemo_gpt_train.yaml
-#
-#   # The setup will take some time (~1 hr), feel free to ctrl-c once the setup script starts
-#   # You can reconnect to log stream using `sky logs nemo_gpt_train`
+#   sky launch --env BUCKET_NAME=<unique_bucket_name> -c nemo_gpt_train nemo_gpt_train.yaml
 #
 #   # Terminate cluster after you're done
 #   sky down nemo_gpt_train
 
 resources:
-  cpus: 16+
-  memory: 128+
-  accelerators: A100-80GB:4
+  cpus: 8+
+  memory: 64+
+  accelerators: A100-80GB:1
   image_id: docker:nvcr.io/nvidia/nemo:24.05
 
 num_nodes: 2
 
 envs:
   DATASET_ROOT: /wiki
-  BUCKET_NAME: romil-sky-wiki # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
+  BUCKET_NAME:  # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
 
 file_mounts:
   ${DATASET_ROOT}:
@@ -37,10 +34,10 @@ file_mounts:
     mode: MOUNT
 
 
-setup: |  
-  # ============== Dependency Setup ==============
+setup: |
   conda deactivate
-  # Clone NeMo if not already present
+  
+  # Clone NeMo repo if not already present
   if [ ! -d NeMo ]; then
       git clone https://github.com/NVIDIA/NeMo.git
       cd NeMo 
@@ -54,6 +51,7 @@ setup: |
   else
       echo "gsutil exists"
   fi
+  
 
 run: |
   conda deactivate
@@ -64,6 +62,9 @@ run: |
   # This bucket acts as a network filesystem (NFS) between the head node and 
   # worker nodes. In our training script, the head node writes a index 
   # file to this shared filesystem that is read by workers.
+  #
+  # Note that NeMo requires this shared filesystem to be strongly consistent - 
+  # any writes made by the head should be immediately visible to the workers.
   
   if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then
       if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then
@@ -87,9 +88,11 @@ run: |
   # Kill any existing megatron processes
   pkill -f -9 megatron
   
-  # Store checkpoints on the shared dataset bucket
-  # Create a directory to store checkpoints
-  CHECKPOINT_PATH=${DATASET_ROOT}/results
+  # Store checkpoints at a local path.
+  # You can change this to the shared bucket for checkpointing to cloud bucket
+  # at every callback, but this will slow down training.
+  # CHECKPOINT_PATH=${DATASET_ROOT}/results
+  CHECKPOINT_PATH=~/sky_workdir/results
   mkdir -p ${CHECKPOINT_PATH}
   
   python -m torch.distributed.run \
@@ -105,7 +108,7 @@ run: |
     trainer.num_nodes=${num_nodes} \
     trainer.max_epochs=null \
     trainer.max_steps=300000 \
-    trainer.val_check_interval=50 \
+    trainer.val_check_interval=100 \
     trainer.log_every_n_steps=50 \
     trainer.limit_val_batches=50 \
     trainer.limit_test_batches=50 \
@@ -148,7 +151,8 @@ run: |
     exp_manager.checkpoint_callback_params.mode=min \
     exp_manager.checkpoint_callback_params.always_save_nemo=True
   
-  # Optional - copy checkpoints to the mounted dataset bucket (~6 GB)
+  # Optional - if writing checkpoints to a local directory,
+  # copy checkpoints to the mounted dataset bucket (~6 GB)
   # if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then
   #     mkdir -p ${DATASET_ROOT}/results
   #     cp -R ~/sky_workdir/nemo_experiments 

From a6c00c78be978c459bd4a153eef3ea2aa7e0559e Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Wed, 10 Jul 2024 14:09:27 -0700
Subject: [PATCH 5/5] Update nemo examples

---
 ...t_train.yaml => nemo_gpt_distributed.yaml} | 82 +++++++------------
 examples/nemo/nemo_gpt_singlenode.yaml        | 31 +++----
 2 files changed, 42 insertions(+), 71 deletions(-)
 rename examples/nemo/{nemo_gpt_train.yaml => nemo_gpt_distributed.yaml} (63%)

diff --git a/examples/nemo/nemo_gpt_train.yaml b/examples/nemo/nemo_gpt_distributed.yaml
similarity index 63%
rename from examples/nemo/nemo_gpt_train.yaml
rename to examples/nemo/nemo_gpt_distributed.yaml
index db94d4ad04a..ac5441d4ac0 100644
--- a/examples/nemo/nemo_gpt_train.yaml
+++ b/examples/nemo/nemo_gpt_distributed.yaml
@@ -3,17 +3,21 @@
 # Inspired from https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
 #
 # Note that we provide a read-only bucket at gs://sky-wiki-data that is used to
-# download preprocessed data to your bucket. If you want to preprocess the data
+# download preprocessed data to local disk. If you want to preprocess the data
 # yourself, see nemo_gpt_preprocessing.yaml.
 #
+# We use a shared bucket to store the index files that are used to coordinate
+# between the head and worker nodes. This shared bucket is mounted as a
+# network filesystem (NFS) on the head and worker nodes.
+#
 # After the script completes, the model checkpoints will be saved in
-# ~/sky_workdir/results on the head node.
+# /ckpts on the head node (can be changed to /shared for cloud storage).
 #
 # Usage:
-#   sky launch --env BUCKET_NAME=<unique_bucket_name> -c nemo_gpt_train nemo_gpt_train.yaml
+#   sky launch --env SHARED_NFS_BUCKET_NAME=<unique_bucket_name> -c nemo_gpt nemo_gpt_distributed.yaml
 #
 #   # Terminate cluster after you're done
-#   sky down nemo_gpt_train
+#   sky down nemo_gpt
 
 resources:
   cpus: 8+
@@ -25,14 +29,25 @@ num_nodes: 2
 
 envs:
   DATASET_ROOT: /wiki
-  BUCKET_NAME:  # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
+  SHARED_NFS_ROOT: /shared
+  SHARED_NFS_BUCKET_NAME: # Enter a unique bucket name here for the shared directory - if it doesn't exist SkyPilot will create it
+  CHECKPOINT_PATH: /ckpts # Store checkpoints at a local path. You can change this to /shared for checkpointing to cloud bucket at every callback, but this will slow down training.
 
 file_mounts:
   ${DATASET_ROOT}:
-    name: ${BUCKET_NAME}
-    store: gcs  # We recommend using GCS for large datasets in mount mode - S3 based mounts may fail with "transport endpoint is not connected" error.
-    mode: MOUNT
+    source: gs://sky-wiki-data    # This is a read-only bucket provided by SkyPilot for the dataset
+    mode: COPY
 
+  # The SHARED_NFS_ROOT path acts as a network filesystem (NFS) between the
+  # head and worker nodes. In NeMo, the head node writes an indexmap to this
+  # shared filesystem that is read by workers.
+  #
+  # Note that NeMo requires this shared filesystem to be strongly consistent -
+  # any writes made by the head should be immediately visible to the workers.
+  ${SHARED_NFS_ROOT}:
+    name: ${SHARED_NFS_BUCKET_NAME}
+    store: gcs  # We recommend using GCS in mount mode - S3 based mounts may fail with "transport endpoint is not connected" error.
+    mode: MOUNT
 
 setup: |
   conda deactivate
@@ -43,43 +58,9 @@ setup: |
       cd NeMo 
       git checkout 5df8e11255802a2ce2f33db6362e60990e215b64
   fi
-  
-  # Install gsutil if it doesn't exist
-  if ! command -v gsutil &> /dev/null
-  then
-      pip install gsutil
-  else
-      echo "gsutil exists"
-  fi
-  
 
 run: |
   conda deactivate
-  # ============= Data Download =============
-  # We download pre-processed data from a read-only bucket at gs://sky-wiki-data
-  # to our shared bucket at gs://${BUCKET_NAME}. 
-  # 
-  # This bucket acts as a network filesystem (NFS) between the head node and 
-  # worker nodes. In our training script, the head node writes a index 
-  # file to this shared filesystem that is read by workers.
-  #
-  # Note that NeMo requires this shared filesystem to be strongly consistent - 
-  # any writes made by the head should be immediately visible to the workers.
-  
-  if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then
-      if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then
-          echo "Data already downloaded"
-      else
-          echo "Head node downloading data to shared bucket."
-          gsutil -m cp gs://sky-wiki-data/{gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_ROOT}
-      fi
-  else
-      while [ ! -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; do
-          echo "Worker ${SKYPILOT_NODE_RANK} - waiting for data to be downloaded to shared bucket."
-          sleep 1
-      done
-  fi 
-  
   # ============= Training =============  
   # Get the number of nodes and master address from SkyPilot envvars
   num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l`
@@ -88,13 +69,11 @@ run: |
   # Kill any existing megatron processes
   pkill -f -9 megatron
   
-  # Store checkpoints at a local path.
-  # You can change this to the shared bucket for checkpointing to cloud bucket
-  # at every callback, but this will slow down training.
-  # CHECKPOINT_PATH=${DATASET_ROOT}/results
-  CHECKPOINT_PATH=~/sky_workdir/results
   mkdir -p ${CHECKPOINT_PATH}
   
+  echo "Writing checkpoints to ${CHECKPOINT_PATH}"
+  echo "Writing index files to shared storage ${SHARED_NFS_ROOT}"
+  
   python -m torch.distributed.run \
     --nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} \
     --nnodes=${num_nodes} \
@@ -108,7 +87,7 @@ run: |
     trainer.num_nodes=${num_nodes} \
     trainer.max_epochs=null \
     trainer.max_steps=300000 \
-    trainer.val_check_interval=100 \
+    trainer.val_check_interval=50 \
     trainer.log_every_n_steps=50 \
     trainer.limit_val_batches=50 \
     trainer.limit_test_batches=50 \
@@ -134,6 +113,7 @@ run: |
     model.data.num_workers=2 \
     model.data.seq_length=1024 \
     model.data.splits_string=\'980,10,10\' \
+    model.data.index_mapping_dir=${SHARED_NFS_ROOT} \
     model.optim.name=fused_adam \
     model.optim.lr=6e-4 \
     model.optim.betas=[0.9,0.95] \
@@ -152,8 +132,8 @@ run: |
     exp_manager.checkpoint_callback_params.always_save_nemo=True
   
   # Optional - if writing checkpoints to a local directory,
-  # copy checkpoints to the mounted dataset bucket (~6 GB)
+  # copy final checkpoints to the shared bucket at the end of training (~6 GB)
   # if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then
-  #     mkdir -p ${DATASET_ROOT}/results
-  #     cp -R ~/sky_workdir/nemo_experiments 
+  #     mkdir -p ${SHARED_NFS_ROOT}/results
+  #     cp -R ${CHECKPOINT_PATH}
   # fi
diff --git a/examples/nemo/nemo_gpt_singlenode.yaml b/examples/nemo/nemo_gpt_singlenode.yaml
index 58612afd118..ff5798e7e13 100644
--- a/examples/nemo/nemo_gpt_singlenode.yaml
+++ b/examples/nemo/nemo_gpt_singlenode.yaml
@@ -6,7 +6,7 @@
 # The specific model used here should fit on GPU with 16GB memory.
 #
 # After the script completes, the model checkpoints will be saved in
-# ~/sky_workdir/results on the head node.
+# /ckpts (configurable through CHECKPOINT_PATH env var) on the head node.
 #
 # Usage:
 #   sky launch -c nemo_gpt nemo_gpt_singlenode.yaml
@@ -26,7 +26,14 @@ resources:
 num_nodes: 1
 
 envs:
-  DATASET_ROOT: $HOME/wiki/
+  DATASET_ROOT: /wiki
+  CHECKPOINT_PATH: /ckpts
+
+
+file_mounts:
+  ${DATASET_ROOT}:
+    source: gs://sky-wiki-data    # This is a read-only bucket provided by SkyPilot for the dataset
+    mode: COPY
 
 setup: |
   conda deactivate
@@ -48,26 +55,10 @@ setup: |
 
 run: |
   conda deactivate
-  # ============= Data Download =============
-  # We download pre-processed data from a read-only bucket at gs://sky-wiki-data
-  # For more on how to pre-process data, see nemo_gpt3_preprocessing.yaml
-  
-  if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then
-      echo "Data already downloaded"
-  else
-      echo "Head node downloading data to local path."
-      mkdir -p $DATASET_ROOT
-      gsutil -m cp gs://sky-wiki-data/{gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_ROOT}
-  fi 
-  
+
   # Kill any existing megatron processes
   pkill -f -9 megatron
   
-  # Store checkpoints at a local path.
-  # You can change this to the shared bucket for checkpointing to cloud bucket
-  # at every callback, but this will slow down training.
-  # CHECKPOINT_PATH=${DATASET_ROOT}/results
-  CHECKPOINT_PATH=~/sky_workdir/results
   mkdir -p ${CHECKPOINT_PATH}
   
   # ============= Training =============    
@@ -78,7 +69,7 @@ run: |
     trainer.num_nodes=1 \
     trainer.max_epochs=null \
     trainer.max_steps=300000 \
-    trainer.val_check_interval=300 \
+    trainer.val_check_interval=50 \
     trainer.log_every_n_steps=50 \
     trainer.limit_val_batches=50 \
     trainer.limit_test_batches=50 \