From 56b263c960f683349e306f77e6dba435dc78b011 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 9 Jul 2024 14:07:27 -0700 Subject: [PATCH 1/5] Update nemo to use container + mcoregpt --- examples/nemo/nemo_gpt_train.yaml | 77 +++++++++---------------------- 1 file changed, 22 insertions(+), 55 deletions(-) diff --git a/examples/nemo/nemo_gpt_train.yaml b/examples/nemo/nemo_gpt_train.yaml index 125e3665289..08ba718343e 100644 --- a/examples/nemo/nemo_gpt_train.yaml +++ b/examples/nemo/nemo_gpt_train.yaml @@ -19,80 +19,43 @@ # sky down nemo_gpt_train resources: - cpus: 6+ - accelerators: A100:1 + cpus: 16+ + memory: 128+ + accelerators: A100-80GB:4 + image_id: docker:nvcr.io/nvidia/nemo:24.05 num_nodes: 2 envs: DATASET_ROOT: /wiki - BUCKET_NAME: # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it + BUCKET_NAME: romil-sky-wiki # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it file_mounts: ${DATASET_ROOT}: name: ${BUCKET_NAME} store: gcs # We recommend using GCS for large datasets in mount mode - S3 based mounts may fail with "transport endpoint is not connected" error. mode: MOUNT - + setup: | # ============== Dependency Setup ============== - conda activate nemo - if [ $? -eq 0 ]; then - echo "Nemo conda env exists" - else - echo "Setup start" - - conda create -y --name nemo python==3.10.12 - conda activate nemo - - # Install PyTorch - pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 - - # Install nemo + conda deactivate + # Clone NeMo if not already present + if [ ! -d NeMo ]; then git clone https://github.com/NVIDIA/NeMo.git - cd NeMo - git checkout b4ad7eaa7873d632391d6985aa6b359f39c20bab - pip install Cython - pip install .[all] - cd .. - - # Install megatron-core - # We install in editable mode because setup.py does not install all - # required modules if we install in non-editable mode. - git clone https://github.com/NVIDIA/Megatron-LM - cd Megatron-LM - git checkout dc21350806361564b8ce61d4a8d247cb195cc5f0 - pip install -e . - cd .. - - # Install ninja for faster compilation - pip install ninja packaging - - # Install transformer engine and flash-attn (Takes ~1hr to compile) - MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation # Version upper capped by TransformerEngine - MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable - - pip install pytorch-extension + git checkout 5df8e11255802a2ce2f33db6362e60990e215b64 + fi - # Install Apex - git clone https://github.com/NVIDIA/apex.git - cd apex - git checkout 52e18c894223800cb611682dce27d88050edf1de - pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ - cd .. - - # Install gsutil if it doesn't exist - if ! command -v gsutil &> /dev/null - then - pip install gsutil - else - echo "gsutil exists" - fi + # Install gsutil if it doesn't exist + if ! command -v gsutil &> /dev/null + then + pip install gsutil + else + echo "gsutil exists" fi run: | - conda activate nemo + conda deactivate # ============= Data Download ============= # We download pre-processed data from a read-only bucket at gs://sky-wiki-data # to our shared bucket at gs://${BUCKET_NAME}. @@ -120,6 +83,9 @@ run: | num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l` master_addr=`echo "$SKYPILOT_NODE_IPS" | head -n1` + # Kill any existing megatron processes + pkill -f -9 megatron + python -m torch.distributed.run \ --nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} \ --nnodes=${num_nodes} \ @@ -139,6 +105,7 @@ run: | trainer.limit_test_batches=50 \ trainer.accumulate_grad_batches=1 \ trainer.precision=16 \ + model.mcore_gpt=True \ model.micro_batch_size=6 \ model.global_batch_size=192 \ model.tensor_model_parallel_size=1 \ From c93f8be0a1844b1256fa84368b86d4c87844eb53 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 9 Jul 2024 14:51:07 -0700 Subject: [PATCH 2/5] Update nemo to use container + mcoregpt + checkpoint to bucket --- examples/nemo/nemo_gpt_train.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/nemo/nemo_gpt_train.yaml b/examples/nemo/nemo_gpt_train.yaml index 08ba718343e..7a14d0021a2 100644 --- a/examples/nemo/nemo_gpt_train.yaml +++ b/examples/nemo/nemo_gpt_train.yaml @@ -86,6 +86,11 @@ run: | # Kill any existing megatron processes pkill -f -9 megatron + # Store checkpoints on the shared dataset bucket + # Create a directory to store checkpoints + CHECKPOINT_PATH=${DATASET_ROOT}/results + mkdir -p ${CHECKPOINT_PATH} + python -m torch.distributed.run \ --nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} \ --nnodes=${num_nodes} \ @@ -99,7 +104,7 @@ run: | trainer.num_nodes=${num_nodes} \ trainer.max_epochs=null \ trainer.max_steps=300000 \ - trainer.val_check_interval=300 \ + trainer.val_check_interval=50 \ trainer.log_every_n_steps=50 \ trainer.limit_val_batches=50 \ trainer.limit_test_batches=50 \ @@ -136,6 +141,7 @@ run: | exp_manager.resume_if_exists=True \ exp_manager.resume_ignore_no_checkpoint=True \ exp_manager.create_checkpoint_callback=True \ + +exp_manager.checkpoint_callback_params.dirpath=${CHECKPOINT_PATH} \ exp_manager.checkpoint_callback_params.monitor=val_loss \ exp_manager.checkpoint_callback_params.save_top_k=3 \ exp_manager.checkpoint_callback_params.mode=min \ From 9ccd2462c4cdc52cf461976d3d815c7c4a85129a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 9 Jul 2024 15:46:08 -0700 Subject: [PATCH 3/5] update --- examples/nemo/nemo_gpt_train.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/nemo/nemo_gpt_train.yaml b/examples/nemo/nemo_gpt_train.yaml index 7a14d0021a2..7130260254f 100644 --- a/examples/nemo/nemo_gpt_train.yaml +++ b/examples/nemo/nemo_gpt_train.yaml @@ -43,6 +43,7 @@ setup: | # Clone NeMo if not already present if [ ! -d NeMo ]; then git clone https://github.com/NVIDIA/NeMo.git + cd NeMo git checkout 5df8e11255802a2ce2f33db6362e60990e215b64 fi From 7eac148437460e2be05edc9227f051372aa62ef9 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 9 Jul 2024 18:57:50 -0700 Subject: [PATCH 4/5] Update nemo examples --- examples/nemo/nemo_gpt_singlenode.yaml | 91 +++++++++----------------- examples/nemo/nemo_gpt_train.yaml | 38 ++++++----- 2 files changed, 53 insertions(+), 76 deletions(-) diff --git a/examples/nemo/nemo_gpt_singlenode.yaml b/examples/nemo/nemo_gpt_singlenode.yaml index 079214717e3..58612afd118 100644 --- a/examples/nemo/nemo_gpt_singlenode.yaml +++ b/examples/nemo/nemo_gpt_singlenode.yaml @@ -6,23 +6,22 @@ # The specific model used here should fit on GPU with 16GB memory. # # After the script completes, the model checkpoints will be saved in -# ~/sky_workdir/nemo_experiments/megatron_gpt/checkpoints on the head node. +# ~/sky_workdir/results on the head node. # # Usage: -# sky launch -s -c nemo_gpt nemo_gpt_singlenode.yaml +# sky launch -c nemo_gpt nemo_gpt_singlenode.yaml # # # Or try on spot A100 GPUs: # sky launch -c nemo_gpt nemo_gpt_singlenode.yaml --use-spot --gpus A100:1 # -# # The setup will take some time (~1 hr), feel free to ctrl-c once the setup script starts -# # You can reconnect to log stream using `sky logs nemo_gpt_train` -# # # Terminate cluster after you're done # sky down nemo_gpt resources: - cpus: 6+ - accelerators: A100:1 + cpus: 8+ + memory: 64+ + accelerators: A100-80GB:1 + image_id: docker:nvcr.io/nvidia/nemo:24.05 num_nodes: 1 @@ -30,63 +29,25 @@ envs: DATASET_ROOT: $HOME/wiki/ setup: | - # ============== Dependency Setup ============== - conda activate nemo - if [ $? -eq 0 ]; then - echo "Nemo conda env exists" - else - echo "Setup start" - - conda create -y --name nemo python==3.10.12 - conda activate nemo + conda deactivate - # Install PyTorch - pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 - - # Install nemo + # Clone NeMo repo if not already present + if [ ! -d NeMo ]; then git clone https://github.com/NVIDIA/NeMo.git - cd NeMo - git checkout b4ad7eaa7873d632391d6985aa6b359f39c20bab - pip install Cython - pip install .[all] - cd .. - - # Install megatron-core - # We install in editable mode because setup.py does not install all - # required modules if we install in non-editable mode. - git clone https://github.com/NVIDIA/Megatron-LM - cd Megatron-LM - git checkout dc21350806361564b8ce61d4a8d247cb195cc5f0 - pip install -e . - cd .. - - # Install ninja for faster compilation - pip install ninja packaging - - # Install transformer engine and flash-attn (Takes ~1hr to compile) - MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation # Version upper capped by TransformerEngine - MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable - - pip install pytorch-extension + cd NeMo + git checkout 5df8e11255802a2ce2f33db6362e60990e215b64 + fi - # Install Apex - git clone https://github.com/NVIDIA/apex.git - cd apex - git checkout 52e18c894223800cb611682dce27d88050edf1de - pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ - cd .. - - # Install gsutil if it doesn't exist - if ! command -v gsutil &> /dev/null - then - pip install gsutil - else - echo "gsutil exists" - fi + # Install gsutil if it doesn't exist + if ! command -v gsutil &> /dev/null + then + pip install gsutil + else + echo "gsutil exists" fi run: | - conda activate nemo + conda deactivate # ============= Data Download ============= # We download pre-processed data from a read-only bucket at gs://sky-wiki-data # For more on how to pre-process data, see nemo_gpt3_preprocessing.yaml @@ -94,11 +55,21 @@ run: | if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then echo "Data already downloaded" else - echo "Head node downloading data to shared bucket." + echo "Head node downloading data to local path." mkdir -p $DATASET_ROOT gsutil -m cp gs://sky-wiki-data/{gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_ROOT} fi + # Kill any existing megatron processes + pkill -f -9 megatron + + # Store checkpoints at a local path. + # You can change this to the shared bucket for checkpointing to cloud bucket + # at every callback, but this will slow down training. + # CHECKPOINT_PATH=${DATASET_ROOT}/results + CHECKPOINT_PATH=~/sky_workdir/results + mkdir -p ${CHECKPOINT_PATH} + # ============= Training ============= python NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \ --config-path=conf \ @@ -113,6 +84,7 @@ run: | trainer.limit_test_batches=50 \ trainer.accumulate_grad_batches=1 \ trainer.precision=16 \ + model.mcore_gpt=True \ model.micro_batch_size=6 \ model.global_batch_size=192 \ model.tensor_model_parallel_size=1 \ @@ -143,6 +115,7 @@ run: | exp_manager.resume_if_exists=True \ exp_manager.resume_ignore_no_checkpoint=True \ exp_manager.create_checkpoint_callback=True \ + +exp_manager.checkpoint_callback_params.dirpath=${CHECKPOINT_PATH} \ exp_manager.checkpoint_callback_params.monitor=val_loss \ exp_manager.checkpoint_callback_params.save_top_k=3 \ exp_manager.checkpoint_callback_params.mode=min \ diff --git a/examples/nemo/nemo_gpt_train.yaml b/examples/nemo/nemo_gpt_train.yaml index 7130260254f..db94d4ad04a 100644 --- a/examples/nemo/nemo_gpt_train.yaml +++ b/examples/nemo/nemo_gpt_train.yaml @@ -7,28 +7,25 @@ # yourself, see nemo_gpt_preprocessing.yaml. # # After the script completes, the model checkpoints will be saved in -# ~/sky_workdir/nemo_experiments/megatron_gpt/checkpoints on the head node. +# ~/sky_workdir/results on the head node. # # Usage: -# sky launch -s -c nemo_gpt_train nemo_gpt_train.yaml -# -# # The setup will take some time (~1 hr), feel free to ctrl-c once the setup script starts -# # You can reconnect to log stream using `sky logs nemo_gpt_train` +# sky launch --env BUCKET_NAME= -c nemo_gpt_train nemo_gpt_train.yaml # # # Terminate cluster after you're done # sky down nemo_gpt_train resources: - cpus: 16+ - memory: 128+ - accelerators: A100-80GB:4 + cpus: 8+ + memory: 64+ + accelerators: A100-80GB:1 image_id: docker:nvcr.io/nvidia/nemo:24.05 num_nodes: 2 envs: DATASET_ROOT: /wiki - BUCKET_NAME: romil-sky-wiki # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it + BUCKET_NAME: # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it file_mounts: ${DATASET_ROOT}: @@ -37,10 +34,10 @@ file_mounts: mode: MOUNT -setup: | - # ============== Dependency Setup ============== +setup: | conda deactivate - # Clone NeMo if not already present + + # Clone NeMo repo if not already present if [ ! -d NeMo ]; then git clone https://github.com/NVIDIA/NeMo.git cd NeMo @@ -54,6 +51,7 @@ setup: | else echo "gsutil exists" fi + run: | conda deactivate @@ -64,6 +62,9 @@ run: | # This bucket acts as a network filesystem (NFS) between the head node and # worker nodes. In our training script, the head node writes a index # file to this shared filesystem that is read by workers. + # + # Note that NeMo requires this shared filesystem to be strongly consistent - + # any writes made by the head should be immediately visible to the workers. if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then @@ -87,9 +88,11 @@ run: | # Kill any existing megatron processes pkill -f -9 megatron - # Store checkpoints on the shared dataset bucket - # Create a directory to store checkpoints - CHECKPOINT_PATH=${DATASET_ROOT}/results + # Store checkpoints at a local path. + # You can change this to the shared bucket for checkpointing to cloud bucket + # at every callback, but this will slow down training. + # CHECKPOINT_PATH=${DATASET_ROOT}/results + CHECKPOINT_PATH=~/sky_workdir/results mkdir -p ${CHECKPOINT_PATH} python -m torch.distributed.run \ @@ -105,7 +108,7 @@ run: | trainer.num_nodes=${num_nodes} \ trainer.max_epochs=null \ trainer.max_steps=300000 \ - trainer.val_check_interval=50 \ + trainer.val_check_interval=100 \ trainer.log_every_n_steps=50 \ trainer.limit_val_batches=50 \ trainer.limit_test_batches=50 \ @@ -148,7 +151,8 @@ run: | exp_manager.checkpoint_callback_params.mode=min \ exp_manager.checkpoint_callback_params.always_save_nemo=True - # Optional - copy checkpoints to the mounted dataset bucket (~6 GB) + # Optional - if writing checkpoints to a local directory, + # copy checkpoints to the mounted dataset bucket (~6 GB) # if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then # mkdir -p ${DATASET_ROOT}/results # cp -R ~/sky_workdir/nemo_experiments From a6c00c78be978c459bd4a153eef3ea2aa7e0559e Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 10 Jul 2024 14:09:27 -0700 Subject: [PATCH 5/5] Update nemo examples --- ...t_train.yaml => nemo_gpt_distributed.yaml} | 82 +++++++------------ examples/nemo/nemo_gpt_singlenode.yaml | 31 +++---- 2 files changed, 42 insertions(+), 71 deletions(-) rename examples/nemo/{nemo_gpt_train.yaml => nemo_gpt_distributed.yaml} (63%) diff --git a/examples/nemo/nemo_gpt_train.yaml b/examples/nemo/nemo_gpt_distributed.yaml similarity index 63% rename from examples/nemo/nemo_gpt_train.yaml rename to examples/nemo/nemo_gpt_distributed.yaml index db94d4ad04a..ac5441d4ac0 100644 --- a/examples/nemo/nemo_gpt_train.yaml +++ b/examples/nemo/nemo_gpt_distributed.yaml @@ -3,17 +3,21 @@ # Inspired from https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst # # Note that we provide a read-only bucket at gs://sky-wiki-data that is used to -# download preprocessed data to your bucket. If you want to preprocess the data +# download preprocessed data to local disk. If you want to preprocess the data # yourself, see nemo_gpt_preprocessing.yaml. # +# We use a shared bucket to store the index files that are used to coordinate +# between the head and worker nodes. This shared bucket is mounted as a +# network filesystem (NFS) on the head and worker nodes. +# # After the script completes, the model checkpoints will be saved in -# ~/sky_workdir/results on the head node. +# /ckpts on the head node (can be changed to /shared for cloud storage). # # Usage: -# sky launch --env BUCKET_NAME= -c nemo_gpt_train nemo_gpt_train.yaml +# sky launch --env SHARED_NFS_BUCKET_NAME= -c nemo_gpt nemo_gpt_distributed.yaml # # # Terminate cluster after you're done -# sky down nemo_gpt_train +# sky down nemo_gpt resources: cpus: 8+ @@ -25,14 +29,25 @@ num_nodes: 2 envs: DATASET_ROOT: /wiki - BUCKET_NAME: # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it + SHARED_NFS_ROOT: /shared + SHARED_NFS_BUCKET_NAME: # Enter a unique bucket name here for the shared directory - if it doesn't exist SkyPilot will create it + CHECKPOINT_PATH: /ckpts # Store checkpoints at a local path. You can change this to /shared for checkpointing to cloud bucket at every callback, but this will slow down training. file_mounts: ${DATASET_ROOT}: - name: ${BUCKET_NAME} - store: gcs # We recommend using GCS for large datasets in mount mode - S3 based mounts may fail with "transport endpoint is not connected" error. - mode: MOUNT + source: gs://sky-wiki-data # This is a read-only bucket provided by SkyPilot for the dataset + mode: COPY + # The SHARED_NFS_ROOT path acts as a network filesystem (NFS) between the + # head and worker nodes. In NeMo, the head node writes an indexmap to this + # shared filesystem that is read by workers. + # + # Note that NeMo requires this shared filesystem to be strongly consistent - + # any writes made by the head should be immediately visible to the workers. + ${SHARED_NFS_ROOT}: + name: ${SHARED_NFS_BUCKET_NAME} + store: gcs # We recommend using GCS in mount mode - S3 based mounts may fail with "transport endpoint is not connected" error. + mode: MOUNT setup: | conda deactivate @@ -43,43 +58,9 @@ setup: | cd NeMo git checkout 5df8e11255802a2ce2f33db6362e60990e215b64 fi - - # Install gsutil if it doesn't exist - if ! command -v gsutil &> /dev/null - then - pip install gsutil - else - echo "gsutil exists" - fi - run: | conda deactivate - # ============= Data Download ============= - # We download pre-processed data from a read-only bucket at gs://sky-wiki-data - # to our shared bucket at gs://${BUCKET_NAME}. - # - # This bucket acts as a network filesystem (NFS) between the head node and - # worker nodes. In our training script, the head node writes a index - # file to this shared filesystem that is read by workers. - # - # Note that NeMo requires this shared filesystem to be strongly consistent - - # any writes made by the head should be immediately visible to the workers. - - if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then - if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then - echo "Data already downloaded" - else - echo "Head node downloading data to shared bucket." - gsutil -m cp gs://sky-wiki-data/{gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_ROOT} - fi - else - while [ ! -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; do - echo "Worker ${SKYPILOT_NODE_RANK} - waiting for data to be downloaded to shared bucket." - sleep 1 - done - fi - # ============= Training ============= # Get the number of nodes and master address from SkyPilot envvars num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l` @@ -88,13 +69,11 @@ run: | # Kill any existing megatron processes pkill -f -9 megatron - # Store checkpoints at a local path. - # You can change this to the shared bucket for checkpointing to cloud bucket - # at every callback, but this will slow down training. - # CHECKPOINT_PATH=${DATASET_ROOT}/results - CHECKPOINT_PATH=~/sky_workdir/results mkdir -p ${CHECKPOINT_PATH} + echo "Writing checkpoints to ${CHECKPOINT_PATH}" + echo "Writing index files to shared storage ${SHARED_NFS_ROOT}" + python -m torch.distributed.run \ --nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} \ --nnodes=${num_nodes} \ @@ -108,7 +87,7 @@ run: | trainer.num_nodes=${num_nodes} \ trainer.max_epochs=null \ trainer.max_steps=300000 \ - trainer.val_check_interval=100 \ + trainer.val_check_interval=50 \ trainer.log_every_n_steps=50 \ trainer.limit_val_batches=50 \ trainer.limit_test_batches=50 \ @@ -134,6 +113,7 @@ run: | model.data.num_workers=2 \ model.data.seq_length=1024 \ model.data.splits_string=\'980,10,10\' \ + model.data.index_mapping_dir=${SHARED_NFS_ROOT} \ model.optim.name=fused_adam \ model.optim.lr=6e-4 \ model.optim.betas=[0.9,0.95] \ @@ -152,8 +132,8 @@ run: | exp_manager.checkpoint_callback_params.always_save_nemo=True # Optional - if writing checkpoints to a local directory, - # copy checkpoints to the mounted dataset bucket (~6 GB) + # copy final checkpoints to the shared bucket at the end of training (~6 GB) # if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then - # mkdir -p ${DATASET_ROOT}/results - # cp -R ~/sky_workdir/nemo_experiments + # mkdir -p ${SHARED_NFS_ROOT}/results + # cp -R ${CHECKPOINT_PATH} # fi diff --git a/examples/nemo/nemo_gpt_singlenode.yaml b/examples/nemo/nemo_gpt_singlenode.yaml index 58612afd118..ff5798e7e13 100644 --- a/examples/nemo/nemo_gpt_singlenode.yaml +++ b/examples/nemo/nemo_gpt_singlenode.yaml @@ -6,7 +6,7 @@ # The specific model used here should fit on GPU with 16GB memory. # # After the script completes, the model checkpoints will be saved in -# ~/sky_workdir/results on the head node. +# /ckpts (configurable through CHECKPOINT_PATH env var) on the head node. # # Usage: # sky launch -c nemo_gpt nemo_gpt_singlenode.yaml @@ -26,7 +26,14 @@ resources: num_nodes: 1 envs: - DATASET_ROOT: $HOME/wiki/ + DATASET_ROOT: /wiki + CHECKPOINT_PATH: /ckpts + + +file_mounts: + ${DATASET_ROOT}: + source: gs://sky-wiki-data # This is a read-only bucket provided by SkyPilot for the dataset + mode: COPY setup: | conda deactivate @@ -48,26 +55,10 @@ setup: | run: | conda deactivate - # ============= Data Download ============= - # We download pre-processed data from a read-only bucket at gs://sky-wiki-data - # For more on how to pre-process data, see nemo_gpt3_preprocessing.yaml - - if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then - echo "Data already downloaded" - else - echo "Head node downloading data to local path." - mkdir -p $DATASET_ROOT - gsutil -m cp gs://sky-wiki-data/{gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_ROOT} - fi - + # Kill any existing megatron processes pkill -f -9 megatron - # Store checkpoints at a local path. - # You can change this to the shared bucket for checkpointing to cloud bucket - # at every callback, but this will slow down training. - # CHECKPOINT_PATH=${DATASET_ROOT}/results - CHECKPOINT_PATH=~/sky_workdir/results mkdir -p ${CHECKPOINT_PATH} # ============= Training ============= @@ -78,7 +69,7 @@ run: | trainer.num_nodes=1 \ trainer.max_epochs=null \ trainer.max_steps=300000 \ - trainer.val_check_interval=300 \ + trainer.val_check_interval=50 \ trainer.log_every_n_steps=50 \ trainer.limit_val_batches=50 \ trainer.limit_test_batches=50 \