fixes

skypilot-org · Sep 13, 2023 · 082a734 · 082a734
1 parent cd44a2b
commit 082a734
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 80 deletions.
diff --git a/examples/nemo/nemo_gpt3_preprocessing.yaml b/examples/nemo/nemo_gpt3_preprocessing.yaml
@@ -72,10 +72,6 @@ setup: |
       git checkout 52e18c894223800cb611682dce27d88050edf1de
       pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
       cd ..
-      
-      # Print out time when setup is done
-      date
-      echo "Setup done"
   fi
 
 run: |

diff --git a/examples/nemo/nemo_gpt3_singlenode.yaml b/examples/nemo/nemo_gpt3_singlenode.yaml
@@ -20,102 +20,72 @@ envs:
   DATASET_ROOT: $HOME/wiki/
 
 setup: |
+  # ============== Dependency Setup ==============
   conda activate nemo
   if [ $? -eq 0 ]; then
       echo "Nemo conda env exists"
   else
+      echo "Setup start"
+  
       conda create -y --name nemo python==3.10.12
       conda activate nemo
   
       # Install PyTorch
       pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
       
       # Install nemo
-      sudo apt-get update
-      sudo apt-get install -y libsndfile1 ffmpeg
-      pip install Cython
-      pip install git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[all]
-    
-      # Clone the NeMo repo to get the examples
       git clone https://github.com/NVIDIA/NeMo.git
-      
-      # Install Apex
-      git clone https://github.com/NVIDIA/apex.git
-      cd apex
-      git checkout 52e18c894223800cb611682dce27d88050edf1de
-      pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+      cd NeMo
+      git checkout b4ad7eaa7873d632391d6985aa6b359f39c20bab
+      pip install Cython
+      pip install .[all]
       cd ..
   
-      # Install megatron-core (TODO - write patch for setup.py)
-      pip install git+https://github.com/NVIDIA/Megatron-LM.git
+      # Install megatron-core
+      # We install in editable mode because setup.py does not install all 
+      # required modules if we install in non-editable mode.
+      git clone https://github.com/NVIDIA/Megatron-LM
+      cd Megatron-LM
+      git checkout dc21350806361564b8ce61d4a8d247cb195cc5f0
+      pip install -e .  
+      cd ..
+      
+      # Install ninja for faster compilation
+      pip install ninja packaging
   
       # Install transformer engine (Takes ~3hr to compile)
-      pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
+      MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation # Version upper capped by TransformerEngine
+      MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
       
       pip install pytorch-extension
-  fi
   
-  # ======== Download and preprocess the wikipedia dataset ========
-  if [ -f ${DATASET_ROOT}/train_data.jsonl ]; then
-      echo "Dataset exists"
-  else
-      # Install axel for faster downloads
-      sudo apt-get install -y axel
-  
-      mkdir -p ${DATASET_ROOT}
-      cd ${DATASET_ROOT}
-  
-      # Download the wikipedia dataset (takes ~15 min)
-      axel -n 20 https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
-      
-      # Preprocess the wikipedia dataset (takes ~2 hours)
-      pip install wikiextractor
-      python -m wikiextractor.WikiExtractor enwiki-latest-pages-articles.xml.bz2 --json
-      find text -name 'wiki_*' -exec cat {} \; > train_data.jsonl
-  fi
-  
-  # ======== Download tokenizer files ========
-  # Check if the tokenizer files exist
-  if [ -f ${DATASET_ROOT}/gpt2-vocab.json ]; then
-      echo "Tokenizer files exist"
-  else
-      # Download the tokenizer files
-      cd {DATASET_ROOT}
-      axel -n 20 https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-      axel -n 20 https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-  fi
-  
-  # ======== Convert data to mmap format ========
-  # Check if the mmap files exist
-  if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then
-      echo "Mmap files exist"
-  else
-      # Convert the data to mmap format`
-      cd ${DATASET_ROOT}
-      python $HOME/sky_workdir/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
-        --input=train_data.jsonl \
-        --json-keys=text \
-        --tokenizer-library=megatron \
-        --vocab gpt2-vocab.json \
-        --dataset-impl mmap \
-        --tokenizer-type GPT2BPETokenizer \
-        --merge-file gpt2-merges.txt \
-        --output-prefix=hfbpe_gpt_training_data \
-        --append-eod \
-        --workers=32
+      # Install Apex
+      git clone https://github.com/NVIDIA/apex.git
+      cd apex
+      git checkout 52e18c894223800cb611682dce27d88050edf1de
+      pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+      cd ..
   fi
 
 run: |
-  conda activate nemo
+  # ============= Data Download =============
+  # We download pre-processed data from a read-only bucket at gs://sky-wiki-data
+  # For more on how to pre-process data, see nemo_gpt3_preprocessing.yaml
   
-  # Get the number of nodes and master address from SkyPilot envvars
-  num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l`
-  master_addr=`echo "$SKYPILOT_NODE_IPS" | head -n1`
+  if [ -f ${DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then
+      echo "Data already downloaded"
+  else
+      echo "Head node downloading data to shared bucket."
+      gsutil -m cp gs://sky-wiki-data/{gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_ROOT}
+  fi 
   
+  # ============= Training =============
+  conda activate nemo
+    
   python NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
     --config-path=NeMo/examples/nlp/language_modeling/conf \
     --config-name=megatron_gpt_config \
-    trainer.devices=1 \
+    trainer.devices=${SKYPILOT_NUM_GPUS_PER_NODE} \
     trainer.num_nodes=1 \
     trainer.max_epochs=null \
     trainer.max_steps=300000 \

diff --git a/examples/nemo/nemo_gpt3_train.yaml b/examples/nemo/nemo_gpt3_train.yaml
@@ -13,13 +13,13 @@
 #   sky down nemo_gpt3_train
 
 resources:
-  accelerators: A100:1
+  accelerators: V100:1
 
 num_nodes: 2
 
 envs:
   DATASET_ROOT: /wiki
-  BUCKET_NAME: # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
+  BUCKET_NAME: romil-nemo-test-bucket2 # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
 
 file_mounts:
   ${DATASET_ROOT}:
@@ -73,10 +73,6 @@ setup: |
       git checkout 52e18c894223800cb611682dce27d88050edf1de
       pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
       cd ..
-      
-      # Print out time when setup is done
-      date
-      echo "Setup done"
   fi
 
 run: |
@@ -155,7 +151,6 @@ run: |
     model.optim.sched.warmup_steps=750 \
     model.optim.sched.constant_steps=80000 \
     model.optim.sched.min_lr=6e-5 \
-    model.nemo_path=${DATASET_ROOT}/ \
     exp_manager.resume_if_exists=True \
     exp_manager.resume_ignore_no_checkpoint=True \
     exp_manager.create_checkpoint_callback=True \