Merge branch 'main' into xren/cp_debug

xrennvidia · Jul 30, 2024 · c7dff10 · c7dff10
2 parents 242d77c + 86bfac2
commit c7dff10
Show file tree

Hide file tree

Showing 24 changed files with 736 additions and 211 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -3304,6 +3304,62 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/lora_tuning_tp2
 
+  L2_Megatron_GPT_PEFT_Lora_TP2SP1:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-2-h100
+      SCRIPT: |
+        rm -rf /home/TestData/nlp/lora_tuning_tp2_sp1
+
+        CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=9999 \
+        trainer.max_steps=3 \
+        trainer.val_check_interval=3 \
+        ++trainer.limit_val_batches=2 \
+        trainer.precision=bf16 \
+        exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2_sp1 \
+        +model.mcore_gpt=True \
+        model.pipeline_model_parallel_size=1 \
+        model.tensor_model_parallel_size=2 \
+        model.sequence_parallel=True \
+        model.megatron_amp_O2=True \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+        +model.fp8=True \
+        +model.fp8_params=True \
+        +model.fp8_hybrid=True \
+        +model.fp8_e4m3=False \
+        +model.fp8_interval=1 \
+        +model.fp8_margin=0 \
+        +model.fp8_amax_history_len=32 \
+        +model.fp8_amax_compute_algo=max \
+        +model.reduce_amax=False \
+        +model.ub_tp_comm_overlap=False \
+        +model.tp_comm_overlap_ag=False \
+        +model.tp_comm_overlap_rs=False \
+        +model.tp_comm_overlap_disable_qkv=True \
+        model.peft.peft_scheme='lora' \
+        model.peft.lora_tuning.adapter_dim=16 \
+        model.peft.lora_tuning.alpha=32 \
+        model.peft.lora_tuning.column_init_method="kaiming" \
+        +model.peft.lora_tuning.dropout_position='pre' \
+        model.peft.lora_tuning.target_modules=['attention'] \
+        model.peft.lora_tuning.adapter_dropout=0.1 \
+        +model.peft.lora_tuning.a2a_experimental=1 \
+        model.answer_only_loss=True \
+        model.micro_batch_size=1 \
+        model.global_batch_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[1.0] \
+        model.data.train_ds.num_workers=0 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/lora_tuning_tp2_sp1        
+
   L2_Megatron_GPT_Eval:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -3858,7 +3914,7 @@ jobs:
         trainer.precision=16 \
         trainer.gradient_clip_val=1.0 \
         exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.pipeline_model_parallel_split_rank=1 \
+        model.pipeline_model_parallel_split_rank=0 \
         model.seq_length=256 \
         model.encoder.num_layers=4 \
         model.decoder.num_layers=1 \
@@ -4631,6 +4687,7 @@ jobs:
       - L2_Megatron_GPT_Embedding 
       - L2_Megatron_GPT_PEFT_Lora_PP2_O2
       - L2_Megatron_GPT_PEFT_Lora_TP2_O1
+      - L2_Megatron_GPT_PEFT_Lora_TP2SP1
       - L2_Megatron_GPT_Eval
       - L2_Megatron_GPT_Eval_PP2
       - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -34,7 +34,7 @@ WORKDIR /workspace
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.13.0
-ARG MCORE_TAG=c7a1f82d761577e6ca0338d3521eac82f2aa0904
+ARG MCORE_TAG=2bbe55be32e2d478c4b2ce575af1cccb8fc3d9b9
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \
@@ -90,4 +90,3 @@ chmod 777 -R /workspace
 EOF
 
 ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
-
diff --git a/examples/llm/megatron_gpt_pretraining.py b/examples/llm/megatron_gpt_pretraining.py
@@ -65,7 +65,6 @@ def get_args():
     checkpoint_callback = ModelCheckpoint(
         every_n_train_steps=5000,
         enable_nemo_ckpt_io=False,
-        async_save=False,
     )
     callbacks = [checkpoint_callback]
 

diff --git a/examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml b/examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml
@@ -77,6 +77,11 @@ model:
     vocab_file: null
     merge_file: null 
 
+  # embedding-specific arguemnts
+  softmax_temp: 0.02 # softmax temp for contrastive loss
+  global_inbatch_negatives: True # whether to use in-batch negatives from other ranks during training
+  backprop_type: 'global' # whether to use `global` or `local` backpropagation during training. Refer to Flava paper for details. 
+
   # precision
   native_amp_init_scale: 4294967296 # 2 ** 32
   native_amp_growth_interval: 1000
@@ -93,7 +98,7 @@ model:
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  
+
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
   # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
@@ -127,7 +132,7 @@ model:
     # Path to data must be specified by the user.
     data_train: null
     data_validation: null
-    hard_negatives_to_train: 4
+    hard_negatives_to_train: 4 # number of hard negatives to use per example for training
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
     data_impl: mmap
     splits_string: 900,50,50

diff --git a/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py b/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py
@@ -37,7 +37,7 @@ def main(cfg) -> None:
     model_cfg = MegatronBertEmbeddingModel.merge_cfg_with(cfg.restore_from_path, cfg)
 
     assert (
-        model_cfg.micro_batch_size * cfg.trainer.devices == model_cfg.global_batch_size
+        model_cfg.micro_batch_size * cfg.trainer.devices * cfg.trainer.num_nodes == model_cfg.global_batch_size
     ), "Gradiant accumulation is not supported for contrastive learning yet"
 
     OmegaConf.set_struct(model_cfg, True)

diff --git a/examples/nlp/information_retrieval/megatron_bert_embedding_generate.py b/examples/nlp/information_retrieval/megatron_bert_embedding_generate.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf, open_dict
+
+from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+
+@hydra_runner(config_path="conf", config_name="megatron_bert_embedding_config")
+def main(cfg) -> None:
+    if cfg.model.data.dataloader_type != "LDDL":
+        mp.set_start_method("spawn", force=True)
+
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    trainer = MegatronBertTrainerBuilder(cfg).create_trainer()
+    exp_manager(trainer, cfg.exp_manager)
+
+    model_cfg = MegatronBertEmbeddingModel.merge_cfg_with(cfg.restore_from_path, cfg)
+
+    OmegaConf.set_struct(model_cfg, True)
+    with open_dict(model_cfg):
+        model_cfg.precision = trainer.precision
+
+    logging.info(f"Loading model from {cfg.restore_from_path}")
+    model = MegatronBertEmbeddingModel.restore_from(
+        restore_path=cfg.restore_from_path,
+        trainer=trainer,
+        save_restore_connector=NLPSaveRestoreConnector(),
+        override_config_path=model_cfg,
+        strict=True,
+    )
+
+    trainer.test(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
@@ -42,24 +42,7 @@
     gpt_data_step,
     gpt_forward_step,
 )
-from nemo.collections.llm.gpt.model.api import (
-    code_gemma_2b,
-    code_gemma_7b,
-    code_llama_7b,
-    code_llama_13b,
-    code_llama_34b,
-    code_llama_70b,
-    gemma,
-    gemma_2b,
-    gemma_7b,
-    llama2_7b,
-    llama2_13b,
-    llama2_70b,
-    llama3_8b,
-    llama3_70b,
-    mistral,
-    mixtral,
-)
+from nemo.collections.llm.recipes import *  # noqa
 
 __all__ = [
     "MockDataModule",
@@ -103,21 +86,5 @@
     "mock",
     "squad",
     "dolly",
-    "mistral",
-    "mixtral",
-    "llama2_7b",
-    "llama3_8b",
-    "llama2_13b",
-    "llama2_70b",
-    "llama3_70b",
-    "code_llama_7b",
-    "code_llama_13b",
-    "code_llama_34b",
-    "code_llama_70b",
-    "gemma",
-    "gemma_2b",
-    "gemma_7b",
-    "code_gemma_2b",
-    "code_gemma_7b",
     "peft",
 ]
diff --git a/nemo/collections/llm/gpt/model/api.py b/nemo/collections/llm/gpt/model/api.py
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
@@ -0,0 +1,13 @@
+from nemo.collections.llm.recipes import llama2_7b, llama3_8b, llama3_8b_16k, llama3_8b_64k, mistral
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim import adam
+
+__all__ = [
+    "llama3_8b",
+    "llama3_8b_16k",
+    "llama3_8b_64k",
+    "llama2_7b",
+    "mistral",
+    "adam",
+    "default_log",
+]