Merge branch 'main' into xren/nsys_profiling

xrennvidia · Sep 17, 2024 · 1299eff · 1299eff
2 parents 87104c1 + 8ff8804
commit 1299eff
Show file tree

Hide file tree

Showing 62 changed files with 558 additions and 390 deletions.
diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
@@ -75,15 +75,15 @@ jobs:
                 -d '{"title":"Cherry-pick '$PR_ID' into '$RELEASE_BRANCH'","head":"cherry-pick-'$PR_ID'-'$RELEASE_BRANCH'","base":"'$RELEASE_BRANCH'"}'
 
             else
-              URL=https://github.com/NVIDIA/NeMo/pull/${{ github.event.number }}
+              URL="https://github.com/NVIDIA/NeMo/pull/$PR_ID"
 
               MESSAGE='{
                 "blocks": [
                   {
                     "type": "section",
                     "text": {
                       "type": "mrkdwn",
-                      "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'${{ github.event.number }}'> failed"
+                      "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'$PR_ID'> failed"
                     }
                   }
                 ]

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -253,6 +253,7 @@ jobs:
        TIMEOUT: 20
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
 
   OPTIONAL_L0_Unit_Tests_CPU_Audio:
      needs: [cicd-test-container-setup]
@@ -264,14 +265,16 @@ jobs:
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
        IS_OPTIONAL: true
 
-  L0_Unit_Tests_CPU_Common:
+  OPTIONAL_L0_Unit_Tests_CPU_Common:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_CPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
+       TIMEOUT: 20
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_LLM:
      needs: [cicd-test-container-setup]
@@ -3463,6 +3466,42 @@ jobs:
         rm -rf examples/nlp/language_modeling/gpt_pretrain_results
         rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
+  OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        mkdir examples/llm/auto_configurator/auto_conf_logs
+
+        python examples/llm/auto_configurator/auto_config.py \
+        --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
+        --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
+        --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
+        --run_number=1
+
+        python examples/llm/auto_configurator/auto_config.py \
+        --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
+        --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
+        --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
+        --run_number=2
+
+        python examples/llm/auto_configurator/auto_config.py \
+        --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
+        --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
+        --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
+        --run_number=3
+
+        python examples/llm/auto_configurator/auto_config.py \
+        --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
+        --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
+        --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
+        --get_results
+      AFTER_SCRIPT: |
+        rm -rf examples/llm/auto_configurator/auto_conf_logs
+      IS_OPTIONAL: true
+
   L2_Megatron_GPT_Finetuning_PP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -5139,7 +5178,7 @@ jobs:
 
       #- OPTIONAL_L0_Unit_Tests_CPU_ASR
       #- OPTIONAL_L0_Unit_Tests_CPU_Audio
-      - L0_Unit_Tests_CPU_Common
+      #- OPTIONAL_L0_Unit_Tests_CPU_Common
       - L0_Unit_Tests_CPU_LLM
       - L0_Unit_Tests_CPU_Multimodal
       #- OPTIONAL_L0_Unit_Tests_CPU_NLP
@@ -5214,6 +5253,7 @@ jobs:
       - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
+      #- OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124
       - L2_Megatron_GPT_Finetuning_PP2
       - L2_Megatron_GPT_Finetuning_StarCoder_PP1
       - L2_Megatron_GPT_Embedding 

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -38,8 +38,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.15.0
-
-ARG MCORE_TAG=01945b98d1ea3a2acb5e8301e181a328104f4856
+ARG MCORE_TAG=76f9f48939ba5ecff0fed7bfbd4204df05d3e4da
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst
@@ -266,7 +266,7 @@ Implement Context Parallelism
 NeMo Framework leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
 
 Visit our source code for more insights into the implementation:
-- `Megatron Core wrappers for Transformer Engine <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py>`_
+- `Megatron Core wrappers for Transformer Engine <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/extensions/transformer_engine.py>`_
 - `Transformer Engine attention modules <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_
 
 

diff --git a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
@@ -99,8 +99,8 @@
 
 def extract_transcriptions(hyps):
     """
-        The transcribed_texts returned by CTC and RNNT models are different.
-        This method would extract and return the text section of the hypothesis.
+    The transcribed_texts returned by CTC and RNNT models are different.
+    This method would extract and return the text section of the hypothesis.
     """
     if isinstance(hyps[0], Hypothesis):
         transcriptions = []
@@ -210,7 +210,10 @@ def perform_streaming(
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, required=True, help="Path to an ASR model .nemo file or name of a pretrained model.",
+        "--asr_model",
+        type=str,
+        required=True,
+        help="Path to an ASR model .nemo file or name of a pretrained model.",
     )
     parser.add_argument(
         "--device", type=str, help="The device to load the model onto and perform the streaming", default="cuda"
@@ -431,7 +434,7 @@ def autocast():
                 "streaming_out_"
                 + os.path.splitext(os.path.basename(args.asr_model))[0]
                 + "_"
-                + os.path.splitext(os.path.basename(args.test_manifest))[0]
+                + os.path.splitext(os.path.basename(args.manifest_file))[0]
                 + ".json"
             )
 

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
@@ -13,37 +13,31 @@
 # limitations under the License.
 
 import contextlib
-import glob
 import json
 import os
 import time
 from dataclasses import dataclass, field, is_dataclass
-from tempfile import NamedTemporaryFile
 from typing import List, Optional, Union
 
 import pytorch_lightning as pl
 import torch
 from omegaconf import OmegaConf, open_dict
 
-from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel
+from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecRNNTModel
 from nemo.collections.asr.models.aed_multitask_models import parse_multitask_prompt
 from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
 from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
-from nemo.collections.asr.parts.submodules.rnnt_greedy_decoding import GreedyBatchedRNNTInferConfig
 from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import (
     compute_output_filename,
     prepare_audio_data,
-    read_and_maybe_sort_manifest,
     restore_transcription_order,
     setup_model,
-    transcribe_partial_audio,
     write_transcription,
 )
-from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 
@@ -69,6 +63,7 @@
 
   output_filename: Output filename where the transcriptions will be written
   batch_size: batch size during inference
+  presort_manifest: sorts the provided manifest by audio length for faster inference (default: True)
 
   cuda: Optional int to enable or disable execution of model on certain CUDA device.
   allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available
@@ -206,10 +201,6 @@ class TranscriptionConfig:
     gt_text_attr_name: str = "text"
     gt_lang_attr_name: str = "lang"
 
-    # Use model's transcribe() function instead of transcribe_partial_audio() by default
-    # Only use transcribe_partial_audio() when the audio is too long to fit in memory
-    # Your manifest input should have `offset` field to use transcribe_partial_audio()
-    allow_partial_transcribe: bool = False
     extract_nbest: bool = False  # Extract n-best hypotheses from the model
 
     calculate_rtfx: bool = False
@@ -293,7 +284,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     elif isinstance(asr_model, EncDecHybridRNNTCTCModel):
         if cfg.decoder_type and cfg.decoder_type not in ['ctc', 'rnnt']:
             raise ValueError('Hybrid model only support ctc or rnnt decoding!')
-    else:  # rnnt model, there could be other models needs to be addressed.
+    elif isinstance(asr_model, EncDecRNNTModel):
         if cfg.decoder_type and cfg.decoder_type != 'rnnt':
             raise ValueError('RNNT model only support rnnt decoding!')
 
@@ -361,29 +352,11 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
         else:
             cfg.decoding = cfg.rnnt_decoding
 
-    remove_path_after_done = None
-    if isinstance(asr_model, EncDecMultiTaskModel):
-        # Special case for EncDecMultiTaskModel, where the input manifest is directly passed into the model's transcribe() function
-        partial_audio = False
-        if cfg.audio_dir is not None and not cfg.append_pred:
-            filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
-        else:
-            assert cfg.dataset_manifest is not None
-            if cfg.presort_manifest:
-                with NamedTemporaryFile("w", suffix=".json", delete=False) as f:
-                    for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=True):
-                        item["audio_filepath"] = get_full_path(item["audio_filepath"], cfg.dataset_manifest)
-                        print(json.dumps(item), file=f)
-                    cfg.dataset_manifest = f.name
-                    remove_path_after_done = f.name
-            filepaths = cfg.dataset_manifest
-    else:
-        # prepare audio filepaths and decide wether it's partial audio
-        filepaths, partial_audio = prepare_audio_data(cfg)
+    filepaths, sorted_manifest_path = prepare_audio_data(cfg)
+
+    remove_path_after_done = sorted_manifest_path if sorted_manifest_path is not None else None
 
-    if not cfg.allow_partial_transcribe:
-        # by defatul, use model's transcribe() function, unless partial audio is required
-        partial_audio = False
+    filepaths = sorted_manifest_path if sorted_manifest_path is not None else filepaths
 
     # setup AMP (optional)
     if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
@@ -424,33 +397,22 @@ def autocast(dtype=None, enabled=True):
         with torch.no_grad():
             if cfg.calculate_rtfx:
                 start_time = time.time()
-            if partial_audio:
-                transcriptions = transcribe_partial_audio(
-                    asr_model=asr_model,
-                    path2manifest=cfg.dataset_manifest,
-                    batch_size=cfg.batch_size,
-                    num_workers=cfg.num_workers,
-                    return_hypotheses=cfg.return_hypotheses,
-                    channel_selector=cfg.channel_selector,
-                    augmentor=augmentor,
-                    decoder_type=cfg.decoder_type,
-                )
-            else:
-                override_cfg = asr_model.get_transcribe_config()
-                override_cfg.batch_size = cfg.batch_size
-                override_cfg.num_workers = cfg.num_workers
-                override_cfg.return_hypotheses = cfg.return_hypotheses
-                override_cfg.channel_selector = cfg.channel_selector
-                override_cfg.augmentor = augmentor
-                override_cfg.text_field = cfg.gt_text_attr_name
-                override_cfg.lang_field = cfg.gt_lang_attr_name
-                if hasattr(override_cfg, "prompt"):
-                    override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
-
-                transcriptions = asr_model.transcribe(
-                    audio=filepaths,
-                    override_config=override_cfg,
-                )
+
+            override_cfg = asr_model.get_transcribe_config()
+            override_cfg.batch_size = cfg.batch_size
+            override_cfg.num_workers = cfg.num_workers
+            override_cfg.return_hypotheses = cfg.return_hypotheses
+            override_cfg.channel_selector = cfg.channel_selector
+            override_cfg.augmentor = augmentor
+            override_cfg.text_field = cfg.gt_text_attr_name
+            override_cfg.lang_field = cfg.gt_lang_attr_name
+            if hasattr(override_cfg, "prompt"):
+                override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
+
+            transcriptions = asr_model.transcribe(
+                audio=filepaths,
+                override_config=override_cfg,
+            )
             if cfg.calculate_rtfx:
                 transcribe_time = time.time() - start_time
 

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
@@ -27,6 +27,7 @@ def get_args():
     parser.add_argument("--run_number", type=int, help="Number of config to run")
     parser.add_argument("--logs_dir", type=str, help="Path where to save training logs")
     parser.add_argument("--data_path", type=str, help="Path to the dataset")
+    parser.add_argument("--tokenizer_path", type=str, help="Path to the tokenizer")
     parser.add_argument("--get_results", action="store_true")
 
     return parser.parse_args()
@@ -51,6 +52,8 @@ def train_config(args):
         max_steps_per_run=25,
         num_tokens_in_b=10,
         vocab_size=51200,
+        tokenizer_type="autotokenizer",
+        tokenizer_path=args.tokenizer_path,
         data_paths=args.data_path,
         path_to_logs=args.logs_dir,
     )
@@ -63,7 +66,7 @@ def train_config(args):
 
         # Run pre-training
         partial = partials[args.run_number - 1]
-        partial.log.dir = os.path.join(args.logs_dir, names[args.run_number - 1])
+        partial.log.log_dir = os.path.join(args.logs_dir, names[args.run_number - 1])
         pretrain = fdl.build(partial)
         pretrain()
     else:

diff --git a/examples/llm/megatron_gpt_pretraining.py b/examples/llm/megatron_gpt_pretraining.py
@@ -102,7 +102,7 @@ def get_args():
     )
 
     nemo_logger = NeMoLogger(
-        dir=args.experiment_dir,
+        log_dir=args.experiment_dir,
     )
 
     train(