Skip to content

Commit

Permalink
Merge branch 'main' into xren/nsys_profiling
Browse files Browse the repository at this point in the history
  • Loading branch information
xrennvidia committed Sep 17, 2024
2 parents 87104c1 + 8ff8804 commit 1299eff
Show file tree
Hide file tree
Showing 62 changed files with 558 additions and 390 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/cherry-pick-release-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,15 @@ jobs:
-d '{"title":"Cherry-pick '$PR_ID' into '$RELEASE_BRANCH'","head":"cherry-pick-'$PR_ID'-'$RELEASE_BRANCH'","base":"'$RELEASE_BRANCH'"}'

else
URL=https://github.com/NVIDIA/NeMo/pull/${{ github.event.number }}
URL="https://github.com/NVIDIA/NeMo/pull/$PR_ID"

MESSAGE='{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'${{ github.event.number }}'> failed"
"text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'$PR_ID'> failed"
}
}
]
Expand Down
46 changes: 43 additions & 3 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ jobs:
TIMEOUT: 20
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

OPTIONAL_L0_Unit_Tests_CPU_Audio:
needs: [cicd-test-container-setup]
Expand All @@ -264,14 +265,16 @@ jobs:
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Common:
OPTIONAL_L0_Unit_Tests_CPU_Common:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true'
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_CPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
TIMEOUT: 20
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_LLM:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -3463,6 +3466,42 @@ jobs:
rm -rf examples/nlp/language_modeling/gpt_pretrain_results
rm -rf examples/nlp/language_modeling/gpt_index_mappings
OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
mkdir examples/llm/auto_configurator/auto_conf_logs
python examples/llm/auto_configurator/auto_config.py \
--logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
--tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
--run_number=1
python examples/llm/auto_configurator/auto_config.py \
--logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
--tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
--run_number=2
python examples/llm/auto_configurator/auto_config.py \
--logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
--tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
--run_number=3
python examples/llm/auto_configurator/auto_config.py \
--logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
--tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
--get_results
AFTER_SCRIPT: |
rm -rf examples/llm/auto_configurator/auto_conf_logs
IS_OPTIONAL: true

L2_Megatron_GPT_Finetuning_PP2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -5139,7 +5178,7 @@ jobs:

#- OPTIONAL_L0_Unit_Tests_CPU_ASR
#- OPTIONAL_L0_Unit_Tests_CPU_Audio
- L0_Unit_Tests_CPU_Common
#- OPTIONAL_L0_Unit_Tests_CPU_Common
- L0_Unit_Tests_CPU_LLM
- L0_Unit_Tests_CPU_Multimodal
#- OPTIONAL_L0_Unit_Tests_CPU_NLP
Expand Down Expand Up @@ -5214,6 +5253,7 @@ jobs:
- L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
#- OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124
- L2_Megatron_GPT_Finetuning_PP2
- L2_Megatron_GPT_Finetuning_StarCoder_PP1
- L2_Megatron_GPT_Embedding
Expand Down
3 changes: 1 addition & 2 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_T
# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.15.0

ARG MCORE_TAG=01945b98d1ea3a2acb5e8301e181a328104f4856
ARG MCORE_TAG=76f9f48939ba5ecff0fed7bfbd4204df05d3e4da

ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
Expand Down
2 changes: 1 addition & 1 deletion docs/source/features/parallelisms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ Implement Context Parallelism
NeMo Framework leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.

Visit our source code for more insights into the implementation:
- `Megatron Core wrappers for Transformer Engine <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py>`_
- `Megatron Core wrappers for Transformer Engine <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/extensions/transformer_engine.py>`_
- `Transformer Engine attention modules <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@

def extract_transcriptions(hyps):
"""
The transcribed_texts returned by CTC and RNNT models are different.
This method would extract and return the text section of the hypothesis.
The transcribed_texts returned by CTC and RNNT models are different.
This method would extract and return the text section of the hypothesis.
"""
if isinstance(hyps[0], Hypothesis):
transcriptions = []
Expand Down Expand Up @@ -210,7 +210,10 @@ def perform_streaming(
def main():
parser = ArgumentParser()
parser.add_argument(
"--asr_model", type=str, required=True, help="Path to an ASR model .nemo file or name of a pretrained model.",
"--asr_model",
type=str,
required=True,
help="Path to an ASR model .nemo file or name of a pretrained model.",
)
parser.add_argument(
"--device", type=str, help="The device to load the model onto and perform the streaming", default="cuda"
Expand Down Expand Up @@ -431,7 +434,7 @@ def autocast():
"streaming_out_"
+ os.path.splitext(os.path.basename(args.asr_model))[0]
+ "_"
+ os.path.splitext(os.path.basename(args.test_manifest))[0]
+ os.path.splitext(os.path.basename(args.manifest_file))[0]
+ ".json"
)

Expand Down
84 changes: 23 additions & 61 deletions examples/asr/transcribe_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,37 +13,31 @@
# limitations under the License.

import contextlib
import glob
import json
import os
import time
from dataclasses import dataclass, field, is_dataclass
from tempfile import NamedTemporaryFile
from typing import List, Optional, Union

import pytorch_lightning as pl
import torch
from omegaconf import OmegaConf, open_dict

from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel
from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecRNNTModel
from nemo.collections.asr.models.aed_multitask_models import parse_multitask_prompt
from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
from nemo.collections.asr.parts.submodules.rnnt_greedy_decoding import GreedyBatchedRNNTInferConfig
from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
from nemo.collections.asr.parts.utils.transcribe_utils import (
compute_output_filename,
prepare_audio_data,
read_and_maybe_sort_manifest,
restore_transcription_order,
setup_model,
transcribe_partial_audio,
write_transcription,
)
from nemo.collections.common.parts.preprocessing.manifest import get_full_path
from nemo.core.config import hydra_runner
from nemo.utils import logging

Expand All @@ -69,6 +63,7 @@
output_filename: Output filename where the transcriptions will be written
batch_size: batch size during inference
presort_manifest: sorts the provided manifest by audio length for faster inference (default: True)
cuda: Optional int to enable or disable execution of model on certain CUDA device.
allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available
Expand Down Expand Up @@ -206,10 +201,6 @@ class TranscriptionConfig:
gt_text_attr_name: str = "text"
gt_lang_attr_name: str = "lang"

# Use model's transcribe() function instead of transcribe_partial_audio() by default
# Only use transcribe_partial_audio() when the audio is too long to fit in memory
# Your manifest input should have `offset` field to use transcribe_partial_audio()
allow_partial_transcribe: bool = False
extract_nbest: bool = False # Extract n-best hypotheses from the model

calculate_rtfx: bool = False
Expand Down Expand Up @@ -293,7 +284,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
elif isinstance(asr_model, EncDecHybridRNNTCTCModel):
if cfg.decoder_type and cfg.decoder_type not in ['ctc', 'rnnt']:
raise ValueError('Hybrid model only support ctc or rnnt decoding!')
else: # rnnt model, there could be other models needs to be addressed.
elif isinstance(asr_model, EncDecRNNTModel):
if cfg.decoder_type and cfg.decoder_type != 'rnnt':
raise ValueError('RNNT model only support rnnt decoding!')

Expand Down Expand Up @@ -361,29 +352,11 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
else:
cfg.decoding = cfg.rnnt_decoding

remove_path_after_done = None
if isinstance(asr_model, EncDecMultiTaskModel):
# Special case for EncDecMultiTaskModel, where the input manifest is directly passed into the model's transcribe() function
partial_audio = False
if cfg.audio_dir is not None and not cfg.append_pred:
filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
else:
assert cfg.dataset_manifest is not None
if cfg.presort_manifest:
with NamedTemporaryFile("w", suffix=".json", delete=False) as f:
for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=True):
item["audio_filepath"] = get_full_path(item["audio_filepath"], cfg.dataset_manifest)
print(json.dumps(item), file=f)
cfg.dataset_manifest = f.name
remove_path_after_done = f.name
filepaths = cfg.dataset_manifest
else:
# prepare audio filepaths and decide wether it's partial audio
filepaths, partial_audio = prepare_audio_data(cfg)
filepaths, sorted_manifest_path = prepare_audio_data(cfg)

remove_path_after_done = sorted_manifest_path if sorted_manifest_path is not None else None

if not cfg.allow_partial_transcribe:
# by defatul, use model's transcribe() function, unless partial audio is required
partial_audio = False
filepaths = sorted_manifest_path if sorted_manifest_path is not None else filepaths

# setup AMP (optional)
if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
Expand Down Expand Up @@ -424,33 +397,22 @@ def autocast(dtype=None, enabled=True):
with torch.no_grad():
if cfg.calculate_rtfx:
start_time = time.time()
if partial_audio:
transcriptions = transcribe_partial_audio(
asr_model=asr_model,
path2manifest=cfg.dataset_manifest,
batch_size=cfg.batch_size,
num_workers=cfg.num_workers,
return_hypotheses=cfg.return_hypotheses,
channel_selector=cfg.channel_selector,
augmentor=augmentor,
decoder_type=cfg.decoder_type,
)
else:
override_cfg = asr_model.get_transcribe_config()
override_cfg.batch_size = cfg.batch_size
override_cfg.num_workers = cfg.num_workers
override_cfg.return_hypotheses = cfg.return_hypotheses
override_cfg.channel_selector = cfg.channel_selector
override_cfg.augmentor = augmentor
override_cfg.text_field = cfg.gt_text_attr_name
override_cfg.lang_field = cfg.gt_lang_attr_name
if hasattr(override_cfg, "prompt"):
override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))

transcriptions = asr_model.transcribe(
audio=filepaths,
override_config=override_cfg,
)

override_cfg = asr_model.get_transcribe_config()
override_cfg.batch_size = cfg.batch_size
override_cfg.num_workers = cfg.num_workers
override_cfg.return_hypotheses = cfg.return_hypotheses
override_cfg.channel_selector = cfg.channel_selector
override_cfg.augmentor = augmentor
override_cfg.text_field = cfg.gt_text_attr_name
override_cfg.lang_field = cfg.gt_lang_attr_name
if hasattr(override_cfg, "prompt"):
override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))

transcriptions = asr_model.transcribe(
audio=filepaths,
override_config=override_cfg,
)
if cfg.calculate_rtfx:
transcribe_time = time.time() - start_time

Expand Down
5 changes: 4 additions & 1 deletion examples/llm/auto_configurator/auto_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def get_args():
parser.add_argument("--run_number", type=int, help="Number of config to run")
parser.add_argument("--logs_dir", type=str, help="Path where to save training logs")
parser.add_argument("--data_path", type=str, help="Path to the dataset")
parser.add_argument("--tokenizer_path", type=str, help="Path to the tokenizer")
parser.add_argument("--get_results", action="store_true")

return parser.parse_args()
Expand All @@ -51,6 +52,8 @@ def train_config(args):
max_steps_per_run=25,
num_tokens_in_b=10,
vocab_size=51200,
tokenizer_type="autotokenizer",
tokenizer_path=args.tokenizer_path,
data_paths=args.data_path,
path_to_logs=args.logs_dir,
)
Expand All @@ -63,7 +66,7 @@ def train_config(args):

# Run pre-training
partial = partials[args.run_number - 1]
partial.log.dir = os.path.join(args.logs_dir, names[args.run_number - 1])
partial.log.log_dir = os.path.join(args.logs_dir, names[args.run_number - 1])
pretrain = fdl.build(partial)
pretrain()
else:
Expand Down
2 changes: 1 addition & 1 deletion examples/llm/megatron_gpt_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def get_args():
)

nemo_logger = NeMoLogger(
dir=args.experiment_dir,
log_dir=args.experiment_dir,
)

train(
Expand Down
Loading

0 comments on commit 1299eff

Please sign in to comment.