diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index 4ef6c5a9f9df..8c61c767b4f1 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -34,22 +34,27 @@ on: description: Last 2000 characters of the test step's log value: ${{ jobs.main.outputs.log }} jobs: + main: runs-on: ${{ inputs.RUNNER }} outputs: conclusion: ${{ steps.main.conclusion }} log: ${{ steps.main.outputs.log }} - permissions: - actions: write # Required for cancelling workflows steps: - name: Docker system cleanup run: | docker system prune -a --filter "until=48h" --force + - name: Docker pull image + run: | + docker pull nemoci.azurecr.io/nemo_container_${{ github.run_id }} + - id: main name: Run main script timeout-minutes: ${{ inputs.TIMEOUT }} run: | + mkdir -p ${{ github.run_id }} + cd ${{ github.run_id }}/ set +e ( set -e @@ -65,9 +70,7 @@ jobs: - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: failure() && inputs.IS_OPTIONAL == false - - name: after_script if: always() && inputs.AFTER_SCRIPT != ':' run: | - docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}' - \ No newline at end of file + docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}' \ No newline at end of file diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 253e114c78f3..2919be733ac3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -93,8 +93,8 @@ jobs: exit 0 ' ### \'\' - - + + L0_Unit_Tests_GPU: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -105,14 +105,16 @@ jobs: NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads IS_OPTIONAL: true - L0_Unit_Tests_CPU: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-cpu - TIMEOUT: 60 - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + # # TODO refactor: Commenting this test out until it is fixed & works properly again (test passes again) + # OPTIONAL_L0_Unit_Tests_CPU: + # needs: [cicd-test-container-setup] + # uses: ./.github/workflows/_test_template.yml + # with: + # RUNNER: self-hosted-azure-cpu + # TIMEOUT: 60 + # SCRIPT: | + # CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + # IS_OPTIONAL: true L0_Setup_Test_Data_And_Models: needs: [cicd-test-container-setup] @@ -532,55 +534,45 @@ jobs: AFTER_SCRIPT: | rm -rf examples/asr/speech_to_text_results - - # L2_Speech_to_Text_AED: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure-gpus-1 - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # python examples/asr/speech_multitask/speech_to_text_aed.py \ - # model.prompt_format=canary \ - # model.model_defaults.asr_enc_hidden=256 \ - # model.model_defaults.lm_dec_hidden=256 \ - # model.encoder.n_layers=12 \ - # model.transf_encoder.num_layers=0 \ - # model.transf_decoder.config_dict.num_layers=12 \ - # model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \ - # ++model.train_ds.is_tarred=false \ - # model.train_ds.batch_duration=60 \ - # +model.train_ds.text_field="answer" \ - # +model.train_ds.lang_field="target_lang" \ - # model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \ - # +model.validation_ds.text_field="answer" \ - # +model.validation_ds.lang_field="target_lang" \ - # model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \ - # +model.test_ds.text_field="answer" \ - # +model.test_ds.lang_field="target_lang" \ - # model.tokenizer.langs.spl_tokens.dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \ - # model.tokenizer.langs.spl_tokens.type="bpe" \ - # model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \ - # model.tokenizer.langs.en.type=bpe \ - # ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \ - # ++model.tokenizer.langs.es.type=bpe \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.use_distributed_sampler=false \ - # +trainer.fast_dev_run=True \ - # exp_manager.exp_dir=examples/asr/speech_to_text_aed_results - # rm -rf examples/asr/speech_to_text_results - + L2_Speech_to_Text_AED: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/speech_multitask/speech_to_text_aed.py \ + model.prompt_format=canary \ + model.model_defaults.asr_enc_hidden=256 \ + model.model_defaults.lm_dec_hidden=256 \ + model.encoder.n_layers=12 \ + model.transf_encoder.num_layers=0 \ + model.transf_decoder.config_dict.num_layers=12 \ + model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \ + model.train_ds.batch_duration=60 \ + model.train_ds.use_bucketing=false \ + model.train_ds.shuffle_buffer_size=100 \ + model.train_ds.num_workers=0 \ + +model.train_ds.text_field="answer" \ + +model.train_ds.lang_field="target_lang" \ + model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \ + +model.validation_ds.text_field="answer" \ + +model.validation_ds.lang_field="target_lang" \ + model.validation_ds.num_workers=0 \ + model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \ + +model.test_ds.text_field="answer" \ + +model.test_ds.lang_field="target_lang" \ + model.test_ds.num_workers=0 \ + spl_tokens.model_dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \ + model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \ + model.tokenizer.langs.en.type=bpe \ + ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \ + ++model.tokenizer.langs.es.type=bpe \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_aed_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_results # L2: Speaker dev run L2_Speaker_dev_run_Speaker_Recognition: @@ -817,6 +809,66 @@ jobs: AFTER_SCRIPT: | rm -rf stt_test_res.json + # L2: Speech Transcription + L2_Speech_Transcription_Canary_Transcribe_Full_Manifest: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/asr/transcribe_speech.py \ + dataset_manifest=/home/TestData/asr/canary/dev-other-wav-10-canary-fields.json \ + output_filename=preds.json \ + batch_size=10 \ + pretrained_name=nvidia/canary-1b \ + num_workers=0 \ + amp=false \ + compute_dtype=bfloat16 \ + matmul_precision=medium + AFTER_SCRIPT: | + rm -rf preds.json transcribe.log + + L2_Speech_Transcription_Canary_Transcribe_With_Prompt: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/asr/transcribe_speech.py \ + dataset_manifest=/home/TestData/asr/canary/dev-other-wav-10.json \ + output_filename=preds.json \ + batch_size=10 \ + pretrained_name=nvidia/canary-1b \ + num_workers=0 \ + amp=false \ + compute_dtype=bfloat16 \ + matmul_precision=medium \ + +prompt.source_lang="en" \ + +prompt.target_lang="en" \ + +prompt.task="asr" \ + +prompt.pnc="no" + AFTER_SCRIPT: | + rm -rf preds.json transcribe.log + + L2_Speech_Transcription_Canary_Transcribe_Audio_Dir: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/asr/transcribe_speech.py \ + audio_dir=/home/TestData/asr/canary/dev-other-wav \ + output_filename=preds.json \ + batch_size=10 \ + pretrained_name=nvidia/canary-1b \ + num_workers=0 \ + amp=false \ + compute_dtype=bfloat16 \ + matmul_precision=medium + AFTER_SCRIPT: | + rm -rf preds.json + + # L2: Transducer alignment L2_Transducer_alignment_Running_pytest: needs: [cicd-test-container-setup] @@ -975,7 +1027,6 @@ jobs: data.test_ds.use_cache=false \ data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv - # L2: Intent and Slot Classification Tasks L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification: needs: [cicd-test-container-setup] @@ -1013,11 +1064,11 @@ jobs: rm -rf checkpoints2 # TODO: add when megatron-bert is supported again - # stage('L2: Model Parallel Size 2 Megatron Text Classification') { + # stage("L2: Model Parallel Size 2 Megatron Text Classification") { # when { # anyOf{ - # branch 'main' - # changeRequest target: 'main' + # branch "main" + # changeRequest target: "main" # } # } # failFast true @@ -1042,11 +1093,11 @@ jobs: # } # } - # stage('L2: Model Parallel Size 2 Megatron Autoresume') { + # stage("L2: Model Parallel Size 2 Megatron Autoresume") { # when { # anyOf{ - # branch 'main' - # changeRequest target: 'main' + # branch "main" + # changeRequest target: "main" # } # } # failFast true @@ -1073,11 +1124,11 @@ jobs: # } # } - # stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') { + # stage("L2: Model Parallel Size 2 Megatron Evaluation from .nemo") { # when { # anyOf{ - # branch 'main' - # changeRequest target: 'main' + # branch "main" + # changeRequest target: "main" # } # } # failFast true @@ -1094,11 +1145,11 @@ jobs: # } # } - # stage('L2: Model Parallel Size 2 Megatron Train from .nemo') { + # stage("L2: Model Parallel Size 2 Megatron Train from .nemo") { # when { # anyOf{ - # branch 'main' - # changeRequest target: 'main' + # branch "main" + # changeRequest target: "main" # } # } # failFast true @@ -1500,18 +1551,18 @@ jobs: model.encoder.num_layers=4 \ model.encoder.hidden_size=64 \ model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ + model.encoder.activation="swiglu" \ model.encoder.masked_softmax_fusion=False \ model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_method="block" \ model.encoder.activations_checkpoint_num_layers=1 \ model.decoder.num_layers=2 \ model.decoder.hidden_size=64 \ model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ + model.decoder.activation="swiglu" \ model.decoder.masked_softmax_fusion=False \ model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_method="block" \ model.decoder.activations_checkpoint_num_layers=1 \ model.micro_batch_size=2 \ model.global_batch_size=4 \ @@ -1545,18 +1596,18 @@ jobs: model.encoder.num_layers=4 \ model.encoder.hidden_size=64 \ model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ + model.encoder.activation="swiglu" \ model.encoder.masked_softmax_fusion=False \ model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_method="block" \ model.encoder.activations_checkpoint_num_layers=1 \ model.decoder.num_layers=2 \ model.decoder.hidden_size=64 \ model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ + model.decoder.activation="swiglu" \ model.decoder.masked_softmax_fusion=False \ model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_method="block" \ model.decoder.activations_checkpoint_num_layers=1 \ model.micro_batch_size=2 \ model.global_batch_size=4 \ @@ -1598,24 +1649,24 @@ jobs: model.encoder.hidden_size=64 \ model.encoder.arch=perceiver \ model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ + model.encoder.activation="swiglu" \ model.encoder.masked_softmax_fusion=False \ model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_method="block" \ model.encoder.activations_checkpoint_num_layers=1 \ model.decoder.num_layers=2 \ model.decoder.hidden_size=64 \ model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ + model.decoder.activation="swiglu" \ model.decoder.masked_softmax_fusion=False \ model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_method="block" \ model.decoder.activations_checkpoint_num_layers=1 \ model.micro_batch_size=2 \ model.global_batch_size=4 \ model.data.data_impl=text_mmap \ model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string='"800,100,100"' \ + model.data.splits_string="\"800,100,100\"" \ model.data.whole_word_masking=False \ model.tokenizer.library=sentencepiece \ model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ @@ -1643,24 +1694,24 @@ jobs: model.encoder.hidden_size=64 \ model.encoder.arch=perceiver \ model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ + model.encoder.activation="swiglu" \ model.encoder.masked_softmax_fusion=False \ model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_method="block" \ model.encoder.activations_checkpoint_num_layers=1 \ model.decoder.num_layers=2 \ model.decoder.hidden_size=64 \ model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ + model.decoder.activation="swiglu" \ model.decoder.masked_softmax_fusion=False \ model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_method="block" \ model.decoder.activations_checkpoint_num_layers=1 \ model.micro_batch_size=2 \ model.global_batch_size=4 \ model.data.data_impl=text_mmap \ model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string='"800,100,100"' \ + model.data.splits_string="\"800,100,100\"" \ model.data.whole_word_masking=False \ model.tokenizer.library=sentencepiece \ model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ @@ -1672,16 +1723,16 @@ jobs: AFTER_SCRIPT: | rm -rf examples/nlp/language_modeling/megatron_mim_results - # stage('L2: NMT Bottleneck Fallback') { + # stage("L2: NMT Bottleneck Fallback") { # when { # anyOf { - # branch 'main' - # changeRequest target: 'main' + # branch "main" + # changeRequest target: "main" # } # } # failFast true # parallel { - # stage('L2: seq2seq (no bottleneck)') { + # stage("L2: seq2seq (no bottleneck)") { # steps { # cd examples/nlp/machine_translation && \ # enc_dec_nmt-bottleneck.py \ @@ -1718,16 +1769,16 @@ jobs: # } # } # } - # stage('L2: NMT Bottleneck Architecture') { + # stage("L2: NMT Bottleneck Architecture") { # when { # anyOf { - # branch 'main' - # changeRequest target: 'main' + # branch "main" + # changeRequest target: "main" # } # } # failFast true # parallel { - # stage('Bridge Encoder (identity)') { + # stage("Bridge Encoder (identity)") { # steps { # cd examples/nlp/machine_translation && \ # enc_dec_nmt-bottleneck.py \ @@ -1762,7 +1813,7 @@ jobs: # exp_manager=null # } # } - # stage('Perceiver Encoder (params)') { + # stage("Perceiver Encoder (params)") { # steps { # cd examples/nlp/machine_translation && \ # enc_dec_nmt-bottleneck.py \ @@ -1799,16 +1850,16 @@ jobs: # } # } # } - # stage('L2: NMT Bottleneck LVM') { + # stage("L2: NMT Bottleneck LVM") { # when { # anyOf { - # branch 'main' - # changeRequest target: 'main' + # branch "main" + # changeRequest target: "main" # } # } # failFast true # parallel { - # stage('VAE') { + # stage("VAE") { # steps { # cd examples/nlp/machine_translation && \ # enc_dec_nmt-bottleneck.py \ @@ -1843,7 +1894,7 @@ jobs: # exp_manager=null # } # } - # stage('MIM') { + # stage("MIM") { # steps { # cd examples/nlp/machine_translation && \ # enc_dec_nmt-bottleneck.py \ @@ -2050,7 +2101,7 @@ jobs: model.num_layers=8 \ model.hidden_size=256 \ model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ + model.activations_checkpoint_method="block" \ model.activations_checkpoint_num_layers=1 \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings @@ -2080,7 +2131,7 @@ jobs: model.num_layers=8 \ model.hidden_size=256 \ model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ + model.activations_checkpoint_method="block" \ model.activations_checkpoint_num_layers=1 \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings @@ -2099,7 +2150,7 @@ jobs: trainer.devices=2 \ trainer.precision=bf16 \ trainer.accelerator=gpu \ - model.data.data_prefix=['none'] \ + model.data.data_prefix=["none"] \ exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ model.mcore_gpt=True \ model.tensor_model_parallel_size=1 \ @@ -2114,7 +2165,7 @@ jobs: model.init_method_std=0.023 \ model.optim.lr=6.0e-4 \ model.megatron_amp_O2=True \ - model.data.splits_string=\'\"98,2,0\"\' \ + model.data.splits_string="\"98,2,0\"" \ model.data.dataloader_type=cyclic \ trainer.max_steps=10 @@ -2123,7 +2174,7 @@ jobs: trainer.devices=2 \ trainer.precision=bf16 \ trainer.accelerator=gpu \ - model.data.data_prefix=['none'] \ + model.data.data_prefix=["none"] \ exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ model.mcore_gpt=True \ model.tensor_model_parallel_size=1 \ @@ -2138,7 +2189,7 @@ jobs: model.init_method_std=0.023 \ model.optim.lr=6.0e-4 \ model.megatron_amp_O2=True \ - model.data.splits_string=\'\"98,2,0\"\' \ + model.data.splits_string="\"98,2,0\"" \ model.data.dataloader_type=cyclic \ trainer.max_steps=20 AFTER_SCRIPT: | @@ -2292,16 +2343,16 @@ jobs: # from pandas.testing import assert_frame_equal # from tensorboard.backend.event_processing.event_accumulator import EventAccumulator # import torch - # if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()): + # if not (torch.cuda.is_available() and "A100" in torch.cuda.get_device_name()): # import sys # sys.exit(0) - # event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0] + # event_file = list(pathlib.Path("examples/nlp/language_modeling/retro_results/megatron_retro/smalltest").glob("events.out.tfevents*"))[0] # ea = EventAccumulator(str(event_file)).Reload() # vals = [] - # for i in ea.Scalars('reduced_train_loss'): + # for i in ea.Scalars("reduced_train_loss"): # vals.append(i.value) - # training_curve = pd.DataFrame({'loss': vals}) - # gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv') + # training_curve = pd.DataFrame({"loss": vals}) + # gt_curve = pd.read_csv("/home/TestData/nlp/megatron_retro/expected_learning_curve.csv") # assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)" # rm -rf examples/nlp/language_modeling/retro_results @@ -2317,13 +2368,13 @@ jobs: python examples/nlp/rag/rag_indexing.py \ trainer.num_nodes=1 \ trainer.devices=1 \ - trainer.precision='bf16-mixed' \ - indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \ + trainer.precision="bf16-mixed" \ + indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \ indexing.embedder.embed_batch_size=128 \ - indexing.data.data_path='/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data' \ + indexing.data.data_path="/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data" \ indexing.data.chunk_size=256 \ indexing.data.chunk_overlap=10 \ - indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' + indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index" L2_RAG_Pipeline_Generating: needs: [cicd-test-container-setup] @@ -2333,14 +2384,14 @@ jobs: SCRIPT: | python examples/nlp/rag/rag_generating.py \ trainer.devices=1 \ - trainer.precision='bf16-mixed' \ - indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \ - indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' \ - generating.llm.model_path='/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo' \ + trainer.precision="bf16-mixed" \ + indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \ + indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index" \ + generating.llm.model_path="/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo" \ generating.inference.tokens_to_generate=50 \ generating.inference.greedy=False \ generating.inference.temperature=1.0 \ - generating.query='Which art schools did I applied to?' + generating.query="Which art schools did I applied to?" L2_BioMegatron_Bert_NER_Task: needs: [cicd-test-container-setup] @@ -3098,29 +3149,44 @@ jobs: L2_Megatron_GPT_Reranker: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - rm -rf /home/TestData/nlp/megatron_ir/working_dir + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + rm -rf /home/TestData/nlp/megatron_ir/working_dir - python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \ - exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \ - model.global_batch_size=4 \ - model.micro_batch_size=4 \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.max_epochs=null \ - trainer.max_steps=20 \ - trainer.val_check_interval=10 \ - model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ - model.peft.lora_tuning.adapter_dim=8 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \ - model.data.validation_ds.write_embeddings_to_file=True \ - model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] - AFTER_SCRIPT: | - rm -rf /home/TestData/nlp/megatron_ir/working_dir + python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \ + exp_manager.exp_dir="/home/TestData/nlp/megatron_ir/working_dir" \ + model.global_batch_size=4 \ + model.micro_batch_size=4 \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.max_epochs=null \ + trainer.max_steps=20 \ + trainer.val_check_interval=10 \ + model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \ + model.peft.lora_tuning.adapter_dim=8 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \ + model.data.validation_ds.write_embeddings_to_file=True \ + model.data.validation_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/val_embs" \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] + + + rm -rf /home/TestData/nlp/megatron_ir/working_dir + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_GPT_Embedding: needs: [cicd-test-container-setup] @@ -3131,7 +3197,7 @@ jobs: rm -rf /home/TestData/nlp/megatron_ir/working_dir python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \ - exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \ + exp_manager.exp_dir="/home/TestData/nlp/megatron_ir/working_dir" \ model.global_batch_size=4 \ model.micro_batch_size=4 \ trainer.devices=1 \ @@ -3139,25 +3205,25 @@ jobs: trainer.max_epochs=null \ trainer.max_steps=20 \ trainer.val_check_interval=10 \ - model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ + model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \ model.peft.lora_tuning.adapter_dim=8 \ model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \ model.data.validation_ds.write_embeddings_to_file=True \ - model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \ + model.data.validation_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/val_embs" \ model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \ trainer.devices=1 \ trainer.num_nodes=1 \ - model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ - model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \ + model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \ + model.peft.restore_from_path="/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo" \ model.global_batch_size=4 \ model.micro_batch_size=4 \ model.peft.lora_tuning.adapter_dim=8 \ model.data.test_ds.write_embeddings_to_file=True \ - model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \ + model.data.test_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/test_embs" \ model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] AFTER_SCRIPT: | @@ -3203,15 +3269,15 @@ jobs: trainer.devices=2 \ model.megatron_amp_O2=True \ model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=['quarel4'] \ + model.data.test_ds.names=["quarel4"] \ model.global_batch_size=2 \ model.micro_batch_size=1 \ model.data.test_ds.tokens_to_generate=10 \ model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \ + model.data.test_ds.output_file_path_prefix="/home/TestData/nlp/lora_tuning_pp2/out" \ inference.greedy=True \ inference.repetition_penalty=1.0 \ - inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl' + inference.outfile_path="/home/TestData/nlp/lora_tuning_pp2/out.jsonl" AFTER_SCRIPT: | rm -rf /home/TestData/nlp/lora_tuning_pp2 @@ -3235,7 +3301,7 @@ jobs: model.pipeline_model_parallel_size=1 \ model.tensor_model_parallel_size=2 \ model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ - model.peft.peft_scheme='lora' \ + model.peft.peft_scheme="lora" \ model.answer_only_loss=True \ model.micro_batch_size=1 \ model.global_batch_size=1 \ @@ -3252,15 +3318,15 @@ jobs: model.tensor_model_parallel_size=2 \ trainer.devices=2 \ model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=['quarel4'] \ + model.data.test_ds.names=["quarel4"] \ model.global_batch_size=2 \ model.micro_batch_size=1 \ model.data.test_ds.tokens_to_generate=10 \ model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \ + model.data.test_ds.output_file_path_prefix="/home/TestData/nlp/lora_tuning_tp2/out" \ inference.greedy=True \ inference.repetition_penalty=1.0 \ - inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl' + inference.outfile_path="/home/TestData/nlp/lora_tuning_tp2/out.jsonl" AFTER_SCRIPT: | rm -rf /home/TestData/nlp/lora_tuning_tp2 @@ -3300,12 +3366,12 @@ jobs: +model.tp_comm_overlap_ag=False \ +model.tp_comm_overlap_rs=False \ +model.tp_comm_overlap_disable_qkv=True \ - model.peft.peft_scheme='lora' \ + model.peft.peft_scheme="lora" \ model.peft.lora_tuning.adapter_dim=16 \ model.peft.lora_tuning.alpha=32 \ model.peft.lora_tuning.column_init_method="kaiming" \ - +model.peft.lora_tuning.dropout_position='pre' \ - model.peft.lora_tuning.target_modules=['attention'] \ + +model.peft.lora_tuning.dropout_position="pre" \ + model.peft.lora_tuning.target_modules=["attention"] \ model.peft.lora_tuning.adapter_dropout=0.1 \ +model.peft.lora_tuning.a2a_experimental=1 \ model.answer_only_loss=True \ @@ -3328,7 +3394,7 @@ jobs: SCRIPT: | python examples/nlp/language_modeling/megatron_gpt_eval.py \ gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \ - prompts=['How to fix GPU memory? A:'] \ + prompts=["How to fix GPU memory? A:"] \ tensor_model_parallel_size=1 \ inference.tokens_to_generate=32 \ trainer.precision=32 @@ -3555,8 +3621,8 @@ jobs: model.decoder.num_layers=4 \ model.decoder.hidden_size=64 \ model.decoder.num_attention_heads=8 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ + model.encoder.transformer_block_type="pre_ln" \ + model.decoder.transformer_block_type="pre_ln" \ model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ model.data.data_impl=text_mmap \ @@ -3588,8 +3654,8 @@ jobs: model.decoder.num_layers=4 \ model.decoder.hidden_size=64 \ model.decoder.num_attention_heads=8 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ + model.encoder.transformer_block_type="pre_ln" \ + model.decoder.transformer_block_type="pre_ln" \ model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ model.data.data_impl=text_mmap \ @@ -3984,7 +4050,7 @@ jobs: SCRIPT: | python examples/nlp/language_modeling/megatron_t5_eval.py \ --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - --prompt 'How do I fix my GPU memory issue? I am seeing out of memory.' \ + --prompt "How do I fix my GPU memory issue? I am seeing out of memory." \ --tensor_model_parallel_size 1 L2_Megatron_Core_T5_Eval: @@ -3995,7 +4061,7 @@ jobs: SCRIPT: | NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_t5_eval.py \ --model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \ - --prompt 'How do I fix my GPU memory issue? I am seeing out of memory.' \ + --prompt "How do I fix my GPU memory issue? I am seeing out of memory." \ --tensor_model_parallel_size 1 L2_Megatron_BART_Pretraining_and_Resume_Training_TP2: @@ -4020,18 +4086,18 @@ jobs: model.encoder.num_layers=4 \ model.encoder.hidden_size=64 \ model.encoder.num_attention_heads=8 \ - model.encoder.activation='reglu' \ + model.encoder.activation="reglu" \ model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_method="block" \ model.encoder.activations_checkpoint_num_layers=1 \ model.decoder.num_layers=4 \ model.decoder.hidden_size=64 \ model.decoder.num_attention_heads=8 \ - model.decoder.activation='reglu' \ + model.decoder.activation="reglu" \ model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_method="block" \ model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' + model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}" python examples/nlp/language_modeling/megatron_bart_pretraining.py \ trainer.devices=2 \ @@ -4050,18 +4116,18 @@ jobs: model.encoder.num_layers=4 \ model.encoder.hidden_size=64 \ model.encoder.num_attention_heads=8 \ - model.encoder.activation='reglu' \ + model.encoder.activation="reglu" \ model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_method="block" \ model.encoder.activations_checkpoint_num_layers=1 \ model.decoder.num_layers=4 \ model.decoder.hidden_size=64 \ model.decoder.num_attention_heads=8 \ - model.decoder.activation='reglu' \ + model.decoder.activation="reglu" \ model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_method="block" \ model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' + model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}" AFTER_SCRIPT: | rm -rf examples/nlp/language_modeling/bart_pretrain_results @@ -4482,7 +4548,7 @@ jobs: AFTER_SCRIPT: | rm -f examples/asr/evaluation_transcripts.json - L2_Stable_Diffusion_Training: + OPTIONAL_L2_Stable_Diffusion_Training: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml with: @@ -4530,6 +4596,7 @@ jobs: model.data.synthetic_data=True AFTER_SCRIPT: | rm -rf examples/multimodal/text_to_image/sd_train_results + IS_OPTIONAL: true L2_NeMo_2_GPT_Pretraining_no_transformer_engine: needs: [cicd-test-container-setup] @@ -4566,7 +4633,7 @@ jobs: - gpu-test - cicd-test-container-setup - L0_Unit_Tests_GPU - - L0_Unit_Tests_CPU + #- OPTIONAL_L0_Unit_Tests_CPU - L2_Community_LLM_Checkpoints_tests_Bert - L2_Community_LLM_Checkpoints_tests_Llama - L2_Community_LLM_Checkpoints_tests_StarCoder @@ -4664,7 +4731,7 @@ jobs: - L2_TTS_Fast_dev_runs_1_Mixer-TTS - L2_TTS_Fast_dev_runs_1_Hifigan - Speech_Checkpoints_tests - - L2_Stable_Diffusion_Training + #- OPTIONAL_L2_Stable_Diffusion_Training - L2_NeMo_2_GPT_Pretraining_no_transformer_engine if: always() runs-on: ubuntu-latest diff --git a/docs/source/performance/performance_summary.md b/docs/source/performance/performance_summary.md new file mode 100644 index 000000000000..c5bdda7b040d --- /dev/null +++ b/docs/source/performance/performance_summary.md @@ -0,0 +1,42 @@ + +# Performance Benchmarks + +## Large Language Models** + +### Pretraining + +- The results in the table below show pre-training performance for various tasks at FP8 precision. + - Container: [NeMo24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) + - System: DGX-H100 + +| Model | #-GPUs | GBS | MBS | Sequence Length| TP | PP | CP | VP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to train in days (10T tokens, 1K GPUs)*** | +| ----- | ------ | --- | --- | ---------------| -- | -- | -- | -- | ------------------ | ----------------------- | ------------------------------------------------------ | +| GPT3-5B | 64 | 2048 | 4 | 2048 | 1 | 1 | 1 | 1 | 22521 | 736 | ***5*** | +| GPT3-20B | 64 | 256 | 2 | 2048 | 2 | 1 | 1 | 1 | 5851 | 750 | ***19*** | +| GPT3-175B | 128 | 256 | 1 | 2048 | 4 | 8 | 1 | 6 | 726 | 782 | **156** | +| GPT3-175B | 512 | 2048 | 2 | 2048 | 4 | 8 | 1 | 6 | 782 | [842](https://mlcommons.org/benchmarks/training/) | **145** | +| LLAMA2-7B | 8 | 128 | 1 | 4096 | 1 | 1 | 1 | 1 | 16847 | 776 | ***7*** | +| LLAMA2-13B | 16 | 128 | 1 | 4096 | 1 | 4 | 1 | 10 | 8646 | 754 | ***13*** | +| LLAMA2-70B | 64 | 128 | 1 | 4096 | 4 | 4 | 1 | 20 | 1707 | 759 | ***66*** | +| Nemotron-8B | 64 | 256 | 4 | 4096 | 2 | 1 | 1 | 1 | 12701 | 653 | ***9*** | +| Nemotron-22B | 64 | 256 | 2 | 4096 | 2 | 4 | 1 | 10 | 4256 | 554 | ***27*** | +| Nemotron-340B | 128 | 32 | 1 | 4096 | 8 | 8 | 1 | 12 | 322 | 678 | ***351*** | +| LLAMA3-8B | 8 | 128 | 1 | 8192 | 1 | 1 | 2 | 1 | 12036 | 697 | ***9*** | +| LLAMA3-70B | 64 | 128 | 1 | 8192 | 4 | 4 | 2 | 5 | 1533 | 738 | ***74*** | + +### Finetuning + +- The results in the table below show finetuning performance of LLAMA2 models with SFT (supervised fine-tuning), and LoRA (Low-rank adaptors) at FP8 precision. + - Container: [NeMo24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) + - System: DGX-H100 +- For fine-tuning, we use `SQuAD-v1.1 `__ dataset, and the inputs are packed to 4096 tokens. + + +| Model | Task | #-GPUs | GBS | MBS | Packed Sequence Length | TP | PP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to finetune in mins (10M tokens)*** | +| ----- | ---- | --- | --- | --- | --------------- | -- | -- | ------------------ | ----------------------- | -------------------------------------------------- | +| LLAMA2-7B | SFT | 8 | 32 | 1 | 4096 | 1 | 1 | 17120 | 682 | ***1.2*** | +| LLAMA2-13B | SFT | 8 | 32 | 1 | 4096 | 1 | 4 | 9741 | 754 | ***2.1*** | +| LLAMA2-70B | SFT | 16 | 32 | 1 | 4096 | 4 | 4 | 1833 | 756 | ***5.7*** | +| LLAMA2-7B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 25206 | 673 | ***0.8*** | +| LLAMA2-13B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 14161 | 733 | ***1.5*** | +| LLAMA2-70B | LoRA | 8 | 32 | 1 | 4096 | 2 | 4 | 2557 | 705 | ***8.1*** | diff --git a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml index 77260e515a90..4c6aa643c19d 100644 --- a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml +++ b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml @@ -251,7 +251,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: bf16-mixed # Should be set to bf16-mixed/16-mixed for O1 and O2 to enable the AMP. log_every_n_steps: 100 # Interval of logging. enable_progress_bar: True num_sanity_val_steps: 2 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it @@ -259,6 +259,7 @@ trainer: sync_batchnorm: true enable_checkpointing: False # Provided by exp_manager logger: false # Provided by exp_manager + use_distributed_sampler: false # Lhotse has its own distributed sampler exp_manager: exp_dir: null diff --git a/nemo/README.md b/nemo/README.md index 869ce2f50031..a6025e77822a 100644 --- a/nemo/README.md +++ b/nemo/README.md @@ -10,3 +10,7 @@ NeMo (**Ne**ural **Mo**dules) is a toolkit for creating AI applications built ar * Vision - collection of modules and models for building computer vision networks * Multimodal - collection of modules and models for building multimodal networks * Audio - collection of modules and models for building audio processing networks + +**Performance** + +Performance benchmarks for pre-training and fine-tuning of various models can be found [here](../docs/source/performance/performance_summary.md) diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py index b1fc17bf8836..4c2afd8865d4 100644 --- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py +++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py @@ -67,7 +67,9 @@ class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset): def __init__( self, tokenizer: TokenizerSpec, - prompt_format_fn: Callable[[CutSet, TokenizerWrapper], Sequence[Sequence[int]]], + prompt_format_fn: Callable[ + [CutSet, TokenizerWrapper], tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]] + ], ): super().__init__() self.tokenizer = TokenizerWrapper(tokenizer) @@ -95,7 +97,7 @@ def __getitem__(self, cuts: CutSet) -> PromptedAudioToTextMiniBatch: prompted_transcript_lens=prompts_with_answers_lens, ) - def _collate_tokens(self, tokens: list[list[int]]) -> tuple[torch.Tensor, torch.Tensor]: + def _collate_tokens(self, tokens: list[list[int] | torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: tokens = [torch.as_tensor(t) for t in tokens] token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long) tokens = collate_vectors(tokens, padding_value=self.padding_value) @@ -196,19 +198,19 @@ def canary( }, ) ] - if text := ' '.join(s.text for s in cut.supervisions if s.text is not None): - # Create answer_ids only if there is some transcript in the data. - turns.extend( - dict( - role="assistant", - slots={ - "text": text, - formatter.PROMPT_LANGUAGE_SLOT: ifnone( - cut.supervisions[0].language, cut.custom.get("target_lang") - ), - }, - ), - ) + # If data has no transcript, create empty response with only. + text = ' '.join(s.text for s in cut.supervisions if s.text is not None) + turns.append( + dict( + role="assistant", + slots={ + "text": text, + formatter.PROMPT_LANGUAGE_SLOT: ifnone( + cut.supervisions[0].language, cut.custom.get("target_lang") + ), + }, + ), + ) encoded = formatter.encode_dialog(turns) prompts_with_answers.append(encoded["input_ids"]) prompts.append(encoded["context_ids"]) diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py index 301b2b0ad026..ddc86257a5f1 100644 --- a/nemo/collections/asr/models/aed_multitask_models.py +++ b/nemo/collections/asr/models/aed_multitask_models.py @@ -1038,13 +1038,11 @@ def predict_step( signal = batch.audio signal_len = batch.audio_lens - transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( + _, _, enc_states, enc_mask = self.forward( input_signal=signal, input_signal_length=signal_len, processed_signal=processed_signal, processed_signal_length=processed_signal_length, - transcript=batch.prompt, - transcript_length=batch.prompt_lens, ) text = self.decoding.decode_predictions_tensor( diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py index bae2c9ffdc67..415096a0c9d5 100644 --- a/nemo/collections/asr/parts/utils/streaming_utils.py +++ b/nemo/collections/asr/parts/utils/streaming_utils.py @@ -21,6 +21,7 @@ from omegaconf import OmegaConf from torch.utils.data import DataLoader +from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextMiniBatch from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder from nemo.collections.asr.parts.preprocessing.features import normalize_batch @@ -1643,7 +1644,16 @@ def _get_batch_preds(self, keep_logits=False): tokens = self.input_tokens.to(device).repeat(feat_signal.size(0), 1) tokens_len = torch.tensor([tokens.size(1)] * tokens.size(0), device=device).long() - batch_input = (feat_signal, feat_signal_len, None, None, tokens, tokens_len) + batch_input = PromptedAudioToTextMiniBatch( + audio=feat_signal, + audio_lens=feat_signal_len, + transcript=None, + transcript_lens=None, + prompt=tokens, + prompt_lens=tokens_len, + prompted_transcript=None, + prompted_transcript_lens=None, + ) predictions = self.asr_model.predict_step(batch_input, has_processed_signal=True) self.all_preds.extend(predictions) del predictions diff --git a/nemo/collections/common/prompts/canary.py b/nemo/collections/common/prompts/canary.py index f2b1e58c3bb2..e3f537957832 100644 --- a/nemo/collections/common/prompts/canary.py +++ b/nemo/collections/common/prompts/canary.py @@ -1,3 +1,5 @@ +from typing import Any + from nemo.collections.common.prompts.formatter import Modality, PromptFormatter from nemo.collections.common.tokenizers.canary_tokenizer import ( CANARY_BOS, @@ -33,6 +35,11 @@ class CanaryPromptFormatter(PromptFormatter): }, } + def _validate_slot_values(self, expected: dict[str, Modality], received: dict[str, Any]) -> None: + if "taskname" in received and "task" not in received: + received["task"] = received.pop("taskname") + return super()._validate_slot_values(expected=expected, received=received) + def encode_turn(self, prompt_template: str, expected_slots: dict, slot_values: dict) -> list[int]: # This method handles a level of indirection for Canary. # It maps values provided in trcfg to the actual special tokens diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py index 1be5c41e4919..33a21990e8f7 100644 --- a/nemo/collections/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -1,3 +1,4 @@ +import math from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING, List, Optional, Union @@ -32,6 +33,7 @@ class FineTuningDataModule(pl.LightningDataModule): num_workers (int, optional): The number of worker processes for data loading. Defaults to 8. pin_memory (bool, optional): Whether to pin memory during data loading for faster GPU training. Defaults to True. persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs. Defaults to False. + max_train_steps (int, optional): Maximum number of steps to train. Used to calculate samples mapping for the mmap dataset """ def __init__( @@ -60,18 +62,40 @@ def __init__( self.num_workers = num_workers self.pin_memory = pin_memory self.persistent_workers = persistent_workers + self.micro_batch_size = micro_batch_size + self.global_batch_size = global_batch_size + self.rampup_batch_size = rampup_batch_size + self.data_sampler = None + self.max_train_samples = None + + def setup(self, stage: str): self.data_sampler = MegatronDataSampler( seq_len=self.seq_length, - micro_batch_size=micro_batch_size, - global_batch_size=global_batch_size, - rampup_batch_size=rampup_batch_size, + micro_batch_size=self.micro_batch_size, + global_batch_size=self.global_batch_size, + rampup_batch_size=self.rampup_batch_size, + dataloader_type="batch", ) + # Follows the calculation in nemo.collections.nlp.data.language_modeling.megatron. + # base_dataset_utils.get_datasets_weights_and_num_samples + self.max_train_samples = int(math.ceil(self.global_batch_size * self.trainer.max_steps * 1.005)) + def train_dataloader(self) -> DataLoader: - return self._create_dataloader(self._create_dataset(str(self.train_path))) + return self._create_dataloader( + self._create_dataset( + str(self.train_path), + max_num_samples=self.max_train_samples, + ) + ) def val_dataloader(self) -> DataLoader: - return self._create_dataloader(self._create_dataset(str(self.validation_path))) + return self._create_dataloader( + self._create_dataset( + str(self.validation_path), + is_test=True, + ), + ) def test_dataloader(self) -> DataLoader: return self._create_dataloader( @@ -85,7 +109,12 @@ def test_dataloader(self) -> DataLoader: @lru_cache def _create_dataset(self, path, **kwargs): return create_sft_dataset( - path, tokenizer=self.tokenizer, seq_length=self.seq_length, memmap_workers=self.memmap_workers, **kwargs + path, + tokenizer=self.tokenizer, + seq_length=self.seq_length, + memmap_workers=self.memmap_workers, + seed=self.seed, + **kwargs, ) def _create_dataloader(self, dataset, **kwargs) -> DataLoader: diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py index 7e4cf8b6c74e..e226678d4894 100644 --- a/nemo/collections/llm/gpt/model/mistral.py +++ b/nemo/collections/llm/gpt/model/mistral.py @@ -111,7 +111,7 @@ def make_vocab_size_divisible_by(mistral_vocab_size): num_layers=source.num_hidden_layers, hidden_size=source.hidden_size, ffn_hidden_size=source.intermediate_size, - kv_channels=source.get('head_dim', source.hidden_size // source.num_attention_heads), + kv_channels=getattr(source, 'head_dim', source.hidden_size // source.num_attention_heads), num_attention_heads=source.num_attention_heads, # max_position_embeddings=source.max_position_embeddings, init_method_std=source.initializer_range, diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py index 868ad5b332b5..d8a265fd41d1 100644 --- a/nemo/collections/llm/gpt/model/mixtral.py +++ b/nemo/collections/llm/gpt/model/mixtral.py @@ -155,7 +155,7 @@ def config(self) -> MixtralConfig8x7B | MixtralConfig8x22B: num_layers=config.num_hidden_layers, hidden_size=config.hidden_size, ffn_hidden_size=config.intermediate_size, - kv_channels=config.get('head_dim', config.hidden_size // config.num_attention_heads), + kv_channels=getattr(config, 'head_dim', config.hidden_size // config.num_attention_heads), max_position_embeddings=config.max_position_embeddings, # TODO seq_length=config.max_position_embeddings, # RoPE diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py index a26354fee8be..59acdec6f8e2 100644 --- a/nemo/lightning/data.py +++ b/nemo/lightning/data.py @@ -122,14 +122,45 @@ def add_megatron_sampler( global_batch_size: int, rampup_batch_size: Optional[List[int]] = None, consumed_samples: int = 0, - dataloader_type: Literal["single", "cyclic"] = "single", + dataloader_type: Literal["single", "cyclic", "batch"] = "single", drop_last: bool = True, pad_samples_to_global_batch_size: bool = False, # data_sharding: bool = False ) -> DataLoader: + """ + This function takes an existing PyTorch `DataLoader` and configures it to use a Megatron sampler. + The Megatron sampler is responsible for splitting the data into batches + during training with Megatron. + + Args: + dataloader (DataLoader): The original PyTorch DataLoader to wrap. + micro_batch_size (int): The size of each micro-batch. + global_batch_size (int): The effective size of the training batch across all data parallel devices. + rampup_batch_size (Optional[List[int]]): A list of target batch sizes for a gradual + rampup schedule during training (optional). + consumed_samples (int, optional): The number of samples consumed before + starting this iteration (defaults to 0). + dataloader_type (Literal["single", "cyclic", "batch"], optional): The type of + Megatron sampler to use. Valid options are: + - "single": Uses `MegatronPretrainingSampler` for single pass data sampling. + - "cyclic": Uses `MegatronPretrainingRandomSampler` for cyclic data sampling. + - "batch": Uses `MegatronPretrainingBatchSampler` for batch sampling. This is the option to + use for fine-tuning workloads, where sequence lengths are variable between samples. + Sampling the entire global batch together ensures that sequences in a global batch are + padded to the same lengths. + Defaults to "single". + drop_last (bool, optional): Whether to drop the last incomplete batch + (defaults to True). + pad_samples_to_global_batch_size (bool, optional): Whether to pad the last incomplete + batch to the `global_batch_size` (defaults to False, only applies when + `drop_last` is False). + + Returns: + DataLoader: A new DataLoader instance with the configured Megatron sampler. + """ + from megatron.core import parallel_state - ## TODO: expose drop_last and pad_samples_to_global_batch_size args if dataloader_type == 'single': batch_sampler = MegatronPretrainingSampler( total_samples=len(dataloader.dataset), @@ -152,6 +183,21 @@ def add_megatron_sampler( drop_last=drop_last, # data_sharding=data_sharding ) + elif dataloader_type == 'batch': + from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import ( + MegatronPretrainingBatchSampler, + ) + + batch_sampler = MegatronPretrainingBatchSampler( + total_samples=len(dataloader.dataset), + consumed_samples=consumed_samples, + micro_batch_size=micro_batch_size, + global_batch_size=global_batch_size, + data_parallel_rank=parallel_state.get_data_parallel_rank(), + data_parallel_size=parallel_state.get_data_parallel_world_size(), + drop_last=drop_last, + pad_samples_to_global_batch_size=not drop_last, + ) else: raise Exception(f'{dataloader_type} dataloader type is not supported.') diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index dfc78c30a929..741c1400474a 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -293,9 +293,12 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa """ connector = self._get_connector(path) ckpt_path: Path = connector.local_path(base_path=base_path) - ckpt_path = connector(ckpt_path, overwrite=overwrite) + # If already in multiproc environment (e.g. due to torchrun invocation) run only on RANK = 0 + from nemo.utils.get_rank import is_global_rank_zero - connector.on_import_ckpt(self) + if is_global_rank_zero(): + ckpt_path = connector(ckpt_path, overwrite=overwrite) + connector.on_import_ckpt(self) return ckpt_path diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index 5b2ac7ff1adb..37978510972d 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -230,6 +230,20 @@ def forward( _num_microbatches: int = num_microbatches or self.infer_num_microbatches(data) pipeline = self.pipeline + + use_global_batch_sampler = self.trainer.datamodule.data_sampler.dataloader_type == 'batch' + if use_global_batch_sampler: + from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split + + # The current way of using a batch sampler + split to micro iterator results in + # extraneous padding, and is only implemented to ensure bit-exactness with NeMo 1. + # This part in NeMo 1 was written when megatron fwd_bwd_function did not support unequal + # sequence lengths, but it does now. Hence this part should be revisited in the future. + batch = next(data) + if isinstance(batch, tuple) and len(batch) == 3: + batch = batch[0] + data = get_iterator_k_split(batch, _num_microbatches, True) + data_iterator: List[Iterator[DataT]] = self.to_data_iterator_list(data) context = self._build_context({**locals()}) diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py index 9848fdb2b8fd..70ebf943b333 100644 --- a/nemo/lightning/pytorch/callbacks/nsys.py +++ b/nemo/lightning/pytorch/callbacks/nsys.py @@ -35,15 +35,15 @@ def __init__( ranks: List[int] = [0], gen_shape: bool = False, ): - assert type(start_step) == int, f'Nsys start_step must be of type int. Found: {type(start_step)}' + assert type(start_step) is int, f'Nsys start_step must be of type int. Found: {type(start_step)}' self._nsys_profile_start_step = start_step - assert type(end_step) == int, f'Nsys end_step must be of type int. Found: {type(start_step)}' + assert type(end_step) is int, f'Nsys end_step must be of type int. Found: {type(start_step)}' self._nsys_profile_end_step = end_step assert ( self._nsys_profile_end_step >= self._nsys_profile_start_step - ), f'Nsys end_step must be greater than or equal to nsys start_step' + ), 'Nsys end_step must be greater than or equal to nsys start_step' self._nsys_profile_ranks = ranks self._nsys_profile_gen_shape = gen_shape @@ -60,8 +60,9 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Opt """ device = trainer.strategy.root_device + current_step = trainer.strategy.current_epoch_step if device.type == 'cuda': - if batch_idx == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks: + if current_step == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks: logging.info("====== Start nsys profiling ======") torch.cuda.cudart().cudaProfilerStart() if self._nsys_profile_gen_shape: @@ -76,8 +77,9 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int) """ device = trainer.strategy.root_device + current_step = trainer.strategy.current_epoch_step if device.type == 'cuda': - if batch_idx == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks: + if current_step == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks: logging.info("====== End nsys profiling ======") torch.cuda.cudart().cudaProfilerStop() torch.autograd.profiler.emit_nvtx().__exit__(None, None, None) diff --git a/nemo/lightning/pytorch/callbacks/progress.py b/nemo/lightning/pytorch/callbacks/progress.py index 9ccf871f820f..1b1276953a61 100644 --- a/nemo/lightning/pytorch/callbacks/progress.py +++ b/nemo/lightning/pytorch/callbacks/progress.py @@ -8,15 +8,6 @@ class MegatronProgressBar(TQDMProgressBar): for megatron models. """ - def get_current_epoch_step(self, trainer) -> int: - """ - Get the value of step within an epoch. - """ - return max( - trainer.fit_loop.epoch_loop.automatic_optimization.optim_progress.optimizer.step.current.completed, - trainer.fit_loop.epoch_loop.manual_optimization.optim_step_progress.current.completed, - ) - def init_train_tqdm(self): """ Override bar_format to not have 's/it'. @@ -41,7 +32,7 @@ def on_train_batch_end(self, trainer, pl_module, *_, **__): """ Override parent class on_train_batch_end to update progress bar per global batch instead of per microbatch. """ - n = self.get_current_epoch_step(trainer) + n = trainer.strategy.current_epoch_step if self._should_update(n, self.train_progress_bar.total): _update_n(self.train_progress_bar, n) self.train_progress_bar.set_postfix(self.get_metrics(trainer, pl_module)) diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py index 7796e4bb92de..3909c680cfc1 100644 --- a/nemo/lightning/pytorch/plugins/data_sampler.py +++ b/nemo/lightning/pytorch/plugins/data_sampler.py @@ -23,7 +23,7 @@ def __init__( micro_batch_size: int = 4, global_batch_size: int = 8, rampup_batch_size: Optional[List[int]] = None, - dataloader_type: Literal["single", "cyclic"] = "single", + dataloader_type: Literal["single", "cyclic", "batch"] = "single", init_consumed_samples: int = 0, ): self.seq_len = seq_len diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py index f44906ef7b7b..99eee9ad6bd3 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies.py @@ -647,6 +647,16 @@ def checkpoint_io(self) -> CheckpointIO: def checkpoint_io(self, io: CheckpointIO) -> None: self._checkpoint_io = io + @property + def current_epoch_step(self) -> int: + """ + Get the value of step within an epoch. + """ + return max( + self.trainer.fit_loop.epoch_loop.automatic_optimization.optim_progress.optimizer.step.current.completed, + self.trainer.fit_loop.epoch_loop.manual_optimization.optim_step_progress.current.completed, + ) + def _get_data_step(self, step_type: str) -> Optional[_ModuleStepFunction]: for fn_name in [f"{step_type}_data_step", "data_step"]: if hasattr(self.lightning_module, fn_name): diff --git a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py index 5785db656217..425a6c696120 100644 --- a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py @@ -132,7 +132,7 @@ def load_config(mistral_config, tokenizer, config_path): # Tokenizer config if hasattr(tokenizer, 'vocab_file'): nemo_config.tokenizer.model = tokenizer.vocab_file - else: + elif os.path.exists(os.path.join(config_path, 'tekken.json')): # Load tekken.json, extract the 'vocab' field & write it to file. vocab_path = os.path.join(config_path, 'tekken.json') assert os.path.exists(vocab_path), f"Expected {vocab_path} to exist" @@ -156,6 +156,14 @@ def load_config(mistral_config, tokenizer, config_path): 'sentencepiece_legacy': False, } nemo_config.tokenizer = tokenizer_dict + else: + # Otherwise use HF + tokenizer_dict = { + 'library': 'huggingface', + 'type': args.input_name_or_path, + 'use_fast': True, + } + nemo_config.tokenizer = tokenizer_dict # TODO(@akoumparouli): rope_scaling. nemo_config['rotary_base'] = mistral_config['rope_theta'] diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py index 99d1795aea9c..c50267ef6b42 100644 --- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py @@ -40,7 +40,7 @@ def get_args(): ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output HF checkpoint.") parser.add_argument('--hf_model_name', type=str, default="mistralai/Mistral-7B-v0.1", help="Name of HF checkpoint") - parser.add_argument("--precision", type=str, default="32", help="Model precision") + parser.add_argument("--precision", type=str, default="bf16", help="Model precision") args = parser.parse_args() return args @@ -48,7 +48,8 @@ def get_args(): def load_config(hf_model_name, nemo_config): hf_config = AutoConfig.from_pretrained(hf_model_name) # SWA; nemo_config.window_size is list [left-bound, right-bound] - hf_config.sliding_window = nemo_config.window_size[0] + if hasattr(nemo_config, 'window_size'): + hf_config.sliding_window = nemo_config.window_size[0] hf_config.max_position_embeddings = nemo_config.encoder_seq_length hf_config.num_hidden_layers = nemo_config.num_layers hf_config.hidden_size = nemo_config.hidden_size @@ -83,6 +84,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None: model_config.use_cpu_initialization = True else: map_location = None + model_config.perform_initialization = False if cpu_only: logging.info("******** Loading model on CPU. This will take a significant amount of time.") @@ -129,7 +131,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None: num_layers = model.cfg.num_layers num_query_groups = model.cfg.get("num_query_groups", head_num) # different num_query_groups for 70B - head_size = hidden_size // head_num + head_size = model.cfg.get('kv_channels', hidden_size // head_num) heads_per_group = head_num // num_query_groups qkv_total_dim = head_num + 2 * num_query_groups @@ -215,6 +217,14 @@ def convert(in_file, precision=None, cpu_only=True) -> None: if __name__ == '__main__': + import transformers + + type(transformers.__version__) + from packaging.version import Version + + if Version(transformers.__version__) < Version('4.44.0'): + logging.warning("You need to use transformers >= v4.44.0") + args = get_args() hf_state_dict, nemo_config, dtype = convert(args.input_name_or_path, args.precision) diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py index 4e805c8f34de..b6267ad7d1cf 100644 --- a/tests/collections/asr/test_asr_multitask_model_bpe.py +++ b/tests/collections/asr/test_asr_multitask_model_bpe.py @@ -17,18 +17,29 @@ import pytest import torch +from lhotse import CutSet +from lhotse.testing.dummies import DummyManifest from omegaconf import DictConfig +from nemo.collections.asr.data.audio_to_text_lhotse_prompted import ( + PromptedAudioToTextLhotseDataset, + PromptedAudioToTextMiniBatch, + canary, +) from nemo.collections.asr.models.aed_multitask_models import EncDecMultiTaskModel from nemo.collections.asr.parts.submodules import multitask_beam_decoding as beam_decode from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis +from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED from nemo.collections.common.prompts.canary import CanaryPromptFormatter from nemo.collections.common.tokenizers import CanaryTokenizer @pytest.fixture() def asr_model(test_data_dir): - preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})} + preprocessor = { + 'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', + 'params': {"window_size": 0.02, "window_stride": 0.01, "features": 64}, + } model_defaults = {'asr_enc_hidden': 128, 'lm_enc_hidden': 64, 'lm_dec_hidden': 64} @@ -384,3 +395,113 @@ def test_build_tokenizer(self, asr_model, test_data_dir): for i, j in zip(ids1, ids2): assert i == j + + @pytest.mark.unit + def test_predict_step(self, asr_model, test_data_dir): + cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True) + c = cuts[0] + c.supervisions[0].language = "en" + c.source_lang = "en" + c.target_lang = "en" + c.task = "asr" + c.pnc = "no" + dataset = PromptedAudioToTextLhotseDataset(tokenizer=asr_model.tokenizer, prompt_format_fn=canary) + batch = dataset[cuts] + + # Numpy array test + outputs = asr_model.predict_step(batch) + print(outputs) + assert len(outputs) == 1 + assert isinstance(outputs[0], str) + + @pytest.mark.unit + def test_FrameBatchMultiTaskAED(self, asr_model, test_data_dir): + model = FrameBatchMultiTaskAED(asr_model, batch_size=1) + + audio_file = os.path.join(test_data_dir, "asr", "train", "an4", "wav", "an46-mmap-b.wav") + meta = { + 'audio_filepath': audio_file, + 'duration': 100000, + 'source_lang': 'en', + 'taskname': 'asr', + 'target_lang': 'en', + 'pnc': 'yes', + 'answer': 'nothing', + } + model.read_audio_file(audio_file, delay=0.0, model_stride_in_secs=40.0, meta_data=meta) + outputs = model.transcribe() + assert isinstance(outputs, str) + + +@pytest.mark.unit +def test_prompted_dataset(asr_model): + dataset = PromptedAudioToTextLhotseDataset(tokenizer=asr_model.tokenizer, prompt_format_fn=canary) + + cuts = DummyManifest(CutSet, begin_id=0, end_id=3, with_data=True) + + c = cuts[0] + c.supervisions[0].language = "en" + c.source_lang = "en" + c.target_lang = "en" + c.task = "asr" + c.pnc = "no" + + c = cuts[1] + c.supervisions[0].language = "de" + c.supervisions[0].text = "unerheblich" + c.source_lang = "en" + c.target_lang = "de" + c.taskname = "ast" # note: testing for "taskname" as we support it together with "task" + c.pnc = "yes" + + c = cuts[2] + c.supervisions[0].language = "en" + c.supervisions[0].text = "" + c.source_lang = "en" + c.target_lang = "en" + c.task = "asr" + c.pnc = "yes" + + batch = dataset[cuts] + + assert isinstance(batch, PromptedAudioToTextMiniBatch) + assert batch.audio.shape == (3, 16000) + assert batch.audio_lens.tolist() == [16000, 16000, 16000] + + # Test example 0 (transcription) + i = 0 + assert ( + asr_model.tokenizer.ids_to_text(batch.prompt[i]) == '<|startoftranscript|><|en|><|transcribe|><|en|><|nopnc|>' + ) + assert batch.prompt_lens[i] == 5 + assert asr_model.tokenizer.ids_to_text(batch.transcript[i]) == 'i##r##r##el##e##v##a##nt' + assert batch.transcript_lens[i] == 8 + assert ( + asr_model.tokenizer.ids_to_text(batch.prompted_transcript[i]) + == '<|startoftranscript|><|en|><|transcribe|><|en|><|nopnc|>i##r##r##el##e##v##a##nt<|endoftext|>' + ) + assert batch.prompted_transcript_lens[i] == 14 + + # Test example 1 (translation) + i = 1 + assert asr_model.tokenizer.ids_to_text(batch.prompt[i]) == '<|startoftranscript|><|en|><|translate|><|de|><|pnc|>' + assert batch.prompt_lens[i] == 5 + assert asr_model.tokenizer.ids_to_text(batch.transcript[i]) == 'u##ne##r##h##e##b##l##i##c##h' + assert batch.transcript_lens[i] == 10 + assert ( + asr_model.tokenizer.ids_to_text(batch.prompted_transcript[i]) + == '<|startoftranscript|><|en|><|translate|><|de|><|pnc|>u##ne##r##h##e##b##l##i##c##h<|endoftext|>' + ) + assert batch.prompted_transcript_lens[i] == 16 + + # Test example 2 (no transcript, e.g. noise) + i = 2 + assert asr_model.tokenizer.ids_to_text(batch.prompt[i]) == '<|startoftranscript|><|en|><|transcribe|><|en|><|pnc|>' + assert batch.prompt_lens[i] == 5 + assert asr_model.tokenizer.ids_to_text(batch.transcript[i]) == '' * 10 + assert batch.transcript_lens[i] == 0 + assert ( + asr_model.tokenizer.ids_to_text(batch.prompted_transcript[i]) + == '<|startoftranscript|><|en|><|transcribe|><|en|><|pnc|><|endoftext|>' + '' * 10 + ) + assert batch.prompted_transcript_lens[i] == 6 diff --git a/tests/lightning/pytorch/callbacks/test_nsys.py b/tests/lightning/pytorch/callbacks/test_nsys.py index e8734ad1c1ac..d87da58b8ad0 100644 --- a/tests/lightning/pytorch/callbacks/test_nsys.py +++ b/tests/lightning/pytorch/callbacks/test_nsys.py @@ -68,6 +68,7 @@ def test_on_train_batch_start_profiling( mock_get_rank.return_value = 0 callback = NsysCallback(start_step=10, end_step=20, ranks=[0], gen_shape=True) + mock_trainer.strategy.current_epoch_step = 10 callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10) mock_cudart().cudaProfilerStart.assert_called_once() @@ -80,6 +81,7 @@ def test_on_train_batch_start_no_profiling(self, mock_cudart, mock_get_rank, moc mock_get_rank.return_value = 0 callback = NsysCallback(start_step=10, end_step=20, ranks=[0]) + mock_trainer.strategy.current_epoch_step = 9 callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 9) mock_cudart().cudaProfilerStart.assert_not_called() @@ -94,6 +96,7 @@ def test_on_train_batch_end_profiling( mock_get_rank.return_value = 0 callback = NsysCallback(start_step=10, end_step=20, ranks=[0]) + mock_trainer.strategy.current_epoch_step = 20 callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20) mock_cudart().cudaProfilerStop.assert_called_once() @@ -163,6 +166,7 @@ def test_profiling_range( mock_get_rank.return_value = 0 callback = NsysCallback(start_step=start_step, end_step=end_step, ranks=[0]) + mock_trainer.strategy.current_epoch_step = batch_idx callback.on_train_batch_start(mock_trainer, mock_pl_module, None, batch_idx) if expected_call: @@ -183,13 +187,16 @@ def test_single_profile_range(self, mock_cudart, mock_get_rank, mock_trainer, mo mock_trainer.strategy.root_device.type = 'cuda' # Start of range + mock_trainer.strategy.current_epoch_step = 10 callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10) assert mock_cudart().cudaProfilerStart.call_count == 1, "cudaProfilerStart was not called" # Middle of range + mock_trainer.strategy.current_epoch_step = 25 callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 25) assert mock_cudart().cudaProfilerStart.call_count == 1, "cudaProfilerStart was called again" # End of range + mock_trainer.strategy.current_epoch_step = 40 callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 40) assert mock_cudart().cudaProfilerStop.call_count == 1, "cudaProfilerStop was not called"