diff --git a/.github/workflows/changelog-build.yml b/.github/workflows/changelog-build.yml
index 7e16c344acb8..b6d836f71cec 100644
--- a/.github/workflows/changelog-build.yml
+++ b/.github/workflows/changelog-build.yml
@@ -1,13 +1,13 @@
name: 'Changelog Build (Release)'
on:
+ workflow_dispatch:
push:
tags:
- '*'
-
+
jobs:
changelog:
- if: startsWith(github.ref, 'refs/tags/')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
@@ -39,7 +39,7 @@ jobs:
ignorePreReleases: "false"
failOnError: "false"
fromTag: ${{ steps.previous_tag.outputs.tag_name }}
- toTag: ${{ github.ref_name }}
+ toTag: ${{ github.ref_name || github.sha }}
- name: Print Changelog
run: |
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 2919be733ac3..ffe6e00e2ad3 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -365,6 +365,34 @@ jobs:
# rm -rf llama2_qat_results
+ L2_Distill_Llama2:
+ needs: [cicd-test-container-setup]
+ uses: ./.github/workflows/_test_template.yml
+ with:
+ RUNNER: self-hosted-azure
+ SCRIPT: |
+ python examples/nlp/language_modeling/megatron_gpt_distillation.py \
+ trainer.devices=2 \
+ trainer.num_nodes=1 \
+ trainer.precision=bf16 \
+ trainer.max_steps=5 \
+ trainer.log_every_n_steps=5 \
+ trainer.val_check_interval=5 \
+ trainer.limit_val_batches=2 \
+ model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+ model.kd_teacher_restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+ model.tensor_model_parallel_size=2 \
+ model.pipeline_model_parallel_size=1 \
+ model.micro_batch_size=1 \
+ model.global_batch_size=4 \
+ model.optim.name=distributed_fused_adam \
+ model.optim.sched.warmup_steps=1 \
+ model.data.data_prefix=[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
+ exp_manager.exp_dir=examples/nlp/megatron_llama_distill
+ AFTER_SCRIPT: |
+ rm -rf examples/nlp/megatron_llama_distill
+
# L2: ASR dev run
ASR_dev_run_Speech_to_Text:
needs: [cicd-test-container-setup]
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3f4c4f3c19de..af09fa241c59 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,112 +1,60 @@
name: "NeMo Code release"
on:
- issue_comment:
- types: [created]
-
+ workflow_dispatch:
+ inputs:
+ branch:
+ description: Branch to release
+ required: true
+ type: string
jobs:
main:
- if: >
- github.event_name == 'issue_comment' &&
- github.event.issue.pull_request &&
- startsWith(github.event.comment.body, '/release-please') &&
- contains(fromJSON('["ko3n1g"]'), github.actor)
+ if: contains(fromJSON('["ko3n1g"]'), github.actor)
runs-on: ubuntu-latest
environment:
name: main
steps:
- - name: Update PR issue comment
- shell: bash
- env:
- message: ${{ github.event.comment.body }}
- run: |
- message="$message
-
- ---
-
- Releasebot 🤖: Release processes started...
- "
- message="${message//$'\n'/
}"
-
- curl -L \
- -X PATCH \
- -H "Accept: application/vnd.github+json" \
- -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
- -H "X-GitHub-Api-Version: 2022-11-28" \
- https://api.github.com/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }} \
- -d '{"body":"'"$message"'"}'
-
- - name: Get PR number
- shell: bash
- id: get-pr-num
- run: |
- PR_URL="${{ github.event.issue.pull_request.url }}"
- PR_NUM=${PR_URL##*/}
- echo "pr_number=$PR_NUM" >> $GITHUB_OUTPUT
-
- - name: Get Pull Request Information
- uses: actions/github-script@v6
- id: get-pr-branch
- with:
- result-encoding: string
- script: |
- const pr = await github.rest.pulls.get({
- owner: context.repo.owner,
- repo: context.repo.repo,
- pull_number: ${{ steps.get-pr-num.outputs.pr_number }}
- });
- console.log('Pull Request Information:', pr.data);
- return pr.data.head.ref;
- name: Checkout repository
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
- ref: ${{ steps.get-pr-branch.outputs.result }}
+ ref: ${{ inputs.branch }}
- - name: Get version number
+ - name: Create release
id: version-number
run: |
cd ${{ github.run_id }}
VERSION=$(python -c "import nemo; print(nemo.__version__)")
- echo "VERSION=$VERSION" >> "$GITHUB_OUTPUT"
-
- - name: Extract changelog
- id: extract-changelog
- uses: peter-evans/find-comment@v3
- with:
- issue-number: ${{ steps.get-pr-num.outputs.pr_number }}
- body-includes: '# Detailed Changelogs'
-
- - name: Extract summary
- id: extract-summary
- uses: peter-evans/find-comment@v3
- with:
- issue-number: ${{ steps.get-pr-num.outputs.pr_number }}
- body-includes: '# Highlights'
-
- - name: Create Release doc
- id: create-release-doc
- env:
- SUMMARY: ${{ steps.extract-summary.outputs.comment-body }}
- CHANGELOG: ${{ steps.extract-changelog.outputs.comment-body }}
- run: |
- echo "TITLE<> $GITHUB_ENV
- echo "NVIDIA Neural Modules ${{ steps.version-number.outputs.VERSION }}" >> $GITHUB_ENV
- echo "EOF" >> $GITHUB_ENV
+ NAME="NVIDIA Neural Modules ${VERSION}"
+ CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
+ CHANGELOG=$(echo "$CHANGELOG" | sed '/./,$!d' | sed ':a;N;$!ba;s/\n$//')
+
+ PAYLOAD=$(jq \
+ -n \
+ -c \
+ --arg CI_COMMIT_BRANCH "${{ inputs.branch }}" \
+ --arg NAME "$NAME" \
+ --arg BODY "$CHANGELOG" \
+ '{
+ "tag_name": $CI_COMMIT_BRANCH,
+ "target_commitish": $CI_COMMIT_BRANCH,
+ "name": $NAME,
+ "body": $BODY,
+ "draft": false,
+ "prerelease": false,
+ "generate_release_notes": false
+ }'
+ )
- echo "BODY<> $GITHUB_ENV
- echo "$SUMMARY" >> $GITHUB_ENV
- echo "$CHANGELOG" >> $GITHUB_ENV
- echo "EOF" >> $GITHUB_ENV
-
- - name: Create Release
- uses: softprops/action-gh-release@v2
- with:
- name: ${{ env.TITLE }}
- tag_name: ${{ steps.version-number.outputs.VERSION }}
- body: ${{ env.BODY }}
+ curl -L \
+ -X POST \
+ -H "Accept: application/vnd.github+json" \
+ -H "Authorization: Bearer ${{ secrets.PAT }}" \
+ -H "X-GitHub-Api-Version: 2022-11-28" \
+ https://api.github.com/repos/NVIDIA/NeMo/releases \
+ -d "$PAYLOAD"
- name: Build, test, and release wheel
env:
@@ -114,6 +62,8 @@ jobs:
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
run: |
cd ${{ github.run_id }}
+ EXPECTED_VERSION=$(python -c 'import nemo; print(nemo.__version__)')
+
python3 -m pip install --upgrade build
python3 -m build
@@ -122,7 +72,6 @@ jobs:
cd ../
INSTALLED_VERSION=$(python -c 'import nemo; print(nemo.__version__)')
- EXPECTED_VERSION=${{ steps.version-number.outputs.VERSION }}
if [[ "$INSTALLED_VERSION" != "$EXPECTED_VERSION" ]]; then
echo 'Wheel has an outdated version, mission abort immediately!'
@@ -134,34 +83,6 @@ jobs:
python3 -m pip install --upgrade twine
python3 -m twine upload --repository pypi dist/*
- - name: Update PR issue comment
- shell: bash
- env:
- message: ${{ github.event.comment.body }}
- run: |
- message="$message
-
- ---
-
- Releasebot 🤖: Release done 🎉
- "
- message="${message//$'\n'/
}"
-
- curl -L \
- -X PATCH \
- -H "Accept: application/vnd.github+json" \
- -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
- -H "X-GitHub-Api-Version: 2022-11-28" \
- https://api.github.com/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }} \
- -d '{"body":"'"$message"'"}'
-
- - name: Close Pull
- run: |
- cd ${{ github.run_id }}
- gh pr close --comment "Releasebot 🤖: Closing PR" "${{ steps.get-pr-num.outputs.pr_number }}"
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
- name: notify
run: |
MESSAGE='{
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 000000000000..7fd5cd00b352
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,921 @@
+# Changelog
+
+## NVIDIA Neural Modules 2.0.0rc1
+
+### Highlights
+
+#### Large language models
+
+- PEFT: QLoRA support, LoRA/QLora for Mixture-of-Experts (MoE) dense layer
+- State Space Models & Hybrid Architecture support (Mamba2 and NV-Mamba2-hybrid)
+- Support Nemotron, Minitron, Gemma2, Qwen, RAG
+- Custom Tokenizer training in NeMo
+- Update the Auto-Configurator for EP, CP and FSDP
+
+#### Multimodal
+
+- NeVA: Add SOTA LLM backbone support (Mixtral/LLaMA3) and suite of model parallelism support (PP/EP)
+- Support Language Instructed Temporal-Localization Assistant (LITA) on top of video NeVA
+
+#### ASR
+
+- SpeechLM and SALM
+- Adapters for Canary Customization
+- Pytorch allocator in PyTorch 2.2 improves training speed up to 30% for all ASR models
+- Cuda Graphs for Transducer Inference
+- Replaced webdataset with Lhotse - gives up to 2x speedup
+- Transcription Improvements - Speedup and QoL Changes
+- ASR Prompt Formatter for multimodal Canary
+
+#### Export & Deploy
+
+- In framework PyTriton deployment with backends: - PyTorch - vLLM - TRT-LLM update to 0.10
+- TRT-LLM C++ runtime
+
+### Detailed Changelogs
+
+#### ASR
+
+Changelog
+
+- Support dataloader as input to `audio` for transcription by @titu1994 :: PR: #9201
+- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205
+- Fix Online_Offline_Microphone_VAD_Demo.ipynb by @stevehuang52 :: PR: #9251
+- Remove .nemo instead of renaming by @mikolajblaz :: PR: #9281
+- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. by @galv :: PR: #9347
+- Revert "Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer." by @titu1994 :: PR: #9351
+- Prompt formatter API and canary transcribe tensor input support by @pzelasko :: PR: #9206
+- Fix prompt formatter's defaults=None case in multi-task model by @pzelasko :: PR: #9366
+- move AED chunked infer script by @stevehuang52 :: PR: #9367
+- Use model-cast-to-bfloat16 rather than AMP-to-bfloat16 for inference. by @galv :: PR: #9198
+- ci: Fix `L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_C… by @ko3n1g :: PR: #9399
+- Fix logging message for ASR by @titu1994 :: PR: #9469
+- Add support to change Multi task model prompt by @titu1994 :: PR: #9542
+- Enable encoder adapters for Canary and MultiTaskAED models by @titu1994 :: PR: #9409
+- Audio model collection by @anteju :: PR: #9263
+- TitaNet Batch Verify Speaker by @monica-sekoyan :: PR: #9337
+- Fix the arguments of forward_for_export function in msdd_models by @tango4j :: PR: #9624
+- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697
+- refactor: notebook branch release by @ko3n1g :: PR: #9711
+- Canary Adapters tutorial (#9670) by @nithinraok :: PR: #9777
+- typos and branch name update to r2.0.0rc1 by @nithinraok :: PR: #9846
+- Fix RNNT alignments test by @artbataev :: PR: #9770
+- By default trust remote code from HF Datasets by @nithinraok :: PR: #9886
+- Temporarily disable cuda graph based RNN-T greedy inference for r2.0.0rc1 by @galv :: PR: #9904
+- Enable CUDA graphs by default, but require CUDA 12.6 for full graphs by @artbataev :: PR: #9919
+- update branch name for script by @nithinraok :: PR: #9936
+- updte branch by @nithinraok :: PR: #9942
+
+
+#### TTS
+
+Changelog
+
+- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205
+- Add mel codec checkpoints by @anteju :: PR: #9228
+- GPU unit tests: Mark flaky tests to be fixed by @pablo-garay :: PR: #9559
+- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697
+- refactor: notebook branch release by @ko3n1g :: PR: #9711
+
+
+
+#### LLM/Multimodal
+
+Changelog
+
+- Update nemo.export module for quantized models by @janekl :: PR: #9218
+- Add save option to the TRT-LLM export test script by @oyilmaz-nvidia :: PR: #9221
+- Checkpoint resuming compatible for 2403 container by @suiyoubi :: PR: #9199
+- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205
+- use get with fallback when reading checkpoint_callback_params by @akoumpa :: PR: #9223
+- Revert rope fusion defaults by @cuichenx :: PR: #9237
+- fix import by @akoumpa :: PR: #9240
+- Add TRT-LLM params like max_num_tokens and opt_num_tokens by @oyilmaz-nvidia :: PR: #9210
+- sum-reduce grad_norm in DP+CP domain by @erhoo82 :: PR: #9262
+- Alit/bert convert fix by @JRD971000 :: PR: #9285
+- conv1d stable version by @JRD971000 :: PR: #9330
+- Fix trainer builder when exp_manager is not in config by @yaoyu-33 :: PR: #9293
+- Fix Peft Weights Loading in NeVA by @yaoyu-33 :: PR: #9341
+- Skip sequence_parallel allreduce when using Mcore DistOpt by @akoumpa :: PR: #9344
+- Fix FSDP gradient calculation with orig params by @janEbert :: PR: #9335
+- TRT-LLM Export Code Cleanup by @oyilmaz-nvidia :: PR: #9270
+- support null/None truncation field by @arendu :: PR: #9355
+- NeVa token fusion by @paul-gibbons :: PR: #9245
+- bugfix if using mcore distOpt with sft by @akoumpa :: PR: #9356
+- Re-org export code by @oyilmaz-nvidia :: PR: #9353
+- QLoRA by @cuichenx :: PR: #9340
+- PeFT fix for distOpt by @akoumpa :: PR: #9392
+- [NeMo-UX] Integrating mcore's DistributedDataParallel into MegatronStrategy by @marcromeyn :: PR: #9387
+- cherry pick of #9266 by @dimapihtar :: PR: #9411
+- Enable specifying alpha for PTQ INT8 SmoothQuant method by @janekl :: PR: #9423
+- add support for new mcore ds features by @dimapihtar :: PR: #9388
+- LoRA for MoE Layer by @cuichenx :: PR: #9396
+- Mistral-7B: apply user's precision to output checkpoint by @akoumpa :: PR: #9222
+- Add option to merge distributed optimizer buckets by @timmoon10 :: PR: #9414
+- TRT-LLM 0.10 Update by @oyilmaz-nvidia :: PR: #9402
+- In-framework deployment by @oyilmaz-nvidia :: PR: #9438
+- Bugfix missing variables and argument changes to MegatronPretrainingRandomSampler by @jstjohn :: PR: #9458
+- Hyena Operator by @guyjacob :: PR: #9264
+- Refactor Quantizer for reusing in QAT by @kevalmorabia97 :: PR: #9276
+- move load state dict after initialize parallel state in nlp_model by @ryxli :: PR: #9382
+- Enable user to optionally upgrade Megatron by @jstjohn :: PR: #9478
+- Fix unwrap model by @cuichenx :: PR: #9480
+- fix operator precedence by @akoumpa :: PR: #9403
+- [NeMo-UX] Adding context- & expert-parallelism to MegatronStrategy by @marcromeyn :: PR: #9525
+- update mcoreddp call by @akoumpa :: PR: #9345
+- mcore distOpt restore fix by @akoumpa :: PR: #9421
+- vLLM Export Support by @apanteleev :: PR: #9381
+- PL: Delete precision if using plugin. TODO switch to MegatronTrainerB… by @akoumpa :: PR: #9535
+- extend get_gpt_layer_modelopt_spec to support MoE by @akoumpa :: PR: #9532
+- fix mock data generation for legacy dataset by @dimapihtar :: PR: #9530
+- add reset learning rate functionality by @dimapihtar :: PR: #9372
+- Use closed-formula to round by multiple by @akoumpa :: PR: #9307
+- GPU unit tests: Mark flaky tests to be fixed by @pablo-garay :: PR: #9559
+- Consolidate gpt continue training script into pretraining script by @yaoyu-33 :: PR: #9413
+- Enable encoder adapters for Canary and MultiTaskAED models by @titu1994 :: PR: #9409
+- PTQ refinements by @janekl :: PR: #9574
+- Add ModelOpt QAT example for Llama2 SFT model by @kevalmorabia97 :: PR: #9326
+- Multimodal projection layer adapter fix for PP>1 by @paul-gibbons :: PR: #9445
+- Add offline quantization script for QLoRA deployment by @cuichenx :: PR: #9455
+- Make QLoRA more model-agnostic by @cuichenx :: PR: #9488
+- Set n_gpu to None in nemo export by @oyilmaz-nvidia :: PR: #9593
+- [NeMo-UX] Fix Megatron-optimizer by @marcromeyn :: PR: #9599
+- Chat template support for megatron_gpt_eval.py by @akoumpa :: PR: #9354
+- [NeMo-UX] Add PEFT by @cuichenx :: PR: #9490
+- Alit/mamba tmp by @JRD971000 :: PR: #9612
+- Enable MCore checkpointing optimizations by @mikolajblaz :: PR: #9505
+- Change mixtral moe key name for trt-llm by @oyilmaz-nvidia :: PR: #9620
+- fix ckpt load bug by @dimapihtar :: PR: #9621
+- Alit/mamba by @JRD971000 :: PR: #9575
+- Unwrap ckpt_io for model opt (async save) by @mikolajblaz :: PR: #9622
+- MCore T5 support for NeMo - Training by @huvunvidia :: PR: #9432
+- [Nemo-UX] Expose transformer_layer_spec inside GPTConfig by @marcromeyn :: PR: #9592
+- Update NeMo Clip to Use MCore Modules by @yaoyu-33 :: PR: #9594
+- Mistral + Mixtral Support for NeVa by @paul-gibbons :: PR: #9459
+- Adding support for mcore generate by @shanmugamr1992 :: PR: #9566
+- Improve error messaging during trt-llm export by @oyilmaz-nvidia :: PR: #9638
+- [Cherrypick] support lora when kv_channel != hidden_size / num_heads by @cuichenx :: PR: #9644
+- Parametrize FPS group by @mikolajblaz :: PR: #9648
+- Cherry-pick megatron export fix from main by @borisfom :: PR: #9643
+- add documentation for reset_lr feature by @dimapihta
+- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697
+- Cherry pick: LITA Integration by @Slyne :: PR: #9684
+- SDXL improvements (and support for Draft+) by @rohitrango :: PR: #9654
+- Gemma 2 by @cuichenx :: PR: #9672
+- Allows non-strict load with distributed checkpoints by @mikolajblaz :: PR: #9613
+- refactor: notebook branch release by @ko3n1g :: PR: #9711
+- [NeMo-UX] Make TE and Apex dependencies optional by @ashors1 :: PR: #9550
+- Alit/r2.0.0 by @JRD971000 :: PR: #9718
+- Manually cherry-pick from PR 9679 (PR to main - Support SFT/Eval/PEFT for mcore T5) by @huvunvidia :: PR: #9737
+- In framework export by @oyilmaz-nvidia :: PR: #9658
+- T5 changes based on mcore changes by @pablo-garay :: PR: #9829
+- [NeMo-UX] Use single instance of loss reductions in GPTModel by @hemildesai :: PR: #9801
+- deprecate NeMo NLP tutorial by @dimapihtar :: PR: #9864
+- Disable nvFuser setup with PyTorch 23.11 and later by @athitten :: PR: #9837
+- make torch_dist ckpt strategy as default by @dimapihtar :: PR: #9852
+- add rampup bs documentation by @dimapihtar :: PR: #9884
+- copy of #9576 by @dimapihtar :: PR: #9986
+- Support Nvidia Torch and Arch versions by @thomasdhc :: PR: #9897
+- Bug fix for pooler causing dist checkpointing exception by @shanmugamr1992 :: PR: #10008
+
+
+
+#### Export
+
+Changelog
+
+- Update nemo.export module for quantized models by @janekl :: PR: #9218
+- Add save option to the TRT-LLM export test script by @oyilmaz-nvidia :: PR: #9221
+- Add TRT-LLM params like max_num_tokens and opt_num_tokens by @oyilmaz-nvidia :: PR: #9210
+- TRT-LLM Export Code Cleanup by @oyilmaz-nvidia :: PR: #9270
+- Re-org export code by @oyilmaz-nvidia :: PR: #9353
+- Use TensorRT-LLM native parameter names in nemo.export module by @janekl :: PR: #9424
+- TRT-LLM 0.10 Update by @oyilmaz-nvidia :: PR: #9402
+- vLLM Export Support by @apanteleev :: PR: #9381
+- Add page context fmha option in TensorRTLLM export by @meatybobby :: PR: #9526
+- Test C++ runtime on demand in nemo_export.py to avoid possible OOMs by @janekl :: PR: #9544
+- Fix nemo export test by @oyilmaz-nvidia :: PR: #9547
+- Add tps and pps params to the export script by @oyilmaz-nvidia :: PR: #9558
+- Add Multimodal Exporter by @meatybobby :: PR: #9256
+- Set n_gpu to None in nemo export by @oyilmaz-nvidia :: PR: #9593
+- Inflight nemo model export support by @JimmyZhang12 :: PR: #9527
+- vLLM Export Improvements by @apanteleev :: PR: #9596
+- Akoumparouli/nemo ux mixtral export by @akoumpa :: PR: #9603
+- Change mixtral moe key name for trt-llm by @oyilmaz-nvidia :: PR: #9620
+- Fix the arguments of forward_for_export function in msdd_models by @tango4j :: PR: #9624
+- Improve error messaging during trt-llm export by @oyilmaz-nvidia :: PR: #9638
+- Cherry-pick megatron export fix from main by @borisfom :: PR: #9643
+- In framework export by @oyilmaz-nvidia :: PR: #9658
+- Add missing imports for torch dist ckpt in export by @oyilmaz-nvidia :: PR: #9826~
+
+
+
+
+
+
+#### Bugfixes
+
+Changelog
+
+- use get with fallback when reading checkpoint_callback_params by @akoumpa :: PR: #9223
+- fix import by @akoumpa :: PR: #9240
+- Remove .nemo instead of renaming by @mikolajblaz :: PR: #9281
+- call set_expert_model_parallel_world_size instead of set_cpu_expert_m… by @akoumpa :: PR: #9275
+- Fix typos in Mixtral NeMo->HF and Starcoder2 NeMo->HF conversion scripts by @evellasques :: PR: #9325
+- Skip sequence_parallel allreduce when using Mcore DistOpt by @akoumpa :: PR: #9344
+- Add OpenAI format response to r2.0.0rc1 by @athitten :: PR: #9796
+- [NeMo UX] Support generating datasets using different train/valid/test distributions by @ashors1 :: PR: #9771
+- Add missing imports for torch dist ckpt in export by @oyilmaz-nvidia :: PR: #9826
+
+
+
+#### General Improvements
+
+Changelog
+
+- [Nemo CICD] run_cicd_for_release_branches_also by @pablo-garay :: PR: #9213
+- rename paths2audiofiles to audio by @github-actions[bot] :: PR: #9220
+- Fix ASR_Context_Biasing.ipynb contains FileNotFoundError by @github-actions[bot] :: PR: #9234
+- ci: Remove duplicated job by @ko3n1g :: PR: #9258
+- Fix document links by @yaoyu-33 :: PR: #9260
+- Pin transformers by @github-actions[bot] :: PR: #9273
+- Fix loading github raw images on notebook by @github-actions[bot] :: PR: #9283
+- Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward by @github-actions[bot] :: PR: #9278
+- Refactor Sequence Packing Script by @cuichenx :: PR: #9271
+- [Nemo-UX] Move code to collections + fix some small bugs by @marcromeyn :: PR: #9277
+- Fix typo in HF tutorial by @github-actions[bot] :: PR: #9304
+- Expand documentation for data parallelism and distributed optimizer by @timmoon10 :: PR: #9227
+- Install alerting by @ko3n1g :: PR: #9311
+- typos by @github-actions[bot] :: PR: #9315
+- FP8 feature documentation by @ksivaman :: PR: #9265
+- [Nemo CICD] Comment out flaky tests by @pablo-garay :: PR: #9333
+- Fixed typos in README.rst by @gdevakumar :: PR: #9322
+- Update README.rst to clarify installation via Conda by @SimonCW :: PR: #9323
+- [Nemo CICD] update flaky test by @pablo-garay :: PR: #9339
+- fix lora and ptuning and isort/black by @github-actions[bot] :: PR: #9295
+- Fix P-tuning for Llama based models by @github-actions[bot] :: PR: #9300
+- add large model stable training fix and contrastive loss update for variable seq by @github-actions[bot] :: PR: #9348
+- Guard cuda memory allocator update by @github-actions[bot] :: PR: #9313
+- [Nemo CICD] Remove unnecessary commented out code by @pablo-garay :: PR: #9364
+- Update Gemma conversion script by @yaoyu-33 :: PR: #9365
+- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347) by @github-actions[bot] :: PR: #9371
+- Re-enable cuda graphs in training modes. by @github-actions[bot] :: PR: #9343
+- fix typo infer_seq_lenght -> infer_seq_length by @akoumpa :: PR: #9370
+- Make a backward compatibility for old MSDD configs in label models by @github-actions[bot] :: PR: #9378
+- Dgalvez/fix greedy batch strategy name r2.0.0rc0 by @github-actions[bot] :: PR: #9253
+- Update README.rst by @jgerh :: PR: #9393
+- Force diarizer to use CUDA if cuda is available and if device=None. by @github-actions[bot] :: PR: #9390
+- ci: Properly catch failed tests by introduction of workflow templates by @ko3n1g :: PR: #9324
+- Fix T5 G2P Input and Output Types by @github-actions[bot] :: PR: #9269
+- Huvu/rag pipeline citest by @huvunvidia :: PR: #9384
+- Fix circular import for MM dataprep notebook by @github-actions[bot] :: PR: #9292
+- add check if num layers is divisible by pp size by @github-actions[bot] :: PR: #9298
+- [Nemo CICD] timeouts fix by @pablo-garay :: PR: #9407
+- [NeMo-UX] Removing un-used ModelConfig class by @marcromeyn :: PR: #9389
+- Add tutorial for Llama-3-8B lora training and deployment by @shashank3959 :: PR: #9359
+- [NeMo-UX] Removing default_path from ModelConnector by @marcromeyn :: PR: #9401
+- Fix README by @ericharper :: PR: #9415
+- [SD] Fix SD CUDA Graph Failure by @alpha0422 :: PR: #9319
+- [NeMo-UX] Adding file-lock to Connector by @marcromeyn :: PR: #9400
+- Add Dev Container Bug Report by @pablo-garay :: PR: #9430
+- Akoumparouli/profiling docs by @akoumpa :: PR: #9420
+- ci: Enrich notifications by @ko3n1g :: PR: #9412
+- Fix failing RIR unit test with lhotse 1.24+ by @pzelasko :: PR: #9444
+- [NeMo-UX] Adding support for mcore distributed optimizer by @marcromeyn :: PR: #9435
+- Use ModelOpt build_tensorrt_llm for building engines for qnemo checkpoints by @janekl :: PR: #9452
+- ci(notifications): Fix extraction of last 2K chars by @ko3n1g :: PR: #9450
+- Update readme with mlperf news by @ericharper :: PR: #9457
+- [NeMo-UX] Add nsys callback by @ashors1 :: PR: #9461
+- [NeMo UX] Introducing optimizer module by @marcromeyn :: PR: #9454
+- Fix minor import bug in deploy module by @oyilmaz-nvidia :: PR: #9463
+- ci(notifications): Fetch all jobs by @ko3n1g :: PR: #9465
+- Update build_dataset.py by @stevehuang52 :: PR: #9467
+- bionemo: bn2/add pipelineparallel dtype by @skothenhill-nv :: PR: #9475
+- [NeMo-UX] Integrate experiment manager features with NeMo-UX APIs by @ashors1 :: PR: #9460
+- Add python_requires by @galv :: PR: #9431
+- [NeMo-UX] Fixing imports of NeMoLogging, AutoResume & ModelCheckpoint by @marcromeyn :: PR: #9476
+- Modelopt Refactor for SDXL Quantization by @suiyoubi :: PR: #9279
+- [NeMo-UX] Fixing defaults in llm.train & Mistral7BModel by @marcromeyn :: PR: #9486
+- In framework deploy using deploy script by @oyilmaz-nvidia :: PR: #9468
+- [NeMo-UX] Integrate tokenizer import into model.import_ckpt by @marcromeyn :: PR: #9485
+- append to file by @malay-nagda :: PR: #9483
+- [NeMo-UX] Fix bug in import_ckpt by @marcromeyn :: PR: #9492
+- Add nemotron news by @ericharper :: PR: #9510
+- Add CICD test for Stable Diffusion by @michal2409 :: PR: #9464
+- Akoumparouli/nemo ux mixtral by @akoumpa :: PR: #9446
+- [NeMo-UX] Llama and Gemma by @cuichenx :: PR: #9528
+- [NeMo-UX] minor logging bug fixes by @ashors1 :: PR: #9529
+- Update neva conversion script from and to HF by @yaoyu-33 :: PR: #9296
+- [Nemo-UX] IO fixes by @marcromeyn :: PR: #9512
+- Fix lhotse tests for v1.24.2 by @pzelasko :: PR: #9546
+- [Nemo CICD] Make GPU Unit Tests non-optional by @pablo-garay :: PR: #9551
+- Add Python AIStore SDK to container and bump min Lhotse version by @pzelasko :: PR: #9537
+- [NeMo-UX] Fix tokenizer IO by @marcromeyn :: PR: #9555
+- [NeMo UX] Move mistral_7b.py to mistral.py by @akoumpa :: PR: #9545
+- ci: Do not attempt to send slack on fork by @ko3n1g :: PR: #9556
+- Fix SDXL incorrect name in Docs by @suiyoubi :: PR: #9534
+- Bump PTL version by @athitten :: PR: #9557
+- [Resiliency] Straggler detection by @jbieniusiewi :: PR: #9473
+- [NeMo-UX] Switch to torch_dist as default distributed checkpointing backend by @ashors1 :: PR: #9541
+- [NeMo-UX] Checkpointing bug fixes by @ashors1 :: PR: #9562
+- Expose MCore path_to_cache option by @maanug-nv :: PR: #9570
+- [NeMo-UX] Fix Trainer serialization by @marcromeyn :: PR: #9571
+- Update click version requirement by @thomasdhc :: PR: #9580
+- [Fault tolerance] Heartbeat detection by @maanug-nv :: PR: #9352
+- [Nemo-UX] Add fabric-API for manual forward-pass by @marcromeyn :: PR: #9577
+- [Nemo-UX] Add SDK-factories to llm-collection by @marcromeyn :: PR: #9589
+- [NeMo-UX] Some improvements to NeMoLogger by @marcromeyn :: PR: #9591
+- Set no_sync_func & grad_sync_fucn by @akoumpa :: PR: #9601
+- [NeMo-UX] Fix nemo logger when trainer has no loggers by @ashors1 :: PR: #9607
+- Fix the dictionary format returned by the `scheduler` method by @sararb :: PR: #9609
+- [NeMo-UX] Dataloading enhancements and bug fixes by @ashors1 :: PR: #9595
+- Fix serialization of AutoResume by @sararb :: PR: #9616
+- Jsonl support by @adityavavre :: PR: #9611
+- Akoumparouli/mistral import instruct chat template fix by @akoumpa :: PR: #9567
+- Remove .cuda calls, use device isntead by @akoumpa :: PR: #9602
+- fix converter defautl args by @akoumpa :: PR: #9565
+- fix: remove non_blocking from PTL's .cuda call by @akoumpa :: PR: #9618
+- NeVA Minor Fixes by @yaoyu-33 :: PR: #9608
+- [NeMo-UX] fix pretrianing data sizes and weights by @cuichenx :: PR: #9627
+- [NeMo-UX] async checkpointing support by @ashors1 :: PR: #9466
+- Change default parallel_save to False by @mikolajblaz :: PR: #9632
+- Add REST API to deploy module by @athitten :: PR: #9539
+- ci: Timeout per step, not job by @ko3n1g :: PR: #9635
+- [NeMo-UX] Fix when optimizers are setup for PEFT by @marcromeyn :: PR: #9619
+- [NeMo-UX] Fix pipeline parallel bug by @ashors1 :: PR: #9637
+- Fixing import error fior llama-index (RAG pipeline) by @pablo-garay :: PR: #9662
+- llama CI fix by @rohitrango :: PR: #9663
+- [NeMo-UX] Make 'load_directly_on_device' configurable by @ashors1 :: PR: #9657
+- [Nemo-UX] Including all trainable-params in a PEFT-checkpoint by @marcromeyn :: PR: #9650
+- [NeMo-UX] Fix imports so local configuration of runs works again by @marcromeyn :: PR: #9690
+- Set TE flag in legacy -> mcore conversion script by @terrykong :: PR: #9722
+- Update starthere docs text by @erastorgueva-nv :: PR: #9724
+- TorchAudio installation workaround for incorrect `PYTORCH_VERSION` variable by @artbataev :: PR: #9736
+- [NeMo-UX] Match nemo 1's default behavior for drop_last and pad_samples_to_global_batch_size by @ashors1 :: PR: #9707
+- add a bit more for timeout (#9702) by @pablo-garay :: PR: #9754
+- Fix missing parallelisms by @maanug-nv :: PR: #9725
+- update branch by @nithinraok :: PR: #9764
+- Fix data preprocessing script by @cuichenx :: PR: #9759
+- vLLM 0.5.1 update by @apanteleev :: PR: #9779
+- upper bound hf-hub by @akoumpa :: PR: #9805
+- Fix few issues and docs for neva and clip in r2.0.0rc1 by @yaoyu-33 :: PR: #9681
+- add dummy vision and text transformer config (assumed mcore to be false) by @rohitrango :: PR: #9699
+- fix lita bugs by @Slyne :: PR: #9810
+- [NeMo-UX] Log `val_loss` by @ashors1 :: PR: #9814
+- [NeMo-UX] Fix some dataloading bugs by @ashors1 :: PR: #9807
+- [NeMo-UX] Adding recipes by @marcromeyn :: PR: #9720
+- [NeMo-UX] Set async_save from strategy rather than ModelCheckpoint by @ashors1 :: PR: #9800
+- Fix hf hub for 0.24+ by @titu1994 :: PR: #9806
+- [NeMo-UX] Fix a minor bug with async checkpointing by @ashors1 :: PR: #9856
+- [NeMo-UX] make progress bar easier to parse by @ashors1 :: PR: #9877
+- Docs: add "Nemo Fundamentals" page by @erastorgueva-nv :: PR: #9835
+- Create __init__.py by @stevehuang52 :: PR: #9892
+- [NeMo-UX] Fixes to make PreemptionCallback work by @hemildesai :: PR: #9830
+- Fix Docker build. Make Dockerfile consistent with CI by @artbataev :: PR: #9784
+- Multimodal data prep notebook fix by @cuichenx :: PR: #9910
+- [NeMo-UX] Add distributed checkpointing unit tests by @ashors1 :: PR: #9794
+- r2.0.0rc1 fix for dist checkpoint loading by @yaoyu-33 :: PR: #9854
+- [NeMo-UX] Rename sdk references to NeMo Run by @hemildesai :: PR: #9872
+- [NeMo-UX] Fix some serialization bugs by @ashors1 :: PR: #9868
+- add mixtral neva tutorial (moe + token fusion + siglip) by @paul-gibbons :: PR: #9926
+- [NeMo-UX] Add more NeMo Logger tests by @ashors1 :: PR: #9795
+- Akoumparouli/mixtral fixes for r2.0.0rc1 by @akoumpa :: PR: #9911
+- R2.0.0rc1 clip fix by @Slyne :: PR: #9871
+- [NeMo-UX] Add missing docstrings and update some defaults by @ashors1 :: PR: #9895
+- Add REST service requirements.txt by @oyilmaz-nvidia :: PR: #9923
+- add bert latest fix by @JRD971000 :: PR: #9921
+- remove empy reconfigure_limit_batches by @akoumpa :: PR: #9934
+- fix mem by @terrykong :: PR: #9964
+- Run a sample query for a quantized model conditionally by @janekl :: PR: #9965
+- Add pydantic-settings by @oyilmaz-nvidia :: PR: #9961
+- Resiliency features update by @jbieniusiewi :: PR: #9714
+- [NeMo-UX] Wrap task config save in a try/except by @ashors1 :: PR: #9956
+- [NeMo-UX] Update default PTL logging `save_dir` by @ashors1 :: PR: #9954
+- Fix lita tutorial by @Slyne :: PR: #9980
+- Add deploy and REST API support to NeMo 2.0 by @athitten :: PR: #9834
+- ci: Allow changelog manual (#10156) by @ko3n1g :: PR: #10157
+- docs: Add changelog by @ko3n1g :: PR: #10155
+- add manifest file by @ko3n1g :: PR: #10161
+
+
+
+## NVIDIA Neural Modules 2.0.0rc0
+
+### Highlights
+
+#### LLM and MM
+
+##### Models
+
+- Megatron Core RETRO
+ - Pre-training
+ - Zero-shot Evaluation
+
+- Pretraining, conversion, evaluation, SFT, and PEFT for:
+ - Mixtral 8X22B
+ - Llama 3
+ - SpaceGemma
+
+- Embedding Models Fine Tuning
+ - Mistral
+ - BERT
+
+- BERT models
+ - Context Parallel
+ - Distributed checkpoint
+
+- Video capabilities with NeVa
+
+##### Performance
+
+- Distributed Checkpointing
+ - Torch native backend
+ - Parallel read/write
+ - Async write
+
+- Multimodal LLM (LLAVA/NeVA)
+ - Pipeline Parallelism support
+ - Sequence packing support
+
+##### Export
+
+- Integration of Export & Deploy Modules into NeMo Framework container
+ - Upgrade to TRT-LLM 0.9
+
+#### Speech (ASR & TTS)
+
+##### Models
+
+- AED Multi Task Models (Canary) - Multi-Task Multi-Lingual Speech Recognition / Speech Translation model
+- Multimodal Domain - Speech LLM supporting SALM Model
+- Parakeet-tdt_ctc-1.1b Model - RTFx of > 1500 (can transcribe 1500 seconds of audio in 1 second)
+- Audio Codec 16kHz Small - NeMo Neural Audio Codec for discretizing speech for use in LLMs
+ - mel_codec_22khz_medium
+ - mel_codec_44khz_medium
+
+##### Perf Improvements
+
+- Transcribe() upgrade - Enables one line transcribe with files, tensors, data loaders
+- Frame looping algorithm for RNNT faster decoding - Improves Real Time Factor (RTF) by 2-3x
+- Cuda Graphs + Label-Looping algorithm for RNN-T and TDT Decoding - Transducer Greedy decoding at over 1500x RTFx, on par with CTC Non-Autoregressive models
+- Semi Sorted Batching support - External User contribution that speeds up training by 15-30%.
+
+##### Customization
+
+- Context biasing for CTC word stamping - Improve accuracy for custom vocabulary and pronunciation
+ - Longform Inference
+ - Longform inference support for AED models
+- Transcription of multi-channel audio for AED models
+
+##### Misc
+
+- Upgraded webdataset - Speech and LLM / Multimodal unified container
+
+### Detailed Changelogs
+
+#### ASR
+
+Changelog
+
+- Enable using hybrid asr models in CTC Segmentation tool by @erastorgueva-nv :: PR: #8828
+- TDT confidence fix by @GNroy :: PR: #8982
+- Fix union type annotations for autodoc+mock-import rendering by @pzelasko :: PR: #8956
+- NeMo dev doc restructure by @yaoyu-33 :: PR: #8896
+- Improved random seed configuration for Lhotse dataloaders with docs by @pzelasko :: PR: #9001
+- Fix #8948, allow preprocessor to be stream captured to a cuda graph when doing per_feature normalization by @galv :: PR: #8964
+- [ASR] Support for transcription of multi-channel audio for AED models by @anteju :: PR: #9007
+- Add ASR latest news by @titu1994 :: PR: #9073
+- Fix docs errors and most warnings by @erastorgueva-nv :: PR: #9006
+- PyTorch CUDA allocator optimization for dynamic batch shape dataloading in ASR by @pzelasko :: PR: #9061
+- RNN-T and TDT inference: use CUDA graphs by default by @artbataev :: PR: #8972
+- Fix #8891 by supported GPU-side batched CTC Greedy Decoding by @galv :: PR: #9100
+- Update branch for notebooks and ci in release by @ericharper :: PR: #9189
+- Enable CUDA graphs by default only for transcription by @artbataev :: PR: #9196
+- rename paths2audiofiles to audio by @nithinraok :: PR: #9209
+- Fix ASR_Context_Biasing.ipynb contains FileNotFoundError by @andrusenkoau :: PR: #9233
+- Cherrypick: Support dataloader as input to `audio` for transcription (#9201) by @titu1994 :: PR: #9235
+- Update Online_Offline_Microphone_VAD_Demo.ipynb by @stevehuang52 :: PR: #9252
+- Dgalvez/fix greedy batch strategy name r2.0.0rc0 by @galv :: PR: #9243
+- Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward by @galv :: PR: #9246
+- Fix loading github raw images on notebook by @nithinraok :: PR: #9282
+- typos by @nithinraok :: PR: #9314
+- Re-enable cuda graphs in training modes. by @galv :: PR: #9338
+- add large model stable training fix and contrastive loss update for variable seq by @nithinraok :: PR: #9259
+- Fix conv1d package in r2.0.0rc0 by @pablo-garay :: PR: #9369
+- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347) by @titu1994 :: PR: #9350
+- Make a backward compatibility for old MSDD configs in label models by @tango4j :: PR: #9377
+- Force diarizer to use CUDA if cuda is available and if device=None. by @tango4j :: PR: #9380
+
+
+
+#### TTS
+
+Changelog
+
+- [TTS] Add tutorial for training audio codecs by @rlangman :: PR: #8723
+- Update radtts.py by @blisc :: PR: #9097
+- [Nemo CICD] RADTTS test optional by @pablo-garay :: PR: #9112
+- Remove Radtts CI test by @blisc :: PR: #9144
+- Fix T5 G2P Input and Output Types by @blisc :: PR: #9224
+
+
+
+#### LLM and MM
+
+Changelog
+
+- Rachitg/dpa by @rachitgarg91 :: PR: #8911
+- Remove precision args in trainer due to PTL update by @yaoyu-33 :: PR: #8908
+- Huvu/mcore retro by @huvunvidia :: PR: #8861
+- fsdp tp > 1 bug fix by @dimapihtar :: PR: #8947
+- Fix memory leak at loss func by @minitu :: PR: #8868
+- change the condition for get qkv tensor from linear_qkv output in mcoremixin by @HuiyingLi :: PR: #8965
+- Add safety checks for 'data' key in MegatronGPTModel cfg by @HuiyingLi :: PR: #8991
+- [NeMo-UX] Adding MegatronParallel by @cuichenx :: PR: #8987
+- Skip top_p computations when set to 1.0 by @odelalleau :: PR: #8905
+- Gemma bug by @cuichenx :: PR: #8962
+- [NeMo-UX] Adding megatron strategy by @marcromeyn :: PR: #8995
+- Quantized checkpoint support in export and deploy modules by @janekl :: PR: #8859
+- add geglu to mlp swap by @JRD971000 :: PR: #8999
+- add timeout for new_group by @acphile :: PR: #8998
+- Zero-shot evaluation pipeline for mcore RETRO by @huvunvidia :: PR: #8941
+- Added fusion for squared relu by @sanandaraj5597 :: PR: #8963
+- Developer Documents for mcore RETRO by @huvunvidia :: PR: #9026
+- [NeMo-UX] Adding GPTModel & MockDataModule by @marcromeyn :: PR: #9011
+- Adding unit test for mcore RETRO model by @huvunvidia :: PR: #9022
+- docs and simplification of cmd args by @arendu :: PR: #8979
+- [NeMo-UX] Add checkpoint-io to MegatronStrategy by @marcromeyn :: PR: #9057
+- Enable Sequence Packing and Pipeline Parallel in NeVA by @yaoyu-33 :: PR: #8957
+- Mingyuanm/add back fp8 support to sd by @Victor49152 :: PR: #9070
+- unfused lora by @arendu :: PR: #9004
+- Handle case where num_query_groups is set to null for LoRA config setup by @vysarge :: PR: #9075
+- Alit/griffin by @JRD971000 :: PR: #9021
+- Implement DistributedCheckpointIO by @mikolajblaz :: PR: #9016
+- Video Neva Pretraining + Inference Implementation by @paul-gibbons :: PR: #9095
+- HF to .nemo for Mixtral-8x22B-instruct by @akoumpa :: PR: #9060
+- mcore ds updates by @dimapihtar :: PR: #8951
+- Alit/griffin perf by @JRD971000 :: PR: #9107
+- Add assert for max_steps to be positive in MegatronGPTSFTModel by @athitten :: PR: #9110
+- Extend sequence length padding for GPT SFT to account for context parallel by @vysarge :: PR: #8869
+- Update gpt dataset config parameter for mock by @thomasdhc :: PR: #9118
+- Add Mcore DistributedDataParallel and distributed optimizer into Nemo by @gdengk :: PR: #9034
+- Revert "Add assert for max_steps to be positive in MegatronGPTSFTMode… by @pablo-garay :: PR: #9128
+- scripts to convert HF lora to nemo by @arendu :: PR: #9102
+- Prevent duplicated checkpoints by @mikolajblaz :: PR: #9015
+- add TN/ITN link in speech tools list by @erastorgueva-nv :: PR: #9142
+- Cleanup deprecated files and temporary changes by @cuichenx :: PR: #9088
+- Use DP+CP groups as the FSDP sharding domain by @erhoo82 :: PR: #9145
+- CUDA memory profile by @erhoo82 :: PR: #9096
+- Fix missing func for T5 model by @gdengk :: PR: #9141
+- Add knob for load_directly_on_device by @mikolajblaz :: PR: #9125
+- Revert rope fusion defaults by @cuichenx :: PR: #9238
+- Update nemo.export module for quantized models by @janekl :: PR: #9250
+- Fix circular import for MM dataprep notebook by @cuichenx :: PR: #9287
+- neva media_type + text generation default fix by @paul-gibbons :: PR: #9257
+- fix lora and ptuning and isort/black by @oyilmaz-nvidia :: PR: #9290
+- add check if num layers is divisible by pp size by @dimapihtar :: PR: #9208
+- Fix P-tuning for Llama based models by @apanteleev :: PR: #9297
+- add deprecation warnings by @pablo-garay :: PR: #9266
+- move pooler under post_process by @dimapihtar :: PR: #9328
+- add deprecation note for nmt by @dimapihtar :: PR: #9342
+- Fix incorrect checkpoint removal logic (#9192) by @mikolajblaz :: PR: #9204
+- fix fp16 precision issue by @dimapihtar :: PR: #9376
+- Fix module.training for Neva in FusedAttn backward which causes nan by @yaoyu-33 :: PR: #8877
+
+
+
+#### Export
+
+Changelog
+
+- Updates for TRT-LLM 0.9 by @oyilmaz-nvidia :: PR: #8873
+- Mingyuanm/sdxl export by @Victor49152 :: PR: #8926
+- Avoid unpacking NeMo checkpoints before exporting to TRT-LLM by @apanteleev :: PR: #8866
+- Update gemma for trt-llm 0.9 by @oyilmaz-nvidia :: PR: #8974
+- TRT-LLM export P-tuning related fixes by @apanteleev :: PR: #8863
+
+
+
+#### General Improvements
+
+Changelog
+
+- Update package info by @ericharper :: PR: #8793
+- [Nemo CICD] Update mcore 4.13.24 by @pablo-garay :: PR: #8917
+- Akoumparouli/low mem mixtral ckpt converter by @akoumpa :: PR: #8895
+- Adding RETRO tests to Action Tests (cicd-main.yml) by @huvunvidia :: PR: #8942
+- Akoumparouli/fix sd train 2 by @akoumpa :: PR: #8883
+- Update te install for jenkins by @ericharper :: PR: #8954
+- [Nemo CICD] Add last job depending on others for blocking check by @pablo-garay :: PR: #8959
+- Minor quantization pipeline updates by @janekl :: PR: #8924
+- Fix External CLIP Converter by @yaoyu-33 :: PR: #8960
+- PP support in LoRA merge script by @cuichenx :: PR: #8934
+- Update PR template by @ericharper :: PR: #8978
+- Update Latest News by @shashank3959 :: PR: #8837
+- Fix incorrect link to latest news in README by @shashank3959 :: PR: #8985
+- Update dependency install for LLM and MM by @ericharper :: PR: #8990
+- Temporarily remove mcore dep by @ericharper :: PR: #9010
+- [Nemo CICD] further specialize runners for more parallelism by @pablo-garay :: PR: #9036
+- Update mm dataprep notebook based on feedback by @cuichenx :: PR: #9029
+- Fix import in lora merge script by @cuichenx :: PR: #9032
+- [Nemo CICD] Run when labeled:Run CICD by @pablo-garay :: PR: #9044
+- [Nemo CICD] Add tag/label for 1-gpu runner by @pablo-garay :: PR: #9046
+- [Nemo CICD] checkout v4 by @pablo-garay :: PR: #9048
+- [Nemo CICD] Remove temp test change by @pablo-garay :: PR: #9049
+- remove in-place addition for dreambooth train with text encoder by @Victor49152 :: PR: #8825
+- Mingyuanm/sdxl quantization notebook by @Victor49152 :: PR: #9042
+- [Nemo CICD] Trigger on comment issued by @pablo-garay :: PR: #9062
+- zarr ckpt to torch_dist ckpt converter by @dimapihtar :: PR: #8842
+- Restore PTQ tests for Llama2 (reopened) by @janekl :: PR: #9064
+- add clip H config by @JRD971000 :: PR: #9082
+- [NeMo-UX] Add mixed-precision plugin by @marcromeyn :: PR: #9065
+- Comment baichuan test and update pr template by @ericharper :: PR: #9085
+- Add safe extraction of nemo tar files by @athitten :: PR: #8976
+- Improved `shard_id` parsing in `LazyNemoTarredIterator`, enables AIS dataloading by @pzelasko :: PR: #9077
+- [NeMo-UX] Add mistral-7b model by @marcromeyn :: PR: #9066
+- Llama3 Conversion Script Update by @suiyoubi :: PR: #9089
+- dehardcode test string by @JimmyZhang12 :: PR: #8865
+- [Nemo CICD] Try trigger cicd run on comment by @pablo-garay :: PR: #9111
+- Lhotse dataloading: RIR augmentation and nemo/tarred input support for RIR and noise aug by @pzelasko :: PR: #9109
+- mixtral evaluation PR by @Slyne :: PR: #8989
+- [Nemo CICD] Revert: run GHA cicd on comment by @pablo-garay :: PR: #9119
+- [Nemo CICD] Comment out flaky test: running too long by @pablo-garay :: PR: #9123
+- [Nemo CICD] Add timeout to unit tests by @pablo-garay :: PR: #9132
+- [Nemo CICD] Indicate optional test in name (prefix) by @pablo-garay :: PR: #9139
+- video neva null image+video folder path fix by @paul-gibbons :: PR: #9116
+- [NeMo-UX] Add data module by @cuichenx :: PR: #9133
+- NeMo Inference Requirements by @oyilmaz-nvidia :: PR: #9093
+- Remove debug print by @maanug-nv :: PR: #9074
+- Remove legacy CI by @pablo-garay :: PR: #9149
+- Update support for push_to_hf_hub() by @titu1994 :: PR: #9159
+- [Nemo CICD] comment out flaky PTQ tests by @pablo-garay :: PR: #9160
+- Update branch by @ericharper :: PR: #9211
+- dist adam transpose fix by @dimapihtar :: PR: #9239
+- [Nemo CICD] Increase time limit for Speech_Checkpoints_tests (#9186) by @pablo-garay :: PR: #9247
+- Pin transformers by @ericharper :: PR: #9261
+- Fix typo in HF tutorial by @titu1994 :: PR: #9302
+
+
+
+## NVIDIA Neural Modules 1.23.0
+
+### Highlights
+
+#### Models
+
+##### Nvidia Starcoder 2 - 15B
+
+- Announcement - https://developer.nvidia.com/blog/unlock-your-llm-coding-potential-with-starcoder2/
+- AI Foundation Model Inference - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/starcoder2-15b
+- https://huggingface.co/bigcode/starcoder2-15b
+
+##### NeMo Canary
+Announcement - https://nvidia.github.io/NeMo/blogs/2024/2024-02-canary/
+
+- https://huggingface.co/nvidia/canary-1b
+
+#### NeMo LLM
+
+- Falcon
+- Code Llama
+- StarCoder
+- GPT perf improvements
+- Context parallelism
+- Mistral
+- Mixtral (without expert parallelism)
+- Mcore GPT Dataset integration
+
+#### NeMo MM
+- CLIP
+- Stable Diffusion (supporting LoRA)
+- Imagen
+- ControlNet (for SD)
+- Instruct pix2pix (for SD)
+- LLAVA
+- NeVA
+- DreamFusion++
+- NSFW filtering
+
+#### NeMo ASR
+
+- Lhotse Dataloading support #7880
+- Canary: Multi task multi lingual ASR #8242
+- LongForm Audio for Diarization #7737
+- Faster algorithm for RNN-T Greedy #7926
+- Cache-Aware streaming notebook #8296
+
+#### NeMo TTS
+
+#### NeMo Vision
+
+#### Known Issues
+
+##### ASR
+
+###### RNNT WER calculation when fused batch size > 1 during validation / test step()
+
+Previously, the RNNT metric was stateful while the CTC one was not ([r1.22.0](https://github.com/NVIDIA/NeMo/blob/r1.22.0/nemo/collections/asr/metrics/rnnt_wer_bpe.py#L419-L420), [r1.23.0](https://github.com/NVIDIA/NeMo/blob/r1.23.0/nemo/collections/asr/metrics/wer.py#L333))
+
+Therefore this calculation in the RNNT joint for fused operation worked properly. However with the unification of metrics in r1.23.0, a bug was introduced where only the last sub-batch of metrics calculates the scores and does not accumulate. This is patched via https://github.com/NVIDIA/NeMo/pull/8587 and will be fixed in the next release.
+
+**Workaround**: Explicitly disable fused batch size during inference using the following command
+
+```python
+from omegaconf import open_dict
+model = ...
+decoding_cfg = model.cfg.decoding
+with open_dict(decoding_cfg):
+ decoding_cfg.fused_batch_size = -1
+model.change_decoding_strategy(decoding_cfg)
+```
+
+Note: This bug does not affect scores calculated via model.transcribe() (since it does not calculate metrics during inference, just text), or using the `transcribe_speech.py` or `speech_to_text_eval.py` in `examples/asr`.
+
+###### Two failing unit tests due to a change in expected results, caused by lhotse version update
+
+#### Container
+
+For additional information regarding NeMo containers, please visit: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo
+
+`docker pull nvcr.io/nvidia/nemo:24.01.speech`
+
+#### ASR
+
+Changelog
+
+- Update link to yaml file in ASR_with_Transducers.ipynb by @Faith-Nchifor :: PR: #8014
+- Use convert_hf_dataset_to_nemo by @karpnv :: PR: #8017
+- Update asr_language_modeling.rst: Add a missing word by @martin0258 :: PR: #8007
+- spelling mistake by @orena1 :: PR: #7903
+- update asr eval by @stevehuang52 :: PR: #8045
+- fix noise aug by @stevehuang52 :: PR: #8057
+- Various fixes for typos and urls by @titu1994 :: PR: #8066
+- [Fix] Increase length check tolerance to prevent test failing by @anteju :: PR: #8067
+- Add text metrics to asr eval by @stevehuang52 :: PR: #8087
+- fix device setting to allow using accelerator cpu by @orena1 :: PR: #8084
+- .ctm in data simulator annotator compliant with RT-09 specification by @popcornell :: PR: #8004
+- Fix AST eval by @stevehuang52 :: PR: #8112
+- fix: numba.*_num_threads resets torch num_threads #8141 by @itzsimpl :: PR: #8145
+- Update dependencies by @titu1994 :: PR: #8156
+- NeMo + Lhotse integration by @pzelasko :: PR: #7880
+- Speedup RNN-T greedy decoding by @artbataev :: PR: #7926
+- [docker] Install k2 before NeMo for faster image rebuilding by @pzelasko :: PR: #8204
+- [docs] Add --force_codec to tarred dataset creation examples by @pzelasko :: PR: #8227
+- Temporarily use the previous RNN-T decoding algorithm as default by @artbataev :: PR: #8226
+- Make TDT inference not require duration params by @hainan-xv :: PR: #8207
+- Cache Aware Streaming tutorial notebook by @erastorgueva-nv :: PR: #8296
+- fix path location and branch by @nithinraok :: PR: #8304
+- Attention encoder-decoder models for multiple speech-to-text tasks … by @titu1994 :: PR: #8324
+- Remove asr webapp by @titu1994 :: PR: #8347
+- remove _target_ at model level in aed model config [ASR] by @krishnacpuvvada :: PR: #8351
+- Add change_vocabulary and save_tokenizers() support to Multitask ASR models by @titu1994 :: PR: #8357
+- Change default beam size by @titu1994 :: PR: #8371
+- adding jenkins test for speech_to_text_aed model by @krishnacpuvvada :: PR: #8368
+- Add Finetuning tutorial with HF Datasets by @nithinraok :: PR: #8356
+- wer fix by @tbartley94 :: PR: #8404
+- add ensemble decoding fix by @nithinraok :: PR: #8427
+- Update k2 by @artbataev :: PR: #8492
+
+
+
+#### TTS
+
+Changelog
+
+- [TTS] Scale sampler steps by number of devices by @rlangman :: PR: #7947
+- Add All Multimodal Source Code Part 2: Text to image, x to nerf by @yaoyu-33 :: PR: #7970
+- [TTS] Add period discriminator and feature matching loss to codec recipe by @rlangman :: PR: #7884
+- Added VectorQuantizer base class by @anteju :: PR: #8011
+
+
+
+#### LLMS
+
+Changelog
+
+- Add interface to set NCCL options of each process group by @erhoo82 :: PR: #7923
+- Support O2 training of PEFT and SFT by @cuichenx :: PR: #7971
+- [NLP] Access scaler only in FP16 case by @janekl :: PR: #7916
+- [NLP] Minor improvements in Llama conversion script by @janekl :: PR: #7978
+- [NLP] Use helpers from utils_funcs.py in Llama conversion by @janekl :: PR: #7979
+- [NLP] Remove replace_sampler_ddp (deprecated in Trainer) by @janekl :: PR: #7981
+- Reworked MegatronPretrainingRandomBatchSampler to correctly handle epochs > 1 by @trias702 :: PR: #7920
+- Remove deprecated arguments from TE's TransformerLayer by @jbaczek :: PR: #7917
+- Add All Multimodal Source Code by @yaoyu-33 :: PR: #7791
+- First draft of mcore bert model in NeMo by @shanmugamr1992 :: PR: #7814
+- Support Falcon Variants (7B/40B/180B) in Mcore NeMo by @xuanzic :: PR: #7666
+- FSDP + Tensor Parallelism by @erhoo82 :: PR: #7897
+- Packed Sequence by @cuichenx :: PR: #7945
+- Adding method back that was removed accidentally by @ericharper :: PR: #8038
+- [NLP] ArtifactItem with init=True to make it debuggable by @janekl :: PR: #7980
+- SFT patch: (1) enable sequence parallelism and (2) enable profile by @erhoo82 :: PR: #7963
+- migration to PTL 2.0 for spellmapper model by @bene-ges :: PR: #7924
+- Change the megatron config lr scheduler default and fix to change partitions script by @shan18 :: PR: #8094
+- (1) Add SHARP interface to M-CORE, (2) use send/recv to send train loss to the first rank instead of b-cast by @erhoo82 :: PR: #7793
+- Reconfigure limit_val_batches only for int by @athitten :: PR: #8099
+- Fixing wrapper and moving it to base class by @shanmugamr1992 :: PR: #8055
+- fix gated_linear_unit bug by @Agoniii :: PR: #8042
+- Fix Adapter for MCore models by @cuichenx :: PR: #8124
+- add war fix for sync issues by @gshennvm :: PR: #8130
+- Improve PEFT UX by @cuichenx :: PR: #8131
+- Enhance flexibility by passing callbacks as method argument by @michal2409 :: PR: #8015
+- context parallelism by @xrennvidia :: PR: #7739
+- Make pipelined TP comm overlap available with mcore by @erhoo82 :: PR: #8005
+- remove deprecated scripts by @arendu :: PR: #8138
+- adding OnlineSampleMapping by @arendu :: PR: #8137
+- Add distopt support for FP8 params and BF16 optimizer state by @timmoon10 :: PR: #7909
+- Revert adding OnlineSampleMapping by @pablo-garay :: PR: #8164
+- Token count and sequence length logging for MegatronGPTSFTModel by @vysarge :: PR: #8136
+- Use latest apex internal API by @jbaczek :: PR: #8129
+- tune specific params in the base model by @arendu :: PR: #7745
+- Virtual pipeline parallel support for MegatronGPTSFTModel by @vysarge :: PR: #7964
+- removed deprecated peft model by @arendu :: PR: #8183
+- remove more deprecated files by @arendu :: PR: #8169
+- Pre-generate cu_seqlens argmin and max_seqlen to remove host-to-device sync by @erhoo82 :: PR: #8108
+- Add the interface to use SHARP to FSDP strategy by @erhoo82 :: PR: #8202
+- Multimodal required NLP base model changes by @yaoyu-33 :: PR: #8188
+- [NLP] Improve and unify loading state_dict for community models by @janekl :: PR: #7977
+- Rename Finetuning Scripts by @cuichenx :: PR: #8201
+- Final multimodal PR with our recent developments on MM side by @yaoyu-33 :: PR: #8127
+- Add include_text parameter to SFT dataloaders by @Kipok :: PR: #8198
+- Add random_seed argument to generate by @Kipok :: PR: #8162
+- Added support for neptune logger by @harishankar-gopalan :: PR: #8210
+- Pre-compute max_seqlen and cu_seqlens_argmin in all model-parallel cases by @erhoo82 :: PR: #8222
+- Use PackedSeqParams in accordance with changes in Megatron-LM by @cuichenx :: PR: #8205
+- Fix to peft & virtual pipeline parallel unsupported check by @vysarge :: PR: #8216
+- Fixed the tp overlap switch by @sanandaraj5597 :: PR: #8195
+- add knobs for rope/swiglu fusion by @lhb8125 :: PR: #8184
+- Added sample cpu_offloading switch to YAML by @sanandaraj5597 :: PR: #8148
+- Syncing random seed between ranks in generate by @Kipok :: PR: #8230
+- add first_val_step to mcore scheduler by @JimmyZhang12 :: PR: #8150
+- Correct padding for SFT input data to account for sequence parallel + TE's fp8 op dimension requirements by @vysarge :: PR: #8240
+- Mistral 7b conversion script by @akoumpa :: PR: #8052
+- switch to mcore dataset [with FIM support] by @dimapihtar :: PR: #8149
+- Mixtral to NeMo conversion script. by @akoumpa :: PR: #8155
+- fixes to accomendate mcore changes by @HuiyingLi :: PR: #8261
+- Allow MegatronPretrainingRandomSampler to do multi-epoch training by @trias702 :: PR: #8239
+- Add dist ckpt support for regular optimizers by @mikolajblaz :: PR: #7749
+- add deallocate pipeline output optimization by @JimmyZhang12 :: PR: #8279
+- Fix memory leak caused by context parallelism hanging references by omegaconf by @JimmyZhang12 :: PR: #8299
+- distributed fused adam + rampup bs support by @dimapihtar :: PR: #8302
+- Update PEFT Doc by @cuichenx :: PR: #8262
+- Converter script fixes for mixtral/mistral by @akoumpa :: PR: #8272
+- Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 by @erhoo82 :: PR: #8334
+- Enable megatron core loggers for GPT pretraining by @ashbhandare :: PR: #8354
+- mcore ds fix by @dimapihtar :: PR: #8283
+- release updates by @dimapihtar :: PR: #8378
+- Mcore customization doc by @HuiyingLi :: PR: #8298
+- updated link to pubmed by @nithinraok :: PR: #8402
+- mcore customization doc minor fix by @HuiyingLi :: PR: #8421
+- Fixing mcore bert for TP, PP and SP by @shanmugamr1992 :: PR: #8336
+- Add settings to suppress bf16 compile errors in CI on V100 by @athitten :: PR: #8481
+- MoE parameter passing by @akoumpa :: PR: #8255
+- Add fp8 support for SD/Update notebook paths by @Victor49152 :: PR: #8489
+
+
+
+#### NeMo Tools
+
+Changelog
+
+- SDE bugfix log by @Jorjeous :: PR: #8430
+
+
+
+#### General Improvements
+
+Changelog
+
+- Add news section to README by @ericharper :: PR: #7984
+- Fixing conversion script to work for code llama by @shanmugamr1992 :: PR: #7997
+- Fix crash when converting to mcore a model using rotary embeddings by @odelalleau :: PR: #7998
+- Added a procedure for Windows users, README by @Jorjeous :: PR: #7942
+- Update manifest.py to speedup loading tarred datasets by @stevehuang52 :: PR: #7900
+- [Fix] Fixed name of a test by @anteju :: PR: #7986
+- Fix lora merge script by @cuichenx :: PR: #8113
+- Support transcoding audio formats when saving tarred datasets (FLAC, OPUS) by @pzelasko :: PR: #8102
+- README edit to change Apple Silicon install instructions (to fix a break introduced by pytorch 2) by @stephenmcconnachie :: PR: #8122
+- Fixes NVIDIA/apex installation to not erroneously install the pkg by @terrykong :: PR: #8126
+- Graphviz fix by @GNroy :: PR: #7843
+- Update README.rst by @fayejf :: PR: #8154
+- Fix TP>1 issue for conversion script by @cuichenx :: PR: #8144
+- Support torch jit script by @artbataev :: PR: #8027
+- NeMo Multimodal Docs and Tests Initial PR by @yaoyu-33 :: PR: #8028
+- Remove left-over prints in NeMo+Lhotse code by @pzelasko :: PR: #8180
+- Upgrade to DLFW PyTorch 23.12 by @ericharper :: PR: #8163
+- Add Lhotse support for key in NeMo manifests by @pzelasko :: PR: #8197
+- Fix CPU Initialization and TP>1 for LoRA Merge Script by @cuichenx :: PR: #8199
+- Add support in Neural Typecheck to disable semantic checks by @titu1994 :: PR: #8212
+- Pin lhotse=1.19.2 in r1.23.0 by @pzelasko :: PR: #8303
+- Multimodal r1.23.0 bug fix by @yaoyu-33 :: PR: #8315
+- MCore dataset compatibility for tokenizers by @vysarge :: PR: #8390
+- Update NFA video download link by @erastorgueva-nv :: PR: #8406
+- Update MM Dataprep Tutorial by @cuichenx :: PR: #8410
+- Fix dreambooth data sampler issue by @yaoyu-33 :: PR: #8400
+- Fix a bug in CTM line processing function for multi-speaker data simulations by @tango4j :: PR: #8416
+- Akoumparouli/mistral bugfix by @akoumpa :: PR: #8353
+- pin to 0.5.0 by @ericharper :: PR: #8465
+- Update NeMo Multimodal Requirements by @yaoyu-33 :: PR: #8515
+- Fix link in multimodal dataprep tutorial by @cuichenx :: PR: #8517
+
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 2f75014b86d1..d015796096fc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -13,6 +13,7 @@ NVIDIA NeMo Framework is an end-to-end, cloud-native framework designed to build
- Activation Recomputation
- Positional Embeddings and Positional Interpolation
- Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) with `TensorRT Model Optimizer `_
+- Knowledge Distillation-based training with `TensorRT Model Optimizer `_
- Sequence Packing
`NVIDIA NeMo Framework `_ has separate collections for:
diff --git a/docs/source/nlp/distillation.rst b/docs/source/nlp/distillation.rst
new file mode 100644
index 000000000000..01e5d599cb8c
--- /dev/null
+++ b/docs/source/nlp/distillation.rst
@@ -0,0 +1,58 @@
+.. _megatron_distillation:
+
+Distillation
+==========================
+
+Knowledge Distillation (KD)
+--------------------------------
+
+KD involves using information from an existing trained model to train a second (usually smaller, faster) model, thereby "distilling" knowledge from one to the other.
+
+Distillation has two primary benefits: faster convergence and higher end accuracy than traditional training.
+
+In NeMo, distillation is enabled by the `NVIDIA TensorRT Model Optimizer (ModelOpt) `_ library -- a library to optimize deep-learning models for inference on GPUs.
+
+The logits-distillation process consists of the following steps:
+
+1. Loading both student and teacher model checkpoints (must support same parallelism strategy, if any)
+2. Training until convergence, where forward passes are run on both models (and backward only on student), performing a specific loss function between the logits.
+3. Saving the final student model.
+
+
+Example
+^^^^^^^
+The example below shows how to run the distillation script for LLama models.
+
+The script must be launched correctly with the number of processes equal to tensor parallelism. This is achieved with the ``torchrun`` command below:
+
+.. code-block:: bash
+
+STUDENT_CKPT="path/to/student.nemo" # can also be None (will use default architecture found in examples/nlp/language_modeling/conf/megatron_llama_distill.yaml)
+TEACHER_CKPT="path/to/teacher.nemo"
+TOKENIZER="path/to/tokenizer.model"
+DATA_PATHS="[1.0,path/to/documents]"
+FINAL_SAVE_FILE="final_checkpoint.nemo"
+TP=4
+
+NPROC=$TP
+launch_config="torchrun --nproc_per_node=$NPROC"
+
+${launch_config} examples/nlp/language_modeling/megatron_gpt_distillation.py \
+ model.restore_from_path=$STUDENT_CKPT \
+ model.kd_teacher_restore_from_path=$TEACHER_CKPT \
+ model.tensor_model_parallel_size=${TP} \
+ model.tokenizer.model=$TOKENIZER \
+ model.data.data_prefix=$DATA_PATHS \
+ model.nemo_path=$FINAL_SAVE_FILE \
+ trainer.precision=bf16 \
+ trainer.devices=$NPROC
+
+For large models, the command can be used in multi-node setting. For example, this can be done with `NeMo Framework Launcher `_ using Slurm.
+
+
+Limitations
+^^^^^^^^^^^
+* Only Megatron Core-based GPT models are supported
+* Only logit-pair distillation is supported for now
+* Pipeline parallelism not yet supported
+* FSDP strategy not yet supported
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
index 6060726d5ba8..435e3e24350e 100644
--- a/docs/source/starthere/intro.rst
+++ b/docs/source/starthere/intro.rst
@@ -102,7 +102,7 @@ This final step involves installing the TensorRT Model Optimizer package.
.. code-block:: bash
- pip install nvidia-modelopt[torch]~=0.13.0 --extra-index-url https://pypi.nvidia.com
+ pip install nvidia-modelopt[torch]~=0.15.0 --extra-index-url https://pypi.nvidia.com
.. code-block:: bash
diff --git a/examples/asr/speech_to_text_eval.py b/examples/asr/speech_to_text_eval.py
index 7b59ffe3fbfc..0024783d2362 100644
--- a/examples/asr/speech_to_text_eval.py
+++ b/examples/asr/speech_to_text_eval.py
@@ -64,7 +64,7 @@
import json
import os
-from dataclasses import dataclass, is_dataclass
+from dataclasses import dataclass, field, is_dataclass
from typing import Optional
import torch
@@ -99,8 +99,13 @@ class EvaluationConfig(transcribe_speech.TranscriptionConfig):
only_score_manifest: bool = False
scores_per_sample: bool = False
- text_processing: Optional[TextProcessingConfig] = TextProcessingConfig(
- punctuation_marks=".,?", separate_punctuation=False, do_lowercase=False, rm_punctuation=False,
+ text_processing: Optional[TextProcessingConfig] = field(
+ default_factory=lambda: TextProcessingConfig(
+ punctuation_marks=".,?",
+ separate_punctuation=False,
+ do_lowercase=False,
+ rm_punctuation=False,
+ )
)
diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index d54ee34c18cd..7f5bc0659150 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -116,7 +116,7 @@
class ModelChangeConfig:
# Sub-config for changes specific to the Conformer Encoder
- conformer: ConformerChangeConfig = ConformerChangeConfig()
+ conformer: ConformerChangeConfig = field(default_factory=ConformerChangeConfig)
@dataclass
@@ -164,14 +164,14 @@ class TranscriptionConfig:
overwrite_transcripts: bool = True
# Decoding strategy for CTC models
- ctc_decoding: CTCDecodingConfig = CTCDecodingConfig()
+ ctc_decoding: CTCDecodingConfig = field(default_factory=CTCDecodingConfig)
# Decoding strategy for RNNT models
# enable CUDA graphs for transcription
- rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
+ rnnt_decoding: RNNTDecodingConfig = field(default_factory=lambda: RNNTDecodingConfig(fused_batch_size=-1))
# Decoding strategy for AED models
- multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
+ multitask_decoding: MultiTaskDecodingConfig = field(default_factory=MultiTaskDecodingConfig)
# Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs:
# Implicit single-turn assuming default role='user' (works with Canary-1B)
# +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes
@@ -187,7 +187,7 @@ class TranscriptionConfig:
att_context_size: Optional[list] = None
# Use this for model-specific changes before transcription
- model_change: ModelChangeConfig = ModelChangeConfig()
+ model_change: ModelChangeConfig = field(default_factory=ModelChangeConfig)
# Config for word / character error rate calculation
calculate_wer: bool = True
diff --git a/examples/llm/megatron_gpt_pretraining.py b/examples/llm/megatron_gpt_pretraining.py
index d3d049e4296e..73e96a23bf81 100644
--- a/examples/llm/megatron_gpt_pretraining.py
+++ b/examples/llm/megatron_gpt_pretraining.py
@@ -92,7 +92,7 @@ def get_args():
callbacks=callbacks,
log_every_n_steps=1,
limit_val_batches=2,
- plugins=nl.MegatronMixedPrecision(precision="bf16-mixed", amp_O2=False),
+ plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
)
nemo_logger = NeMoLogger(
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 85609c2dd9b0..95cb1dcf48ec 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -117,6 +117,7 @@ model:
batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+ scale_positional_embedding: False # Apply scaling for RoPE frequencies
## Reset learning rate schedule.
# 1. reset_lr=True, reset_lr_steps=False. When pre-training an existing checkpoint "from scratch" on a different dataset.
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml b/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml
new file mode 100644
index 000000000000..052de2ff7d48
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml
@@ -0,0 +1,222 @@
+name: megatron_llama_distill
+
+trainer:
+ devices: 1
+ num_nodes: 1
+ accelerator: gpu
+ precision: 32
+ logger: False # logger provided by exp_manager
+ enable_checkpointing: False
+ use_distributed_sampler: False
+ max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+ max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+ log_every_n_steps: 10
+ val_check_interval: 100
+ limit_val_batches: 50
+ limit_test_batches: 500
+ accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+ gradient_clip_val: 1.0
+ benchmark: False
+ enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+ explicit_log_dir: null
+ exp_dir: null
+ name: ${name}
+ create_wandb_logger: False
+ wandb_logger_kwargs:
+ project: null
+ name: null
+ resume_if_exists: True
+ resume_ignore_no_checkpoint: True
+ create_checkpoint_callback: True
+ checkpoint_callback_params:
+ monitor: val_loss
+ save_top_k: 10
+ mode: min
+ always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+ save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+ filename: "megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}"
+ model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+ restore_from_path: null # overwrite if starting from pretrained
+ restore_from_ckpt: null # overwrite if resuming training
+ kd_teacher_restore_from_path: null # overwrite always! (path to teacher weights)
+ nemo_path: null # overwrite always! (path to save final model)
+
+ mcore_gpt: True
+ # specify micro_batch_size, global_batch_size, and model parallelism
+ # gradient accumulation will be done automatically based on data_parallel_size
+ micro_batch_size: 1 # limited by GPU memory
+ global_batch_size: 8 # will use more micro batches to reach global batch size
+ tensor_model_parallel_size: 1 # intra-layer model parallelism
+ pipeline_model_parallel_size: 1 # inter-layer model parallelism
+ virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+ # model architecture
+ encoder_seq_length: 4096
+ max_position_embeddings: ${.encoder_seq_length}
+ num_layers: 8 # 7b: 32 | 13b: 40 | 70b: 80
+ hidden_size: 4096 # 7b: 4096 | 13b: 5120 | 70b: 8192
+ ffn_hidden_size: 11008 # Transformer FFN hidden size. Usually 4 * hidden_size. | 7b: 11008 | 13b: 13824 | 70b: 28672
+ num_attention_heads: 32 # 7b: 32 | 13b: 40 | 70b: 64
+ init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+ use_scaled_init_method: True # use scaled residuals initialization
+ hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+ attention_dropout: 0.0 # Dropout probability for attention
+ ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+ kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+ apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+ normalization: "rmsnorm" # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+ layernorm_epsilon: 1e-5
+ do_layer_norm_weight_decay: False # True means weight decay on all params
+ make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+ pre_process: True # add embedding
+ post_process: True # add pooler
+ persist_layer_norm: True # Use of persistent fused layer norm kernel.
+ bias: False # Whether to use bias terms in all weight matrices.
+ activation: "fast-swiglu" # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+ headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+ transformer_block_type: "pre_ln" # Options ['pre_ln', 'post_ln', 'normformer']
+ openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+ normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+ position_embedding_type: "rope" # Position embedding type. Options ['learned_absolute', 'rope']
+ rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+ attention_type: "multihead" # Attention type. Options ['multihead']
+ share_embeddings_and_output_weights: False # Share embedding and output layer weights.
+ overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+ batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+ num_query_groups: 32 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 32 | 13b: 40 | 70b: 8
+
+ tokenizer:
+ library: "sentencepiece"
+ type: null
+ model: ??? # /path/to/tokenizer.model
+ vocab_file: null
+ merge_file: null
+ delimiter: null # only used for tabular tokenizer
+ sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+
+ # Mixed precision
+ native_amp_init_scale: 4294967296 # 2 ** 32
+ native_amp_growth_interval: 1000
+ hysteresis: 2 # Gradient scale hysteresis
+ fp32_residual_connection: False # Move residual connections to fp32
+ fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+ # Megatron O2-style half-precision
+ megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+ grad_allreduce_chunk_size_mb: 125
+
+ # Fusion
+ grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+ gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+ bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+ bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+ masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+ get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+ apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+ # Miscellaneous
+ seed: 1234
+ resume_from_checkpoint: null # manually set the checkpoint file to load from
+ use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+ onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+ apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+ gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+ sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+ ## Activation Checkpointing
+ # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+ # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+ # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+ # 'full' will checkpoint the entire transformer layer.
+ activations_checkpoint_granularity: null # 'selective' or 'full'
+ activations_checkpoint_method: null # 'uniform', 'block'
+ # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+ # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+ # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+ activations_checkpoint_num_layers: null
+ # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+ # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+ num_micro_batches_with_partial_activation_checkpoints: null
+ # This feature is valid only when used with pipeline-model-parallelism.
+ # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+ # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+ # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+ # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+ # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+ activations_checkpoint_layers_per_pipeline: null
+ # This feature is valid only when used with pipeline-model-parallelism.
+ # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+ # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+ # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+ # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+ # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+ ## Sequence Parallelism
+ # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+ # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+ sequence_parallel: False
+
+ ## Transformer Engine
+ transformer_engine: True
+ fp8: False # enables fp8 in TransformerLayer forward
+ fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+ fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
+ fp8_margin: 0 # scaling margin
+ fp8_interval: 1 # scaling update interval
+ fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+ fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+ reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+ use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+ data:
+ # Path to data must be specified by the user.
+ # Supports List, String and Dictionary
+ # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+ # Or see example below:
+ # data_prefix:
+ # - .5
+ # - /raid/data/pile/my-gpt3_00_text_document
+ # - .5
+ # - /raid/data/pile/my-gpt3_01_text_document
+ # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+ # Or see example below:
+ # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+ data_prefix: ???
+ index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+ data_impl: mmap
+ splits_string: 98,1,1
+ seq_length: 4096
+ skip_warmup: True
+ num_workers: 2
+ dataloader_type: single # cyclic
+ reset_position_ids: False # Reset position ids after end-of-document token
+ reset_attention_mask: False # Reset attention mask after end-of-document token
+ eod_mask_loss: False # Mask loss for the end of document tokens
+ validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+ no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+ pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+ shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+
+ # Nsys profiling options
+ nsys_profile:
+ enabled: False
+ start_step: 10 # Global batch to start profiling
+ end_step: 10 # Global batch to end profiling
+ ranks: [0] # Global rank IDs to profile
+ gen_shape: False # Generate model and kernel details including input shapes
+
+ optim:
+ name: fused_adam
+ lr: 1e-4
+ weight_decay: 0.1
+ betas:
+ - 0.9
+ - 0.95
+ sched:
+ name: CosineAnnealing
+ warmup_steps: 50
+ min_lr: 1e-5
diff --git a/examples/nlp/language_modeling/megatron_gpt_distillation.py b/examples/nlp/language_modeling/megatron_gpt_distillation.py
new file mode 100644
index 000000000000..b3ecdcfc5522
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_gpt_distillation.py
@@ -0,0 +1,576 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from pathlib import Path
+from typing import Any, Dict
+
+import modelopt.torch.distill as mtd
+import modelopt.torch.opt as mto
+import torch.multiprocessing as mp
+from omegaconf import DictConfig, OmegaConf, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+
+try:
+ from megatron.core import parallel_state, tensor_parallel
+ from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+ from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
+except ImportError:
+ raise AssertionError("ModelOpt only supports Megatron-Core.")
+import types
+from abc import ABCMeta
+from importlib.metadata import version
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.parallel_state import get_tensor_model_parallel_group
+from megatron.core.transformer import TransformerConfig
+from pkg_resources import packaging
+from torch import Tensor
+from torch.nn.modules.loss import _Loss
+
+from nemo.collections.common.parts.utils import extend_instance
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import (
+ EmbeddingScalingMixin,
+ MegatronGPTModel,
+ get_specs,
+)
+from nemo.collections.nlp.modules.common.megatron.module import Float16Module
+from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+from nemo.utils.model_utils import load_config, unwrap_model
+
+mp.set_start_method("spawn", force=True)
+
+# Model config fields which affect the structure of the model.
+# These will be the values taken from the teacher's config file,
+# while the rest remain the same as student's.
+MODEL_ARCHITECHTURE_KEYS = [
+ "encoder_seq_length",
+ "max_position_embeddings",
+ "num_layers",
+ "hidden_size",
+ "ffn_hidden_size",
+ "num_attention_heads",
+ "init_method_std",
+ "use_scaled_init_method",
+ "hidden_dropout",
+ "attention_dropout",
+ "ffn_dropout",
+ "kv_channels",
+ "apply_query_key_layer_scaling",
+ "normalization",
+ "layernorm_epsilon",
+ "do_layer_norm_weight_decay",
+ "make_vocab_size_divisible_by",
+ "pre_process",
+ "post_process",
+ "persist_layer_norm",
+ "bias",
+ "activation",
+ "headscale",
+ "transformer_block_type",
+ "openai_gelu",
+ "normalize_attention_scores",
+ "position_embedding_type",
+ "rotary_percentage",
+ "attention_type",
+ "share_embeddings_and_output_weights",
+ "overlap_p2p_comm",
+ "batch_p2p_comm",
+ "num_query_groups",
+ "seq_len_interpolation_factor",
+ "rotary_base",
+ "scale_positional_embedding",
+]
+
+
+class DistillationMegatronGPTModel(MegatronGPTModel):
+ """ModelOpt Distillation-enabled subclass of `MegatronGPTModel`."""
+
+ def __init__(self, cfg: DictConfig, trainer: Trainer):
+ """
+ Constructor.
+
+ Args:
+ cfg: Model configuration.
+ trainer: Nemo trainer instance.
+ """
+ logging.info("Distillation: Enabled.")
+ assert cfg.kd_teacher_restore_from_path is not None, "Path to teacher weights must be provided."
+ assert cfg.pipeline_model_parallel_size == 1, "Distillation mode does not yet support Pipeline Parallel."
+
+ super().__init__(cfg, trainer)
+
+ logging.info("\n\n************** Final model configuration ***********")
+ logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+ def model_provider_func(self, pre_process, post_process):
+ """Model depends on pipeline paralellism."""
+ if not self.mcore_gpt:
+ raise AssertionError("ModelOpt Distillation only supports MCore model edition.")
+
+ model = MCoreGPTModel(
+ config=self.transformer_config,
+ transformer_layer_spec=get_specs(
+ self.spec_name,
+ self.transformer_config,
+ self.transformer_engine,
+ self.cfg.get('hyena', None),
+ ),
+ vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
+ max_sequence_length=self.cfg.get('encoder_seq_length', 512),
+ pre_process=pre_process,
+ post_process=post_process,
+ parallel_output=True,
+ share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True),
+ position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'),
+ rotary_percent=self.cfg.get('rotary_percentage', 1.0),
+ seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
+ rotary_base=self.cfg.get('rotary_base', 10000),
+ )
+ if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage():
+ extend_instance(model.embedding, EmbeddingScalingMixin)
+
+ # [ModelOpt] Distillation mode.
+ distill_cfg = load_distillation_config(self.transformer_config)
+ # Intialize DistillationModel.
+ kd_config = {
+ "teacher_model": (_teacher_provider, [self.cfg, copy.deepcopy(self.trainer)], {}),
+ "criterion": distill_cfg["criterion"],
+ "loss_balancer": distill_cfg["loss_balancer"],
+ }
+ model = mtd.convert(model, mode=[("kd_loss", kd_config)])
+
+ # Additional tweaks needed for MCore/Nemo.
+ adjust_distillation_model_for_mcore(model, distill_cfg)
+
+ return model
+
+ def get_forward_output_and_loss_func(self, validation_step=False, tuning=False):
+ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
+
+ # Get data batch
+ batch = self.get_batch(dataloader_iter, tuning)
+
+ # Transfer needed data to GPU
+ required_keys = set()
+ max_seqlen = batch['max_seqlen'].squeeze() if 'max_seqlen' in batch else None
+ cu_seqlens_argmin = batch['cu_seqlens_argmin'] if 'cu_seqlens_argmin' in batch else None
+ if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+ required_keys.update(batch.keys())
+ else:
+ required_keys.add('attention_mask')
+ if 'cu_seqlens' in batch:
+ required_keys.add('cu_seqlens')
+ if parallel_state.is_pipeline_first_stage():
+ required_keys.update(('tokens', 'position_ids'))
+ if parallel_state.is_pipeline_last_stage():
+ required_keys.update(('labels', 'loss_mask'))
+ if self.get_attention_mask_from_fusion and 'attention_mask' in required_keys:
+ required_keys.remove('attention_mask')
+ batch = {
+ key: val.cuda(non_blocking=True) if key in required_keys and isinstance(val, torch.Tensor) else None
+ for key, val in batch.items()
+ }
+
+ # slice batch along sequence dimension for context parallelism
+ batch = self.get_batch_on_this_context_parallel_rank(batch)
+
+ # Model forward pass
+ forward_args = {
+ 'input_ids': batch['tokens'],
+ 'position_ids': batch['position_ids'],
+ 'attention_mask': None if self.get_attention_mask_from_fusion else batch['attention_mask'],
+ 'labels': batch['labels'] if 'labels' in batch else None,
+ 'loss_mask': batch['loss_mask'],
+ }
+
+ # TODO: @eharper can we add this to mcore?
+ forward_args.pop('loss_mask')
+
+ if 'cu_seqlens' in batch: # packed sequence from GPTSFTPackedDataset
+ # these args are passed eventually into TEDotProductAttention.forward()
+ cu_seqlens = batch['cu_seqlens'].squeeze() # remove batch size dimension (mbs=1)
+ # remove -1 "paddings" added in collate_fn
+ if cu_seqlens_argmin is not None:
+ cu_seqlens = cu_seqlens[: cu_seqlens_argmin.item()]
+ else:
+ cu_seqlens = cu_seqlens[: torch.argmin(cu_seqlens)]
+
+ try:
+ from megatron.core.packed_seq_params import PackedSeqParams
+ except (ImportError, ModuleNotFoundError) as e:
+ mcore_version = packaging.version.Version(version('megatron-core'))
+ logging.error(
+ f"megatron-core v{mcore_version} does not support training with packed sequence. "
+ "Please use megatron-core >= 0.5.0, or set model.data.train_ds.packed_sequence=False"
+ )
+ raise e
+
+ forward_args['packed_seq_params'] = PackedSeqParams(
+ cu_seqlens_q=cu_seqlens,
+ cu_seqlens_kv=cu_seqlens,
+ max_seqlen_q=max_seqlen,
+ max_seqlen_kv=max_seqlen,
+ qkv_format='thd',
+ )
+
+ output_tensor = model(**forward_args)
+
+ def loss_func(output_tensor):
+ if validation_step:
+ loss_for_ub = self.loss_func(
+ batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor, validation_step=True
+ )
+ else:
+ # [ModelOpt] KD Loss for a micro-batch (ub)
+ unwrapped_model = unwrap_model(model, (Float16Module, MCoreFloat16Module))
+ loss_for_ub = unwrapped_model.compute_kd_loss(
+ loss_reduction_fn=lambda x: self.loss_func(
+ batch['loss_mask'], batch['num_valid_tokens_in_ub'], x
+ )
+ )
+ cp_size = parallel_state.get_context_parallel_world_size()
+ if validation_step and not self.validation_drop_last:
+ num_valid_tokens_in_ub = batch['num_valid_tokens_in_ub']
+ if loss_for_ub.isnan():
+ assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input'
+ loss_sum_for_ub = torch.zeros_like(loss_for_ub)
+ num_valid_tokens_in_ub = 0
+ else:
+ if self.sample_weight == 'constant':
+ num_valid_tokens_in_ub = 1
+ loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub
+
+ loss_sum_and_ub_size_all_gpu = torch.cat(
+ [
+ loss_sum_for_ub.clone().detach().view(1),
+ torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(),
+ ]
+ )
+ # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds)
+ torch.distributed.all_reduce(
+ loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group()
+ )
+ return loss_for_ub * cp_size, {'loss_sum_and_ub_size': loss_sum_and_ub_size_all_gpu}
+ else:
+ reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+ return loss_for_ub * cp_size, {'avg': reduced_loss}
+
+ return output_tensor, loss_func
+
+ return fwd_output_and_loss_func
+
+ def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor, validation_step=False):
+ loss = super().loss_func(loss_mask, num_valid_tokens_in_ub, output_tensor)
+ if not validation_step and self.cfg.tensor_model_parallel_size > 1:
+ # [ModelOpt] KD loss requires extra all-reduce to ensure same values across MP-TP partitions.
+ loss = torch.sum(tensor_parallel.gather_from_tensor_model_parallel_region(loss.reshape(1)))
+ return loss
+
+ def configure_optimizers(self):
+ with self.model.hide_teacher_model():
+ return super().configure_optimizers()
+
+
+########################################################
+
+
+def load_distillation_config(student_cfg: TransformerConfig) -> Dict[str, Any]:
+ """Create a default distillation config for MCore GPT Models.
+
+ Args:
+ student_cfg: Model config for student model.
+ """
+ logit_pair = ("output_layer", "output_layer") # logit module names for MCoreGPTModel
+ tp_enabled = student_cfg.tensor_model_parallel_size > 1
+
+ cfg = {
+ "criterion": {tuple(logit_pair): LogitsKLLoss(tensor_parallel=tp_enabled)},
+ "loss_balancer": None,
+ "skip_lm_loss": True,
+ }
+ return cfg
+
+
+class BaseLoss(_Loss, metaclass=ABCMeta):
+ """Abstract base class for Megatron distillation losses."""
+
+ def __init__(self, tensor_parallel: bool = False):
+ """Constructor.
+
+ Args:
+ tensor_parallel: Whether tensor parallelism is enabled or not.
+ """
+ super().__init__()
+ self._tensor_parallel = tensor_parallel
+
+ def pre_forward(self, predictions: Tensor, targets: Tensor) -> Tuple[Tensor, Tensor]:
+ """Performs projection of student tensor to match teacher's size if necessary."""
+ if isinstance(predictions, tuple):
+ # `ColumnParallelLinear` returns bias too
+ predictions, targets = predictions[0], targets[0]
+
+ targets = targets.detach()
+
+ return predictions, targets
+
+ def post_forward(self, loss: Tensor) -> Tensor:
+ """Reshapes tensor from [s, b] to [b, s] for upcoming loss masking."""
+ loss = loss.transpose(0, 1).contiguous()
+ return loss
+
+
+class LogitsKLLoss(BaseLoss):
+ """Calculates KL-Divergence loss between two logits tensors without reducing the sequence dim."""
+
+ def __init__(
+ self,
+ tensor_parallel: bool = False,
+ temperature: float = 1.0,
+ reverse: bool = False,
+ ):
+ """
+ Constructor.
+
+ Args:
+ tensor_parallel: Whether tensor parallelism is enabled or not.
+ temperature: Divide tensors by this value prior to calculating loss.
+ reverse: Whether to reverse the loss as KLD(teacher, student) instead of KLD(student, teacher)
+ """
+ super().__init__(tensor_parallel)
+ self._temperature = temperature
+ self._reverse = reverse
+
+ def forward(self, predictions: Tensor, targets: Tensor) -> Tensor:
+ """
+ Forward function.
+
+ Args:
+ predictions: Student model tensors (size [s, b, h])
+ targets: Teacher model tensors (size [s, b, h])
+
+ Returns:
+ KLD loss of tensors (size [b, s])
+ """
+ predictions, targets = self.pre_forward(predictions, targets)
+
+ # Division by temp should happen prior to finding max for both student and teacher.
+ # Currently we don't use temperature in any of ours runs (temp=1.0)
+ output_teacher = targets.float() / self._temperature
+ output_student = predictions.float() / self._temperature
+
+ # Compute local softmax, and the reweight to compute global softmax.
+ if self._tensor_parallel:
+
+ # Maximum value along vocab dimension across all GPUs.
+ teacher_logits_max, _ = torch.max(output_teacher, dim=-1)
+ torch.distributed.all_reduce(
+ teacher_logits_max,
+ op=torch.distributed.ReduceOp.MAX,
+ group=get_tensor_model_parallel_group(),
+ )
+ output_teacher = output_teacher - teacher_logits_max.unsqueeze(dim=-1)
+
+ denom_teacher = torch.sum(torch.exp(output_teacher), dim=-1)
+ # We can't use `gather_from_tensor_model_parallel_region` here since it discards
+ # gradients from other ranks - we need to all_reduce the gradients as well.
+ denom_teacher = all_reduce_autograd(denom_teacher, group=get_tensor_model_parallel_group())
+
+ # Maximum value along vocab dimension across all GPUs.
+ student_logits_max, _ = torch.max(output_student, dim=-1)
+ torch.distributed.all_reduce(
+ student_logits_max,
+ op=torch.distributed.ReduceOp.MAX,
+ group=get_tensor_model_parallel_group(),
+ )
+ output_student = output_student - student_logits_max.unsqueeze(dim=-1).detach()
+
+ denom_student = torch.sum(torch.exp(output_student), dim=-1)
+ denom_student = all_reduce_autograd(denom_student, group=get_tensor_model_parallel_group())
+
+ slen, bsz, sharded_vocab_size = output_student.shape
+ student_log_prob = output_student - torch.log(denom_student).view(slen, bsz, 1).expand(
+ slen, bsz, sharded_vocab_size
+ )
+ teacher_log_prob = output_teacher - torch.log(denom_teacher).view(slen, bsz, 1).expand(
+ slen, bsz, sharded_vocab_size
+ )
+
+ if self._reverse:
+ loss = torch.sum(
+ F.kl_div(teacher_log_prob, student_log_prob, reduction="none", log_target=True),
+ dim=-1,
+ )
+ else:
+ loss = torch.sum(
+ F.kl_div(student_log_prob, teacher_log_prob, reduction="none", log_target=True),
+ dim=-1,
+ )
+
+ else:
+ if self._reverse:
+ loss = torch.sum(
+ F.kl_div(
+ F.log_softmax(output_teacher, dim=-1), F.softmax(output_student, dim=-1), reduction="none"
+ ),
+ dim=-1,
+ )
+ else:
+ loss = torch.sum(
+ F.kl_div(
+ F.log_softmax(output_student, dim=-1), F.softmax(output_teacher, dim=-1), reduction="none"
+ ),
+ dim=-1,
+ )
+
+ return self.post_forward(loss)
+
+
+class _AllReduce(torch.autograd.Function):
+ """Implementation from old PyTorch `torch.distributed.nn.parallel`."""
+
+ @staticmethod
+ def forward(ctx, op, group, tensor):
+ ctx.group, ctx.op = group, op
+ tensor = tensor.clone()
+ torch.distributed.all_reduce(tensor, op=op, group=group)
+ return tensor
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ return (None, None, _AllReduce.apply(ctx.op, ctx.group, grad_output))
+
+
+def all_reduce_autograd(tensor, op=torch.distributed.ReduceOp.SUM, group=torch.distributed.group.WORLD):
+ return _AllReduce.apply(op, group, tensor)
+
+
+def adjust_distillation_model_for_mcore(model: mtd.DistillationModel, distill_cfg: Dict[str, Any]):
+ """Extra modifcations to ``mtd.DistillationModel`` requried for Megatron-Core."""
+
+ # HACK: Get rid of ModelOpt Distillation state
+ mto.ModeloptStateManager(model)._state.pop()
+
+ # HACK: Hide teacher during `sharded_state_dict` method.
+ def _sharded_state_dict(self, *args, **kwargs) -> ShardedStateDict:
+ with self.hide_teacher_model():
+ return self._sharded_state_dict(*args, **kwargs)
+
+ model._sharded_state_dict = model.sharded_state_dict
+ model.sharded_state_dict = types.MethodType(_sharded_state_dict, model)
+
+ # HACK: Skip `lm_loss` bypassing it when training if not needed for backprop.
+ def _compute_language_model_loss(self, labels, logits) -> Tensor:
+ if self.training:
+ return torch.zeros_like(labels)
+ return self._compute_language_model_loss(labels, logits)
+
+ if distill_cfg["skip_lm_loss"]:
+ model._compute_language_model_loss = model.compute_language_model_loss
+ model.compute_language_model_loss = types.MethodType(_compute_language_model_loss, model)
+
+
+########################################################
+
+
+def _teacher_provider(cfg: DictConfig, trainer: Trainer) -> MCoreGPTModel:
+ """Teacher model factory (must be a non-local function to pickle)."""
+ logging.info("Distillation: Loading teacher weights...")
+ teacher_model_cfg = _merge_model_arch_fields(cfg, cfg.kd_teacher_restore_from_path)
+
+ model = MegatronGPTModel.restore_from(
+ cfg.kd_teacher_restore_from_path,
+ override_config_path=teacher_model_cfg,
+ trainer=trainer,
+ )
+ teacher_model_module_list = model.get_model_module_list()
+ logging.info("Distillation: ... teacher weights loaded.")
+ return teacher_model_module_list[0]
+
+
+def _merge_model_arch_fields(cfg: DictConfig, model_load_path: str) -> DictConfig:
+ """Overwrite model-architecture fields of a config with a checkpoint's."""
+ model_cfg = load_config(model_load_path)
+ model_arch_keys = [k for k in MODEL_ARCHITECHTURE_KEYS if k in model_cfg]
+ model_arch_cfg = OmegaConf.masked_copy(model_cfg, model_arch_keys)
+ with open_dict(cfg):
+ cfg = OmegaConf.merge(cfg, model_arch_cfg)
+ # Add tokenizer from model if not provided
+ if OmegaConf.is_missing(cfg.tokenizer, "model"):
+ cfg.tokenizer = model_cfg.tokenizer
+ return cfg
+
+
+########################################################
+
+
+@hydra_runner(config_path="conf", config_name="megatron_llama_distill")
+def main(cfg) -> None:
+ logging.info("\n\n************** Experiment configuration ***********")
+ logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+ trainer = MegatronTrainerBuilder(cfg).create_trainer()
+ exp_manager(trainer, cfg.exp_manager)
+
+ with open_dict(cfg):
+ cfg.model.name = "modelopt" # Convert TE layernorm spec to unfused format
+ # HACK: Checkpoint-loading process hangs/loops if this isn't present here for some reason.
+ cfg.model.target = "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel"
+
+ # Continual training
+ if cfg.model.get("restore_from_path") is not None:
+ # Option 1: Restore only the model weights from a .nemo file
+ logging.info(f"Continual training: loading weights from {cfg.model.restore_from_path}")
+
+ # Merge model config's architecture fields with the one from the checkpoint
+ cfg.model = _merge_model_arch_fields(cfg.model, cfg.model.restore_from_path)
+
+ model = DistillationMegatronGPTModel.restore_from(
+ restore_path=cfg.model.restore_from_path,
+ override_config_path=cfg.model,
+ trainer=trainer,
+ save_restore_connector=NLPSaveRestoreConnector(),
+ )
+ logging.info("... weights loaded.")
+ elif cfg.model.get("restore_from_ckpt") is not None:
+ # Option 2: Restore both model weights and optimizer states from a PTL checkpoint
+ logging.info(f"Continual training: loading weights and optimizer states from {cfg.model.restore_from_ckpt}")
+ trainer.ckpt_path = Path(cfg.model.restore_from_ckpt)
+ model = DistillationMegatronGPTModel(cfg.model, trainer)
+ logging.info("... weights and optimizer states loaded.")
+
+ # Start new pretraining or resume from a checkpoint if it exists
+ else:
+ logging.info("Instantiating new model ...")
+ model = DistillationMegatronGPTModel(cfg.model, trainer)
+ logging.info("... model instantiated.")
+
+ trainer.fit(model)
+
+ if cfg.model.nemo_path:
+ model.save_to(cfg.model.nemo_path)
+ else:
+ logging.warning("Skipping saving final model as no `model.nemo_path` provided.")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/nemo/collections/asr/data/data_simulation.py b/nemo/collections/asr/data/data_simulation.py
index 5ee2ad19b951..369a95d9ee9a 100644
--- a/nemo/collections/asr/data/data_simulation.py
+++ b/nemo/collections/asr/data/data_simulation.py
@@ -1122,7 +1122,7 @@ def _generate_session(
alignments=self._alignments,
session_name=filename,
speaker_id=speaker_ids[speaker_turn],
- start=int(start / self._params.data_simulator.sr),
+ start=float(start / self._params.data_simulator.sr),
)
self.annotator.annote_lists['ctm'].extend(new_ctm_entries)
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 55663a04a5a0..c4c88091cb28 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -30,7 +30,7 @@
from lhotse.dataset.dataloading import resolve_seed
from lhotse.lazy import LazyIteratorChain, LazyJsonlIterator
from lhotse.serialization import open_best
-from lhotse.utils import compute_num_samples
+from lhotse.utils import compute_num_samples, ifnone
from nemo.collections.common.parts.preprocessing.manifest import get_full_path
@@ -332,14 +332,16 @@ def __iter__(self) -> Generator[Cut, None, None]:
# Handle NeMo tarred manifests with offsets.
# They have multiple JSONL entries where audio paths end with '-sub1', '-sub2', etc. for each offset.
- offset_pattern = re.compile(r'^.+(-sub\d+)$')
+ offset_pattern = re.compile(r'^(?P.+)(?P-sub\d+)(?P\.\w+)?$')
for sid in shard_ids:
manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
def basename(d: dict) -> str:
return (
- k[: -len(m.group(1))] if (m := offset_pattern.match(k := d["audio_filepath"])) is not None else k
+ m.group("stem") + ifnone(m.group("ext"), "")
+ if (m := offset_pattern.match(k := d["audio_filepath"])) is not None
+ else k
)
shard_manifest: dict[str, list[dict]] = groupby(basename, self.shard_id_to_manifest[sid])
diff --git a/nemo/collections/common/parts/utils.py b/nemo/collections/common/parts/utils.py
index e8eb1b999292..75783815548a 100644
--- a/nemo/collections/common/parts/utils.py
+++ b/nemo/collections/common/parts/utils.py
@@ -12,10 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import logging
import math
import os
from typing import Iterable, List
+logger = logging.getLogger(__name__)
+
import einops
import torch
import torch.nn as nn
@@ -109,6 +112,31 @@ def extend_instance(obj, mixin):
) # mixin needs to go first for our forward() logic to work
+def apply_rope_scaling(freqs):
+ # Apply scaling for RoPE frequencies
+ logger.info("apply rope scaling ...")
+ # Values obtained from grid search
+ scale_factor = 8
+ low_freq_factor = 1
+ high_freq_factor = 4
+ old_context_len = 8192 # original llama3 length
+
+ low_freq_wavelen = old_context_len / low_freq_factor
+ high_freq_wavelen = old_context_len / high_freq_factor
+ new_freqs = []
+ for freq in freqs:
+ wavelen = 2 * math.pi / freq
+ if wavelen < high_freq_wavelen:
+ new_freqs.append(freq)
+ elif wavelen > low_freq_wavelen:
+ new_freqs.append(freq / scale_factor)
+ else:
+ assert low_freq_wavelen != high_freq_wavelen
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+ new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
+ return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+
+
def mask_sequence_tensor(tensor: torch.Tensor, lengths: torch.Tensor):
"""
For tensors containing sequences, zero out out-of-bound elements given lengths of every element in the batch.
diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
index 6622054c4f98..b2d9b5ba8cca 100644
--- a/nemo/collections/llm/gpt/data/pre_training.py
+++ b/nemo/collections/llm/gpt/data/pre_training.py
@@ -190,6 +190,7 @@ def test_dataloader(self) -> EVAL_DATALOADERS:
def _create_dataloader(self, dataset, mode, **kwargs) -> WrappedDataLoader:
self.init_global_step = self.trainer.global_step
+ self.data_sampler.init_global_step = self.init_global_step
dataloader = WrappedDataLoader(
mode=mode,
dataset=dataset,
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index 913144d1bf5f..71b60d5df59f 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -17,12 +17,26 @@ class to provide a specific implementation of the forward method.
"""
def forward(self, x):
- linear_output, bias = self.to_wrap(x)
- if isinstance(linear_output, tuple) and len(linear_output) == 2:
- linear_output, layernorm_output = linear_output
- adapter_output = self.adapter(layernorm_output)
- else:
- adapter_output = self.adapter(x)
+ linear_output = self.to_wrap(x)
+ assert isinstance(
+ linear_output, tuple
+ ), f"{self.to_wrap} should return a tuple but instead returns {linear_output}"
+ """ Four cases for the wrapped module's return values
+ 1. nothing: (out, None)
+ 2. return_bias: (out, bias)
+ 2. return_layernorm_output: ((out, ln_out), None)
+ 3. both: (out, bias, ln_out)
+ """
+ if len(linear_output) == 2:
+ linear_output, bias = linear_output
+ if isinstance(linear_output, tuple) and len(linear_output) == 2:
+ linear_output, layernorm_output = linear_output
+ x = layernorm_output
+ elif len(linear_output) == 3:
+ linear_output, bias, layernorm_output = linear_output
+ x = layernorm_output
+
+ adapter_output = self.adapter(x)
return linear_output + adapter_output, bias
@@ -72,6 +86,8 @@ class LoRA(PEFT):
alpha: int = 32
dropout: float = 0.0
dropout_position: Literal['pre', 'post'] = 'post'
+ lora_A_init_method: str = "xavier"
+ lora_B_init_method: str = "zero"
def transform(self, m: nn.Module, name=None, prefix=None):
"""
@@ -96,6 +112,11 @@ def transform(self, m: nn.Module, name=None, prefix=None):
input_is_parallel = False
in_features = m.in_features
out_features = m.out_features * tp_size
+ # LoRA is applied after layernorm, so layernorm output must be returned
+ m.return_layernorm_output = True
+ # perf optimization for LoRA + SP
+ if m.config.sequence_parallel and not m.ub_overlap_ag:
+ m.return_layernorm_output_gathered = True
else: # name in ['linear_proj', 'linear_fc2']
# Row Parallel Linear
input_is_parallel = True
@@ -110,8 +131,8 @@ def transform(self, m: nn.Module, name=None, prefix=None):
activation='identity',
norm_position=None,
norm_type=None,
- column_init_method="normal",
- row_init_method="zero",
+ column_init_method=self.lora_A_init_method,
+ row_init_method=self.lora_B_init_method,
gather_output=False,
input_is_parallel=input_is_parallel,
dropout=self.dropout,
diff --git a/nemo/collections/llm/tokenizer.py b/nemo/collections/llm/tokenizer.py
index 3943e24ba799..808642758ebe 100644
--- a/nemo/collections/llm/tokenizer.py
+++ b/nemo/collections/llm/tokenizer.py
@@ -9,8 +9,8 @@
track_io(
AutoTokenizer,
artifacts=[
- FileArtifact("vocab_file"),
- FileArtifact("merges_file"),
+ FileArtifact("vocab_file", required=False),
+ FileArtifact("merges_file", required=False),
],
)
__all__.append("AutoTokenizer")
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 22ee37ec361b..0198c07c82d2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -30,7 +30,7 @@
from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
from pytorch_lightning.trainer.trainer import Trainer
-from nemo.collections.common.parts.utils import extend_instance
+from nemo.collections.common.parts.utils import apply_rope_scaling, extend_instance
from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
MegatronCorePretrainingSampler,
MegatronPretrainingRandomSampler,
@@ -77,6 +77,7 @@
from nemo.utils.te_utils import is_float8tensor
try:
+ import megatron.core as core
from megatron.core import InferenceParams, parallel_state, tensor_parallel
from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
@@ -429,6 +430,7 @@ def get_inference_config(self):
def model_provider_func(self, pre_process, post_process):
"""Model depends on pipeline paralellism."""
if self.mcore_gpt:
+
model = MCoreGPTModel(
config=self.transformer_config,
transformer_layer_spec=get_specs(
@@ -448,6 +450,10 @@ def model_provider_func(self, pre_process, post_process):
seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
rotary_base=self.cfg.get('rotary_base', 10000),
)
+
+ if self.cfg.get('scale_positional_embedding', False):
+ model.rotary_pos_emb.inv_freq = apply_rope_scaling(model.rotary_pos_emb.inv_freq)
+
if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage():
extend_instance(model.embedding, EmbeddingScalingMixin)
else:
diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py
index 03afec176325..8ee3fa1c05e7 100644
--- a/nemo/export/multimodal/build.py
+++ b/nemo/export/multimodal/build.py
@@ -208,7 +208,10 @@ def forward(self, images):
return vision_x
encoder = AutoModel.from_pretrained(
- vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True
+ vision_config["from_pretrained"],
+ torch_dtype=torch.bfloat16,
+ trust_remote_code=True,
+ attn_implementation='eager',
)
vision_encoder = encoder.vision_model
hf_config = encoder.config
@@ -326,7 +329,10 @@ def forward(self, images):
return vision_x
encoder = AutoModel.from_pretrained(
- vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True
+ vision_config["from_pretrained"],
+ torch_dtype=torch.bfloat16,
+ trust_remote_code=True,
+ attn_implementation='eager',
)
vision_encoder = encoder.vision_model
hf_config = encoder.config
diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py
index 4025634ebe28..9119b2474b17 100644
--- a/nemo/lightning/io/artifact/base.py
+++ b/nemo/lightning/io/artifact/base.py
@@ -6,8 +6,9 @@
class Artifact(ABC, Generic[ValueT]):
- def __init__(self, attr: str):
+ def __init__(self, attr: str, required: bool = True):
self.attr = attr
+ self.required = required
@abstractmethod
def dump(self, value: ValueT, path: Path) -> ValueT:
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 1cd45648ea20..69368599682e 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -91,6 +91,14 @@ def __call__(self, output_path: Optional[Path] = None, overwrite: bool = False)
logging.error(f"An error occurred: {e}")
raise
+ finally:
+ # Delete the lock file if it exists
+ if lock_path.exists():
+ try:
+ os.remove(lock_path)
+ except OSError as e:
+ logging.warning(f"Failed to remove lock file {lock_path}: {e}")
+
return _output_path
def local_path(self, base_path: Optional[Path] = None) -> Path:
@@ -152,19 +160,26 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
return _trainer
- def nemo_save(self, output_path: Path, trainer: pl.Trainer) -> None:
+ def nemo_save(self, output_path: Path, trainer: pl.Trainer, dump_io: bool = True) -> None:
"""
Saves the model's state to the specified path using the trainer's current strategy.
Args:
output_path (Path): The path where the model checkpoint will be saved.
trainer (pl.Trainer): The trainer with the strategy to save the model.
+ dump_io (bool): If True, the IO configuration will be saved to the output path.
"""
trainer.strategy._setup_optimizers = False
trainer.strategy._init_model_parallel = False
trainer.strategy.setup(trainer)
trainer.save_checkpoint(output_path)
+ from nemo.lightning.io.pl import TrainerContext
+ from nemo.utils.get_rank import is_global_rank_zero
+
+ if is_global_rank_zero() and dump_io:
+ TrainerContext.from_trainer(trainer).io_dump(output_path)
+
def nemo_load(
self, path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True
) -> Tuple[pl.LightningModule, pl.Trainer]:
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index 741c1400474a..d0d4d0243ff7 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -432,13 +432,24 @@ def _io_init(self, **kwargs) -> fdl.Config[Self]:
-------
fdl.Config[Self]: The initialized configuration object.
"""
- return fdl.Config(type(self), **kwargs)
+ try:
+ return fdl.Config(type(self), **kwargs)
+ except Exception as e:
+ error_msg = (
+ f"Error creating fdl.Config for {type(self).__name__}: {str(e)}\n"
+ f"Arguments that caused the error: {kwargs}\n"
+ f"This may be due to unsupported argument types or nested configurations."
+ )
+ raise RuntimeError(error_msg) from e
def _io_wrap_init(cls):
"""Wraps the __init__ method of a class to add IO functionality."""
original_init = cls.__init__
+ if getattr(cls, "__wrapped_init__", False):
+ return cls
+
@functools.wraps(original_init)
def wrapped_init(self, *args, **kwargs):
if hasattr(self, "io_transform_args"):
@@ -453,6 +464,7 @@ def wrapped_init(self, *args, **kwargs):
original_init(self, *args, **kwargs)
cls.__init__ = wrapped_init
+ cls.__wrapped_init__ = True
return cls
@@ -502,6 +514,10 @@ def _io_path_elements_fn(x):
def _artifact_transform(cfg: fdl.Config, output_path: Path):
for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
current_val = getattr(cfg, artifact.attr)
+ if current_val is None:
+ if artifact.required:
+ raise ValueError(f"Artifact '{artifact.attr}' is required but not provided")
+ continue
new_val = artifact.dump(current_val, output_path)
setattr(cfg, artifact.attr, new_val)
diff --git a/nemo/lightning/pytorch/callbacks/model_transform.py b/nemo/lightning/pytorch/callbacks/model_transform.py
index 5d48851843fc..8a07566f92c3 100644
--- a/nemo/lightning/pytorch/callbacks/model_transform.py
+++ b/nemo/lightning/pytorch/callbacks/model_transform.py
@@ -71,6 +71,11 @@ def _maybe_apply_transform(self, trainer):
def apply_transform(self, trainer):
self.model_transform(trainer.model)
+ from pytorch_lightning.utilities import model_summary
+
+ logging.info(
+ f"After applying model_transform:\n" f"{model_summary.summarize(trainer.lightning_module, max_depth=1)}"
+ )
@property
def _needs_to_call(self) -> bool:
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index 3909c680cfc1..08a56f854e4a 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -25,6 +25,7 @@ def __init__(
rampup_batch_size: Optional[List[int]] = None,
dataloader_type: Literal["single", "cyclic", "batch"] = "single",
init_consumed_samples: int = 0,
+ init_global_step: int = 0,
):
self.seq_len = seq_len
self.micro_batch_size = micro_batch_size
@@ -35,6 +36,7 @@ def __init__(
self.prev_consumed_samples = self.init_consumed_samples
self.if_first_step = 0
self.prev_global_batch_size = None
+ self.init_global_step = init_global_step
def setup(self, global_rank: int) -> None:
from nemo.lightning.data import setup_microbatch_calculator
@@ -92,9 +94,7 @@ def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModul
self.prev_global_batch_size = self.current_global_batch_size
- # TODO: Add consumed samples
- consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_consumed_samples)
-
+ consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_global_step)
pl_module.log(
'consumed_samples',
consumed_samples,
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
index 5e43e09c0420..65b7c6292249 100644
--- a/nemo/lightning/pytorch/plugins/mixed_precision.py
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -26,11 +26,17 @@
AnyT = TypeVar("AnyT")
+def get_optim_config(optimizer: Optimizer):
+ try:
+ return optimizer.mcore_optimizer.config
+ except:
+ raise ValueError("Failed to extract optimizer config from module.")
+
+
class MegatronMixedPrecision(MixedPrecision):
def __init__(
self,
precision: Literal["16-mixed", "bf16-mixed"],
- amp_O2: bool = False,
device="cuda",
) -> None:
if precision == "bf16-mixed":
@@ -39,21 +45,6 @@ def __init__(
scaler = GradScaler(init_scale=2**32, growth_interval=1000, hysteresis=2)
super().__init__(precision, device, scaler)
- self.amp_O2 = amp_O2
-
- def connect(
- self, model: Module, optimizers: List[Optimizer], lr_schedulers: List[Any]
- ) -> Tuple[Module, List[Optimizer], List[Any]]:
- """Connects this plugin to the accelerator and the training process."""
- from nemo.core.optim import MainParamsOptimizerWrapper
-
- if not optimizers or not self.amp_O2 or isinstance(optimizers[0], MainParamsOptimizerWrapper):
- return model, optimizers, lr_schedulers
-
- _optimizers = [*optimizers]
- _optimizers[0] = self.convert_optimizer(_optimizers[0])
-
- return model, _optimizers, lr_schedulers
def convert_module(self, module: Module) -> Module:
"""Convert the module parameters to the precision type this plugin handles.
@@ -68,11 +59,11 @@ def convert_module(self, module: Module) -> Module:
config = get_model_config(module.module)
config.fp16 = self.precision == "16-mixed"
config.bf16 = self.precision == "bf16-mixed"
- if isinstance(module.module, Float16Module):
- new_float16_module = Float16Module(config, module.module.module)
- module.module = new_float16_module
- else:
+ config.autocast = False
+ if hasattr(module, 'module'):
module.module = Float16Module(config, module.module)
+ else:
+ module = Float16Module(config, module)
return module
@@ -82,16 +73,10 @@ def convert_optimizer(self, optimizer: Optimizer) -> Optimizer:
This is optional and depends on the precision limitations during optimization.
"""
- from nemo.core.optim import MainParamsOptimizerWrapper
-
- if isinstance(optimizer, MainParamsOptimizerWrapper) or not self.amp_O2:
- return optimizer
-
- return MainParamsOptimizerWrapper(
- optimizer,
- fp32_grad_accum=True,
- contiguous_grad_bucket=True,
- )
+ optim_config = get_optim_config(optimizer)
+ assert optim_config.bf16 == (self.precision == "bf16-mixed"), "BF16 enabled on model but not on optimizer"
+ assert optim_config.fp16 == (self.precision == "fp16-mixed"), "BF16 enabled on model but not on optimizer"
+ return optimizer
def convert_input(self, data: AnyT) -> AnyT:
"""Convert model inputs (forward) to the floating point precision type of this plugin.
@@ -120,7 +105,7 @@ def optimizer_step(
) -> None:
from nemo.core.optim import MainParamsOptimizerWrapper
- if not self.amp_O2 and not isinstance(optimizer, MainParamsOptimizerWrapper):
+ if not isinstance(optimizer, MainParamsOptimizerWrapper):
return super().optimizer_step(optimizer, model, closure, **kwargs)
if self.scaler is None:
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 99eee9ad6bd3..f22ed5b40a20 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -4,7 +4,7 @@
import os
import shutil
from collections import OrderedDict
-from contextlib import ExitStack
+from contextlib import ExitStack, contextmanager
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, ContextManager, Dict, List, Literal, Mapping, Optional, TypeVar, Union, cast
@@ -621,6 +621,8 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr
assert self.megatron_parallel is not None
_strategy_lib.load_model_state_dict(self.megatron_parallel, checkpoint, strict=strict)
+ for opt in self.optimizers:
+ opt.reload_model_params()
@property
@override
@@ -731,6 +733,14 @@ def parallelism(self) -> ParallelismConfig:
pipeline_dtype=self.pipeline_dtype,
)
+ @contextmanager
+ @override
+ def tensor_init_context(self, empty_init: Optional[bool] = None):
+ # Materializaton happens in `setup()`
+ # @akoumparouli: using Parent's tensor_init_context causes mcore
+ # parameters to be initialized on GPU instead of (assumed) CPU.
+ yield
+
def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
"""PTL considers checkpoints as .ckpt files.
diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py
index da1a77c3c731..81f4d12bd3fb 100644
--- a/nemo/lightning/pytorch/trainer.py
+++ b/nemo/lightning/pytorch/trainer.py
@@ -10,18 +10,24 @@
class Trainer(pl.Trainer, IOMixin):
+
+ def add_io(self, obj):
+ """Recurse to the leaves of a container and add io functionality to non-serializable leaves"""
+ if isinstance(obj, (dict, list)):
+ if isinstance(obj, dict):
+ obj = obj.values()
+ for item in obj:
+ self.add_io(item)
+ else:
+ if not serialization.find_node_traverser(type(obj)):
+ track_io(type(obj))
+ return
+
def io_init(self, **kwargs) -> fdl.Config[Self]:
# Each argument of the trainer can be stateful so we copy them
cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items()}
- for val in cfg_kwargs.values():
- if not serialization.find_node_traverser(type(val)):
- track_io(type(val))
- elif isinstance(val, list):
- for v in val:
- if not serialization.find_node_traverser(type(v)):
- track_io(type(v))
-
+ self.add_io(cfg_kwargs)
return fdl.Config(type(self), **cfg_kwargs)
def to_fabric(self, callbacks=None, loggers=None) -> Fabric:
diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py
index ec0650a90e7d..e18a2a3d3b6c 100644
--- a/nemo/utils/callbacks/cuda_graph.py
+++ b/nemo/utils/callbacks/cuda_graph.py
@@ -139,16 +139,6 @@ def to_tensor(self, value, name):
return value
-def register_key(self, key, meta, value):
- # PyTorch Lightning creates all metrics on GPU, but creating the metric on
- # its input device is prefered.
- # Refer to: https://github.com/Lightning-AI/pytorch-lightning/blob/2.0.7/src/lightning/pytorch/trainer/connectors/logger_connector/result.py#L409
- metric = _ResultMetric(meta, isinstance(value, torch.Tensor))
- device = value.device if isinstance(value, torch.Tensor) else self.device
- metric = metric.to(device)
- self[key] = metric
-
-
def update_metrics(self, key, value, batch_size):
# PyTorch Lightning always move all metrics to GPU, but moving the metric to
# its input device is prefered.
@@ -374,8 +364,6 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
# Use smart metrics to avoid syncs
LightningModule.__orig_to_tensor__ = LightningModule._LightningModule__to_tensor
LightningModule._LightningModule__to_tensor = to_tensor
- _ResultCollection.__orig_register_key__ = _ResultCollection.register_key
- _ResultCollection.register_key = register_key
_ResultCollection.__orig_update_metrics__ = _ResultCollection.update_metrics
_ResultCollection.update_metrics = update_metrics
@@ -409,8 +397,6 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
LightningModule._LightningModule__to_tensor = LightningModule.__orig_to_tensor__
del LightningModule.__orig_to_tensor__
- _ResultCollection.register_key = _ResultCollection.__orig_register_key__
- del _ResultCollection.__orig_register_key__
_ResultCollection.update_metrics = _ResultCollection.__orig_update_metrics__
del _ResultCollection.__orig_update_metrics__
diff --git a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
index 1f8c69b5b240..35039f8d02e9 100644
--- a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
+++ b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
@@ -62,7 +62,7 @@ def get_args():
help="Path to output mcore weights file (ends in .nemo).",
)
parser.add_argument(
- "--cpu-only",
+ "--cpu_only",
action="store_true",
help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
"but this option makes the conversion script significantly slower.",
@@ -73,7 +73,7 @@ def get_args():
help="Run conversion again and overwrite output file when the output file already exists",
)
parser.add_argument(
- "--ignore-if-missing",
+ "--ignore_if_missing",
default="rotary_pos_emb.inv_freq",
help="comma-separated list of state_dict keys that are known to be missing in mcore and can be safely ignored",
)
@@ -158,8 +158,8 @@ def build_key_mapping(nemo_cfg):
for wb in ('weight', 'bias') if has_layernorm_bias else ('weight',):
mcore_to_nemo_mapping.update(
{
- f"{mcore_prefix}.{i}.self_attention.linear_qkv.layer_norm_{wb}": f"{nemo_prefix}.{i}.input_layernorm.{wb}",
- f"{mcore_prefix}.{i}.mlp.linear_fc1.layer_norm_{wb}": f"{nemo_prefix}.{i}.post_attention_layernorm.{wb}",
+ f"{mcore_prefix}.{i}.input_layernorm.{wb}": f"{nemo_prefix}.{i}.input_layernorm.{wb}",
+ f"{mcore_prefix}.{i}.pre_mlp_layernorm.{wb}": f"{nemo_prefix}.{i}.post_attention_layernorm.{wb}",
}
)
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
index 0837e0e6ccf2..4eb8cb6330ca 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
@@ -18,6 +18,8 @@
python convert_llama_hf_to_nemo.py \
--input_name_or_path \
--output_path
+ --precision bf16 \
+ --llama31 True
"""
import os
@@ -60,6 +62,13 @@ def get_args():
required=False,
help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
)
+ parser.add_argument(
+ "--llama31",
+ type=bool,
+ default=True,
+ required=False,
+ help="Whether the model is from LLaMa 3.1 family. LLaMa 3.1 enables scaling for RoPE frequencies.",
+ )
parser.add_argument("--precision", type=str, default="16", help="Model precision")
args = parser.parse_args()
return args
@@ -110,6 +119,7 @@ def load_config(args, llama_config):
while llama_config['vocab_size'] % base != 0:
base //= 2
nemo_config.make_vocab_size_divisible_by = base
+ nemo_config.scale_positional_embedding = args.llama31
return nemo_config
@@ -161,6 +171,7 @@ def convert(args):
plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
nemo_config.precision = precision
+ nemo_config.micro_batch_size = 1
print(f"nemo_config: {nemo_config}")
# Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
@@ -298,12 +309,22 @@ def convert(args):
# We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
if 'tokenizer_model' not in hf_config:
- if hf_config['num_hidden_layers'] == 32:
- model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
- elif hf_config['num_hidden_layers'] == 80:
- model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+ if args.llama31:
+ if hf_config['num_hidden_layers'] == 32:
+ model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B')
+ elif hf_config['num_hidden_layers'] == 80:
+ model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-70B')
+ elif hf_config['num_hidden_layers'] == 126:
+ model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B') # 405B tokenizer is the same as 8B
+ else:
+ logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
else:
- logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+ if hf_config['num_hidden_layers'] == 32:
+ model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
+ elif hf_config['num_hidden_layers'] == 80:
+ model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+ else:
+ logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
# cast to target precision and disable cpu init
dtype = torch_dtype_from_precision(precision)
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
new file mode 100644
index 000000000000..f395e34765d0
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
+ Example to run this conversion script:
+ python convert_llama_hf_to_nemo.py \
+ --input_name_or_path \
+ --input_state_dict \
+ --output_path \
+ --precision bf16
+ --llama31 True
+"""
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+ GradScaler,
+ MegatronHalfPrecisionPlugin,
+ NLPDDPStrategy,
+ NLPSaveRestoreConnector,
+ PipelineMixedPrecisionPlugin,
+)
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+ parser = ArgumentParser()
+ parser.add_argument(
+ "--input_name_or_path",
+ type=str,
+ default=None,
+ required=True,
+ help="Path to Huggingface LLaMA checkpoints",
+ )
+ parser.add_argument(
+ "--input_state_dict",
+ type=str,
+ default=None,
+ required=True,
+ help="Path to Huggingface LLaMA checkpoints",
+ )
+
+ parser.add_argument(
+ "--llama31",
+ type=bool,
+ default=True,
+ required=False,
+ help="Apply scaling for RoPE frequencies",
+ )
+
+ parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+ parser.add_argument(
+ "--hparams_file",
+ type=str,
+ default=os.path.join(
+ os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml'
+ ),
+ required=False,
+ help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+ )
+ parser.add_argument("--precision", type=str, default="16", help="Model precision")
+ args = parser.parse_args()
+ return args
+
+
+def load_config(args, llama_config):
+ nemo_config = OmegaConf.load(args.hparams_file).model
+
+ if llama_config.get('rope_theta', None):
+ nemo_config['rotary_base'] = llama_config['rope_theta']
+ nemo_config.encoder_seq_length = llama_config['max_position_embeddings']
+ nemo_config.num_layers = int(llama_config['num_hidden_layers'])
+ nemo_config.hidden_size = llama_config['hidden_size']
+ nemo_config.ffn_hidden_size = llama_config['intermediate_size']
+ nemo_config.num_attention_heads = llama_config['num_attention_heads']
+ nemo_config.max_position_embeddings = llama_config['max_position_embeddings']
+ nemo_config.init_method_std = llama_config['initializer_range']
+ nemo_config.layernorm_epsilon = llama_config['rms_norm_eps']
+ if 'num_key_value_heads' in llama_config:
+ nemo_config.num_query_groups = llama_config['num_key_value_heads']
+ nemo_config.use_cpu_initialization = True
+ nemo_config.activation = 'fast-swiglu'
+ nemo_config.megatron_amp_O2 = True # True
+ nemo_config.scale_positional_embedding = args.llama31
+
+ # Tokenizer config
+ if 'tokenizer_model' in llama_config:
+ nemo_config.tokenizer.model = llama_config['tokenizer_model']
+ else:
+ # Llama3 uses converted TikToken Tokenizer
+ tokenizer_dict = {
+ 'library': 'huggingface',
+ 'type': args.input_name_or_path,
+ 'use_fast': True,
+ }
+ nemo_config.tokenizer = tokenizer_dict
+
+ if llama_config['rope_scaling'] is not None:
+ if llama_config['rope_scaling']['type'] == 'linear':
+ nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor']
+ else:
+ raise ValueError("Only linear rope scaling type is supported now")
+ if llama_config['rope_theta'] is not None:
+ nemo_config['rotary_base'] = llama_config['rope_theta']
+
+ base = 128
+ while llama_config['vocab_size'] % base != 0:
+ base //= 2
+ nemo_config.make_vocab_size_divisible_by = base
+
+ return nemo_config
+
+
+def convert(args):
+ logging.info(f"loading checkpoint {args.input_name_or_path}")
+ import torch
+
+ model = LlamaForCausalLM.from_pretrained(
+ args.input_name_or_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
+ )
+ hf_config = vars(model.config)
+ if os.path.exists(f'{args.input_name_or_path}/tokenizer.model'):
+ tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
+ hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+ else:
+ tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path)
+ print(f"hf_config: {hf_config}")
+ print("named parameters:")
+ for name, param in model.named_parameters():
+ print(f"- {name}")
+
+ nemo_config = load_config(args, hf_config)
+
+ if args.precision in ["32", "16"]:
+ precision = int(float(args.precision))
+ elif args.precision in ["bf16", "bf16-mixed"]:
+ if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+ precision = args.precision
+ else:
+ logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+ precision = args.precision[2:] # prune bf in string
+ else:
+ precision = args.precision
+
+ plugins = []
+ if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+ scaler = None
+ if precision in [16, '16', '16-mixed']:
+ scaler = GradScaler(
+ init_scale=nemo_config.get('native_amp_init_scale', 2**32),
+ growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+ hysteresis=nemo_config.get('hysteresis', 2),
+ )
+ # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+ plugin_precision = '16-mixed'
+ else:
+ plugin_precision = 'bf16-mixed'
+
+ if nemo_config.get('megatron_amp_O2', False):
+ print('HALF PRECISION')
+ plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+ else:
+ plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+ nemo_config.precision = precision
+ nemo_config.micro_batch_size = 1
+ print(f"nemo_config: {nemo_config}")
+
+ # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
+ trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
+
+ hidden_size = hf_config["hidden_size"]
+ head_num = hf_config["num_attention_heads"]
+ head_size = hidden_size // head_num
+ num_layers = hf_config["num_hidden_layers"]
+
+ mcore_gpt = nemo_config.mcore_gpt
+
+ assert mcore_gpt == nemo_config.get(
+ 'transformer_engine', False
+ ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+ param_to_weights = lambda param: param.float()
+
+ print('start init model')
+ del model
+ import time
+
+ st = time.perf_counter()
+ model = MegatronGPTModel(nemo_config, trainer)
+ print(f'Model init took {time.perf_counter() - st} sec')
+ from functools import reduce
+ from glob import glob
+
+ weights = glob(f'{args.input_state_dict}/*.pt')
+ st = time.perf_counter()
+ for weight_file in sorted(weights):
+ filename = os.path.basename(weight_file)
+ str_list = filename.split('.')
+ weight_type = str_list[-2]
+ str_name = '.'.join(str_list[1:-1])
+ print(f'-- Assign weight_type={weight_type} to {str_name}')
+ if nemo_config.get('megatron_amp_O2', False):
+ current = reduce(getattr, [model, 'model', 'module'] + str_list[:-2])
+ else:
+ current = reduce(getattr, [model, 'model'] + str_list[:-2])
+ load = torch.load(weight_file)
+ if nemo_config.get('megatron_amp_O2', False):
+ if precision == 'bf16':
+ target_precision = torch.bfloat16
+ elif precision == 16:
+ target_precision = torch.float16
+ load = load.to(target_precision)
+
+ if weight_type == 'weight':
+ assert current.weight.shape == load.shape
+ assert current.weight.dtype == load.dtype
+ current.weight = torch.nn.Parameter(load)
+ assert current.weight.norm() == load.norm()
+ elif weight_type == 'layer_norm_weight':
+ assert current.layer_norm_weight.dtype == load.dtype
+ assert current.layer_norm_weight.shape == load.shape
+ current.layer_norm_weight = torch.nn.Parameter(load)
+ assert current.layer_norm_weight.norm() == load.norm()
+ else:
+ raise ValueError(f'Unsupported weight type = {weight_type}')
+ del load
+
+ print(f'Finish loading model in {time.perf_counter() - st} sec. Start to save model')
+ st = time.perf_counter()
+ print(f'Model save took {time.perf_counter() - st} sec.')
+
+ model._save_restore_connector = NLPSaveRestoreConnector()
+
+ # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
+ if 'tokenizer_model' not in hf_config:
+ if args.llama31:
+ if hf_config['num_hidden_layers'] == 32:
+ model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B')
+ elif hf_config['num_hidden_layers'] == 80:
+ model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-70B')
+ elif hf_config['num_hidden_layers'] == 126:
+ model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B') # 405B tokenizer is the same as 8B
+ else:
+ logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+ else:
+ if hf_config['num_hidden_layers'] == 32:
+ model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
+ elif hf_config['num_hidden_layers'] == 80:
+ model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+ else:
+ logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+
+ # cast to target precision and disable cpu init
+ dtype = torch_dtype_from_precision(precision)
+ model = model.to(dtype=dtype)
+ model.cfg.use_cpu_initialization = False
+
+ model.save_to(args.output_path)
+ logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+ args = get_args()
+ convert(args)
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
new file mode 100644
index 000000000000..940a9df5f9a8
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
+ Example to run this conversion script:
+ python convert_llama_hf_to_nemo.py \
+ --input_name_or_path \
+ --output_path
+ --precision bf16
+ --apply_rope_scaling True
+"""
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+ GradScaler,
+ MegatronHalfPrecisionPlugin,
+ NLPDDPStrategy,
+ NLPSaveRestoreConnector,
+ PipelineMixedPrecisionPlugin,
+)
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+ parser = ArgumentParser()
+ parser.add_argument(
+ "--input_name_or_path",
+ type=str,
+ default=None,
+ required=True,
+ help="Path to Huggingface LLaMA checkpoints",
+ )
+ parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output to dict dir")
+ parser.add_argument(
+ "--hparams_file",
+ type=str,
+ default=os.path.join(
+ os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml'
+ ),
+ required=False,
+ help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+ )
+ parser.add_argument(
+ "--apply_rope_scaling",
+ type=bool,
+ default=True,
+ required=False,
+ help="Apply scaling for RoPE frequencies",
+ )
+ parser.add_argument("--precision", type=str, default="16", help="Model precision")
+ args = parser.parse_args()
+ return args
+
+
+def load_config(args, llama_config):
+ nemo_config = OmegaConf.load(args.hparams_file).model
+
+ if llama_config.get('rope_theta', None):
+ nemo_config['rotary_base'] = llama_config['rope_theta']
+ nemo_config.encoder_seq_length = llama_config['max_position_embeddings']
+ nemo_config.num_layers = int(llama_config['num_hidden_layers'])
+ nemo_config.hidden_size = llama_config['hidden_size']
+ nemo_config.ffn_hidden_size = llama_config['intermediate_size']
+ nemo_config.num_attention_heads = llama_config['num_attention_heads']
+ nemo_config.max_position_embeddings = llama_config['max_position_embeddings']
+ nemo_config.init_method_std = llama_config['initializer_range']
+ nemo_config.layernorm_epsilon = llama_config['rms_norm_eps']
+ if 'num_key_value_heads' in llama_config:
+ nemo_config.num_query_groups = llama_config['num_key_value_heads']
+ nemo_config.use_cpu_initialization = True
+ nemo_config.activation = 'fast-swiglu'
+ nemo_config.megatron_amp_O2 = True
+
+ # Tokenizer config
+ if 'tokenizer_model' in llama_config:
+ nemo_config.tokenizer.model = llama_config['tokenizer_model']
+ else:
+ # Llama3 uses converted TikToken Tokenizer
+ tokenizer_dict = {
+ 'library': 'huggingface',
+ 'type': args.input_name_or_path,
+ 'use_fast': True,
+ }
+ nemo_config.tokenizer = tokenizer_dict
+
+ if llama_config['rope_scaling'] is not None:
+ if llama_config['rope_scaling']['type'] == 'linear':
+ nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor']
+ else:
+ raise ValueError("Only linear rope scaling type is supported now")
+ if llama_config['rope_theta'] is not None:
+ nemo_config['rotary_base'] = llama_config['rope_theta']
+
+ base = 128
+ while llama_config['vocab_size'] % base != 0:
+ base //= 2
+ nemo_config.make_vocab_size_divisible_by = base
+
+ return nemo_config
+
+
+def convert(args):
+ logging.info(f"loading checkpoint {args.input_name_or_path}")
+ import torch
+
+ model = LlamaForCausalLM.from_pretrained(
+ args.input_name_or_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
+ )
+ hf_config = vars(model.config)
+ if os.path.exists(f'{args.input_name_or_path}/tokenizer.model'):
+ tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
+ hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+ else:
+ tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path)
+
+ print("named parameters:")
+ for name, param in model.named_parameters():
+ print(f"- {name}")
+
+ nemo_config = load_config(args, hf_config)
+ nemo_config.scale_positional_embedding = args.apply_rope_scaling
+
+ if args.precision in ["32", "16"]:
+ precision = int(float(args.precision))
+ elif args.precision in ["bf16", "bf16-mixed"]:
+ if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+ precision = args.precision
+ else:
+ logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+ precision = args.precision[2:] # prune bf in string
+ else:
+ precision = args.precision
+
+ plugins = []
+ if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+ scaler = None
+ if precision in [16, '16', '16-mixed']:
+ scaler = GradScaler(
+ init_scale=nemo_config.get('native_amp_init_scale', 2**32),
+ growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+ hysteresis=nemo_config.get('hysteresis', 2),
+ )
+ # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+ plugin_precision = '16-mixed'
+ else:
+ plugin_precision = 'bf16-mixed'
+
+ if nemo_config.get('megatron_amp_O2', False):
+ print('HALF PRECISION')
+ plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+ else:
+ plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+ nemo_config.precision = precision
+ print(f"nemo_config: {nemo_config}")
+
+ # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
+ trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
+
+ hidden_size = hf_config["hidden_size"]
+ head_num = hf_config["num_attention_heads"]
+ head_size = hidden_size // head_num
+ num_layers = hf_config["num_hidden_layers"]
+
+ mcore_gpt = nemo_config.mcore_gpt
+
+ assert mcore_gpt == nemo_config.get(
+ 'transformer_engine', False
+ ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+ param_to_weights = lambda param: param.float()
+
+ checkpoint = OrderedDict()
+ checkpoint['state_dict'] = OrderedDict()
+
+ embed_weight = model.state_dict()[f'model.embed_tokens.weight']
+ if mcore_gpt:
+ embed_weights_base_name = f'model.embedding.word_embeddings.weight'
+ else:
+ embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
+ checkpoint['state_dict'][embed_weights_base_name] = param_to_weights(embed_weight)
+
+ # in hf, this is defined as register_buffer(..., persistent=False) so it won't be in the state dict
+ if f'model.layers.0.self_attn.rotary_emb.inv_freq' in model.state_dict():
+ rotary_embed_weight = model.state_dict()[f'model.layers.0.self_attn.rotary_emb.inv_freq']
+ if mcore_gpt:
+ rotary_embed_weight_base_name = f'model.rotary_pos_emb.inv_freq'
+ else:
+ rotary_embed_weight_base_name = f'model.language_model.rotary_pos_emb.inv_freq'
+ checkpoint['state_dict'][rotary_embed_weight_base_name] = param_to_weights(rotary_embed_weight)
+
+ if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
+ num_query_groups = head_num
+ else:
+ num_query_groups = nemo_config.num_query_groups
+ assert head_num % num_query_groups == 0, 'head_num must be divisible by num_query_groups'
+ if mcore_gpt:
+ assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.'
+
+ for l in range(int(num_layers)):
+ print(f"converting layer {l}")
+ old_tensor_shape = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].size()
+ new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+ new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+ q = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].view(*new_q_tensor_shape)
+ k = model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight'].view(*new_kv_tensor_shape)
+ v = model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight'].view(*new_kv_tensor_shape)
+ qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+ heads_per_group = head_num // num_query_groups
+ for i in range(num_query_groups):
+ qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+ qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+ qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+ qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+ if mcore_gpt:
+ qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'
+ else:
+ qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight'
+ checkpoint['state_dict'][qkv_weights_base_name] = param_to_weights(qkv_weights)
+
+ # attention dense
+ o_weight = model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']
+ if mcore_gpt:
+ o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
+ else:
+ o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight'
+ checkpoint['state_dict'][o_weight_base_name] = param_to_weights(o_weight)
+
+ # MLP
+ mlp_down_weight = model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']
+ mlp_gate_weight = model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']
+ if mcore_gpt:
+ mlp_down_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.weight'
+ else:
+ mlp_down_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_h_to_4h.weight'
+ mlp_down_weight = torch.cat((mlp_down_weight, mlp_gate_weight), axis=0)
+ checkpoint['state_dict'][mlp_down_base_name] = param_to_weights(mlp_down_weight)
+
+ mlp_up_weight = model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']
+ if mcore_gpt:
+ mlp_up_base_name = f'model.decoder.layers.{l}.mlp.linear_fc2.weight'
+ else:
+ mlp_up_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_4h_to_h.weight'
+ checkpoint['state_dict'][mlp_up_base_name] = param_to_weights(mlp_up_weight)
+
+ # LayerNorm
+ input_ln_weight = model.state_dict()[f'model.layers.{l}.input_layernorm.weight']
+ if mcore_gpt:
+ input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+ else:
+ input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight'
+ checkpoint['state_dict'][input_ln_base_name] = param_to_weights(input_ln_weight)
+
+ post_attn_ln_weight = model.state_dict()[f'model.layers.{l}.post_attention_layernorm.weight']
+ if mcore_gpt:
+ post_attn_ln_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
+ else:
+ post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
+ checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+
+ print(f"done layer {l}")
+
+ final_ln_weight = model.state_dict()[f'model.norm.weight']
+ if mcore_gpt:
+ final_ln_base_name = f'model.decoder.final_layernorm.weight'
+ else:
+ final_ln_base_name = f'model.language_model.encoder.final_layernorm.weight'
+ checkpoint['state_dict'][final_ln_base_name] = param_to_weights(final_ln_weight)
+
+ output_layer_weight = model.state_dict()[f'lm_head.weight']
+ if mcore_gpt:
+ output_layer_base_name = f'model.output_layer.weight'
+ else:
+ output_layer_base_name = f'model.language_model.output_layer.weight'
+ checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight)
+
+ checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+
+ del model
+ import gc
+
+ gc.collect()
+
+ if nemo_config.get('megatron_amp_O2', False):
+ keys = list(checkpoint['state_dict'].keys())
+ print('convert to O2')
+ for key in keys:
+ checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
+
+ # os.mkdir(args.output_path, exist_ok=True)
+ for key in checkpoint['state_dict']:
+ print(f'Saving {key} in {checkpoint["state_dict"][key].dtype}..')
+ save_location = f'{args.output_path}/{key[13:]}.pt'
+ torch.save(checkpoint['state_dict'][key], save_location)
+
+
+if __name__ == '__main__':
+ args = get_args()
+ convert(args)
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index a8c8621c8d31..45765e6d1997 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -1735,10 +1735,14 @@ def nemo_tarred_manifest_path_with_offset(tmp_path_factory) -> Tuple[str, str]:
TarWriter(f"{root}/audios_0.tar", shard_size=None) as tar_writer,
SequentialJsonlWriter(root / "tarred_audio_filepaths.jsonl") as mft_writer,
):
- tar_writer.write(recording.id, BytesIO(recording.sources[0].source))
+
+ def audio_path(n: int = None):
+ return recording.id + ("" if n is None else f"-sub{n}") + ".wav"
+
+ tar_writer.write(audio_path(), BytesIO(recording.sources[0].source))
mft_writer.write(
{ # segment 0-3s
- "audio_filepath": recording.id,
+ "audio_filepath": audio_path(),
"offset": 0.0,
"duration": 3.0,
"text": "irrelevant",
@@ -1748,7 +1752,7 @@ def nemo_tarred_manifest_path_with_offset(tmp_path_factory) -> Tuple[str, str]:
)
mft_writer.write(
{ # segment 4-9s
- "audio_filepath": recording.id + "-sub1",
+ "audio_filepath": audio_path(1),
"offset": 4.0,
"duration": 5.0,
"text": "irrelevant-2",
@@ -1758,7 +1762,7 @@ def nemo_tarred_manifest_path_with_offset(tmp_path_factory) -> Tuple[str, str]:
)
mft_writer.write(
{ # full recording - for reference
- "audio_filepath": recording.id + "-sub2",
+ "audio_filepath": audio_path(2),
"offset": 0.0,
"duration": 10.0,
"text": "irrelevant irrelevant-2",
@@ -1794,7 +1798,7 @@ def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_p
assert len(batch) == 3
# Validate example containing full 10s recording.
- full_cut = batch[-1]
+ full_cut = batch[1]
assert full_cut.start == 0.0
assert full_cut.duration == 10.0
assert full_cut.supervisions[0].text == "irrelevant irrelevant-2"
@@ -1803,7 +1807,7 @@ def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_p
assert full_audio.shape[1] == full_cut.num_samples == 160000 # 10s * 16kHz
# Validate segment 0-3s.
- cut = batch[0]
+ cut = batch[2]
assert cut.start == 0.0
assert cut.duration == 3.0
assert cut.supervisions[0].text == "irrelevant"
@@ -1817,7 +1821,7 @@ def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_p
# Note: LazyNeMoTarredIterator removes the offset information, as it creates a new recording
# that's a "subset" of the original recording as a memory saving optimization.
# Hence, we will not see cut.start == 4.0.
- cut = batch[1]
+ cut = batch[0]
assert cut.start == 0.0
assert cut.duration == 5.0
assert cut.supervisions[0].text == "irrelevant-2"
diff --git a/tests/lightning/io/test_mixin.py b/tests/lightning/io/test_mixin.py
index 824608db6bf0..3a520b8e74ae 100644
--- a/tests/lightning/io/test_mixin.py
+++ b/tests/lightning/io/test_mixin.py
@@ -14,3 +14,10 @@ def test_reinit(self):
assert copied is not dummy
assert copied.a == dummy.a
assert copied.b == dummy.b
+
+ def test_init(self):
+ outputs = []
+ for i in range(1001):
+ outputs.append(DummyClass(i, i))
+
+ assert len(outputs) == 1001