diff --git a/.github/workflows/changelog-build.yml b/.github/workflows/changelog-build.yml index 7e16c344acb8..b6d836f71cec 100644 --- a/.github/workflows/changelog-build.yml +++ b/.github/workflows/changelog-build.yml @@ -1,13 +1,13 @@ name: 'Changelog Build (Release)' on: + workflow_dispatch: push: tags: - '*' - + jobs: changelog: - if: startsWith(github.ref, 'refs/tags/') runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -39,7 +39,7 @@ jobs: ignorePreReleases: "false" failOnError: "false" fromTag: ${{ steps.previous_tag.outputs.tag_name }} - toTag: ${{ github.ref_name }} + toTag: ${{ github.ref_name || github.sha }} - name: Print Changelog run: | diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 2919be733ac3..ffe6e00e2ad3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -365,6 +365,34 @@ jobs: # rm -rf llama2_qat_results + L2_Distill_Llama2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_distillation.py \ + trainer.devices=2 \ + trainer.num_nodes=1 \ + trainer.precision=bf16 \ + trainer.max_steps=5 \ + trainer.log_every_n_steps=5 \ + trainer.val_check_interval=5 \ + trainer.limit_val_batches=2 \ + model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + model.kd_teacher_restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + model.tensor_model_parallel_size=2 \ + model.pipeline_model_parallel_size=1 \ + model.micro_batch_size=1 \ + model.global_batch_size=4 \ + model.optim.name=distributed_fused_adam \ + model.optim.sched.warmup_steps=1 \ + model.data.data_prefix=[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ + exp_manager.exp_dir=examples/nlp/megatron_llama_distill + AFTER_SCRIPT: | + rm -rf examples/nlp/megatron_llama_distill + # L2: ASR dev run ASR_dev_run_Speech_to_Text: needs: [cicd-test-container-setup] diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3f4c4f3c19de..af09fa241c59 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,112 +1,60 @@ name: "NeMo Code release" on: - issue_comment: - types: [created] - + workflow_dispatch: + inputs: + branch: + description: Branch to release + required: true + type: string jobs: main: - if: > - github.event_name == 'issue_comment' && - github.event.issue.pull_request && - startsWith(github.event.comment.body, '/release-please') && - contains(fromJSON('["ko3n1g"]'), github.actor) + if: contains(fromJSON('["ko3n1g"]'), github.actor) runs-on: ubuntu-latest environment: name: main steps: - - name: Update PR issue comment - shell: bash - env: - message: ${{ github.event.comment.body }} - run: | - message="$message - - --- - - Releasebot 🤖: Release processes started... - " - message="${message//$'\n'/
}" - - curl -L \ - -X PATCH \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - https://api.github.com/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }} \ - -d '{"body":"'"$message"'"}' - - - name: Get PR number - shell: bash - id: get-pr-num - run: | - PR_URL="${{ github.event.issue.pull_request.url }}" - PR_NUM=${PR_URL##*/} - echo "pr_number=$PR_NUM" >> $GITHUB_OUTPUT - - - name: Get Pull Request Information - uses: actions/github-script@v6 - id: get-pr-branch - with: - result-encoding: string - script: | - const pr = await github.rest.pulls.get({ - owner: context.repo.owner, - repo: context.repo.repo, - pull_number: ${{ steps.get-pr-num.outputs.pr_number }} - }); - console.log('Pull Request Information:', pr.data); - return pr.data.head.ref; - name: Checkout repository uses: actions/checkout@v4 with: path: ${{ github.run_id }} - ref: ${{ steps.get-pr-branch.outputs.result }} + ref: ${{ inputs.branch }} - - name: Get version number + - name: Create release id: version-number run: | cd ${{ github.run_id }} VERSION=$(python -c "import nemo; print(nemo.__version__)") - echo "VERSION=$VERSION" >> "$GITHUB_OUTPUT" - - - name: Extract changelog - id: extract-changelog - uses: peter-evans/find-comment@v3 - with: - issue-number: ${{ steps.get-pr-num.outputs.pr_number }} - body-includes: '# Detailed Changelogs' - - - name: Extract summary - id: extract-summary - uses: peter-evans/find-comment@v3 - with: - issue-number: ${{ steps.get-pr-num.outputs.pr_number }} - body-includes: '# Highlights' - - - name: Create Release doc - id: create-release-doc - env: - SUMMARY: ${{ steps.extract-summary.outputs.comment-body }} - CHANGELOG: ${{ steps.extract-changelog.outputs.comment-body }} - run: | - echo "TITLE<> $GITHUB_ENV - echo "NVIDIA Neural Modules ${{ steps.version-number.outputs.VERSION }}" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV + NAME="NVIDIA Neural Modules ${VERSION}" + CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md) + CHANGELOG=$(echo "$CHANGELOG" | sed '/./,$!d' | sed ':a;N;$!ba;s/\n$//') + + PAYLOAD=$(jq \ + -n \ + -c \ + --arg CI_COMMIT_BRANCH "${{ inputs.branch }}" \ + --arg NAME "$NAME" \ + --arg BODY "$CHANGELOG" \ + '{ + "tag_name": $CI_COMMIT_BRANCH, + "target_commitish": $CI_COMMIT_BRANCH, + "name": $NAME, + "body": $BODY, + "draft": false, + "prerelease": false, + "generate_release_notes": false + }' + ) - echo "BODY<> $GITHUB_ENV - echo "$SUMMARY" >> $GITHUB_ENV - echo "$CHANGELOG" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - - name: Create Release - uses: softprops/action-gh-release@v2 - with: - name: ${{ env.TITLE }} - tag_name: ${{ steps.version-number.outputs.VERSION }} - body: ${{ env.BODY }} + curl -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.PAT }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/NVIDIA/NeMo/releases \ + -d "$PAYLOAD" - name: Build, test, and release wheel env: @@ -114,6 +62,8 @@ jobs: TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} run: | cd ${{ github.run_id }} + EXPECTED_VERSION=$(python -c 'import nemo; print(nemo.__version__)') + python3 -m pip install --upgrade build python3 -m build @@ -122,7 +72,6 @@ jobs: cd ../ INSTALLED_VERSION=$(python -c 'import nemo; print(nemo.__version__)') - EXPECTED_VERSION=${{ steps.version-number.outputs.VERSION }} if [[ "$INSTALLED_VERSION" != "$EXPECTED_VERSION" ]]; then echo 'Wheel has an outdated version, mission abort immediately!' @@ -134,34 +83,6 @@ jobs: python3 -m pip install --upgrade twine python3 -m twine upload --repository pypi dist/* - - name: Update PR issue comment - shell: bash - env: - message: ${{ github.event.comment.body }} - run: | - message="$message - - --- - - Releasebot 🤖: Release done 🎉 - " - message="${message//$'\n'/
}" - - curl -L \ - -X PATCH \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - https://api.github.com/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }} \ - -d '{"body":"'"$message"'"}' - - - name: Close Pull - run: | - cd ${{ github.run_id }} - gh pr close --comment "Releasebot 🤖: Closing PR" "${{ steps.get-pr-num.outputs.pr_number }}" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: notify run: | MESSAGE='{ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000000..7fd5cd00b352 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,921 @@ +# Changelog + +## NVIDIA Neural Modules 2.0.0rc1 + +### Highlights + +#### Large language models + +- PEFT: QLoRA support, LoRA/QLora for Mixture-of-Experts (MoE) dense layer +- State Space Models & Hybrid Architecture support (Mamba2 and NV-Mamba2-hybrid) +- Support Nemotron, Minitron, Gemma2, Qwen, RAG +- Custom Tokenizer training in NeMo +- Update the Auto-Configurator for EP, CP and FSDP + +#### Multimodal + +- NeVA: Add SOTA LLM backbone support (Mixtral/LLaMA3) and suite of model parallelism support (PP/EP) +- Support Language Instructed Temporal-Localization Assistant (LITA) on top of video NeVA + +#### ASR + +- SpeechLM and SALM +- Adapters for Canary Customization +- Pytorch allocator in PyTorch 2.2 improves training speed up to 30% for all ASR models +- Cuda Graphs for Transducer Inference +- Replaced webdataset with Lhotse - gives up to 2x speedup +- Transcription Improvements - Speedup and QoL Changes +- ASR Prompt Formatter for multimodal Canary + +#### Export & Deploy + +- In framework PyTriton deployment with backends: - PyTorch - vLLM - TRT-LLM update to 0.10 +- TRT-LLM C++ runtime + +### Detailed Changelogs + +#### ASR + +
Changelog + +- Support dataloader as input to `audio` for transcription by @titu1994 :: PR: #9201 +- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205 +- Fix Online_Offline_Microphone_VAD_Demo.ipynb by @stevehuang52 :: PR: #9251 +- Remove .nemo instead of renaming by @mikolajblaz :: PR: #9281 +- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. by @galv :: PR: #9347 +- Revert "Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer." by @titu1994 :: PR: #9351 +- Prompt formatter API and canary transcribe tensor input support by @pzelasko :: PR: #9206 +- Fix prompt formatter's defaults=None case in multi-task model by @pzelasko :: PR: #9366 +- move AED chunked infer script by @stevehuang52 :: PR: #9367 +- Use model-cast-to-bfloat16 rather than AMP-to-bfloat16 for inference. by @galv :: PR: #9198 +- ci: Fix `L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_C… by @ko3n1g :: PR: #9399 +- Fix logging message for ASR by @titu1994 :: PR: #9469 +- Add support to change Multi task model prompt by @titu1994 :: PR: #9542 +- Enable encoder adapters for Canary and MultiTaskAED models by @titu1994 :: PR: #9409 +- Audio model collection by @anteju :: PR: #9263 +- TitaNet Batch Verify Speaker by @monica-sekoyan :: PR: #9337 +- Fix the arguments of forward_for_export function in msdd_models by @tango4j :: PR: #9624 +- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697 +- refactor: notebook branch release by @ko3n1g :: PR: #9711 +- Canary Adapters tutorial (#9670) by @nithinraok :: PR: #9777 +- typos and branch name update to r2.0.0rc1 by @nithinraok :: PR: #9846 +- Fix RNNT alignments test by @artbataev :: PR: #9770 +- By default trust remote code from HF Datasets by @nithinraok :: PR: #9886 +- Temporarily disable cuda graph based RNN-T greedy inference for r2.0.0rc1 by @galv :: PR: #9904 +- Enable CUDA graphs by default, but require CUDA 12.6 for full graphs by @artbataev :: PR: #9919 +- update branch name for script by @nithinraok :: PR: #9936 +- updte branch by @nithinraok :: PR: #9942 +
+ +#### TTS + +
Changelog + +- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205 +- Add mel codec checkpoints by @anteju :: PR: #9228 +- GPU unit tests: Mark flaky tests to be fixed by @pablo-garay :: PR: #9559 +- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697 +- refactor: notebook branch release by @ko3n1g :: PR: #9711 + +
+ +#### LLM/Multimodal + +
Changelog + +- Update nemo.export module for quantized models by @janekl :: PR: #9218 +- Add save option to the TRT-LLM export test script by @oyilmaz-nvidia :: PR: #9221 +- Checkpoint resuming compatible for 2403 container by @suiyoubi :: PR: #9199 +- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205 +- use get with fallback when reading checkpoint_callback_params by @akoumpa :: PR: #9223 +- Revert rope fusion defaults by @cuichenx :: PR: #9237 +- fix import by @akoumpa :: PR: #9240 +- Add TRT-LLM params like max_num_tokens and opt_num_tokens by @oyilmaz-nvidia :: PR: #9210 +- sum-reduce grad_norm in DP+CP domain by @erhoo82 :: PR: #9262 +- Alit/bert convert fix by @JRD971000 :: PR: #9285 +- conv1d stable version by @JRD971000 :: PR: #9330 +- Fix trainer builder when exp_manager is not in config by @yaoyu-33 :: PR: #9293 +- Fix Peft Weights Loading in NeVA by @yaoyu-33 :: PR: #9341 +- Skip sequence_parallel allreduce when using Mcore DistOpt by @akoumpa :: PR: #9344 +- Fix FSDP gradient calculation with orig params by @janEbert :: PR: #9335 +- TRT-LLM Export Code Cleanup by @oyilmaz-nvidia :: PR: #9270 +- support null/None truncation field by @arendu :: PR: #9355 +- NeVa token fusion by @paul-gibbons :: PR: #9245 +- bugfix if using mcore distOpt with sft by @akoumpa :: PR: #9356 +- Re-org export code by @oyilmaz-nvidia :: PR: #9353 +- QLoRA by @cuichenx :: PR: #9340 +- PeFT fix for distOpt by @akoumpa :: PR: #9392 +- [NeMo-UX] Integrating mcore's DistributedDataParallel into MegatronStrategy by @marcromeyn :: PR: #9387 +- cherry pick of #9266 by @dimapihtar :: PR: #9411 +- Enable specifying alpha for PTQ INT8 SmoothQuant method by @janekl :: PR: #9423 +- add support for new mcore ds features by @dimapihtar :: PR: #9388 +- LoRA for MoE Layer by @cuichenx :: PR: #9396 +- Mistral-7B: apply user's precision to output checkpoint by @akoumpa :: PR: #9222 +- Add option to merge distributed optimizer buckets by @timmoon10 :: PR: #9414 +- TRT-LLM 0.10 Update by @oyilmaz-nvidia :: PR: #9402 +- In-framework deployment by @oyilmaz-nvidia :: PR: #9438 +- Bugfix missing variables and argument changes to MegatronPretrainingRandomSampler by @jstjohn :: PR: #9458 +- Hyena Operator by @guyjacob :: PR: #9264 +- Refactor Quantizer for reusing in QAT by @kevalmorabia97 :: PR: #9276 +- move load state dict after initialize parallel state in nlp_model by @ryxli :: PR: #9382 +- Enable user to optionally upgrade Megatron by @jstjohn :: PR: #9478 +- Fix unwrap model by @cuichenx :: PR: #9480 +- fix operator precedence by @akoumpa :: PR: #9403 +- [NeMo-UX] Adding context- & expert-parallelism to MegatronStrategy by @marcromeyn :: PR: #9525 +- update mcoreddp call by @akoumpa :: PR: #9345 +- mcore distOpt restore fix by @akoumpa :: PR: #9421 +- vLLM Export Support by @apanteleev :: PR: #9381 +- PL: Delete precision if using plugin. TODO switch to MegatronTrainerB… by @akoumpa :: PR: #9535 +- extend get_gpt_layer_modelopt_spec to support MoE by @akoumpa :: PR: #9532 +- fix mock data generation for legacy dataset by @dimapihtar :: PR: #9530 +- add reset learning rate functionality by @dimapihtar :: PR: #9372 +- Use closed-formula to round by multiple by @akoumpa :: PR: #9307 +- GPU unit tests: Mark flaky tests to be fixed by @pablo-garay :: PR: #9559 +- Consolidate gpt continue training script into pretraining script by @yaoyu-33 :: PR: #9413 +- Enable encoder adapters for Canary and MultiTaskAED models by @titu1994 :: PR: #9409 +- PTQ refinements by @janekl :: PR: #9574 +- Add ModelOpt QAT example for Llama2 SFT model by @kevalmorabia97 :: PR: #9326 +- Multimodal projection layer adapter fix for PP>1 by @paul-gibbons :: PR: #9445 +- Add offline quantization script for QLoRA deployment by @cuichenx :: PR: #9455 +- Make QLoRA more model-agnostic by @cuichenx :: PR: #9488 +- Set n_gpu to None in nemo export by @oyilmaz-nvidia :: PR: #9593 +- [NeMo-UX] Fix Megatron-optimizer by @marcromeyn :: PR: #9599 +- Chat template support for megatron_gpt_eval.py by @akoumpa :: PR: #9354 +- [NeMo-UX] Add PEFT by @cuichenx :: PR: #9490 +- Alit/mamba tmp by @JRD971000 :: PR: #9612 +- Enable MCore checkpointing optimizations by @mikolajblaz :: PR: #9505 +- Change mixtral moe key name for trt-llm by @oyilmaz-nvidia :: PR: #9620 +- fix ckpt load bug by @dimapihtar :: PR: #9621 +- Alit/mamba by @JRD971000 :: PR: #9575 +- Unwrap ckpt_io for model opt (async save) by @mikolajblaz :: PR: #9622 +- MCore T5 support for NeMo - Training by @huvunvidia :: PR: #9432 +- [Nemo-UX] Expose transformer_layer_spec inside GPTConfig by @marcromeyn :: PR: #9592 +- Update NeMo Clip to Use MCore Modules by @yaoyu-33 :: PR: #9594 +- Mistral + Mixtral Support for NeVa by @paul-gibbons :: PR: #9459 +- Adding support for mcore generate by @shanmugamr1992 :: PR: #9566 +- Improve error messaging during trt-llm export by @oyilmaz-nvidia :: PR: #9638 +- [Cherrypick] support lora when kv_channel != hidden_size / num_heads by @cuichenx :: PR: #9644 +- Parametrize FPS group by @mikolajblaz :: PR: #9648 +- Cherry-pick megatron export fix from main by @borisfom :: PR: #9643 +- add documentation for reset_lr feature by @dimapihta +- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697 +- Cherry pick: LITA Integration by @Slyne :: PR: #9684 +- SDXL improvements (and support for Draft+) by @rohitrango :: PR: #9654 +- Gemma 2 by @cuichenx :: PR: #9672 +- Allows non-strict load with distributed checkpoints by @mikolajblaz :: PR: #9613 +- refactor: notebook branch release by @ko3n1g :: PR: #9711 +- [NeMo-UX] Make TE and Apex dependencies optional by @ashors1 :: PR: #9550 +- Alit/r2.0.0 by @JRD971000 :: PR: #9718 +- Manually cherry-pick from PR 9679 (PR to main - Support SFT/Eval/PEFT for mcore T5) by @huvunvidia :: PR: #9737 +- In framework export by @oyilmaz-nvidia :: PR: #9658 +- T5 changes based on mcore changes by @pablo-garay :: PR: #9829 +- [NeMo-UX] Use single instance of loss reductions in GPTModel by @hemildesai :: PR: #9801 +- deprecate NeMo NLP tutorial by @dimapihtar :: PR: #9864 +- Disable nvFuser setup with PyTorch 23.11 and later by @athitten :: PR: #9837 +- make torch_dist ckpt strategy as default by @dimapihtar :: PR: #9852 +- add rampup bs documentation by @dimapihtar :: PR: #9884 +- copy of #9576 by @dimapihtar :: PR: #9986 +- Support Nvidia Torch and Arch versions by @thomasdhc :: PR: #9897 +- Bug fix for pooler causing dist checkpointing exception by @shanmugamr1992 :: PR: #10008 + +
+ +#### Export + +
Changelog + +- Update nemo.export module for quantized models by @janekl :: PR: #9218 +- Add save option to the TRT-LLM export test script by @oyilmaz-nvidia :: PR: #9221 +- Add TRT-LLM params like max_num_tokens and opt_num_tokens by @oyilmaz-nvidia :: PR: #9210 +- TRT-LLM Export Code Cleanup by @oyilmaz-nvidia :: PR: #9270 +- Re-org export code by @oyilmaz-nvidia :: PR: #9353 +- Use TensorRT-LLM native parameter names in nemo.export module by @janekl :: PR: #9424 +- TRT-LLM 0.10 Update by @oyilmaz-nvidia :: PR: #9402 +- vLLM Export Support by @apanteleev :: PR: #9381 +- Add page context fmha option in TensorRTLLM export by @meatybobby :: PR: #9526 +- Test C++ runtime on demand in nemo_export.py to avoid possible OOMs by @janekl :: PR: #9544 +- Fix nemo export test by @oyilmaz-nvidia :: PR: #9547 +- Add tps and pps params to the export script by @oyilmaz-nvidia :: PR: #9558 +- Add Multimodal Exporter by @meatybobby :: PR: #9256 +- Set n_gpu to None in nemo export by @oyilmaz-nvidia :: PR: #9593 +- Inflight nemo model export support by @JimmyZhang12 :: PR: #9527 +- vLLM Export Improvements by @apanteleev :: PR: #9596 +- Akoumparouli/nemo ux mixtral export by @akoumpa :: PR: #9603 +- Change mixtral moe key name for trt-llm by @oyilmaz-nvidia :: PR: #9620 +- Fix the arguments of forward_for_export function in msdd_models by @tango4j :: PR: #9624 +- Improve error messaging during trt-llm export by @oyilmaz-nvidia :: PR: #9638 +- Cherry-pick megatron export fix from main by @borisfom :: PR: #9643 +- In framework export by @oyilmaz-nvidia :: PR: #9658 +- Add missing imports for torch dist ckpt in export by @oyilmaz-nvidia :: PR: #9826~ + +
+ + + + +#### Bugfixes + +
Changelog + +- use get with fallback when reading checkpoint_callback_params by @akoumpa :: PR: #9223 +- fix import by @akoumpa :: PR: #9240 +- Remove .nemo instead of renaming by @mikolajblaz :: PR: #9281 +- call set_expert_model_parallel_world_size instead of set_cpu_expert_m… by @akoumpa :: PR: #9275 +- Fix typos in Mixtral NeMo->HF and Starcoder2 NeMo->HF conversion scripts by @evellasques :: PR: #9325 +- Skip sequence_parallel allreduce when using Mcore DistOpt by @akoumpa :: PR: #9344 +- Add OpenAI format response to r2.0.0rc1 by @athitten :: PR: #9796 +- [NeMo UX] Support generating datasets using different train/valid/test distributions by @ashors1 :: PR: #9771 +- Add missing imports for torch dist ckpt in export by @oyilmaz-nvidia :: PR: #9826 + +
+ +#### General Improvements + +
Changelog + +- [Nemo CICD] run_cicd_for_release_branches_also by @pablo-garay :: PR: #9213 +- rename paths2audiofiles to audio by @github-actions[bot] :: PR: #9220 +- Fix ASR_Context_Biasing.ipynb contains FileNotFoundError by @github-actions[bot] :: PR: #9234 +- ci: Remove duplicated job by @ko3n1g :: PR: #9258 +- Fix document links by @yaoyu-33 :: PR: #9260 +- Pin transformers by @github-actions[bot] :: PR: #9273 +- Fix loading github raw images on notebook by @github-actions[bot] :: PR: #9283 +- Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward by @github-actions[bot] :: PR: #9278 +- Refactor Sequence Packing Script by @cuichenx :: PR: #9271 +- [Nemo-UX] Move code to collections + fix some small bugs by @marcromeyn :: PR: #9277 +- Fix typo in HF tutorial by @github-actions[bot] :: PR: #9304 +- Expand documentation for data parallelism and distributed optimizer by @timmoon10 :: PR: #9227 +- Install alerting by @ko3n1g :: PR: #9311 +- typos by @github-actions[bot] :: PR: #9315 +- FP8 feature documentation by @ksivaman :: PR: #9265 +- [Nemo CICD] Comment out flaky tests by @pablo-garay :: PR: #9333 +- Fixed typos in README.rst by @gdevakumar :: PR: #9322 +- Update README.rst to clarify installation via Conda by @SimonCW :: PR: #9323 +- [Nemo CICD] update flaky test by @pablo-garay :: PR: #9339 +- fix lora and ptuning and isort/black by @github-actions[bot] :: PR: #9295 +- Fix P-tuning for Llama based models by @github-actions[bot] :: PR: #9300 +- add large model stable training fix and contrastive loss update for variable seq by @github-actions[bot] :: PR: #9348 +- Guard cuda memory allocator update by @github-actions[bot] :: PR: #9313 +- [Nemo CICD] Remove unnecessary commented out code by @pablo-garay :: PR: #9364 +- Update Gemma conversion script by @yaoyu-33 :: PR: #9365 +- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347) by @github-actions[bot] :: PR: #9371 +- Re-enable cuda graphs in training modes. by @github-actions[bot] :: PR: #9343 +- fix typo infer_seq_lenght -> infer_seq_length by @akoumpa :: PR: #9370 +- Make a backward compatibility for old MSDD configs in label models by @github-actions[bot] :: PR: #9378 +- Dgalvez/fix greedy batch strategy name r2.0.0rc0 by @github-actions[bot] :: PR: #9253 +- Update README.rst by @jgerh :: PR: #9393 +- Force diarizer to use CUDA if cuda is available and if device=None. by @github-actions[bot] :: PR: #9390 +- ci: Properly catch failed tests by introduction of workflow templates by @ko3n1g :: PR: #9324 +- Fix T5 G2P Input and Output Types by @github-actions[bot] :: PR: #9269 +- Huvu/rag pipeline citest by @huvunvidia :: PR: #9384 +- Fix circular import for MM dataprep notebook by @github-actions[bot] :: PR: #9292 +- add check if num layers is divisible by pp size by @github-actions[bot] :: PR: #9298 +- [Nemo CICD] timeouts fix by @pablo-garay :: PR: #9407 +- [NeMo-UX] Removing un-used ModelConfig class by @marcromeyn :: PR: #9389 +- Add tutorial for Llama-3-8B lora training and deployment by @shashank3959 :: PR: #9359 +- [NeMo-UX] Removing default_path from ModelConnector by @marcromeyn :: PR: #9401 +- Fix README by @ericharper :: PR: #9415 +- [SD] Fix SD CUDA Graph Failure by @alpha0422 :: PR: #9319 +- [NeMo-UX] Adding file-lock to Connector by @marcromeyn :: PR: #9400 +- Add Dev Container Bug Report by @pablo-garay :: PR: #9430 +- Akoumparouli/profiling docs by @akoumpa :: PR: #9420 +- ci: Enrich notifications by @ko3n1g :: PR: #9412 +- Fix failing RIR unit test with lhotse 1.24+ by @pzelasko :: PR: #9444 +- [NeMo-UX] Adding support for mcore distributed optimizer by @marcromeyn :: PR: #9435 +- Use ModelOpt build_tensorrt_llm for building engines for qnemo checkpoints by @janekl :: PR: #9452 +- ci(notifications): Fix extraction of last 2K chars by @ko3n1g :: PR: #9450 +- Update readme with mlperf news by @ericharper :: PR: #9457 +- [NeMo-UX] Add nsys callback by @ashors1 :: PR: #9461 +- [NeMo UX] Introducing optimizer module by @marcromeyn :: PR: #9454 +- Fix minor import bug in deploy module by @oyilmaz-nvidia :: PR: #9463 +- ci(notifications): Fetch all jobs by @ko3n1g :: PR: #9465 +- Update build_dataset.py by @stevehuang52 :: PR: #9467 +- bionemo: bn2/add pipelineparallel dtype by @skothenhill-nv :: PR: #9475 +- [NeMo-UX] Integrate experiment manager features with NeMo-UX APIs by @ashors1 :: PR: #9460 +- Add python_requires by @galv :: PR: #9431 +- [NeMo-UX] Fixing imports of NeMoLogging, AutoResume & ModelCheckpoint by @marcromeyn :: PR: #9476 +- Modelopt Refactor for SDXL Quantization by @suiyoubi :: PR: #9279 +- [NeMo-UX] Fixing defaults in llm.train & Mistral7BModel by @marcromeyn :: PR: #9486 +- In framework deploy using deploy script by @oyilmaz-nvidia :: PR: #9468 +- [NeMo-UX] Integrate tokenizer import into model.import_ckpt by @marcromeyn :: PR: #9485 +- append to file by @malay-nagda :: PR: #9483 +- [NeMo-UX] Fix bug in import_ckpt by @marcromeyn :: PR: #9492 +- Add nemotron news by @ericharper :: PR: #9510 +- Add CICD test for Stable Diffusion by @michal2409 :: PR: #9464 +- Akoumparouli/nemo ux mixtral by @akoumpa :: PR: #9446 +- [NeMo-UX] Llama and Gemma by @cuichenx :: PR: #9528 +- [NeMo-UX] minor logging bug fixes by @ashors1 :: PR: #9529 +- Update neva conversion script from and to HF by @yaoyu-33 :: PR: #9296 +- [Nemo-UX] IO fixes by @marcromeyn :: PR: #9512 +- Fix lhotse tests for v1.24.2 by @pzelasko :: PR: #9546 +- [Nemo CICD] Make GPU Unit Tests non-optional by @pablo-garay :: PR: #9551 +- Add Python AIStore SDK to container and bump min Lhotse version by @pzelasko :: PR: #9537 +- [NeMo-UX] Fix tokenizer IO by @marcromeyn :: PR: #9555 +- [NeMo UX] Move mistral_7b.py to mistral.py by @akoumpa :: PR: #9545 +- ci: Do not attempt to send slack on fork by @ko3n1g :: PR: #9556 +- Fix SDXL incorrect name in Docs by @suiyoubi :: PR: #9534 +- Bump PTL version by @athitten :: PR: #9557 +- [Resiliency] Straggler detection by @jbieniusiewi :: PR: #9473 +- [NeMo-UX] Switch to torch_dist as default distributed checkpointing backend by @ashors1 :: PR: #9541 +- [NeMo-UX] Checkpointing bug fixes by @ashors1 :: PR: #9562 +- Expose MCore path_to_cache option by @maanug-nv :: PR: #9570 +- [NeMo-UX] Fix Trainer serialization by @marcromeyn :: PR: #9571 +- Update click version requirement by @thomasdhc :: PR: #9580 +- [Fault tolerance] Heartbeat detection by @maanug-nv :: PR: #9352 +- [Nemo-UX] Add fabric-API for manual forward-pass by @marcromeyn :: PR: #9577 +- [Nemo-UX] Add SDK-factories to llm-collection by @marcromeyn :: PR: #9589 +- [NeMo-UX] Some improvements to NeMoLogger by @marcromeyn :: PR: #9591 +- Set no_sync_func & grad_sync_fucn by @akoumpa :: PR: #9601 +- [NeMo-UX] Fix nemo logger when trainer has no loggers by @ashors1 :: PR: #9607 +- Fix the dictionary format returned by the `scheduler` method by @sararb :: PR: #9609 +- [NeMo-UX] Dataloading enhancements and bug fixes by @ashors1 :: PR: #9595 +- Fix serialization of AutoResume by @sararb :: PR: #9616 +- Jsonl support by @adityavavre :: PR: #9611 +- Akoumparouli/mistral import instruct chat template fix by @akoumpa :: PR: #9567 +- Remove .cuda calls, use device isntead by @akoumpa :: PR: #9602 +- fix converter defautl args by @akoumpa :: PR: #9565 +- fix: remove non_blocking from PTL's .cuda call by @akoumpa :: PR: #9618 +- NeVA Minor Fixes by @yaoyu-33 :: PR: #9608 +- [NeMo-UX] fix pretrianing data sizes and weights by @cuichenx :: PR: #9627 +- [NeMo-UX] async checkpointing support by @ashors1 :: PR: #9466 +- Change default parallel_save to False by @mikolajblaz :: PR: #9632 +- Add REST API to deploy module by @athitten :: PR: #9539 +- ci: Timeout per step, not job by @ko3n1g :: PR: #9635 +- [NeMo-UX] Fix when optimizers are setup for PEFT by @marcromeyn :: PR: #9619 +- [NeMo-UX] Fix pipeline parallel bug by @ashors1 :: PR: #9637 +- Fixing import error fior llama-index (RAG pipeline) by @pablo-garay :: PR: #9662 +- llama CI fix by @rohitrango :: PR: #9663 +- [NeMo-UX] Make 'load_directly_on_device' configurable by @ashors1 :: PR: #9657 +- [Nemo-UX] Including all trainable-params in a PEFT-checkpoint by @marcromeyn :: PR: #9650 +- [NeMo-UX] Fix imports so local configuration of runs works again by @marcromeyn :: PR: #9690 +- Set TE flag in legacy -> mcore conversion script by @terrykong :: PR: #9722 +- Update starthere docs text by @erastorgueva-nv :: PR: #9724 +- TorchAudio installation workaround for incorrect `PYTORCH_VERSION` variable by @artbataev :: PR: #9736 +- [NeMo-UX] Match nemo 1's default behavior for drop_last and pad_samples_to_global_batch_size by @ashors1 :: PR: #9707 +- add a bit more for timeout (#9702) by @pablo-garay :: PR: #9754 +- Fix missing parallelisms by @maanug-nv :: PR: #9725 +- update branch by @nithinraok :: PR: #9764 +- Fix data preprocessing script by @cuichenx :: PR: #9759 +- vLLM 0.5.1 update by @apanteleev :: PR: #9779 +- upper bound hf-hub by @akoumpa :: PR: #9805 +- Fix few issues and docs for neva and clip in r2.0.0rc1 by @yaoyu-33 :: PR: #9681 +- add dummy vision and text transformer config (assumed mcore to be false) by @rohitrango :: PR: #9699 +- fix lita bugs by @Slyne :: PR: #9810 +- [NeMo-UX] Log `val_loss` by @ashors1 :: PR: #9814 +- [NeMo-UX] Fix some dataloading bugs by @ashors1 :: PR: #9807 +- [NeMo-UX] Adding recipes by @marcromeyn :: PR: #9720 +- [NeMo-UX] Set async_save from strategy rather than ModelCheckpoint by @ashors1 :: PR: #9800 +- Fix hf hub for 0.24+ by @titu1994 :: PR: #9806 +- [NeMo-UX] Fix a minor bug with async checkpointing by @ashors1 :: PR: #9856 +- [NeMo-UX] make progress bar easier to parse by @ashors1 :: PR: #9877 +- Docs: add "Nemo Fundamentals" page by @erastorgueva-nv :: PR: #9835 +- Create __init__.py by @stevehuang52 :: PR: #9892 +- [NeMo-UX] Fixes to make PreemptionCallback work by @hemildesai :: PR: #9830 +- Fix Docker build. Make Dockerfile consistent with CI by @artbataev :: PR: #9784 +- Multimodal data prep notebook fix by @cuichenx :: PR: #9910 +- [NeMo-UX] Add distributed checkpointing unit tests by @ashors1 :: PR: #9794 +- r2.0.0rc1 fix for dist checkpoint loading by @yaoyu-33 :: PR: #9854 +- [NeMo-UX] Rename sdk references to NeMo Run by @hemildesai :: PR: #9872 +- [NeMo-UX] Fix some serialization bugs by @ashors1 :: PR: #9868 +- add mixtral neva tutorial (moe + token fusion + siglip) by @paul-gibbons :: PR: #9926 +- [NeMo-UX] Add more NeMo Logger tests by @ashors1 :: PR: #9795 +- Akoumparouli/mixtral fixes for r2.0.0rc1 by @akoumpa :: PR: #9911 +- R2.0.0rc1 clip fix by @Slyne :: PR: #9871 +- [NeMo-UX] Add missing docstrings and update some defaults by @ashors1 :: PR: #9895 +- Add REST service requirements.txt by @oyilmaz-nvidia :: PR: #9923 +- add bert latest fix by @JRD971000 :: PR: #9921 +- remove empy reconfigure_limit_batches by @akoumpa :: PR: #9934 +- fix mem by @terrykong :: PR: #9964 +- Run a sample query for a quantized model conditionally by @janekl :: PR: #9965 +- Add pydantic-settings by @oyilmaz-nvidia :: PR: #9961 +- Resiliency features update by @jbieniusiewi :: PR: #9714 +- [NeMo-UX] Wrap task config save in a try/except by @ashors1 :: PR: #9956 +- [NeMo-UX] Update default PTL logging `save_dir` by @ashors1 :: PR: #9954 +- Fix lita tutorial by @Slyne :: PR: #9980 +- Add deploy and REST API support to NeMo 2.0 by @athitten :: PR: #9834 +- ci: Allow changelog manual (#10156) by @ko3n1g :: PR: #10157 +- docs: Add changelog by @ko3n1g :: PR: #10155 +- add manifest file by @ko3n1g :: PR: #10161 + +
+ +## NVIDIA Neural Modules 2.0.0rc0 + +### Highlights + +#### LLM and MM + +##### Models + +- Megatron Core RETRO + - Pre-training + - Zero-shot Evaluation + +- Pretraining, conversion, evaluation, SFT, and PEFT for: + - Mixtral 8X22B + - Llama 3 + - SpaceGemma + +- Embedding Models Fine Tuning + - Mistral + - BERT + +- BERT models + - Context Parallel + - Distributed checkpoint + +- Video capabilities with NeVa + +##### Performance + +- Distributed Checkpointing + - Torch native backend + - Parallel read/write + - Async write + +- Multimodal LLM (LLAVA/NeVA) + - Pipeline Parallelism support + - Sequence packing support + +##### Export + +- Integration of Export & Deploy Modules into NeMo Framework container + - Upgrade to TRT-LLM 0.9 + +#### Speech (ASR & TTS) + +##### Models + +- AED Multi Task Models (Canary) - Multi-Task Multi-Lingual Speech Recognition / Speech Translation model +- Multimodal Domain - Speech LLM supporting SALM Model +- Parakeet-tdt_ctc-1.1b Model - RTFx of > 1500 (can transcribe 1500 seconds of audio in 1 second) +- Audio Codec 16kHz Small - NeMo Neural Audio Codec for discretizing speech for use in LLMs + - mel_codec_22khz_medium + - mel_codec_44khz_medium + +##### Perf Improvements + +- Transcribe() upgrade - Enables one line transcribe with files, tensors, data loaders +- Frame looping algorithm for RNNT faster decoding - Improves Real Time Factor (RTF) by 2-3x +- Cuda Graphs + Label-Looping algorithm for RNN-T and TDT Decoding - Transducer Greedy decoding at over 1500x RTFx, on par with CTC Non-Autoregressive models +- Semi Sorted Batching support - External User contribution that speeds up training by 15-30%. + +##### Customization + +- Context biasing for CTC word stamping - Improve accuracy for custom vocabulary and pronunciation + - Longform Inference + - Longform inference support for AED models +- Transcription of multi-channel audio for AED models + +##### Misc + +- Upgraded webdataset - Speech and LLM / Multimodal unified container + +### Detailed Changelogs + +#### ASR + +
Changelog + +- Enable using hybrid asr models in CTC Segmentation tool by @erastorgueva-nv :: PR: #8828 +- TDT confidence fix by @GNroy :: PR: #8982 +- Fix union type annotations for autodoc+mock-import rendering by @pzelasko :: PR: #8956 +- NeMo dev doc restructure by @yaoyu-33 :: PR: #8896 +- Improved random seed configuration for Lhotse dataloaders with docs by @pzelasko :: PR: #9001 +- Fix #8948, allow preprocessor to be stream captured to a cuda graph when doing per_feature normalization by @galv :: PR: #8964 +- [ASR] Support for transcription of multi-channel audio for AED models by @anteju :: PR: #9007 +- Add ASR latest news by @titu1994 :: PR: #9073 +- Fix docs errors and most warnings by @erastorgueva-nv :: PR: #9006 +- PyTorch CUDA allocator optimization for dynamic batch shape dataloading in ASR by @pzelasko :: PR: #9061 +- RNN-T and TDT inference: use CUDA graphs by default by @artbataev :: PR: #8972 +- Fix #8891 by supported GPU-side batched CTC Greedy Decoding by @galv :: PR: #9100 +- Update branch for notebooks and ci in release by @ericharper :: PR: #9189 +- Enable CUDA graphs by default only for transcription by @artbataev :: PR: #9196 +- rename paths2audiofiles to audio by @nithinraok :: PR: #9209 +- Fix ASR_Context_Biasing.ipynb contains FileNotFoundError by @andrusenkoau :: PR: #9233 +- Cherrypick: Support dataloader as input to `audio` for transcription (#9201) by @titu1994 :: PR: #9235 +- Update Online_Offline_Microphone_VAD_Demo.ipynb by @stevehuang52 :: PR: #9252 +- Dgalvez/fix greedy batch strategy name r2.0.0rc0 by @galv :: PR: #9243 +- Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward by @galv :: PR: #9246 +- Fix loading github raw images on notebook by @nithinraok :: PR: #9282 +- typos by @nithinraok :: PR: #9314 +- Re-enable cuda graphs in training modes. by @galv :: PR: #9338 +- add large model stable training fix and contrastive loss update for variable seq by @nithinraok :: PR: #9259 +- Fix conv1d package in r2.0.0rc0 by @pablo-garay :: PR: #9369 +- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347) by @titu1994 :: PR: #9350 +- Make a backward compatibility for old MSDD configs in label models by @tango4j :: PR: #9377 +- Force diarizer to use CUDA if cuda is available and if device=None. by @tango4j :: PR: #9380 + +
+ +#### TTS + +
Changelog + +- [TTS] Add tutorial for training audio codecs by @rlangman :: PR: #8723 +- Update radtts.py by @blisc :: PR: #9097 +- [Nemo CICD] RADTTS test optional by @pablo-garay :: PR: #9112 +- Remove Radtts CI test by @blisc :: PR: #9144 +- Fix T5 G2P Input and Output Types by @blisc :: PR: #9224 + +
+ +#### LLM and MM + +
Changelog + +- Rachitg/dpa by @rachitgarg91 :: PR: #8911 +- Remove precision args in trainer due to PTL update by @yaoyu-33 :: PR: #8908 +- Huvu/mcore retro by @huvunvidia :: PR: #8861 +- fsdp tp > 1 bug fix by @dimapihtar :: PR: #8947 +- Fix memory leak at loss func by @minitu :: PR: #8868 +- change the condition for get qkv tensor from linear_qkv output in mcoremixin by @HuiyingLi :: PR: #8965 +- Add safety checks for 'data' key in MegatronGPTModel cfg by @HuiyingLi :: PR: #8991 +- [NeMo-UX] Adding MegatronParallel by @cuichenx :: PR: #8987 +- Skip top_p computations when set to 1.0 by @odelalleau :: PR: #8905 +- Gemma bug by @cuichenx :: PR: #8962 +- [NeMo-UX] Adding megatron strategy by @marcromeyn :: PR: #8995 +- Quantized checkpoint support in export and deploy modules by @janekl :: PR: #8859 +- add geglu to mlp swap by @JRD971000 :: PR: #8999 +- add timeout for new_group by @acphile :: PR: #8998 +- Zero-shot evaluation pipeline for mcore RETRO by @huvunvidia :: PR: #8941 +- Added fusion for squared relu by @sanandaraj5597 :: PR: #8963 +- Developer Documents for mcore RETRO by @huvunvidia :: PR: #9026 +- [NeMo-UX] Adding GPTModel & MockDataModule by @marcromeyn :: PR: #9011 +- Adding unit test for mcore RETRO model by @huvunvidia :: PR: #9022 +- docs and simplification of cmd args by @arendu :: PR: #8979 +- [NeMo-UX] Add checkpoint-io to MegatronStrategy by @marcromeyn :: PR: #9057 +- Enable Sequence Packing and Pipeline Parallel in NeVA by @yaoyu-33 :: PR: #8957 +- Mingyuanm/add back fp8 support to sd by @Victor49152 :: PR: #9070 +- unfused lora by @arendu :: PR: #9004 +- Handle case where num_query_groups is set to null for LoRA config setup by @vysarge :: PR: #9075 +- Alit/griffin by @JRD971000 :: PR: #9021 +- Implement DistributedCheckpointIO by @mikolajblaz :: PR: #9016 +- Video Neva Pretraining + Inference Implementation by @paul-gibbons :: PR: #9095 +- HF to .nemo for Mixtral-8x22B-instruct by @akoumpa :: PR: #9060 +- mcore ds updates by @dimapihtar :: PR: #8951 +- Alit/griffin perf by @JRD971000 :: PR: #9107 +- Add assert for max_steps to be positive in MegatronGPTSFTModel by @athitten :: PR: #9110 +- Extend sequence length padding for GPT SFT to account for context parallel by @vysarge :: PR: #8869 +- Update gpt dataset config parameter for mock by @thomasdhc :: PR: #9118 +- Add Mcore DistributedDataParallel and distributed optimizer into Nemo by @gdengk :: PR: #9034 +- Revert "Add assert for max_steps to be positive in MegatronGPTSFTMode… by @pablo-garay :: PR: #9128 +- scripts to convert HF lora to nemo by @arendu :: PR: #9102 +- Prevent duplicated checkpoints by @mikolajblaz :: PR: #9015 +- add TN/ITN link in speech tools list by @erastorgueva-nv :: PR: #9142 +- Cleanup deprecated files and temporary changes by @cuichenx :: PR: #9088 +- Use DP+CP groups as the FSDP sharding domain by @erhoo82 :: PR: #9145 +- CUDA memory profile by @erhoo82 :: PR: #9096 +- Fix missing func for T5 model by @gdengk :: PR: #9141 +- Add knob for load_directly_on_device by @mikolajblaz :: PR: #9125 +- Revert rope fusion defaults by @cuichenx :: PR: #9238 +- Update nemo.export module for quantized models by @janekl :: PR: #9250 +- Fix circular import for MM dataprep notebook by @cuichenx :: PR: #9287 +- neva media_type + text generation default fix by @paul-gibbons :: PR: #9257 +- fix lora and ptuning and isort/black by @oyilmaz-nvidia :: PR: #9290 +- add check if num layers is divisible by pp size by @dimapihtar :: PR: #9208 +- Fix P-tuning for Llama based models by @apanteleev :: PR: #9297 +- add deprecation warnings by @pablo-garay :: PR: #9266 +- move pooler under post_process by @dimapihtar :: PR: #9328 +- add deprecation note for nmt by @dimapihtar :: PR: #9342 +- Fix incorrect checkpoint removal logic (#9192) by @mikolajblaz :: PR: #9204 +- fix fp16 precision issue by @dimapihtar :: PR: #9376 +- Fix module.training for Neva in FusedAttn backward which causes nan by @yaoyu-33 :: PR: #8877 + +
+ +#### Export + +
Changelog + +- Updates for TRT-LLM 0.9 by @oyilmaz-nvidia :: PR: #8873 +- Mingyuanm/sdxl export by @Victor49152 :: PR: #8926 +- Avoid unpacking NeMo checkpoints before exporting to TRT-LLM by @apanteleev :: PR: #8866 +- Update gemma for trt-llm 0.9 by @oyilmaz-nvidia :: PR: #8974 +- TRT-LLM export P-tuning related fixes by @apanteleev :: PR: #8863 + +
+ +#### General Improvements + +
Changelog + +- Update package info by @ericharper :: PR: #8793 +- [Nemo CICD] Update mcore 4.13.24 by @pablo-garay :: PR: #8917 +- Akoumparouli/low mem mixtral ckpt converter by @akoumpa :: PR: #8895 +- Adding RETRO tests to Action Tests (cicd-main.yml) by @huvunvidia :: PR: #8942 +- Akoumparouli/fix sd train 2 by @akoumpa :: PR: #8883 +- Update te install for jenkins by @ericharper :: PR: #8954 +- [Nemo CICD] Add last job depending on others for blocking check by @pablo-garay :: PR: #8959 +- Minor quantization pipeline updates by @janekl :: PR: #8924 +- Fix External CLIP Converter by @yaoyu-33 :: PR: #8960 +- PP support in LoRA merge script by @cuichenx :: PR: #8934 +- Update PR template by @ericharper :: PR: #8978 +- Update Latest News by @shashank3959 :: PR: #8837 +- Fix incorrect link to latest news in README by @shashank3959 :: PR: #8985 +- Update dependency install for LLM and MM by @ericharper :: PR: #8990 +- Temporarily remove mcore dep by @ericharper :: PR: #9010 +- [Nemo CICD] further specialize runners for more parallelism by @pablo-garay :: PR: #9036 +- Update mm dataprep notebook based on feedback by @cuichenx :: PR: #9029 +- Fix import in lora merge script by @cuichenx :: PR: #9032 +- [Nemo CICD] Run when labeled:Run CICD by @pablo-garay :: PR: #9044 +- [Nemo CICD] Add tag/label for 1-gpu runner by @pablo-garay :: PR: #9046 +- [Nemo CICD] checkout v4 by @pablo-garay :: PR: #9048 +- [Nemo CICD] Remove temp test change by @pablo-garay :: PR: #9049 +- remove in-place addition for dreambooth train with text encoder by @Victor49152 :: PR: #8825 +- Mingyuanm/sdxl quantization notebook by @Victor49152 :: PR: #9042 +- [Nemo CICD] Trigger on comment issued by @pablo-garay :: PR: #9062 +- zarr ckpt to torch_dist ckpt converter by @dimapihtar :: PR: #8842 +- Restore PTQ tests for Llama2 (reopened) by @janekl :: PR: #9064 +- add clip H config by @JRD971000 :: PR: #9082 +- [NeMo-UX] Add mixed-precision plugin by @marcromeyn :: PR: #9065 +- Comment baichuan test and update pr template by @ericharper :: PR: #9085 +- Add safe extraction of nemo tar files by @athitten :: PR: #8976 +- Improved `shard_id` parsing in `LazyNemoTarredIterator`, enables AIS dataloading by @pzelasko :: PR: #9077 +- [NeMo-UX] Add mistral-7b model by @marcromeyn :: PR: #9066 +- Llama3 Conversion Script Update by @suiyoubi :: PR: #9089 +- dehardcode test string by @JimmyZhang12 :: PR: #8865 +- [Nemo CICD] Try trigger cicd run on comment by @pablo-garay :: PR: #9111 +- Lhotse dataloading: RIR augmentation and nemo/tarred input support for RIR and noise aug by @pzelasko :: PR: #9109 +- mixtral evaluation PR by @Slyne :: PR: #8989 +- [Nemo CICD] Revert: run GHA cicd on comment by @pablo-garay :: PR: #9119 +- [Nemo CICD] Comment out flaky test: running too long by @pablo-garay :: PR: #9123 +- [Nemo CICD] Add timeout to unit tests by @pablo-garay :: PR: #9132 +- [Nemo CICD] Indicate optional test in name (prefix) by @pablo-garay :: PR: #9139 +- video neva null image+video folder path fix by @paul-gibbons :: PR: #9116 +- [NeMo-UX] Add data module by @cuichenx :: PR: #9133 +- NeMo Inference Requirements by @oyilmaz-nvidia :: PR: #9093 +- Remove debug print by @maanug-nv :: PR: #9074 +- Remove legacy CI by @pablo-garay :: PR: #9149 +- Update support for push_to_hf_hub() by @titu1994 :: PR: #9159 +- [Nemo CICD] comment out flaky PTQ tests by @pablo-garay :: PR: #9160 +- Update branch by @ericharper :: PR: #9211 +- dist adam transpose fix by @dimapihtar :: PR: #9239 +- [Nemo CICD] Increase time limit for Speech_Checkpoints_tests (#9186) by @pablo-garay :: PR: #9247 +- Pin transformers by @ericharper :: PR: #9261 +- Fix typo in HF tutorial by @titu1994 :: PR: #9302 + +
+ +## NVIDIA Neural Modules 1.23.0 + +### Highlights + +#### Models + +##### Nvidia Starcoder 2 - 15B + +- Announcement - https://developer.nvidia.com/blog/unlock-your-llm-coding-potential-with-starcoder2/ +- AI Foundation Model Inference - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/starcoder2-15b +- https://huggingface.co/bigcode/starcoder2-15b + +##### NeMo Canary +Announcement - https://nvidia.github.io/NeMo/blogs/2024/2024-02-canary/ + +- https://huggingface.co/nvidia/canary-1b + +#### NeMo LLM + +- Falcon +- Code Llama +- StarCoder +- GPT perf improvements +- Context parallelism +- Mistral +- Mixtral (without expert parallelism) +- Mcore GPT Dataset integration + +#### NeMo MM +- CLIP +- Stable Diffusion (supporting LoRA) +- Imagen +- ControlNet (for SD) +- Instruct pix2pix (for SD) +- LLAVA +- NeVA +- DreamFusion++ +- NSFW filtering + +#### NeMo ASR + +- Lhotse Dataloading support #7880 +- Canary: Multi task multi lingual ASR #8242 +- LongForm Audio for Diarization #7737 +- Faster algorithm for RNN-T Greedy #7926 +- Cache-Aware streaming notebook #8296 + +#### NeMo TTS + +#### NeMo Vision + +#### Known Issues + +##### ASR + +###### RNNT WER calculation when fused batch size > 1 during validation / test step() + +Previously, the RNNT metric was stateful while the CTC one was not ([r1.22.0](https://github.com/NVIDIA/NeMo/blob/r1.22.0/nemo/collections/asr/metrics/rnnt_wer_bpe.py#L419-L420), [r1.23.0](https://github.com/NVIDIA/NeMo/blob/r1.23.0/nemo/collections/asr/metrics/wer.py#L333)) + +Therefore this calculation in the RNNT joint for fused operation worked properly. However with the unification of metrics in r1.23.0, a bug was introduced where only the last sub-batch of metrics calculates the scores and does not accumulate. This is patched via https://github.com/NVIDIA/NeMo/pull/8587 and will be fixed in the next release. + +**Workaround**: Explicitly disable fused batch size during inference using the following command + +```python +from omegaconf import open_dict +model = ... +decoding_cfg = model.cfg.decoding +with open_dict(decoding_cfg): + decoding_cfg.fused_batch_size = -1 +model.change_decoding_strategy(decoding_cfg) +``` + +Note: This bug does not affect scores calculated via model.transcribe() (since it does not calculate metrics during inference, just text), or using the `transcribe_speech.py` or `speech_to_text_eval.py` in `examples/asr`. + +###### Two failing unit tests due to a change in expected results, caused by lhotse version update + +#### Container + +For additional information regarding NeMo containers, please visit: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo + +`docker pull nvcr.io/nvidia/nemo:24.01.speech` + +#### ASR + +
Changelog + +- Update link to yaml file in ASR_with_Transducers.ipynb by @Faith-Nchifor :: PR: #8014 +- Use convert_hf_dataset_to_nemo by @karpnv :: PR: #8017 +- Update asr_language_modeling.rst: Add a missing word by @martin0258 :: PR: #8007 +- spelling mistake by @orena1 :: PR: #7903 +- update asr eval by @stevehuang52 :: PR: #8045 +- fix noise aug by @stevehuang52 :: PR: #8057 +- Various fixes for typos and urls by @titu1994 :: PR: #8066 +- [Fix] Increase length check tolerance to prevent test failing by @anteju :: PR: #8067 +- Add text metrics to asr eval by @stevehuang52 :: PR: #8087 +- fix device setting to allow using accelerator cpu by @orena1 :: PR: #8084 +- .ctm in data simulator annotator compliant with RT-09 specification by @popcornell :: PR: #8004 +- Fix AST eval by @stevehuang52 :: PR: #8112 +- fix: numba.*_num_threads resets torch num_threads #8141 by @itzsimpl :: PR: #8145 +- Update dependencies by @titu1994 :: PR: #8156 +- NeMo + Lhotse integration by @pzelasko :: PR: #7880 +- Speedup RNN-T greedy decoding by @artbataev :: PR: #7926 +- [docker] Install k2 before NeMo for faster image rebuilding by @pzelasko :: PR: #8204 +- [docs] Add --force_codec to tarred dataset creation examples by @pzelasko :: PR: #8227 +- Temporarily use the previous RNN-T decoding algorithm as default by @artbataev :: PR: #8226 +- Make TDT inference not require duration params by @hainan-xv :: PR: #8207 +- Cache Aware Streaming tutorial notebook by @erastorgueva-nv :: PR: #8296 +- fix path location and branch by @nithinraok :: PR: #8304 +- Attention encoder-decoder models for multiple speech-to-text tasks … by @titu1994 :: PR: #8324 +- Remove asr webapp by @titu1994 :: PR: #8347 +- remove _target_ at model level in aed model config [ASR] by @krishnacpuvvada :: PR: #8351 +- Add change_vocabulary and save_tokenizers() support to Multitask ASR models by @titu1994 :: PR: #8357 +- Change default beam size by @titu1994 :: PR: #8371 +- adding jenkins test for speech_to_text_aed model by @krishnacpuvvada :: PR: #8368 +- Add Finetuning tutorial with HF Datasets by @nithinraok :: PR: #8356 +- wer fix by @tbartley94 :: PR: #8404 +- add ensemble decoding fix by @nithinraok :: PR: #8427 +- Update k2 by @artbataev :: PR: #8492 + +
+ +#### TTS + +
Changelog + +- [TTS] Scale sampler steps by number of devices by @rlangman :: PR: #7947 +- Add All Multimodal Source Code Part 2: Text to image, x to nerf by @yaoyu-33 :: PR: #7970 +- [TTS] Add period discriminator and feature matching loss to codec recipe by @rlangman :: PR: #7884 +- Added VectorQuantizer base class by @anteju :: PR: #8011 + +
+ +#### LLMS + +
Changelog + +- Add interface to set NCCL options of each process group by @erhoo82 :: PR: #7923 +- Support O2 training of PEFT and SFT by @cuichenx :: PR: #7971 +- [NLP] Access scaler only in FP16 case by @janekl :: PR: #7916 +- [NLP] Minor improvements in Llama conversion script by @janekl :: PR: #7978 +- [NLP] Use helpers from utils_funcs.py in Llama conversion by @janekl :: PR: #7979 +- [NLP] Remove replace_sampler_ddp (deprecated in Trainer) by @janekl :: PR: #7981 +- Reworked MegatronPretrainingRandomBatchSampler to correctly handle epochs > 1 by @trias702 :: PR: #7920 +- Remove deprecated arguments from TE's TransformerLayer by @jbaczek :: PR: #7917 +- Add All Multimodal Source Code by @yaoyu-33 :: PR: #7791 +- First draft of mcore bert model in NeMo by @shanmugamr1992 :: PR: #7814 +- Support Falcon Variants (7B/40B/180B) in Mcore NeMo by @xuanzic :: PR: #7666 +- FSDP + Tensor Parallelism by @erhoo82 :: PR: #7897 +- Packed Sequence by @cuichenx :: PR: #7945 +- Adding method back that was removed accidentally by @ericharper :: PR: #8038 +- [NLP] ArtifactItem with init=True to make it debuggable by @janekl :: PR: #7980 +- SFT patch: (1) enable sequence parallelism and (2) enable profile by @erhoo82 :: PR: #7963 +- migration to PTL 2.0 for spellmapper model by @bene-ges :: PR: #7924 +- Change the megatron config lr scheduler default and fix to change partitions script by @shan18 :: PR: #8094 +- (1) Add SHARP interface to M-CORE, (2) use send/recv to send train loss to the first rank instead of b-cast by @erhoo82 :: PR: #7793 +- Reconfigure limit_val_batches only for int by @athitten :: PR: #8099 +- Fixing wrapper and moving it to base class by @shanmugamr1992 :: PR: #8055 +- fix gated_linear_unit bug by @Agoniii :: PR: #8042 +- Fix Adapter for MCore models by @cuichenx :: PR: #8124 +- add war fix for sync issues by @gshennvm :: PR: #8130 +- Improve PEFT UX by @cuichenx :: PR: #8131 +- Enhance flexibility by passing callbacks as method argument by @michal2409 :: PR: #8015 +- context parallelism by @xrennvidia :: PR: #7739 +- Make pipelined TP comm overlap available with mcore by @erhoo82 :: PR: #8005 +- remove deprecated scripts by @arendu :: PR: #8138 +- adding OnlineSampleMapping by @arendu :: PR: #8137 +- Add distopt support for FP8 params and BF16 optimizer state by @timmoon10 :: PR: #7909 +- Revert adding OnlineSampleMapping by @pablo-garay :: PR: #8164 +- Token count and sequence length logging for MegatronGPTSFTModel by @vysarge :: PR: #8136 +- Use latest apex internal API by @jbaczek :: PR: #8129 +- tune specific params in the base model by @arendu :: PR: #7745 +- Virtual pipeline parallel support for MegatronGPTSFTModel by @vysarge :: PR: #7964 +- removed deprecated peft model by @arendu :: PR: #8183 +- remove more deprecated files by @arendu :: PR: #8169 +- Pre-generate cu_seqlens argmin and max_seqlen to remove host-to-device sync by @erhoo82 :: PR: #8108 +- Add the interface to use SHARP to FSDP strategy by @erhoo82 :: PR: #8202 +- Multimodal required NLP base model changes by @yaoyu-33 :: PR: #8188 +- [NLP] Improve and unify loading state_dict for community models by @janekl :: PR: #7977 +- Rename Finetuning Scripts by @cuichenx :: PR: #8201 +- Final multimodal PR with our recent developments on MM side by @yaoyu-33 :: PR: #8127 +- Add include_text parameter to SFT dataloaders by @Kipok :: PR: #8198 +- Add random_seed argument to generate by @Kipok :: PR: #8162 +- Added support for neptune logger by @harishankar-gopalan :: PR: #8210 +- Pre-compute max_seqlen and cu_seqlens_argmin in all model-parallel cases by @erhoo82 :: PR: #8222 +- Use PackedSeqParams in accordance with changes in Megatron-LM by @cuichenx :: PR: #8205 +- Fix to peft & virtual pipeline parallel unsupported check by @vysarge :: PR: #8216 +- Fixed the tp overlap switch by @sanandaraj5597 :: PR: #8195 +- add knobs for rope/swiglu fusion by @lhb8125 :: PR: #8184 +- Added sample cpu_offloading switch to YAML by @sanandaraj5597 :: PR: #8148 +- Syncing random seed between ranks in generate by @Kipok :: PR: #8230 +- add first_val_step to mcore scheduler by @JimmyZhang12 :: PR: #8150 +- Correct padding for SFT input data to account for sequence parallel + TE's fp8 op dimension requirements by @vysarge :: PR: #8240 +- Mistral 7b conversion script by @akoumpa :: PR: #8052 +- switch to mcore dataset [with FIM support] by @dimapihtar :: PR: #8149 +- Mixtral to NeMo conversion script. by @akoumpa :: PR: #8155 +- fixes to accomendate mcore changes by @HuiyingLi :: PR: #8261 +- Allow MegatronPretrainingRandomSampler to do multi-epoch training by @trias702 :: PR: #8239 +- Add dist ckpt support for regular optimizers by @mikolajblaz :: PR: #7749 +- add deallocate pipeline output optimization by @JimmyZhang12 :: PR: #8279 +- Fix memory leak caused by context parallelism hanging references by omegaconf by @JimmyZhang12 :: PR: #8299 +- distributed fused adam + rampup bs support by @dimapihtar :: PR: #8302 +- Update PEFT Doc by @cuichenx :: PR: #8262 +- Converter script fixes for mixtral/mistral by @akoumpa :: PR: #8272 +- Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 by @erhoo82 :: PR: #8334 +- Enable megatron core loggers for GPT pretraining by @ashbhandare :: PR: #8354 +- mcore ds fix by @dimapihtar :: PR: #8283 +- release updates by @dimapihtar :: PR: #8378 +- Mcore customization doc by @HuiyingLi :: PR: #8298 +- updated link to pubmed by @nithinraok :: PR: #8402 +- mcore customization doc minor fix by @HuiyingLi :: PR: #8421 +- Fixing mcore bert for TP, PP and SP by @shanmugamr1992 :: PR: #8336 +- Add settings to suppress bf16 compile errors in CI on V100 by @athitten :: PR: #8481 +- MoE parameter passing by @akoumpa :: PR: #8255 +- Add fp8 support for SD/Update notebook paths by @Victor49152 :: PR: #8489 + +
+ +#### NeMo Tools + +
Changelog + +- SDE bugfix log by @Jorjeous :: PR: #8430 + +
+ +#### General Improvements + +
Changelog + +- Add news section to README by @ericharper :: PR: #7984 +- Fixing conversion script to work for code llama by @shanmugamr1992 :: PR: #7997 +- Fix crash when converting to mcore a model using rotary embeddings by @odelalleau :: PR: #7998 +- Added a procedure for Windows users, README by @Jorjeous :: PR: #7942 +- Update manifest.py to speedup loading tarred datasets by @stevehuang52 :: PR: #7900 +- [Fix] Fixed name of a test by @anteju :: PR: #7986 +- Fix lora merge script by @cuichenx :: PR: #8113 +- Support transcoding audio formats when saving tarred datasets (FLAC, OPUS) by @pzelasko :: PR: #8102 +- README edit to change Apple Silicon install instructions (to fix a break introduced by pytorch 2) by @stephenmcconnachie :: PR: #8122 +- Fixes NVIDIA/apex installation to not erroneously install the pkg by @terrykong :: PR: #8126 +- Graphviz fix by @GNroy :: PR: #7843 +- Update README.rst by @fayejf :: PR: #8154 +- Fix TP>1 issue for conversion script by @cuichenx :: PR: #8144 +- Support torch jit script by @artbataev :: PR: #8027 +- NeMo Multimodal Docs and Tests Initial PR by @yaoyu-33 :: PR: #8028 +- Remove left-over prints in NeMo+Lhotse code by @pzelasko :: PR: #8180 +- Upgrade to DLFW PyTorch 23.12 by @ericharper :: PR: #8163 +- Add Lhotse support for key in NeMo manifests by @pzelasko :: PR: #8197 +- Fix CPU Initialization and TP>1 for LoRA Merge Script by @cuichenx :: PR: #8199 +- Add support in Neural Typecheck to disable semantic checks by @titu1994 :: PR: #8212 +- Pin lhotse=1.19.2 in r1.23.0 by @pzelasko :: PR: #8303 +- Multimodal r1.23.0 bug fix by @yaoyu-33 :: PR: #8315 +- MCore dataset compatibility for tokenizers by @vysarge :: PR: #8390 +- Update NFA video download link by @erastorgueva-nv :: PR: #8406 +- Update MM Dataprep Tutorial by @cuichenx :: PR: #8410 +- Fix dreambooth data sampler issue by @yaoyu-33 :: PR: #8400 +- Fix a bug in CTM line processing function for multi-speaker data simulations by @tango4j :: PR: #8416 +- Akoumparouli/mistral bugfix by @akoumpa :: PR: #8353 +- pin to 0.5.0 by @ericharper :: PR: #8465 +- Update NeMo Multimodal Requirements by @yaoyu-33 :: PR: #8515 +- Fix link in multimodal dataprep tutorial by @cuichenx :: PR: #8517 + +
diff --git a/docs/source/index.rst b/docs/source/index.rst index 2f75014b86d1..d015796096fc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -13,6 +13,7 @@ NVIDIA NeMo Framework is an end-to-end, cloud-native framework designed to build - Activation Recomputation - Positional Embeddings and Positional Interpolation - Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) with `TensorRT Model Optimizer `_ +- Knowledge Distillation-based training with `TensorRT Model Optimizer `_ - Sequence Packing `NVIDIA NeMo Framework `_ has separate collections for: diff --git a/docs/source/nlp/distillation.rst b/docs/source/nlp/distillation.rst new file mode 100644 index 000000000000..01e5d599cb8c --- /dev/null +++ b/docs/source/nlp/distillation.rst @@ -0,0 +1,58 @@ +.. _megatron_distillation: + +Distillation +========================== + +Knowledge Distillation (KD) +-------------------------------- + +KD involves using information from an existing trained model to train a second (usually smaller, faster) model, thereby "distilling" knowledge from one to the other. + +Distillation has two primary benefits: faster convergence and higher end accuracy than traditional training. + +In NeMo, distillation is enabled by the `NVIDIA TensorRT Model Optimizer (ModelOpt) `_ library -- a library to optimize deep-learning models for inference on GPUs. + +The logits-distillation process consists of the following steps: + +1. Loading both student and teacher model checkpoints (must support same parallelism strategy, if any) +2. Training until convergence, where forward passes are run on both models (and backward only on student), performing a specific loss function between the logits. +3. Saving the final student model. + + +Example +^^^^^^^ +The example below shows how to run the distillation script for LLama models. + +The script must be launched correctly with the number of processes equal to tensor parallelism. This is achieved with the ``torchrun`` command below: + +.. code-block:: bash + +STUDENT_CKPT="path/to/student.nemo" # can also be None (will use default architecture found in examples/nlp/language_modeling/conf/megatron_llama_distill.yaml) +TEACHER_CKPT="path/to/teacher.nemo" +TOKENIZER="path/to/tokenizer.model" +DATA_PATHS="[1.0,path/to/documents]" +FINAL_SAVE_FILE="final_checkpoint.nemo" +TP=4 + +NPROC=$TP +launch_config="torchrun --nproc_per_node=$NPROC" + +${launch_config} examples/nlp/language_modeling/megatron_gpt_distillation.py \ + model.restore_from_path=$STUDENT_CKPT \ + model.kd_teacher_restore_from_path=$TEACHER_CKPT \ + model.tensor_model_parallel_size=${TP} \ + model.tokenizer.model=$TOKENIZER \ + model.data.data_prefix=$DATA_PATHS \ + model.nemo_path=$FINAL_SAVE_FILE \ + trainer.precision=bf16 \ + trainer.devices=$NPROC + +For large models, the command can be used in multi-node setting. For example, this can be done with `NeMo Framework Launcher `_ using Slurm. + + +Limitations +^^^^^^^^^^^ +* Only Megatron Core-based GPT models are supported +* Only logit-pair distillation is supported for now +* Pipeline parallelism not yet supported +* FSDP strategy not yet supported diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst index 6060726d5ba8..435e3e24350e 100644 --- a/docs/source/starthere/intro.rst +++ b/docs/source/starthere/intro.rst @@ -102,7 +102,7 @@ This final step involves installing the TensorRT Model Optimizer package. .. code-block:: bash - pip install nvidia-modelopt[torch]~=0.13.0 --extra-index-url https://pypi.nvidia.com + pip install nvidia-modelopt[torch]~=0.15.0 --extra-index-url https://pypi.nvidia.com .. code-block:: bash diff --git a/examples/asr/speech_to_text_eval.py b/examples/asr/speech_to_text_eval.py index 7b59ffe3fbfc..0024783d2362 100644 --- a/examples/asr/speech_to_text_eval.py +++ b/examples/asr/speech_to_text_eval.py @@ -64,7 +64,7 @@ import json import os -from dataclasses import dataclass, is_dataclass +from dataclasses import dataclass, field, is_dataclass from typing import Optional import torch @@ -99,8 +99,13 @@ class EvaluationConfig(transcribe_speech.TranscriptionConfig): only_score_manifest: bool = False scores_per_sample: bool = False - text_processing: Optional[TextProcessingConfig] = TextProcessingConfig( - punctuation_marks=".,?", separate_punctuation=False, do_lowercase=False, rm_punctuation=False, + text_processing: Optional[TextProcessingConfig] = field( + default_factory=lambda: TextProcessingConfig( + punctuation_marks=".,?", + separate_punctuation=False, + do_lowercase=False, + rm_punctuation=False, + ) ) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index d54ee34c18cd..7f5bc0659150 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -116,7 +116,7 @@ class ModelChangeConfig: # Sub-config for changes specific to the Conformer Encoder - conformer: ConformerChangeConfig = ConformerChangeConfig() + conformer: ConformerChangeConfig = field(default_factory=ConformerChangeConfig) @dataclass @@ -164,14 +164,14 @@ class TranscriptionConfig: overwrite_transcripts: bool = True # Decoding strategy for CTC models - ctc_decoding: CTCDecodingConfig = CTCDecodingConfig() + ctc_decoding: CTCDecodingConfig = field(default_factory=CTCDecodingConfig) # Decoding strategy for RNNT models # enable CUDA graphs for transcription - rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1) + rnnt_decoding: RNNTDecodingConfig = field(default_factory=lambda: RNNTDecodingConfig(fused_batch_size=-1)) # Decoding strategy for AED models - multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig() + multitask_decoding: MultiTaskDecodingConfig = field(default_factory=MultiTaskDecodingConfig) # Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs: # Implicit single-turn assuming default role='user' (works with Canary-1B) # +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes @@ -187,7 +187,7 @@ class TranscriptionConfig: att_context_size: Optional[list] = None # Use this for model-specific changes before transcription - model_change: ModelChangeConfig = ModelChangeConfig() + model_change: ModelChangeConfig = field(default_factory=ModelChangeConfig) # Config for word / character error rate calculation calculate_wer: bool = True diff --git a/examples/llm/megatron_gpt_pretraining.py b/examples/llm/megatron_gpt_pretraining.py index d3d049e4296e..73e96a23bf81 100644 --- a/examples/llm/megatron_gpt_pretraining.py +++ b/examples/llm/megatron_gpt_pretraining.py @@ -92,7 +92,7 @@ def get_args(): callbacks=callbacks, log_every_n_steps=1, limit_val_batches=2, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed", amp_O2=False), + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), ) nemo_logger = NeMoLogger( diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 85609c2dd9b0..95cb1dcf48ec 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -117,6 +117,7 @@ model: batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + scale_positional_embedding: False # Apply scaling for RoPE frequencies ## Reset learning rate schedule. # 1. reset_lr=True, reset_lr_steps=False. When pre-training an existing checkpoint "from scratch" on a different dataset. diff --git a/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml b/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml new file mode 100644 index 000000000000..052de2ff7d48 --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml @@ -0,0 +1,222 @@ +name: megatron_llama_distill + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + precision: 32 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. + max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models + gradient_clip_val: 1.0 + benchmark: False + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: "megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}" + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + +model: + restore_from_path: null # overwrite if starting from pretrained + restore_from_ckpt: null # overwrite if resuming training + kd_teacher_restore_from_path: null # overwrite always! (path to teacher weights) + nemo_path: null # overwrite always! (path to save final model) + + mcore_gpt: True + # specify micro_batch_size, global_batch_size, and model parallelism + # gradient accumulation will be done automatically based on data_parallel_size + micro_batch_size: 1 # limited by GPU memory + global_batch_size: 8 # will use more micro batches to reach global batch size + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + virtual_pipeline_model_parallel_size: null # interleaved pipeline + + # model architecture + encoder_seq_length: 4096 + max_position_embeddings: ${.encoder_seq_length} + num_layers: 8 # 7b: 32 | 13b: 40 | 70b: 80 + hidden_size: 4096 # 7b: 4096 | 13b: 5120 | 70b: 8192 + ffn_hidden_size: 11008 # Transformer FFN hidden size. Usually 4 * hidden_size. | 7b: 11008 | 13b: 13824 | 70b: 28672 + num_attention_heads: 32 # 7b: 32 | 13b: 40 | 70b: 64 + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + use_scaled_init_method: True # use scaled residuals initialization + hidden_dropout: 0.0 # Dropout probability for hidden state transformer. + attention_dropout: 0.0 # Dropout probability for attention + ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + normalization: "rmsnorm" # Normalization layer to use. Options are 'layernorm', 'rmsnorm' + layernorm_epsilon: 1e-5 + do_layer_norm_weight_decay: False # True means weight decay on all params + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + persist_layer_norm: True # Use of persistent fused layer norm kernel. + bias: False # Whether to use bias terms in all weight matrices. + activation: "fast-swiglu" # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] + headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. + transformer_block_type: "pre_ln" # Options ['pre_ln', 'post_ln', 'normformer'] + openai_gelu: False # Use OpenAI's GELU instead of the default GeLU + normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. + position_embedding_type: "rope" # Position embedding type. Options ['learned_absolute', 'rope'] + rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. + attention_type: "multihead" # Attention type. Options ['multihead'] + share_embeddings_and_output_weights: False # Share embedding and output layer weights. + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: 32 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 32 | 13b: 40 | 70b: 8 + + tokenizer: + library: "sentencepiece" + type: null + model: ??? # /path/to/tokenizer.model + vocab_file: null + merge_file: null + delimiter: null # only used for tabular tokenizer + sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers. + + # Mixed precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + hysteresis: 2 # Gradient scale hysteresis + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # Megatron O2-style half-precision + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + grad_allreduce_chunk_size_mb: 125 + + # Fusion + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism.. + gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2. + bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. + bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope + + # Miscellaneous + seed: 1234 + resume_from_checkpoint: null # manually set the checkpoint file to load from + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages + + ## Activation Checkpointing + # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. + # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + # 'full' will checkpoint the entire transformer layer. + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null + # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. + # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. + num_micro_batches_with_partial_activation_checkpoints: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed + # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is + # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint + # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. + # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. + activations_checkpoint_layers_per_pipeline: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later + # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than + # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage + # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', + # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Transformer Engine + transformer_engine: True + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration + use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + + data: + # Path to data must be specified by the user. + # Supports List, String and Dictionary + # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", + # Or see example below: + # data_prefix: + # - .5 + # - /raid/data/pile/my-gpt3_00_text_document + # - .5 + # - /raid/data/pile/my-gpt3_01_text_document + # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} + # Or see example below: + # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" + data_prefix: ??? + index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix + data_impl: mmap + splits_string: 98,1,1 + seq_length: 4096 + skip_warmup: True + num_workers: 2 + dataloader_type: single # cyclic + reset_position_ids: False # Reset position ids after end-of-document token + reset_attention_mask: False # Reset attention mask after end-of-document token + eod_mask_loss: False # Mask loss for the end of document tokens + validation_drop_last: True # Set to false if the last partial validation samples is to be consumed + no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token + pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size + shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled + + # Nsys profiling options + nsys_profile: + enabled: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + sched: + name: CosineAnnealing + warmup_steps: 50 + min_lr: 1e-5 diff --git a/examples/nlp/language_modeling/megatron_gpt_distillation.py b/examples/nlp/language_modeling/megatron_gpt_distillation.py new file mode 100644 index 000000000000..b3ecdcfc5522 --- /dev/null +++ b/examples/nlp/language_modeling/megatron_gpt_distillation.py @@ -0,0 +1,576 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +from pathlib import Path +from typing import Any, Dict + +import modelopt.torch.distill as mtd +import modelopt.torch.opt as mto +import torch.multiprocessing as mp +from omegaconf import DictConfig, OmegaConf, open_dict +from pytorch_lightning.trainer.trainer import Trainer + +try: + from megatron.core import parallel_state, tensor_parallel + from megatron.core.models.gpt import GPTModel as MCoreGPTModel + from megatron.core.transformer.module import Float16Module as MCoreFloat16Module +except ImportError: + raise AssertionError("ModelOpt only supports Megatron-Core.") +import types +from abc import ABCMeta +from importlib.metadata import version +from typing import Tuple + +import torch +import torch.nn.functional as F +from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.parallel_state import get_tensor_model_parallel_group +from megatron.core.transformer import TransformerConfig +from pkg_resources import packaging +from torch import Tensor +from torch.nn.modules.loss import _Loss + +from nemo.collections.common.parts.utils import extend_instance +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import ( + EmbeddingScalingMixin, + MegatronGPTModel, + get_specs, +) +from nemo.collections.nlp.modules.common.megatron.module import Float16Module +from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder +from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager +from nemo.utils.model_utils import load_config, unwrap_model + +mp.set_start_method("spawn", force=True) + +# Model config fields which affect the structure of the model. +# These will be the values taken from the teacher's config file, +# while the rest remain the same as student's. +MODEL_ARCHITECHTURE_KEYS = [ + "encoder_seq_length", + "max_position_embeddings", + "num_layers", + "hidden_size", + "ffn_hidden_size", + "num_attention_heads", + "init_method_std", + "use_scaled_init_method", + "hidden_dropout", + "attention_dropout", + "ffn_dropout", + "kv_channels", + "apply_query_key_layer_scaling", + "normalization", + "layernorm_epsilon", + "do_layer_norm_weight_decay", + "make_vocab_size_divisible_by", + "pre_process", + "post_process", + "persist_layer_norm", + "bias", + "activation", + "headscale", + "transformer_block_type", + "openai_gelu", + "normalize_attention_scores", + "position_embedding_type", + "rotary_percentage", + "attention_type", + "share_embeddings_and_output_weights", + "overlap_p2p_comm", + "batch_p2p_comm", + "num_query_groups", + "seq_len_interpolation_factor", + "rotary_base", + "scale_positional_embedding", +] + + +class DistillationMegatronGPTModel(MegatronGPTModel): + """ModelOpt Distillation-enabled subclass of `MegatronGPTModel`.""" + + def __init__(self, cfg: DictConfig, trainer: Trainer): + """ + Constructor. + + Args: + cfg: Model configuration. + trainer: Nemo trainer instance. + """ + logging.info("Distillation: Enabled.") + assert cfg.kd_teacher_restore_from_path is not None, "Path to teacher weights must be provided." + assert cfg.pipeline_model_parallel_size == 1, "Distillation mode does not yet support Pipeline Parallel." + + super().__init__(cfg, trainer) + + logging.info("\n\n************** Final model configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + + def model_provider_func(self, pre_process, post_process): + """Model depends on pipeline paralellism.""" + if not self.mcore_gpt: + raise AssertionError("ModelOpt Distillation only supports MCore model edition.") + + model = MCoreGPTModel( + config=self.transformer_config, + transformer_layer_spec=get_specs( + self.spec_name, + self.transformer_config, + self.transformer_engine, + self.cfg.get('hyena', None), + ), + vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), + max_sequence_length=self.cfg.get('encoder_seq_length', 512), + pre_process=pre_process, + post_process=post_process, + parallel_output=True, + share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True), + position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'), + rotary_percent=self.cfg.get('rotary_percentage', 1.0), + seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), + rotary_base=self.cfg.get('rotary_base', 10000), + ) + if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage(): + extend_instance(model.embedding, EmbeddingScalingMixin) + + # [ModelOpt] Distillation mode. + distill_cfg = load_distillation_config(self.transformer_config) + # Intialize DistillationModel. + kd_config = { + "teacher_model": (_teacher_provider, [self.cfg, copy.deepcopy(self.trainer)], {}), + "criterion": distill_cfg["criterion"], + "loss_balancer": distill_cfg["loss_balancer"], + } + model = mtd.convert(model, mode=[("kd_loss", kd_config)]) + + # Additional tweaks needed for MCore/Nemo. + adjust_distillation_model_for_mcore(model, distill_cfg) + + return model + + def get_forward_output_and_loss_func(self, validation_step=False, tuning=False): + def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None): + + # Get data batch + batch = self.get_batch(dataloader_iter, tuning) + + # Transfer needed data to GPU + required_keys = set() + max_seqlen = batch['max_seqlen'].squeeze() if 'max_seqlen' in batch else None + cu_seqlens_argmin = batch['cu_seqlens_argmin'] if 'cu_seqlens_argmin' in batch else None + if parallel_state.get_pipeline_model_parallel_world_size() == 1: + required_keys.update(batch.keys()) + else: + required_keys.add('attention_mask') + if 'cu_seqlens' in batch: + required_keys.add('cu_seqlens') + if parallel_state.is_pipeline_first_stage(): + required_keys.update(('tokens', 'position_ids')) + if parallel_state.is_pipeline_last_stage(): + required_keys.update(('labels', 'loss_mask')) + if self.get_attention_mask_from_fusion and 'attention_mask' in required_keys: + required_keys.remove('attention_mask') + batch = { + key: val.cuda(non_blocking=True) if key in required_keys and isinstance(val, torch.Tensor) else None + for key, val in batch.items() + } + + # slice batch along sequence dimension for context parallelism + batch = self.get_batch_on_this_context_parallel_rank(batch) + + # Model forward pass + forward_args = { + 'input_ids': batch['tokens'], + 'position_ids': batch['position_ids'], + 'attention_mask': None if self.get_attention_mask_from_fusion else batch['attention_mask'], + 'labels': batch['labels'] if 'labels' in batch else None, + 'loss_mask': batch['loss_mask'], + } + + # TODO: @eharper can we add this to mcore? + forward_args.pop('loss_mask') + + if 'cu_seqlens' in batch: # packed sequence from GPTSFTPackedDataset + # these args are passed eventually into TEDotProductAttention.forward() + cu_seqlens = batch['cu_seqlens'].squeeze() # remove batch size dimension (mbs=1) + # remove -1 "paddings" added in collate_fn + if cu_seqlens_argmin is not None: + cu_seqlens = cu_seqlens[: cu_seqlens_argmin.item()] + else: + cu_seqlens = cu_seqlens[: torch.argmin(cu_seqlens)] + + try: + from megatron.core.packed_seq_params import PackedSeqParams + except (ImportError, ModuleNotFoundError) as e: + mcore_version = packaging.version.Version(version('megatron-core')) + logging.error( + f"megatron-core v{mcore_version} does not support training with packed sequence. " + "Please use megatron-core >= 0.5.0, or set model.data.train_ds.packed_sequence=False" + ) + raise e + + forward_args['packed_seq_params'] = PackedSeqParams( + cu_seqlens_q=cu_seqlens, + cu_seqlens_kv=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_kv=max_seqlen, + qkv_format='thd', + ) + + output_tensor = model(**forward_args) + + def loss_func(output_tensor): + if validation_step: + loss_for_ub = self.loss_func( + batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor, validation_step=True + ) + else: + # [ModelOpt] KD Loss for a micro-batch (ub) + unwrapped_model = unwrap_model(model, (Float16Module, MCoreFloat16Module)) + loss_for_ub = unwrapped_model.compute_kd_loss( + loss_reduction_fn=lambda x: self.loss_func( + batch['loss_mask'], batch['num_valid_tokens_in_ub'], x + ) + ) + cp_size = parallel_state.get_context_parallel_world_size() + if validation_step and not self.validation_drop_last: + num_valid_tokens_in_ub = batch['num_valid_tokens_in_ub'] + if loss_for_ub.isnan(): + assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input' + loss_sum_for_ub = torch.zeros_like(loss_for_ub) + num_valid_tokens_in_ub = 0 + else: + if self.sample_weight == 'constant': + num_valid_tokens_in_ub = 1 + loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub + + loss_sum_and_ub_size_all_gpu = torch.cat( + [ + loss_sum_for_ub.clone().detach().view(1), + torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(), + ] + ) + # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds) + torch.distributed.all_reduce( + loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group() + ) + return loss_for_ub * cp_size, {'loss_sum_and_ub_size': loss_sum_and_ub_size_all_gpu} + else: + reduced_loss = average_losses_across_data_parallel_group([loss_for_ub]) + return loss_for_ub * cp_size, {'avg': reduced_loss} + + return output_tensor, loss_func + + return fwd_output_and_loss_func + + def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor, validation_step=False): + loss = super().loss_func(loss_mask, num_valid_tokens_in_ub, output_tensor) + if not validation_step and self.cfg.tensor_model_parallel_size > 1: + # [ModelOpt] KD loss requires extra all-reduce to ensure same values across MP-TP partitions. + loss = torch.sum(tensor_parallel.gather_from_tensor_model_parallel_region(loss.reshape(1))) + return loss + + def configure_optimizers(self): + with self.model.hide_teacher_model(): + return super().configure_optimizers() + + +######################################################## + + +def load_distillation_config(student_cfg: TransformerConfig) -> Dict[str, Any]: + """Create a default distillation config for MCore GPT Models. + + Args: + student_cfg: Model config for student model. + """ + logit_pair = ("output_layer", "output_layer") # logit module names for MCoreGPTModel + tp_enabled = student_cfg.tensor_model_parallel_size > 1 + + cfg = { + "criterion": {tuple(logit_pair): LogitsKLLoss(tensor_parallel=tp_enabled)}, + "loss_balancer": None, + "skip_lm_loss": True, + } + return cfg + + +class BaseLoss(_Loss, metaclass=ABCMeta): + """Abstract base class for Megatron distillation losses.""" + + def __init__(self, tensor_parallel: bool = False): + """Constructor. + + Args: + tensor_parallel: Whether tensor parallelism is enabled or not. + """ + super().__init__() + self._tensor_parallel = tensor_parallel + + def pre_forward(self, predictions: Tensor, targets: Tensor) -> Tuple[Tensor, Tensor]: + """Performs projection of student tensor to match teacher's size if necessary.""" + if isinstance(predictions, tuple): + # `ColumnParallelLinear` returns bias too + predictions, targets = predictions[0], targets[0] + + targets = targets.detach() + + return predictions, targets + + def post_forward(self, loss: Tensor) -> Tensor: + """Reshapes tensor from [s, b] to [b, s] for upcoming loss masking.""" + loss = loss.transpose(0, 1).contiguous() + return loss + + +class LogitsKLLoss(BaseLoss): + """Calculates KL-Divergence loss between two logits tensors without reducing the sequence dim.""" + + def __init__( + self, + tensor_parallel: bool = False, + temperature: float = 1.0, + reverse: bool = False, + ): + """ + Constructor. + + Args: + tensor_parallel: Whether tensor parallelism is enabled or not. + temperature: Divide tensors by this value prior to calculating loss. + reverse: Whether to reverse the loss as KLD(teacher, student) instead of KLD(student, teacher) + """ + super().__init__(tensor_parallel) + self._temperature = temperature + self._reverse = reverse + + def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: + """ + Forward function. + + Args: + predictions: Student model tensors (size [s, b, h]) + targets: Teacher model tensors (size [s, b, h]) + + Returns: + KLD loss of tensors (size [b, s]) + """ + predictions, targets = self.pre_forward(predictions, targets) + + # Division by temp should happen prior to finding max for both student and teacher. + # Currently we don't use temperature in any of ours runs (temp=1.0) + output_teacher = targets.float() / self._temperature + output_student = predictions.float() / self._temperature + + # Compute local softmax, and the reweight to compute global softmax. + if self._tensor_parallel: + + # Maximum value along vocab dimension across all GPUs. + teacher_logits_max, _ = torch.max(output_teacher, dim=-1) + torch.distributed.all_reduce( + teacher_logits_max, + op=torch.distributed.ReduceOp.MAX, + group=get_tensor_model_parallel_group(), + ) + output_teacher = output_teacher - teacher_logits_max.unsqueeze(dim=-1) + + denom_teacher = torch.sum(torch.exp(output_teacher), dim=-1) + # We can't use `gather_from_tensor_model_parallel_region` here since it discards + # gradients from other ranks - we need to all_reduce the gradients as well. + denom_teacher = all_reduce_autograd(denom_teacher, group=get_tensor_model_parallel_group()) + + # Maximum value along vocab dimension across all GPUs. + student_logits_max, _ = torch.max(output_student, dim=-1) + torch.distributed.all_reduce( + student_logits_max, + op=torch.distributed.ReduceOp.MAX, + group=get_tensor_model_parallel_group(), + ) + output_student = output_student - student_logits_max.unsqueeze(dim=-1).detach() + + denom_student = torch.sum(torch.exp(output_student), dim=-1) + denom_student = all_reduce_autograd(denom_student, group=get_tensor_model_parallel_group()) + + slen, bsz, sharded_vocab_size = output_student.shape + student_log_prob = output_student - torch.log(denom_student).view(slen, bsz, 1).expand( + slen, bsz, sharded_vocab_size + ) + teacher_log_prob = output_teacher - torch.log(denom_teacher).view(slen, bsz, 1).expand( + slen, bsz, sharded_vocab_size + ) + + if self._reverse: + loss = torch.sum( + F.kl_div(teacher_log_prob, student_log_prob, reduction="none", log_target=True), + dim=-1, + ) + else: + loss = torch.sum( + F.kl_div(student_log_prob, teacher_log_prob, reduction="none", log_target=True), + dim=-1, + ) + + else: + if self._reverse: + loss = torch.sum( + F.kl_div( + F.log_softmax(output_teacher, dim=-1), F.softmax(output_student, dim=-1), reduction="none" + ), + dim=-1, + ) + else: + loss = torch.sum( + F.kl_div( + F.log_softmax(output_student, dim=-1), F.softmax(output_teacher, dim=-1), reduction="none" + ), + dim=-1, + ) + + return self.post_forward(loss) + + +class _AllReduce(torch.autograd.Function): + """Implementation from old PyTorch `torch.distributed.nn.parallel`.""" + + @staticmethod + def forward(ctx, op, group, tensor): + ctx.group, ctx.op = group, op + tensor = tensor.clone() + torch.distributed.all_reduce(tensor, op=op, group=group) + return tensor + + @staticmethod + def backward(ctx, grad_output): + return (None, None, _AllReduce.apply(ctx.op, ctx.group, grad_output)) + + +def all_reduce_autograd(tensor, op=torch.distributed.ReduceOp.SUM, group=torch.distributed.group.WORLD): + return _AllReduce.apply(op, group, tensor) + + +def adjust_distillation_model_for_mcore(model: mtd.DistillationModel, distill_cfg: Dict[str, Any]): + """Extra modifcations to ``mtd.DistillationModel`` requried for Megatron-Core.""" + + # HACK: Get rid of ModelOpt Distillation state + mto.ModeloptStateManager(model)._state.pop() + + # HACK: Hide teacher during `sharded_state_dict` method. + def _sharded_state_dict(self, *args, **kwargs) -> ShardedStateDict: + with self.hide_teacher_model(): + return self._sharded_state_dict(*args, **kwargs) + + model._sharded_state_dict = model.sharded_state_dict + model.sharded_state_dict = types.MethodType(_sharded_state_dict, model) + + # HACK: Skip `lm_loss` bypassing it when training if not needed for backprop. + def _compute_language_model_loss(self, labels, logits) -> Tensor: + if self.training: + return torch.zeros_like(labels) + return self._compute_language_model_loss(labels, logits) + + if distill_cfg["skip_lm_loss"]: + model._compute_language_model_loss = model.compute_language_model_loss + model.compute_language_model_loss = types.MethodType(_compute_language_model_loss, model) + + +######################################################## + + +def _teacher_provider(cfg: DictConfig, trainer: Trainer) -> MCoreGPTModel: + """Teacher model factory (must be a non-local function to pickle).""" + logging.info("Distillation: Loading teacher weights...") + teacher_model_cfg = _merge_model_arch_fields(cfg, cfg.kd_teacher_restore_from_path) + + model = MegatronGPTModel.restore_from( + cfg.kd_teacher_restore_from_path, + override_config_path=teacher_model_cfg, + trainer=trainer, + ) + teacher_model_module_list = model.get_model_module_list() + logging.info("Distillation: ... teacher weights loaded.") + return teacher_model_module_list[0] + + +def _merge_model_arch_fields(cfg: DictConfig, model_load_path: str) -> DictConfig: + """Overwrite model-architecture fields of a config with a checkpoint's.""" + model_cfg = load_config(model_load_path) + model_arch_keys = [k for k in MODEL_ARCHITECHTURE_KEYS if k in model_cfg] + model_arch_cfg = OmegaConf.masked_copy(model_cfg, model_arch_keys) + with open_dict(cfg): + cfg = OmegaConf.merge(cfg, model_arch_cfg) + # Add tokenizer from model if not provided + if OmegaConf.is_missing(cfg.tokenizer, "model"): + cfg.tokenizer = model_cfg.tokenizer + return cfg + + +######################################################## + + +@hydra_runner(config_path="conf", config_name="megatron_llama_distill") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + + trainer = MegatronTrainerBuilder(cfg).create_trainer() + exp_manager(trainer, cfg.exp_manager) + + with open_dict(cfg): + cfg.model.name = "modelopt" # Convert TE layernorm spec to unfused format + # HACK: Checkpoint-loading process hangs/loops if this isn't present here for some reason. + cfg.model.target = "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel" + + # Continual training + if cfg.model.get("restore_from_path") is not None: + # Option 1: Restore only the model weights from a .nemo file + logging.info(f"Continual training: loading weights from {cfg.model.restore_from_path}") + + # Merge model config's architecture fields with the one from the checkpoint + cfg.model = _merge_model_arch_fields(cfg.model, cfg.model.restore_from_path) + + model = DistillationMegatronGPTModel.restore_from( + restore_path=cfg.model.restore_from_path, + override_config_path=cfg.model, + trainer=trainer, + save_restore_connector=NLPSaveRestoreConnector(), + ) + logging.info("... weights loaded.") + elif cfg.model.get("restore_from_ckpt") is not None: + # Option 2: Restore both model weights and optimizer states from a PTL checkpoint + logging.info(f"Continual training: loading weights and optimizer states from {cfg.model.restore_from_ckpt}") + trainer.ckpt_path = Path(cfg.model.restore_from_ckpt) + model = DistillationMegatronGPTModel(cfg.model, trainer) + logging.info("... weights and optimizer states loaded.") + + # Start new pretraining or resume from a checkpoint if it exists + else: + logging.info("Instantiating new model ...") + model = DistillationMegatronGPTModel(cfg.model, trainer) + logging.info("... model instantiated.") + + trainer.fit(model) + + if cfg.model.nemo_path: + model.save_to(cfg.model.nemo_path) + else: + logging.warning("Skipping saving final model as no `model.nemo_path` provided.") + + +if __name__ == '__main__': + main() diff --git a/nemo/collections/asr/data/data_simulation.py b/nemo/collections/asr/data/data_simulation.py index 5ee2ad19b951..369a95d9ee9a 100644 --- a/nemo/collections/asr/data/data_simulation.py +++ b/nemo/collections/asr/data/data_simulation.py @@ -1122,7 +1122,7 @@ def _generate_session( alignments=self._alignments, session_name=filename, speaker_id=speaker_ids[speaker_turn], - start=int(start / self._params.data_simulator.sr), + start=float(start / self._params.data_simulator.sr), ) self.annotator.annote_lists['ctm'].extend(new_ctm_entries) diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py index 55663a04a5a0..c4c88091cb28 100644 --- a/nemo/collections/common/data/lhotse/nemo_adapters.py +++ b/nemo/collections/common/data/lhotse/nemo_adapters.py @@ -30,7 +30,7 @@ from lhotse.dataset.dataloading import resolve_seed from lhotse.lazy import LazyIteratorChain, LazyJsonlIterator from lhotse.serialization import open_best -from lhotse.utils import compute_num_samples +from lhotse.utils import compute_num_samples, ifnone from nemo.collections.common.parts.preprocessing.manifest import get_full_path @@ -332,14 +332,16 @@ def __iter__(self) -> Generator[Cut, None, None]: # Handle NeMo tarred manifests with offsets. # They have multiple JSONL entries where audio paths end with '-sub1', '-sub2', etc. for each offset. - offset_pattern = re.compile(r'^.+(-sub\d+)$') + offset_pattern = re.compile(r'^(?P.+)(?P-sub\d+)(?P\.\w+)?$') for sid in shard_ids: manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0] def basename(d: dict) -> str: return ( - k[: -len(m.group(1))] if (m := offset_pattern.match(k := d["audio_filepath"])) is not None else k + m.group("stem") + ifnone(m.group("ext"), "") + if (m := offset_pattern.match(k := d["audio_filepath"])) is not None + else k ) shard_manifest: dict[str, list[dict]] = groupby(basename, self.shard_id_to_manifest[sid]) diff --git a/nemo/collections/common/parts/utils.py b/nemo/collections/common/parts/utils.py index e8eb1b999292..75783815548a 100644 --- a/nemo/collections/common/parts/utils.py +++ b/nemo/collections/common/parts/utils.py @@ -12,10 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import math import os from typing import Iterable, List +logger = logging.getLogger(__name__) + import einops import torch import torch.nn as nn @@ -109,6 +112,31 @@ def extend_instance(obj, mixin): ) # mixin needs to go first for our forward() logic to work +def apply_rope_scaling(freqs): + # Apply scaling for RoPE frequencies + logger.info("apply rope scaling ...") + # Values obtained from grid search + scale_factor = 8 + low_freq_factor = 1 + high_freq_factor = 4 + old_context_len = 8192 # original llama3 length + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + new_freqs = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + new_freqs.append(freq) + elif wavelen > low_freq_wavelen: + new_freqs.append(freq / scale_factor) + else: + assert low_freq_wavelen != high_freq_wavelen + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq) + return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device) + + def mask_sequence_tensor(tensor: torch.Tensor, lengths: torch.Tensor): """ For tensors containing sequences, zero out out-of-bound elements given lengths of every element in the batch. diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py index 6622054c4f98..b2d9b5ba8cca 100644 --- a/nemo/collections/llm/gpt/data/pre_training.py +++ b/nemo/collections/llm/gpt/data/pre_training.py @@ -190,6 +190,7 @@ def test_dataloader(self) -> EVAL_DATALOADERS: def _create_dataloader(self, dataset, mode, **kwargs) -> WrappedDataLoader: self.init_global_step = self.trainer.global_step + self.data_sampler.init_global_step = self.init_global_step dataloader = WrappedDataLoader( mode=mode, dataset=dataset, diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py index 913144d1bf5f..71b60d5df59f 100644 --- a/nemo/collections/llm/peft/lora.py +++ b/nemo/collections/llm/peft/lora.py @@ -17,12 +17,26 @@ class to provide a specific implementation of the forward method. """ def forward(self, x): - linear_output, bias = self.to_wrap(x) - if isinstance(linear_output, tuple) and len(linear_output) == 2: - linear_output, layernorm_output = linear_output - adapter_output = self.adapter(layernorm_output) - else: - adapter_output = self.adapter(x) + linear_output = self.to_wrap(x) + assert isinstance( + linear_output, tuple + ), f"{self.to_wrap} should return a tuple but instead returns {linear_output}" + """ Four cases for the wrapped module's return values + 1. nothing: (out, None) + 2. return_bias: (out, bias) + 2. return_layernorm_output: ((out, ln_out), None) + 3. both: (out, bias, ln_out) + """ + if len(linear_output) == 2: + linear_output, bias = linear_output + if isinstance(linear_output, tuple) and len(linear_output) == 2: + linear_output, layernorm_output = linear_output + x = layernorm_output + elif len(linear_output) == 3: + linear_output, bias, layernorm_output = linear_output + x = layernorm_output + + adapter_output = self.adapter(x) return linear_output + adapter_output, bias @@ -72,6 +86,8 @@ class LoRA(PEFT): alpha: int = 32 dropout: float = 0.0 dropout_position: Literal['pre', 'post'] = 'post' + lora_A_init_method: str = "xavier" + lora_B_init_method: str = "zero" def transform(self, m: nn.Module, name=None, prefix=None): """ @@ -96,6 +112,11 @@ def transform(self, m: nn.Module, name=None, prefix=None): input_is_parallel = False in_features = m.in_features out_features = m.out_features * tp_size + # LoRA is applied after layernorm, so layernorm output must be returned + m.return_layernorm_output = True + # perf optimization for LoRA + SP + if m.config.sequence_parallel and not m.ub_overlap_ag: + m.return_layernorm_output_gathered = True else: # name in ['linear_proj', 'linear_fc2'] # Row Parallel Linear input_is_parallel = True @@ -110,8 +131,8 @@ def transform(self, m: nn.Module, name=None, prefix=None): activation='identity', norm_position=None, norm_type=None, - column_init_method="normal", - row_init_method="zero", + column_init_method=self.lora_A_init_method, + row_init_method=self.lora_B_init_method, gather_output=False, input_is_parallel=input_is_parallel, dropout=self.dropout, diff --git a/nemo/collections/llm/tokenizer.py b/nemo/collections/llm/tokenizer.py index 3943e24ba799..808642758ebe 100644 --- a/nemo/collections/llm/tokenizer.py +++ b/nemo/collections/llm/tokenizer.py @@ -9,8 +9,8 @@ track_io( AutoTokenizer, artifacts=[ - FileArtifact("vocab_file"), - FileArtifact("merges_file"), + FileArtifact("vocab_file", required=False), + FileArtifact("merges_file", required=False), ], ) __all__.append("AutoTokenizer") diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 22ee37ec361b..0198c07c82d2 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -30,7 +30,7 @@ from pytorch_lightning.loops.fetchers import _DataFetcherWrapper from pytorch_lightning.trainer.trainer import Trainer -from nemo.collections.common.parts.utils import extend_instance +from nemo.collections.common.parts.utils import apply_rope_scaling, extend_instance from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import ( MegatronCorePretrainingSampler, MegatronPretrainingRandomSampler, @@ -77,6 +77,7 @@ from nemo.utils.te_utils import is_float8tensor try: + import megatron.core as core from megatron.core import InferenceParams, parallel_state, tensor_parallel from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset @@ -429,6 +430,7 @@ def get_inference_config(self): def model_provider_func(self, pre_process, post_process): """Model depends on pipeline paralellism.""" if self.mcore_gpt: + model = MCoreGPTModel( config=self.transformer_config, transformer_layer_spec=get_specs( @@ -448,6 +450,10 @@ def model_provider_func(self, pre_process, post_process): seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), rotary_base=self.cfg.get('rotary_base', 10000), ) + + if self.cfg.get('scale_positional_embedding', False): + model.rotary_pos_emb.inv_freq = apply_rope_scaling(model.rotary_pos_emb.inv_freq) + if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage(): extend_instance(model.embedding, EmbeddingScalingMixin) else: diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py index 03afec176325..8ee3fa1c05e7 100644 --- a/nemo/export/multimodal/build.py +++ b/nemo/export/multimodal/build.py @@ -208,7 +208,10 @@ def forward(self, images): return vision_x encoder = AutoModel.from_pretrained( - vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True + vision_config["from_pretrained"], + torch_dtype=torch.bfloat16, + trust_remote_code=True, + attn_implementation='eager', ) vision_encoder = encoder.vision_model hf_config = encoder.config @@ -326,7 +329,10 @@ def forward(self, images): return vision_x encoder = AutoModel.from_pretrained( - vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True + vision_config["from_pretrained"], + torch_dtype=torch.bfloat16, + trust_remote_code=True, + attn_implementation='eager', ) vision_encoder = encoder.vision_model hf_config = encoder.config diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py index 4025634ebe28..9119b2474b17 100644 --- a/nemo/lightning/io/artifact/base.py +++ b/nemo/lightning/io/artifact/base.py @@ -6,8 +6,9 @@ class Artifact(ABC, Generic[ValueT]): - def __init__(self, attr: str): + def __init__(self, attr: str, required: bool = True): self.attr = attr + self.required = required @abstractmethod def dump(self, value: ValueT, path: Path) -> ValueT: diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index 1cd45648ea20..69368599682e 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -91,6 +91,14 @@ def __call__(self, output_path: Optional[Path] = None, overwrite: bool = False) logging.error(f"An error occurred: {e}") raise + finally: + # Delete the lock file if it exists + if lock_path.exists(): + try: + os.remove(lock_path) + except OSError as e: + logging.warning(f"Failed to remove lock file {lock_path}: {e}") + return _output_path def local_path(self, base_path: Optional[Path] = None) -> Path: @@ -152,19 +160,26 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = return _trainer - def nemo_save(self, output_path: Path, trainer: pl.Trainer) -> None: + def nemo_save(self, output_path: Path, trainer: pl.Trainer, dump_io: bool = True) -> None: """ Saves the model's state to the specified path using the trainer's current strategy. Args: output_path (Path): The path where the model checkpoint will be saved. trainer (pl.Trainer): The trainer with the strategy to save the model. + dump_io (bool): If True, the IO configuration will be saved to the output path. """ trainer.strategy._setup_optimizers = False trainer.strategy._init_model_parallel = False trainer.strategy.setup(trainer) trainer.save_checkpoint(output_path) + from nemo.lightning.io.pl import TrainerContext + from nemo.utils.get_rank import is_global_rank_zero + + if is_global_rank_zero() and dump_io: + TrainerContext.from_trainer(trainer).io_dump(output_path) + def nemo_load( self, path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True ) -> Tuple[pl.LightningModule, pl.Trainer]: diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index 741c1400474a..d0d4d0243ff7 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -432,13 +432,24 @@ def _io_init(self, **kwargs) -> fdl.Config[Self]: ------- fdl.Config[Self]: The initialized configuration object. """ - return fdl.Config(type(self), **kwargs) + try: + return fdl.Config(type(self), **kwargs) + except Exception as e: + error_msg = ( + f"Error creating fdl.Config for {type(self).__name__}: {str(e)}\n" + f"Arguments that caused the error: {kwargs}\n" + f"This may be due to unsupported argument types or nested configurations." + ) + raise RuntimeError(error_msg) from e def _io_wrap_init(cls): """Wraps the __init__ method of a class to add IO functionality.""" original_init = cls.__init__ + if getattr(cls, "__wrapped_init__", False): + return cls + @functools.wraps(original_init) def wrapped_init(self, *args, **kwargs): if hasattr(self, "io_transform_args"): @@ -453,6 +464,7 @@ def wrapped_init(self, *args, **kwargs): original_init(self, *args, **kwargs) cls.__init__ = wrapped_init + cls.__wrapped_init__ = True return cls @@ -502,6 +514,10 @@ def _io_path_elements_fn(x): def _artifact_transform(cfg: fdl.Config, output_path: Path): for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []): current_val = getattr(cfg, artifact.attr) + if current_val is None: + if artifact.required: + raise ValueError(f"Artifact '{artifact.attr}' is required but not provided") + continue new_val = artifact.dump(current_val, output_path) setattr(cfg, artifact.attr, new_val) diff --git a/nemo/lightning/pytorch/callbacks/model_transform.py b/nemo/lightning/pytorch/callbacks/model_transform.py index 5d48851843fc..8a07566f92c3 100644 --- a/nemo/lightning/pytorch/callbacks/model_transform.py +++ b/nemo/lightning/pytorch/callbacks/model_transform.py @@ -71,6 +71,11 @@ def _maybe_apply_transform(self, trainer): def apply_transform(self, trainer): self.model_transform(trainer.model) + from pytorch_lightning.utilities import model_summary + + logging.info( + f"After applying model_transform:\n" f"{model_summary.summarize(trainer.lightning_module, max_depth=1)}" + ) @property def _needs_to_call(self) -> bool: diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py index 3909c680cfc1..08a56f854e4a 100644 --- a/nemo/lightning/pytorch/plugins/data_sampler.py +++ b/nemo/lightning/pytorch/plugins/data_sampler.py @@ -25,6 +25,7 @@ def __init__( rampup_batch_size: Optional[List[int]] = None, dataloader_type: Literal["single", "cyclic", "batch"] = "single", init_consumed_samples: int = 0, + init_global_step: int = 0, ): self.seq_len = seq_len self.micro_batch_size = micro_batch_size @@ -35,6 +36,7 @@ def __init__( self.prev_consumed_samples = self.init_consumed_samples self.if_first_step = 0 self.prev_global_batch_size = None + self.init_global_step = init_global_step def setup(self, global_rank: int) -> None: from nemo.lightning.data import setup_microbatch_calculator @@ -92,9 +94,7 @@ def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModul self.prev_global_batch_size = self.current_global_batch_size - # TODO: Add consumed samples - consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_consumed_samples) - + consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_global_step) pl_module.log( 'consumed_samples', consumed_samples, diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py index 5e43e09c0420..65b7c6292249 100644 --- a/nemo/lightning/pytorch/plugins/mixed_precision.py +++ b/nemo/lightning/pytorch/plugins/mixed_precision.py @@ -26,11 +26,17 @@ AnyT = TypeVar("AnyT") +def get_optim_config(optimizer: Optimizer): + try: + return optimizer.mcore_optimizer.config + except: + raise ValueError("Failed to extract optimizer config from module.") + + class MegatronMixedPrecision(MixedPrecision): def __init__( self, precision: Literal["16-mixed", "bf16-mixed"], - amp_O2: bool = False, device="cuda", ) -> None: if precision == "bf16-mixed": @@ -39,21 +45,6 @@ def __init__( scaler = GradScaler(init_scale=2**32, growth_interval=1000, hysteresis=2) super().__init__(precision, device, scaler) - self.amp_O2 = amp_O2 - - def connect( - self, model: Module, optimizers: List[Optimizer], lr_schedulers: List[Any] - ) -> Tuple[Module, List[Optimizer], List[Any]]: - """Connects this plugin to the accelerator and the training process.""" - from nemo.core.optim import MainParamsOptimizerWrapper - - if not optimizers or not self.amp_O2 or isinstance(optimizers[0], MainParamsOptimizerWrapper): - return model, optimizers, lr_schedulers - - _optimizers = [*optimizers] - _optimizers[0] = self.convert_optimizer(_optimizers[0]) - - return model, _optimizers, lr_schedulers def convert_module(self, module: Module) -> Module: """Convert the module parameters to the precision type this plugin handles. @@ -68,11 +59,11 @@ def convert_module(self, module: Module) -> Module: config = get_model_config(module.module) config.fp16 = self.precision == "16-mixed" config.bf16 = self.precision == "bf16-mixed" - if isinstance(module.module, Float16Module): - new_float16_module = Float16Module(config, module.module.module) - module.module = new_float16_module - else: + config.autocast = False + if hasattr(module, 'module'): module.module = Float16Module(config, module.module) + else: + module = Float16Module(config, module) return module @@ -82,16 +73,10 @@ def convert_optimizer(self, optimizer: Optimizer) -> Optimizer: This is optional and depends on the precision limitations during optimization. """ - from nemo.core.optim import MainParamsOptimizerWrapper - - if isinstance(optimizer, MainParamsOptimizerWrapper) or not self.amp_O2: - return optimizer - - return MainParamsOptimizerWrapper( - optimizer, - fp32_grad_accum=True, - contiguous_grad_bucket=True, - ) + optim_config = get_optim_config(optimizer) + assert optim_config.bf16 == (self.precision == "bf16-mixed"), "BF16 enabled on model but not on optimizer" + assert optim_config.fp16 == (self.precision == "fp16-mixed"), "BF16 enabled on model but not on optimizer" + return optimizer def convert_input(self, data: AnyT) -> AnyT: """Convert model inputs (forward) to the floating point precision type of this plugin. @@ -120,7 +105,7 @@ def optimizer_step( ) -> None: from nemo.core.optim import MainParamsOptimizerWrapper - if not self.amp_O2 and not isinstance(optimizer, MainParamsOptimizerWrapper): + if not isinstance(optimizer, MainParamsOptimizerWrapper): return super().optimizer_step(optimizer, model, closure, **kwargs) if self.scaler is None: diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py index 99eee9ad6bd3..f22ed5b40a20 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies.py @@ -4,7 +4,7 @@ import os import shutil from collections import OrderedDict -from contextlib import ExitStack +from contextlib import ExitStack, contextmanager from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, ContextManager, Dict, List, Literal, Mapping, Optional, TypeVar, Union, cast @@ -621,6 +621,8 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr assert self.megatron_parallel is not None _strategy_lib.load_model_state_dict(self.megatron_parallel, checkpoint, strict=strict) + for opt in self.optimizers: + opt.reload_model_params() @property @override @@ -731,6 +733,14 @@ def parallelism(self) -> ParallelismConfig: pipeline_dtype=self.pipeline_dtype, ) + @contextmanager + @override + def tensor_init_context(self, empty_init: Optional[bool] = None): + # Materializaton happens in `setup()` + # @akoumparouli: using Parent's tensor_init_context causes mcore + # parameters to be initialized on GPU instead of (assumed) CPU. + yield + def ckpt_to_dir(filepath: Union[str, Path]) -> Path: """PTL considers checkpoints as .ckpt files. diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py index da1a77c3c731..81f4d12bd3fb 100644 --- a/nemo/lightning/pytorch/trainer.py +++ b/nemo/lightning/pytorch/trainer.py @@ -10,18 +10,24 @@ class Trainer(pl.Trainer, IOMixin): + + def add_io(self, obj): + """Recurse to the leaves of a container and add io functionality to non-serializable leaves""" + if isinstance(obj, (dict, list)): + if isinstance(obj, dict): + obj = obj.values() + for item in obj: + self.add_io(item) + else: + if not serialization.find_node_traverser(type(obj)): + track_io(type(obj)) + return + def io_init(self, **kwargs) -> fdl.Config[Self]: # Each argument of the trainer can be stateful so we copy them cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items()} - for val in cfg_kwargs.values(): - if not serialization.find_node_traverser(type(val)): - track_io(type(val)) - elif isinstance(val, list): - for v in val: - if not serialization.find_node_traverser(type(v)): - track_io(type(v)) - + self.add_io(cfg_kwargs) return fdl.Config(type(self), **cfg_kwargs) def to_fabric(self, callbacks=None, loggers=None) -> Fabric: diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py index ec0650a90e7d..e18a2a3d3b6c 100644 --- a/nemo/utils/callbacks/cuda_graph.py +++ b/nemo/utils/callbacks/cuda_graph.py @@ -139,16 +139,6 @@ def to_tensor(self, value, name): return value -def register_key(self, key, meta, value): - # PyTorch Lightning creates all metrics on GPU, but creating the metric on - # its input device is prefered. - # Refer to: https://github.com/Lightning-AI/pytorch-lightning/blob/2.0.7/src/lightning/pytorch/trainer/connectors/logger_connector/result.py#L409 - metric = _ResultMetric(meta, isinstance(value, torch.Tensor)) - device = value.device if isinstance(value, torch.Tensor) else self.device - metric = metric.to(device) - self[key] = metric - - def update_metrics(self, key, value, batch_size): # PyTorch Lightning always move all metrics to GPU, but moving the metric to # its input device is prefered. @@ -374,8 +364,6 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") # Use smart metrics to avoid syncs LightningModule.__orig_to_tensor__ = LightningModule._LightningModule__to_tensor LightningModule._LightningModule__to_tensor = to_tensor - _ResultCollection.__orig_register_key__ = _ResultCollection.register_key - _ResultCollection.register_key = register_key _ResultCollection.__orig_update_metrics__ = _ResultCollection.update_metrics _ResultCollection.update_metrics = update_metrics @@ -409,8 +397,6 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - LightningModule._LightningModule__to_tensor = LightningModule.__orig_to_tensor__ del LightningModule.__orig_to_tensor__ - _ResultCollection.register_key = _ResultCollection.__orig_register_key__ - del _ResultCollection.__orig_register_key__ _ResultCollection.update_metrics = _ResultCollection.__orig_update_metrics__ del _ResultCollection.__orig_update_metrics__ diff --git a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py index 1f8c69b5b240..35039f8d02e9 100644 --- a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py +++ b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py @@ -62,7 +62,7 @@ def get_args(): help="Path to output mcore weights file (ends in .nemo).", ) parser.add_argument( - "--cpu-only", + "--cpu_only", action="store_true", help="Load model in cpu only. Useful if the model cannot fit in GPU memory, " "but this option makes the conversion script significantly slower.", @@ -73,7 +73,7 @@ def get_args(): help="Run conversion again and overwrite output file when the output file already exists", ) parser.add_argument( - "--ignore-if-missing", + "--ignore_if_missing", default="rotary_pos_emb.inv_freq", help="comma-separated list of state_dict keys that are known to be missing in mcore and can be safely ignored", ) @@ -158,8 +158,8 @@ def build_key_mapping(nemo_cfg): for wb in ('weight', 'bias') if has_layernorm_bias else ('weight',): mcore_to_nemo_mapping.update( { - f"{mcore_prefix}.{i}.self_attention.linear_qkv.layer_norm_{wb}": f"{nemo_prefix}.{i}.input_layernorm.{wb}", - f"{mcore_prefix}.{i}.mlp.linear_fc1.layer_norm_{wb}": f"{nemo_prefix}.{i}.post_attention_layernorm.{wb}", + f"{mcore_prefix}.{i}.input_layernorm.{wb}": f"{nemo_prefix}.{i}.input_layernorm.{wb}", + f"{mcore_prefix}.{i}.pre_mlp_layernorm.{wb}": f"{nemo_prefix}.{i}.post_attention_layernorm.{wb}", } ) diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py index 0837e0e6ccf2..4eb8cb6330ca 100644 --- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py @@ -18,6 +18,8 @@ python convert_llama_hf_to_nemo.py \ --input_name_or_path \ --output_path + --precision bf16 \ + --llama31 True """ import os @@ -60,6 +62,13 @@ def get_args(): required=False, help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", ) + parser.add_argument( + "--llama31", + type=bool, + default=True, + required=False, + help="Whether the model is from LLaMa 3.1 family. LLaMa 3.1 enables scaling for RoPE frequencies.", + ) parser.add_argument("--precision", type=str, default="16", help="Model precision") args = parser.parse_args() return args @@ -110,6 +119,7 @@ def load_config(args, llama_config): while llama_config['vocab_size'] % base != 0: base //= 2 nemo_config.make_vocab_size_divisible_by = base + nemo_config.scale_positional_embedding = args.llama31 return nemo_config @@ -161,6 +171,7 @@ def convert(args): plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) nemo_config.precision = precision + nemo_config.micro_batch_size = 1 print(f"nemo_config: {nemo_config}") # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together. @@ -298,12 +309,22 @@ def convert(args): # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path if 'tokenizer_model' not in hf_config: - if hf_config['num_hidden_layers'] == 32: - model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B') - elif hf_config['num_hidden_layers'] == 80: - model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B') + if args.llama31: + if hf_config['num_hidden_layers'] == 32: + model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B') + elif hf_config['num_hidden_layers'] == 80: + model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-70B') + elif hf_config['num_hidden_layers'] == 126: + model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B') # 405B tokenizer is the same as 8B + else: + logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.") else: - logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.") + if hf_config['num_hidden_layers'] == 32: + model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B') + elif hf_config['num_hidden_layers'] == 80: + model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B') + else: + logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.") # cast to target precision and disable cpu init dtype = torch_dtype_from_precision(precision) diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py new file mode 100644 index 000000000000..f395e34765d0 --- /dev/null +++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py @@ -0,0 +1,286 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint. + Example to run this conversion script: + python convert_llama_hf_to_nemo.py \ + --input_name_or_path \ + --input_state_dict \ + --output_path \ + --precision bf16 + --llama31 True +""" + +import os +from argparse import ArgumentParser +from collections import OrderedDict + +import torch +from omegaconf import OmegaConf +from pytorch_lightning.trainer.trainer import Trainer +from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.nlp_overrides import ( + GradScaler, + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + NLPSaveRestoreConnector, + PipelineMixedPrecisionPlugin, +) +from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision +from nemo.utils import logging + + +def get_args(): + parser = ArgumentParser() + parser.add_argument( + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to Huggingface LLaMA checkpoints", + ) + parser.add_argument( + "--input_state_dict", + type=str, + default=None, + required=True, + help="Path to Huggingface LLaMA checkpoints", + ) + + parser.add_argument( + "--llama31", + type=bool, + default=True, + required=False, + help="Apply scaling for RoPE frequencies", + ) + + parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.") + parser.add_argument( + "--hparams_file", + type=str, + default=os.path.join( + os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml' + ), + required=False, + help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", + ) + parser.add_argument("--precision", type=str, default="16", help="Model precision") + args = parser.parse_args() + return args + + +def load_config(args, llama_config): + nemo_config = OmegaConf.load(args.hparams_file).model + + if llama_config.get('rope_theta', None): + nemo_config['rotary_base'] = llama_config['rope_theta'] + nemo_config.encoder_seq_length = llama_config['max_position_embeddings'] + nemo_config.num_layers = int(llama_config['num_hidden_layers']) + nemo_config.hidden_size = llama_config['hidden_size'] + nemo_config.ffn_hidden_size = llama_config['intermediate_size'] + nemo_config.num_attention_heads = llama_config['num_attention_heads'] + nemo_config.max_position_embeddings = llama_config['max_position_embeddings'] + nemo_config.init_method_std = llama_config['initializer_range'] + nemo_config.layernorm_epsilon = llama_config['rms_norm_eps'] + if 'num_key_value_heads' in llama_config: + nemo_config.num_query_groups = llama_config['num_key_value_heads'] + nemo_config.use_cpu_initialization = True + nemo_config.activation = 'fast-swiglu' + nemo_config.megatron_amp_O2 = True # True + nemo_config.scale_positional_embedding = args.llama31 + + # Tokenizer config + if 'tokenizer_model' in llama_config: + nemo_config.tokenizer.model = llama_config['tokenizer_model'] + else: + # Llama3 uses converted TikToken Tokenizer + tokenizer_dict = { + 'library': 'huggingface', + 'type': args.input_name_or_path, + 'use_fast': True, + } + nemo_config.tokenizer = tokenizer_dict + + if llama_config['rope_scaling'] is not None: + if llama_config['rope_scaling']['type'] == 'linear': + nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor'] + else: + raise ValueError("Only linear rope scaling type is supported now") + if llama_config['rope_theta'] is not None: + nemo_config['rotary_base'] = llama_config['rope_theta'] + + base = 128 + while llama_config['vocab_size'] % base != 0: + base //= 2 + nemo_config.make_vocab_size_divisible_by = base + + return nemo_config + + +def convert(args): + logging.info(f"loading checkpoint {args.input_name_or_path}") + import torch + + model = LlamaForCausalLM.from_pretrained( + args.input_name_or_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True + ) + hf_config = vars(model.config) + if os.path.exists(f'{args.input_name_or_path}/tokenizer.model'): + tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path) + hf_config['tokenizer_model'] = str(tokenizer.vocab_file) + else: + tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path) + print(f"hf_config: {hf_config}") + print("named parameters:") + for name, param in model.named_parameters(): + print(f"- {name}") + + nemo_config = load_config(args, hf_config) + + if args.precision in ["32", "16"]: + precision = int(float(args.precision)) + elif args.precision in ["bf16", "bf16-mixed"]: + if torch.cuda.is_available() and torch.cuda.is_bf16_supported(): + precision = args.precision + else: + logging.warning("BF16 is not supported on this device. Using FP16 instead.") + precision = args.precision[2:] # prune bf in string + else: + precision = args.precision + + plugins = [] + if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: + scaler = None + if precision in [16, '16', '16-mixed']: + scaler = GradScaler( + init_scale=nemo_config.get('native_amp_init_scale', 2**32), + growth_interval=nemo_config.get('native_amp_growth_interval', 1000), + hysteresis=nemo_config.get('hysteresis', 2), + ) + # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed + plugin_precision = '16-mixed' + else: + plugin_precision = 'bf16-mixed' + + if nemo_config.get('megatron_amp_O2', False): + print('HALF PRECISION') + plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + else: + plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + + nemo_config.precision = precision + nemo_config.micro_batch_size = 1 + print(f"nemo_config: {nemo_config}") + + # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together. + trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy()) + + hidden_size = hf_config["hidden_size"] + head_num = hf_config["num_attention_heads"] + head_size = hidden_size // head_num + num_layers = hf_config["num_hidden_layers"] + + mcore_gpt = nemo_config.mcore_gpt + + assert mcore_gpt == nemo_config.get( + 'transformer_engine', False + ), "mcore_gpt transformer_engine must be enabled (or disabled) together." + + param_to_weights = lambda param: param.float() + + print('start init model') + del model + import time + + st = time.perf_counter() + model = MegatronGPTModel(nemo_config, trainer) + print(f'Model init took {time.perf_counter() - st} sec') + from functools import reduce + from glob import glob + + weights = glob(f'{args.input_state_dict}/*.pt') + st = time.perf_counter() + for weight_file in sorted(weights): + filename = os.path.basename(weight_file) + str_list = filename.split('.') + weight_type = str_list[-2] + str_name = '.'.join(str_list[1:-1]) + print(f'-- Assign weight_type={weight_type} to {str_name}') + if nemo_config.get('megatron_amp_O2', False): + current = reduce(getattr, [model, 'model', 'module'] + str_list[:-2]) + else: + current = reduce(getattr, [model, 'model'] + str_list[:-2]) + load = torch.load(weight_file) + if nemo_config.get('megatron_amp_O2', False): + if precision == 'bf16': + target_precision = torch.bfloat16 + elif precision == 16: + target_precision = torch.float16 + load = load.to(target_precision) + + if weight_type == 'weight': + assert current.weight.shape == load.shape + assert current.weight.dtype == load.dtype + current.weight = torch.nn.Parameter(load) + assert current.weight.norm() == load.norm() + elif weight_type == 'layer_norm_weight': + assert current.layer_norm_weight.dtype == load.dtype + assert current.layer_norm_weight.shape == load.shape + current.layer_norm_weight = torch.nn.Parameter(load) + assert current.layer_norm_weight.norm() == load.norm() + else: + raise ValueError(f'Unsupported weight type = {weight_type}') + del load + + print(f'Finish loading model in {time.perf_counter() - st} sec. Start to save model') + st = time.perf_counter() + print(f'Model save took {time.perf_counter() - st} sec.') + + model._save_restore_connector = NLPSaveRestoreConnector() + + # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path + if 'tokenizer_model' not in hf_config: + if args.llama31: + if hf_config['num_hidden_layers'] == 32: + model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B') + elif hf_config['num_hidden_layers'] == 80: + model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-70B') + elif hf_config['num_hidden_layers'] == 126: + model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B') # 405B tokenizer is the same as 8B + else: + logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.") + else: + if hf_config['num_hidden_layers'] == 32: + model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B') + elif hf_config['num_hidden_layers'] == 80: + model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B') + else: + logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.") + + # cast to target precision and disable cpu init + dtype = torch_dtype_from_precision(precision) + model = model.to(dtype=dtype) + model.cfg.use_cpu_initialization = False + + model.save_to(args.output_path) + logging.info(f'NeMo model saved to: {args.output_path}') + + +if __name__ == '__main__': + args = get_args() + convert(args) diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py new file mode 100644 index 000000000000..940a9df5f9a8 --- /dev/null +++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py @@ -0,0 +1,321 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint. + Example to run this conversion script: + python convert_llama_hf_to_nemo.py \ + --input_name_or_path \ + --output_path + --precision bf16 + --apply_rope_scaling True +""" + +import os +from argparse import ArgumentParser +from collections import OrderedDict + +import torch +from omegaconf import OmegaConf +from pytorch_lightning.trainer.trainer import Trainer +from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.nlp_overrides import ( + GradScaler, + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + NLPSaveRestoreConnector, + PipelineMixedPrecisionPlugin, +) +from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision +from nemo.utils import logging + + +def get_args(): + parser = ArgumentParser() + parser.add_argument( + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to Huggingface LLaMA checkpoints", + ) + parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output to dict dir") + parser.add_argument( + "--hparams_file", + type=str, + default=os.path.join( + os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml' + ), + required=False, + help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", + ) + parser.add_argument( + "--apply_rope_scaling", + type=bool, + default=True, + required=False, + help="Apply scaling for RoPE frequencies", + ) + parser.add_argument("--precision", type=str, default="16", help="Model precision") + args = parser.parse_args() + return args + + +def load_config(args, llama_config): + nemo_config = OmegaConf.load(args.hparams_file).model + + if llama_config.get('rope_theta', None): + nemo_config['rotary_base'] = llama_config['rope_theta'] + nemo_config.encoder_seq_length = llama_config['max_position_embeddings'] + nemo_config.num_layers = int(llama_config['num_hidden_layers']) + nemo_config.hidden_size = llama_config['hidden_size'] + nemo_config.ffn_hidden_size = llama_config['intermediate_size'] + nemo_config.num_attention_heads = llama_config['num_attention_heads'] + nemo_config.max_position_embeddings = llama_config['max_position_embeddings'] + nemo_config.init_method_std = llama_config['initializer_range'] + nemo_config.layernorm_epsilon = llama_config['rms_norm_eps'] + if 'num_key_value_heads' in llama_config: + nemo_config.num_query_groups = llama_config['num_key_value_heads'] + nemo_config.use_cpu_initialization = True + nemo_config.activation = 'fast-swiglu' + nemo_config.megatron_amp_O2 = True + + # Tokenizer config + if 'tokenizer_model' in llama_config: + nemo_config.tokenizer.model = llama_config['tokenizer_model'] + else: + # Llama3 uses converted TikToken Tokenizer + tokenizer_dict = { + 'library': 'huggingface', + 'type': args.input_name_or_path, + 'use_fast': True, + } + nemo_config.tokenizer = tokenizer_dict + + if llama_config['rope_scaling'] is not None: + if llama_config['rope_scaling']['type'] == 'linear': + nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor'] + else: + raise ValueError("Only linear rope scaling type is supported now") + if llama_config['rope_theta'] is not None: + nemo_config['rotary_base'] = llama_config['rope_theta'] + + base = 128 + while llama_config['vocab_size'] % base != 0: + base //= 2 + nemo_config.make_vocab_size_divisible_by = base + + return nemo_config + + +def convert(args): + logging.info(f"loading checkpoint {args.input_name_or_path}") + import torch + + model = LlamaForCausalLM.from_pretrained( + args.input_name_or_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True + ) + hf_config = vars(model.config) + if os.path.exists(f'{args.input_name_or_path}/tokenizer.model'): + tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path) + hf_config['tokenizer_model'] = str(tokenizer.vocab_file) + else: + tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path) + + print("named parameters:") + for name, param in model.named_parameters(): + print(f"- {name}") + + nemo_config = load_config(args, hf_config) + nemo_config.scale_positional_embedding = args.apply_rope_scaling + + if args.precision in ["32", "16"]: + precision = int(float(args.precision)) + elif args.precision in ["bf16", "bf16-mixed"]: + if torch.cuda.is_available() and torch.cuda.is_bf16_supported(): + precision = args.precision + else: + logging.warning("BF16 is not supported on this device. Using FP16 instead.") + precision = args.precision[2:] # prune bf in string + else: + precision = args.precision + + plugins = [] + if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: + scaler = None + if precision in [16, '16', '16-mixed']: + scaler = GradScaler( + init_scale=nemo_config.get('native_amp_init_scale', 2**32), + growth_interval=nemo_config.get('native_amp_growth_interval', 1000), + hysteresis=nemo_config.get('hysteresis', 2), + ) + # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed + plugin_precision = '16-mixed' + else: + plugin_precision = 'bf16-mixed' + + if nemo_config.get('megatron_amp_O2', False): + print('HALF PRECISION') + plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + else: + plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + + nemo_config.precision = precision + print(f"nemo_config: {nemo_config}") + + # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together. + trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy()) + + hidden_size = hf_config["hidden_size"] + head_num = hf_config["num_attention_heads"] + head_size = hidden_size // head_num + num_layers = hf_config["num_hidden_layers"] + + mcore_gpt = nemo_config.mcore_gpt + + assert mcore_gpt == nemo_config.get( + 'transformer_engine', False + ), "mcore_gpt transformer_engine must be enabled (or disabled) together." + + param_to_weights = lambda param: param.float() + + checkpoint = OrderedDict() + checkpoint['state_dict'] = OrderedDict() + + embed_weight = model.state_dict()[f'model.embed_tokens.weight'] + if mcore_gpt: + embed_weights_base_name = f'model.embedding.word_embeddings.weight' + else: + embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight' + checkpoint['state_dict'][embed_weights_base_name] = param_to_weights(embed_weight) + + # in hf, this is defined as register_buffer(..., persistent=False) so it won't be in the state dict + if f'model.layers.0.self_attn.rotary_emb.inv_freq' in model.state_dict(): + rotary_embed_weight = model.state_dict()[f'model.layers.0.self_attn.rotary_emb.inv_freq'] + if mcore_gpt: + rotary_embed_weight_base_name = f'model.rotary_pos_emb.inv_freq' + else: + rotary_embed_weight_base_name = f'model.language_model.rotary_pos_emb.inv_freq' + checkpoint['state_dict'][rotary_embed_weight_base_name] = param_to_weights(rotary_embed_weight) + + if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num: + num_query_groups = head_num + else: + num_query_groups = nemo_config.num_query_groups + assert head_num % num_query_groups == 0, 'head_num must be divisible by num_query_groups' + if mcore_gpt: + assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.' + + for l in range(int(num_layers)): + print(f"converting layer {l}") + old_tensor_shape = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].size() + new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:] + new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:] + q = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].view(*new_q_tensor_shape) + k = model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight'].view(*new_kv_tensor_shape) + v = model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight'].view(*new_kv_tensor_shape) + qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:]) + heads_per_group = head_num // num_query_groups + for i in range(num_query_groups): + qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :])) + qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :])) + qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :])) + qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) + if mcore_gpt: + qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight' + else: + qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight' + checkpoint['state_dict'][qkv_weights_base_name] = param_to_weights(qkv_weights) + + # attention dense + o_weight = model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight'] + if mcore_gpt: + o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight' + else: + o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight' + checkpoint['state_dict'][o_weight_base_name] = param_to_weights(o_weight) + + # MLP + mlp_down_weight = model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight'] + mlp_gate_weight = model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight'] + if mcore_gpt: + mlp_down_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.weight' + else: + mlp_down_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_h_to_4h.weight' + mlp_down_weight = torch.cat((mlp_down_weight, mlp_gate_weight), axis=0) + checkpoint['state_dict'][mlp_down_base_name] = param_to_weights(mlp_down_weight) + + mlp_up_weight = model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight'] + if mcore_gpt: + mlp_up_base_name = f'model.decoder.layers.{l}.mlp.linear_fc2.weight' + else: + mlp_up_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_4h_to_h.weight' + checkpoint['state_dict'][mlp_up_base_name] = param_to_weights(mlp_up_weight) + + # LayerNorm + input_ln_weight = model.state_dict()[f'model.layers.{l}.input_layernorm.weight'] + if mcore_gpt: + input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight' + else: + input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight' + checkpoint['state_dict'][input_ln_base_name] = param_to_weights(input_ln_weight) + + post_attn_ln_weight = model.state_dict()[f'model.layers.{l}.post_attention_layernorm.weight'] + if mcore_gpt: + post_attn_ln_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight' + else: + post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight' + checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight) + + print(f"done layer {l}") + + final_ln_weight = model.state_dict()[f'model.norm.weight'] + if mcore_gpt: + final_ln_base_name = f'model.decoder.final_layernorm.weight' + else: + final_ln_base_name = f'model.language_model.encoder.final_layernorm.weight' + checkpoint['state_dict'][final_ln_base_name] = param_to_weights(final_ln_weight) + + output_layer_weight = model.state_dict()[f'lm_head.weight'] + if mcore_gpt: + output_layer_base_name = f'model.output_layer.weight' + else: + output_layer_base_name = f'model.language_model.output_layer.weight' + checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight) + + checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config + + del model + import gc + + gc.collect() + + if nemo_config.get('megatron_amp_O2', False): + keys = list(checkpoint['state_dict'].keys()) + print('convert to O2') + for key in keys: + checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key) + + # os.mkdir(args.output_path, exist_ok=True) + for key in checkpoint['state_dict']: + print(f'Saving {key} in {checkpoint["state_dict"][key].dtype}..') + save_location = f'{args.output_path}/{key[13:]}.pt' + torch.save(checkpoint['state_dict'][key], save_location) + + +if __name__ == '__main__': + args = get_args() + convert(args) diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py index a8c8621c8d31..45765e6d1997 100644 --- a/tests/collections/common/test_lhotse_dataloading.py +++ b/tests/collections/common/test_lhotse_dataloading.py @@ -1735,10 +1735,14 @@ def nemo_tarred_manifest_path_with_offset(tmp_path_factory) -> Tuple[str, str]: TarWriter(f"{root}/audios_0.tar", shard_size=None) as tar_writer, SequentialJsonlWriter(root / "tarred_audio_filepaths.jsonl") as mft_writer, ): - tar_writer.write(recording.id, BytesIO(recording.sources[0].source)) + + def audio_path(n: int = None): + return recording.id + ("" if n is None else f"-sub{n}") + ".wav" + + tar_writer.write(audio_path(), BytesIO(recording.sources[0].source)) mft_writer.write( { # segment 0-3s - "audio_filepath": recording.id, + "audio_filepath": audio_path(), "offset": 0.0, "duration": 3.0, "text": "irrelevant", @@ -1748,7 +1752,7 @@ def nemo_tarred_manifest_path_with_offset(tmp_path_factory) -> Tuple[str, str]: ) mft_writer.write( { # segment 4-9s - "audio_filepath": recording.id + "-sub1", + "audio_filepath": audio_path(1), "offset": 4.0, "duration": 5.0, "text": "irrelevant-2", @@ -1758,7 +1762,7 @@ def nemo_tarred_manifest_path_with_offset(tmp_path_factory) -> Tuple[str, str]: ) mft_writer.write( { # full recording - for reference - "audio_filepath": recording.id + "-sub2", + "audio_filepath": audio_path(2), "offset": 0.0, "duration": 10.0, "text": "irrelevant irrelevant-2", @@ -1794,7 +1798,7 @@ def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_p assert len(batch) == 3 # Validate example containing full 10s recording. - full_cut = batch[-1] + full_cut = batch[1] assert full_cut.start == 0.0 assert full_cut.duration == 10.0 assert full_cut.supervisions[0].text == "irrelevant irrelevant-2" @@ -1803,7 +1807,7 @@ def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_p assert full_audio.shape[1] == full_cut.num_samples == 160000 # 10s * 16kHz # Validate segment 0-3s. - cut = batch[0] + cut = batch[2] assert cut.start == 0.0 assert cut.duration == 3.0 assert cut.supervisions[0].text == "irrelevant" @@ -1817,7 +1821,7 @@ def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_p # Note: LazyNeMoTarredIterator removes the offset information, as it creates a new recording # that's a "subset" of the original recording as a memory saving optimization. # Hence, we will not see cut.start == 4.0. - cut = batch[1] + cut = batch[0] assert cut.start == 0.0 assert cut.duration == 5.0 assert cut.supervisions[0].text == "irrelevant-2" diff --git a/tests/lightning/io/test_mixin.py b/tests/lightning/io/test_mixin.py index 824608db6bf0..3a520b8e74ae 100644 --- a/tests/lightning/io/test_mixin.py +++ b/tests/lightning/io/test_mixin.py @@ -14,3 +14,10 @@ def test_reinit(self): assert copied is not dummy assert copied.a == dummy.a assert copied.b == dummy.b + + def test_init(self): + outputs = [] + for i in range(1001): + outputs.append(DummyClass(i, i)) + + assert len(outputs) == 1001