diff --git a/.github/workflows/changelog-build.yml b/.github/workflows/changelog-build.yml
index 7e16c344acb8..b6d836f71cec 100644
--- a/.github/workflows/changelog-build.yml
+++ b/.github/workflows/changelog-build.yml
@@ -1,13 +1,13 @@
 name: 'Changelog Build (Release)'
 
 on:
+  workflow_dispatch:
   push:
     tags:
       - '*'
-
+    
 jobs:
   changelog:
-    if: startsWith(github.ref, 'refs/tags/')
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
@@ -39,7 +39,7 @@ jobs:
           ignorePreReleases: "false"
           failOnError: "false"
           fromTag: ${{ steps.previous_tag.outputs.tag_name }}
-          toTag: ${{ github.ref_name }}
+          toTag: ${{ github.ref_name || github.sha }}
 
       - name: Print Changelog
         run: |
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 2919be733ac3..ffe6e00e2ad3 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -365,6 +365,34 @@ jobs:
 
   #           rm -rf llama2_qat_results
 
+  L2_Distill_Llama2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_distillation.py \
+          trainer.devices=2 \
+          trainer.num_nodes=1 \
+          trainer.precision=bf16 \
+          trainer.max_steps=5 \
+          trainer.log_every_n_steps=5 \
+          trainer.val_check_interval=5 \
+          trainer.limit_val_batches=2 \
+          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          model.kd_teacher_restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          model.tensor_model_parallel_size=2 \
+          model.pipeline_model_parallel_size=1 \
+          model.micro_batch_size=1 \
+          model.global_batch_size=4 \
+          model.optim.name=distributed_fused_adam \
+          model.optim.sched.warmup_steps=1 \
+          model.data.data_prefix=[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+          model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
+          exp_manager.exp_dir=examples/nlp/megatron_llama_distill
+      AFTER_SCRIPT: |
+          rm -rf examples/nlp/megatron_llama_distill
+
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3f4c4f3c19de..af09fa241c59 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,112 +1,60 @@
 name: "NeMo Code release"
 
 on:
-  issue_comment:
-    types: [created]
-
+  workflow_dispatch:
+    inputs:
+      branch: 
+        description: Branch to release
+        required: true
+        type: string
 jobs: 
   main:
-    if: >
-      github.event_name == 'issue_comment' &&
-      github.event.issue.pull_request &&
-      startsWith(github.event.comment.body, '/release-please') &&
-      contains(fromJSON('["ko3n1g"]'), github.actor)
+    if: contains(fromJSON('["ko3n1g"]'), github.actor)
     runs-on: ubuntu-latest
     environment: 
       name: main
     steps:  
-      - name: Update PR issue comment
-        shell: bash
-        env:
-          message: ${{ github.event.comment.body }}
-        run: |
-          message="$message
-
-          ---
-
-          Releasebot 🤖: Release processes started...
-          "
-          message="${message//$'\n'/<br>}"
-
-          curl -L \
-            -X PATCH \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }} \
-            -d '{"body":"'"$message"'"}'
-
-      - name: Get PR number
-        shell: bash
-        id: get-pr-num
-        run: |
-          PR_URL="${{ github.event.issue.pull_request.url }}"
-          PR_NUM=${PR_URL##*/}
-          echo "pr_number=$PR_NUM" >> $GITHUB_OUTPUT
-
-      - name: Get Pull Request Information
-        uses: actions/github-script@v6
-        id: get-pr-branch
-        with:
-          result-encoding: string
-          script: |
-            const pr = await github.rest.pulls.get({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: ${{ steps.get-pr-num.outputs.pr_number }}
-            });
-            console.log('Pull Request Information:', pr.data);
-            return pr.data.head.ref;
 
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
           path: ${{ github.run_id }}
-          ref: ${{ steps.get-pr-branch.outputs.result }}
+          ref: ${{ inputs.branch }}
 
-      - name: Get version number
+      - name: Create release
         id: version-number
         run: |
           cd ${{ github.run_id }}
           VERSION=$(python -c "import nemo; print(nemo.__version__)")
-          echo "VERSION=$VERSION" >> "$GITHUB_OUTPUT"
-
-      - name: Extract changelog
-        id: extract-changelog
-        uses: peter-evans/find-comment@v3
-        with:
-          issue-number: ${{ steps.get-pr-num.outputs.pr_number }}
-          body-includes: '# Detailed Changelogs'
-      
-      - name: Extract summary
-        id: extract-summary
-        uses: peter-evans/find-comment@v3
-        with:
-          issue-number: ${{ steps.get-pr-num.outputs.pr_number }}
-          body-includes: '# Highlights'
-        
-      - name: Create Release doc
-        id: create-release-doc
-        env:
-          SUMMARY: ${{ steps.extract-summary.outputs.comment-body }}
-          CHANGELOG: ${{ steps.extract-changelog.outputs.comment-body }}
-        run: |
           
-          echo "TITLE<<EOF" >> $GITHUB_ENV
-          echo "NVIDIA Neural Modules ${{ steps.version-number.outputs.VERSION }}" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
+          NAME="NVIDIA Neural Modules ${VERSION}"
+          CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
+          CHANGELOG=$(echo "$CHANGELOG" | sed '/./,$!d' | sed ':a;N;$!ba;s/\n$//')
+
+          PAYLOAD=$(jq \
+                      -n \
+                      -c \
+                      --arg CI_COMMIT_BRANCH "${{ inputs.branch }}" \
+                      --arg NAME "$NAME" \
+                      --arg BODY "$CHANGELOG" \
+                      '{
+                        "tag_name": $CI_COMMIT_BRANCH,
+                        "target_commitish": $CI_COMMIT_BRANCH,
+                        "name": $NAME,
+                        "body": $BODY,
+                        "draft": false,
+                        "prerelease": false,
+                        "generate_release_notes": false
+                      }'
+                  )
 
-          echo "BODY<<EOF" >> $GITHUB_ENV
-          echo "$SUMMARY" >> $GITHUB_ENV
-          echo "$CHANGELOG" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-      - name: Create Release
-        uses: softprops/action-gh-release@v2
-        with:
-          name: ${{ env.TITLE }}
-          tag_name: ${{ steps.version-number.outputs.VERSION }}
-          body: ${{ env.BODY }}
+          curl -L \
+            -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.PAT }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/NVIDIA/NeMo/releases \
+            -d "$PAYLOAD"
       
       - name: Build, test, and release wheel
         env:
@@ -114,6 +62,8 @@ jobs:
           TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
         run: |
           cd ${{ github.run_id }}
+          EXPECTED_VERSION=$(python -c 'import nemo; print(nemo.__version__)')
+
           python3 -m pip install --upgrade build
           python3 -m build
 
@@ -122,7 +72,6 @@ jobs:
           cd ../
 
           INSTALLED_VERSION=$(python -c 'import nemo; print(nemo.__version__)')
-          EXPECTED_VERSION=${{ steps.version-number.outputs.VERSION }}
           
           if [[ "$INSTALLED_VERSION" != "$EXPECTED_VERSION" ]]; then
             echo 'Wheel has an outdated version, mission abort immediately!'
@@ -134,34 +83,6 @@ jobs:
           python3 -m pip install --upgrade twine
           python3 -m twine upload --repository pypi dist/*
 
-      - name: Update PR issue comment
-        shell: bash
-        env:
-          message: ${{ github.event.comment.body }}
-        run: |
-          message="$message
-
-          ---
-
-          Releasebot 🤖: Release done 🎉
-          "
-          message="${message//$'\n'/<br>}"
-
-          curl -L \
-            -X PATCH \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }} \
-            -d '{"body":"'"$message"'"}'
-
-      - name: Close Pull
-        run: |
-          cd ${{ github.run_id }}
-          gh pr close --comment "Releasebot 🤖: Closing PR" "${{ steps.get-pr-num.outputs.pr_number }}"
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
       - name: notify
         run: |
           MESSAGE='{
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 000000000000..7fd5cd00b352
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,921 @@
+# Changelog
+
+## NVIDIA Neural Modules 2.0.0rc1
+
+### Highlights
+
+#### Large language models
+
+- PEFT: QLoRA support, LoRA/QLora for Mixture-of-Experts (MoE) dense layer
+- State Space Models & Hybrid Architecture support (Mamba2 and NV-Mamba2-hybrid)
+- Support Nemotron, Minitron, Gemma2, Qwen, RAG
+- Custom Tokenizer training in NeMo
+- Update the Auto-Configurator for EP, CP and FSDP
+
+#### Multimodal
+
+- NeVA: Add SOTA LLM backbone support (Mixtral/LLaMA3) and suite of model parallelism support (PP/EP)
+- Support Language Instructed Temporal-Localization Assistant (LITA) on top of video NeVA
+
+#### ASR
+
+- SpeechLM and SALM
+- Adapters for Canary Customization
+- Pytorch allocator in PyTorch 2.2 improves training speed up to 30% for all ASR models
+- Cuda Graphs for Transducer Inference
+- Replaced webdataset with Lhotse - gives up to 2x speedup
+- Transcription Improvements - Speedup and QoL Changes
+- ASR Prompt Formatter for multimodal Canary
+
+#### Export & Deploy
+
+- In framework PyTriton deployment with backends: - PyTorch - vLLM - TRT-LLM update to 0.10
+- TRT-LLM C++ runtime
+
+### Detailed Changelogs
+
+#### ASR
+
+<details><summary>Changelog</summary>
+
+- Support dataloader as input to `audio` for transcription by @titu1994 :: PR: #9201  
+- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205  
+- Fix Online_Offline_Microphone_VAD_Demo.ipynb by @stevehuang52 :: PR: #9251  
+- Remove .nemo instead of renaming by @mikolajblaz :: PR: #9281  
+- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. by @galv :: PR: #9347
+- Revert "Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer." by @titu1994 :: PR: #9351
+- Prompt formatter API and canary transcribe tensor input support by @pzelasko :: PR: #9206
+- Fix prompt formatter's defaults=None case in multi-task model by @pzelasko :: PR: #9366
+- move AED chunked infer script by @stevehuang52 :: PR: #9367
+- Use model-cast-to-bfloat16 rather than AMP-to-bfloat16 for inference. by @galv :: PR: #9198
+- ci: Fix `L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_C… by @ko3n1g :: PR: #9399
+- Fix logging message for ASR by @titu1994 :: PR: #9469
+- Add support to change Multi task model prompt by @titu1994 :: PR: #9542
+- Enable encoder adapters for Canary and MultiTaskAED models by @titu1994 :: PR: #9409
+- Audio model collection by @anteju :: PR: #9263
+- TitaNet Batch Verify Speaker by @monica-sekoyan :: PR: #9337
+- Fix the arguments  of forward_for_export function in msdd_models by @tango4j :: PR: #9624
+- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697
+- refactor: notebook branch release by @ko3n1g :: PR: #9711
+- Canary Adapters tutorial (#9670) by @nithinraok :: PR: #9777
+- typos and branch name update to r2.0.0rc1 by @nithinraok :: PR: #9846
+- Fix RNNT alignments test by @artbataev :: PR: #9770
+- By default trust remote code from HF Datasets by @nithinraok :: PR: #9886
+- Temporarily disable cuda graph based RNN-T greedy inference for r2.0.0rc1 by @galv :: PR: #9904
+- Enable CUDA graphs by default, but require CUDA 12.6 for full graphs by @artbataev :: PR: #9919
+- update branch name for script by @nithinraok :: PR: #9936
+- updte branch by @nithinraok :: PR: #9942
+</details>
+
+#### TTS
+
+<details><summary>Changelog</summary>
+
+- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205
+- Add mel codec checkpoints by @anteju :: PR: #9228
+- GPU unit tests: Mark flaky tests to be fixed by @pablo-garay :: PR: #9559
+- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697
+- refactor: notebook branch release by @ko3n1g :: PR: #9711
+
+</details>
+
+#### LLM/Multimodal
+  
+<details><summary>Changelog</summary>
+
+- Update nemo.export module for quantized models by @janekl :: PR: #9218
+- Add save option to the TRT-LLM export test script by @oyilmaz-nvidia :: PR: #9221
+- Checkpoint resuming compatible for 2403 container by @suiyoubi :: PR: #9199
+- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205
+- use get with fallback when reading checkpoint_callback_params by @akoumpa :: PR: #9223
+- Revert rope fusion defaults by @cuichenx :: PR: #9237
+- fix import by @akoumpa :: PR: #9240
+- Add TRT-LLM params like max_num_tokens and opt_num_tokens by @oyilmaz-nvidia :: PR: #9210
+- sum-reduce grad_norm in DP+CP domain by @erhoo82 :: PR: #9262
+- Alit/bert convert fix by @JRD971000 :: PR: #9285
+- conv1d stable version by @JRD971000 :: PR: #9330
+- Fix trainer builder when exp_manager is not in config by @yaoyu-33 :: PR: #9293
+- Fix Peft Weights Loading in NeVA by @yaoyu-33 :: PR: #9341
+- Skip sequence_parallel allreduce when using Mcore DistOpt by @akoumpa :: PR: #9344
+- Fix FSDP gradient calculation with orig params by @janEbert :: PR: #9335
+- TRT-LLM Export Code Cleanup by @oyilmaz-nvidia :: PR: #9270
+- support null/None truncation field by @arendu :: PR: #9355
+- NeVa token fusion by @paul-gibbons :: PR: #9245
+- bugfix if using mcore distOpt with sft by @akoumpa :: PR: #9356
+- Re-org export code by @oyilmaz-nvidia :: PR: #9353
+- QLoRA by @cuichenx :: PR: #9340
+- PeFT fix for distOpt by @akoumpa :: PR: #9392
+- [NeMo-UX] Integrating mcore's DistributedDataParallel into MegatronStrategy by @marcromeyn :: PR: #9387
+- cherry pick of #9266 by @dimapihtar :: PR: #9411
+- Enable specifying alpha for PTQ INT8 SmoothQuant method by @janekl :: PR: #9423
+- add support for new mcore ds features by @dimapihtar :: PR: #9388
+- LoRA for MoE Layer by @cuichenx :: PR: #9396
+- Mistral-7B: apply user's precision to output checkpoint by @akoumpa :: PR: #9222
+- Add option to merge distributed optimizer buckets by @timmoon10 :: PR: #9414
+- TRT-LLM 0.10 Update by @oyilmaz-nvidia :: PR: #9402
+- In-framework deployment by @oyilmaz-nvidia :: PR: #9438
+- Bugfix missing variables and argument changes to MegatronPretrainingRandomSampler by @jstjohn :: PR: #9458
+- Hyena Operator by @guyjacob :: PR: #9264
+- Refactor Quantizer for reusing in QAT by @kevalmorabia97 :: PR: #9276
+- move load state dict after initialize parallel state in nlp_model by @ryxli :: PR: #9382
+- Enable user to optionally upgrade Megatron by @jstjohn :: PR: #9478
+- Fix unwrap model by @cuichenx :: PR: #9480
+- fix operator precedence by @akoumpa :: PR: #9403
+- [NeMo-UX] Adding context- & expert-parallelism to MegatronStrategy by @marcromeyn :: PR: #9525
+- update mcoreddp call by @akoumpa :: PR: #9345
+- mcore distOpt restore fix by @akoumpa :: PR: #9421
+- vLLM Export Support by @apanteleev :: PR: #9381
+- PL: Delete precision if using plugin. TODO switch to MegatronTrainerB… by @akoumpa :: PR: #9535
+- extend get_gpt_layer_modelopt_spec to support MoE by @akoumpa :: PR: #9532
+- fix mock data generation for legacy dataset by @dimapihtar :: PR: #9530
+- add reset learning rate functionality by @dimapihtar :: PR: #9372
+- Use closed-formula to round by multiple by @akoumpa :: PR: #9307
+- GPU unit tests: Mark flaky tests to be fixed by @pablo-garay :: PR: #9559
+- Consolidate gpt continue training script into pretraining script by @yaoyu-33 :: PR: #9413
+- Enable encoder adapters for Canary and MultiTaskAED models by @titu1994 :: PR: #9409
+- PTQ refinements by @janekl :: PR: #9574
+- Add ModelOpt QAT example for Llama2 SFT model by @kevalmorabia97 :: PR: #9326
+- Multimodal projection layer adapter fix for PP>1 by @paul-gibbons :: PR: #9445
+- Add offline quantization script for QLoRA deployment by @cuichenx :: PR: #9455
+- Make QLoRA more model-agnostic by @cuichenx :: PR: #9488
+- Set n_gpu to None in nemo export by @oyilmaz-nvidia :: PR: #9593
+- [NeMo-UX] Fix Megatron-optimizer by @marcromeyn :: PR: #9599
+- Chat template support for megatron_gpt_eval.py by @akoumpa :: PR: #9354
+- [NeMo-UX] Add PEFT by @cuichenx :: PR: #9490
+- Alit/mamba tmp by @JRD971000 :: PR: #9612
+- Enable MCore checkpointing optimizations by @mikolajblaz :: PR: #9505
+- Change mixtral moe key name for trt-llm by @oyilmaz-nvidia :: PR: #9620
+- fix ckpt load bug by @dimapihtar :: PR: #9621
+- Alit/mamba by @JRD971000 :: PR: #9575
+- Unwrap ckpt_io for model opt (async save) by @mikolajblaz :: PR: #9622
+- MCore T5 support for NeMo - Training by @huvunvidia :: PR: #9432
+- [Nemo-UX] Expose transformer_layer_spec inside GPTConfig by @marcromeyn :: PR: #9592
+- Update NeMo Clip to Use MCore Modules by @yaoyu-33 :: PR: #9594
+- Mistral + Mixtral Support for NeVa by @paul-gibbons :: PR: #9459
+- Adding support for mcore generate by @shanmugamr1992 :: PR: #9566
+- Improve error messaging during trt-llm export by @oyilmaz-nvidia :: PR: #9638
+- [Cherrypick] support lora when kv_channel != hidden_size / num_heads by @cuichenx :: PR: #9644
+- Parametrize FPS group by @mikolajblaz :: PR: #9648
+- Cherry-pick megatron export fix from main by @borisfom :: PR: #9643
+- add documentation for reset_lr feature by @dimapihta
+- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697
+- Cherry pick: LITA Integration by @Slyne :: PR: #9684
+- SDXL improvements (and support for Draft+) by @rohitrango :: PR: #9654
+- Gemma 2 by @cuichenx :: PR: #9672
+- Allows non-strict load with distributed checkpoints by @mikolajblaz :: PR: #9613
+- refactor: notebook branch release by @ko3n1g :: PR: #9711
+- [NeMo-UX] Make TE and Apex dependencies optional by @ashors1 :: PR: #9550
+- Alit/r2.0.0 by @JRD971000 :: PR: #9718
+- Manually cherry-pick from PR 9679 (PR to main - Support SFT/Eval/PEFT for mcore T5) by @huvunvidia :: PR: #9737
+- In framework export by @oyilmaz-nvidia :: PR: #9658
+- T5 changes based on mcore changes by @pablo-garay :: PR: #9829
+- [NeMo-UX] Use single instance of loss reductions in GPTModel by @hemildesai :: PR: #9801
+- deprecate NeMo NLP tutorial by @dimapihtar :: PR: #9864
+- Disable nvFuser setup with PyTorch 23.11 and later by @athitten :: PR: #9837
+- make torch_dist ckpt strategy as default by @dimapihtar :: PR: #9852
+- add rampup bs documentation by @dimapihtar :: PR: #9884
+- copy of #9576 by @dimapihtar :: PR: #9986
+- Support Nvidia Torch and Arch versions by @thomasdhc :: PR: #9897
+- Bug fix for pooler causing dist checkpointing exception by @shanmugamr1992 :: PR: #10008
+
+</details>
+
+#### Export
+
+<details><summary>Changelog</summary>
+
+- Update nemo.export module for quantized models by @janekl :: PR: #9218
+- Add save option to the TRT-LLM export test script by @oyilmaz-nvidia :: PR: #9221
+- Add TRT-LLM params like max_num_tokens and opt_num_tokens by @oyilmaz-nvidia :: PR: #9210
+- TRT-LLM Export Code Cleanup by @oyilmaz-nvidia :: PR: #9270
+- Re-org export code by @oyilmaz-nvidia :: PR: #9353
+- Use TensorRT-LLM native parameter names in nemo.export module by @janekl :: PR: #9424
+- TRT-LLM 0.10 Update by @oyilmaz-nvidia :: PR: #9402
+- vLLM Export Support by @apanteleev :: PR: #9381
+- Add page context fmha option in TensorRTLLM export by @meatybobby :: PR: #9526
+- Test C++ runtime on demand in nemo_export.py to avoid possible OOMs by @janekl :: PR: #9544
+- Fix nemo export test by @oyilmaz-nvidia :: PR: #9547
+- Add tps and pps params to the export script by @oyilmaz-nvidia :: PR: #9558
+- Add Multimodal Exporter by @meatybobby :: PR: #9256
+- Set n_gpu to None in nemo export by @oyilmaz-nvidia :: PR: #9593
+- Inflight nemo model export support by @JimmyZhang12 :: PR: #9527
+- vLLM Export Improvements by @apanteleev :: PR: #9596
+- Akoumparouli/nemo ux mixtral export by @akoumpa :: PR: #9603
+- Change mixtral moe key name for trt-llm by @oyilmaz-nvidia :: PR: #9620
+- Fix the arguments  of forward_for_export function in msdd_models by @tango4j :: PR: #9624
+- Improve error messaging during trt-llm export by @oyilmaz-nvidia :: PR: #9638
+- Cherry-pick megatron export fix from main by @borisfom :: PR: #9643
+- In framework export by @oyilmaz-nvidia :: PR: #9658
+- Add missing imports for torch dist ckpt in export by @oyilmaz-nvidia :: PR: #9826~
+
+</details>
+
+
+
+
+#### Bugfixes
+  
+<details><summary>Changelog</summary>
+
+- use get with fallback when reading checkpoint_callback_params by @akoumpa :: PR: #9223
+- fix import by @akoumpa :: PR: #9240
+- Remove .nemo instead of renaming by @mikolajblaz :: PR: #9281
+- call set_expert_model_parallel_world_size instead of set_cpu_expert_m… by @akoumpa :: PR: #9275
+- Fix typos in Mixtral NeMo->HF and Starcoder2 NeMo->HF conversion scripts by @evellasques :: PR: #9325
+- Skip sequence_parallel allreduce when using Mcore DistOpt by @akoumpa :: PR: #9344
+- Add OpenAI format response to r2.0.0rc1 by @athitten :: PR: #9796
+- [NeMo UX] Support generating datasets using different train/valid/test distributions by @ashors1 :: PR: #9771
+- Add missing imports for torch dist ckpt in export by @oyilmaz-nvidia :: PR: #9826
+
+</details>
+
+#### General Improvements
+
+<details><summary>Changelog</summary>
+
+- [Nemo CICD] run_cicd_for_release_branches_also by @pablo-garay :: PR: #9213
+- rename paths2audiofiles to audio by @github-actions[bot] :: PR: #9220
+- Fix ASR_Context_Biasing.ipynb contains FileNotFoundError by @github-actions[bot] :: PR: #9234
+- ci: Remove duplicated job by @ko3n1g :: PR: #9258
+- Fix document links by @yaoyu-33 :: PR: #9260
+- Pin transformers by @github-actions[bot] :: PR: #9273
+- Fix loading github raw images on notebook by @github-actions[bot] :: PR: #9283
+- Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward by @github-actions[bot] :: PR: #9278
+- Refactor Sequence Packing Script by @cuichenx :: PR: #9271
+- [Nemo-UX] Move code to collections + fix some small bugs by @marcromeyn :: PR: #9277
+- Fix typo in HF tutorial by @github-actions[bot] :: PR: #9304
+- Expand documentation for data parallelism and distributed optimizer by @timmoon10 :: PR: #9227
+- Install alerting by @ko3n1g :: PR: #9311
+- typos by @github-actions[bot] :: PR: #9315
+- FP8 feature documentation by @ksivaman :: PR: #9265
+- [Nemo CICD] Comment out flaky tests by @pablo-garay :: PR: #9333
+- Fixed typos in README.rst by @gdevakumar :: PR: #9322
+- Update README.rst to clarify installation via Conda by @SimonCW :: PR: #9323
+- [Nemo CICD] update flaky test by @pablo-garay :: PR: #9339
+- fix lora and ptuning and isort/black by @github-actions[bot] :: PR: #9295
+- Fix P-tuning for Llama based models by @github-actions[bot] :: PR: #9300
+- add large model stable training fix and contrastive loss update for variable seq by @github-actions[bot] :: PR: #9348
+- Guard cuda memory allocator update by @github-actions[bot] :: PR: #9313
+- [Nemo CICD] Remove unnecessary commented out code by @pablo-garay :: PR: #9364
+- Update Gemma conversion script by @yaoyu-33 :: PR: #9365
+- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347) by @github-actions[bot] :: PR: #9371
+- Re-enable cuda graphs in training modes. by @github-actions[bot] :: PR: #9343
+- fix typo infer_seq_lenght -> infer_seq_length by @akoumpa :: PR: #9370
+- Make a backward compatibility for old MSDD configs in label models by @github-actions[bot] :: PR: #9378
+- Dgalvez/fix greedy batch strategy name r2.0.0rc0 by @github-actions[bot] :: PR: #9253
+- Update README.rst by @jgerh :: PR: #9393
+- Force diarizer to use CUDA if cuda is available and if device=None. by @github-actions[bot] :: PR: #9390
+- ci: Properly catch failed tests by introduction of workflow templates by @ko3n1g :: PR: #9324
+- Fix T5 G2P Input and Output Types by @github-actions[bot] :: PR: #9269
+- Huvu/rag pipeline citest by @huvunvidia :: PR: #9384
+- Fix circular import for MM dataprep notebook by @github-actions[bot] :: PR: #9292
+- add check if num layers is divisible by pp size by @github-actions[bot] :: PR: #9298
+- [Nemo CICD] timeouts fix by @pablo-garay :: PR: #9407
+- [NeMo-UX] Removing un-used ModelConfig class by @marcromeyn :: PR: #9389
+- Add tutorial for Llama-3-8B lora training and deployment by @shashank3959 :: PR: #9359
+- [NeMo-UX] Removing default_path from ModelConnector by @marcromeyn :: PR: #9401
+- Fix README by @ericharper :: PR: #9415
+- [SD] Fix SD CUDA Graph Failure by @alpha0422 :: PR: #9319
+- [NeMo-UX] Adding file-lock to Connector by @marcromeyn :: PR: #9400
+- Add Dev Container Bug Report by @pablo-garay :: PR: #9430
+- Akoumparouli/profiling docs by @akoumpa :: PR: #9420
+- ci: Enrich notifications by @ko3n1g :: PR: #9412
+- Fix failing RIR unit test with lhotse 1.24+ by @pzelasko :: PR: #9444
+- [NeMo-UX] Adding support for mcore distributed optimizer by @marcromeyn :: PR: #9435
+- Use ModelOpt build_tensorrt_llm for building engines for qnemo checkpoints by @janekl :: PR: #9452
+- ci(notifications): Fix extraction of last 2K chars by @ko3n1g :: PR: #9450
+- Update readme with mlperf news by @ericharper :: PR: #9457
+- [NeMo-UX] Add nsys callback by @ashors1 :: PR: #9461
+- [NeMo UX] Introducing optimizer module by @marcromeyn :: PR: #9454
+- Fix minor import bug in deploy module by @oyilmaz-nvidia :: PR: #9463
+- ci(notifications): Fetch all jobs by @ko3n1g :: PR: #9465
+- Update build_dataset.py by @stevehuang52 :: PR: #9467
+- bionemo: bn2/add pipelineparallel dtype by @skothenhill-nv :: PR: #9475
+- [NeMo-UX] Integrate experiment manager features with NeMo-UX APIs by @ashors1 :: PR: #9460
+- Add python_requires by @galv :: PR: #9431
+- [NeMo-UX] Fixing imports of NeMoLogging, AutoResume & ModelCheckpoint by @marcromeyn :: PR: #9476
+- Modelopt Refactor for SDXL Quantization by @suiyoubi :: PR: #9279
+- [NeMo-UX] Fixing defaults in llm.train & Mistral7BModel by @marcromeyn :: PR: #9486
+- In framework deploy using deploy script by @oyilmaz-nvidia :: PR: #9468
+- [NeMo-UX] Integrate tokenizer import into model.import_ckpt by @marcromeyn :: PR: #9485
+- append to file by @malay-nagda :: PR: #9483
+- [NeMo-UX] Fix bug in import_ckpt by @marcromeyn :: PR: #9492
+- Add nemotron news by @ericharper :: PR: #9510
+- Add CICD test for Stable Diffusion by @michal2409 :: PR: #9464
+- Akoumparouli/nemo ux mixtral by @akoumpa :: PR: #9446
+- [NeMo-UX] Llama and Gemma by @cuichenx :: PR: #9528
+- [NeMo-UX] minor logging bug fixes by @ashors1 :: PR: #9529
+- Update neva conversion script from and to HF by @yaoyu-33 :: PR: #9296
+- [Nemo-UX] IO fixes by @marcromeyn :: PR: #9512
+- Fix lhotse tests for v1.24.2 by @pzelasko :: PR: #9546
+- [Nemo CICD] Make GPU Unit Tests non-optional by @pablo-garay :: PR: #9551
+- Add Python AIStore SDK to container and bump min Lhotse version by @pzelasko :: PR: #9537
+- [NeMo-UX] Fix tokenizer IO by @marcromeyn :: PR: #9555
+- [NeMo UX] Move mistral_7b.py to mistral.py by @akoumpa :: PR: #9545
+- ci: Do not attempt to send slack on fork by @ko3n1g :: PR: #9556
+- Fix SDXL incorrect name in Docs by @suiyoubi :: PR: #9534
+- Bump PTL version by @athitten :: PR: #9557
+- [Resiliency] Straggler detection by @jbieniusiewi :: PR: #9473
+- [NeMo-UX] Switch to torch_dist as default distributed checkpointing backend by @ashors1 :: PR: #9541
+- [NeMo-UX] Checkpointing bug fixes by @ashors1 :: PR: #9562
+- Expose MCore path_to_cache option by @maanug-nv :: PR: #9570
+- [NeMo-UX] Fix Trainer serialization by @marcromeyn :: PR: #9571
+- Update click version requirement by @thomasdhc :: PR: #9580
+- [Fault tolerance] Heartbeat detection by @maanug-nv :: PR: #9352
+- [Nemo-UX] Add fabric-API for manual forward-pass by @marcromeyn :: PR: #9577
+- [Nemo-UX] Add SDK-factories to llm-collection by @marcromeyn :: PR: #9589
+- [NeMo-UX] Some improvements to NeMoLogger by @marcromeyn :: PR: #9591
+- Set no_sync_func & grad_sync_fucn by @akoumpa :: PR: #9601
+- [NeMo-UX] Fix nemo logger when trainer has no loggers by @ashors1 :: PR: #9607
+- Fix the dictionary  format returned by the `scheduler` method by @sararb :: PR: #9609
+- [NeMo-UX] Dataloading enhancements and bug fixes by @ashors1 :: PR: #9595
+- Fix serialization of AutoResume by @sararb :: PR: #9616
+- Jsonl support by @adityavavre :: PR: #9611
+- Akoumparouli/mistral import instruct chat template fix by @akoumpa :: PR: #9567
+- Remove .cuda calls, use device isntead by @akoumpa :: PR: #9602
+- fix converter defautl args by @akoumpa :: PR: #9565
+- fix: remove non_blocking from PTL's .cuda call by @akoumpa :: PR: #9618
+- NeVA Minor Fixes by @yaoyu-33 :: PR: #9608
+- [NeMo-UX] fix pretrianing data sizes and weights by @cuichenx :: PR: #9627
+- [NeMo-UX] async checkpointing support by @ashors1 :: PR: #9466
+- Change default parallel_save to False by @mikolajblaz :: PR: #9632
+- Add REST API to deploy module by @athitten :: PR: #9539
+- ci: Timeout per step, not job by @ko3n1g :: PR: #9635
+- [NeMo-UX] Fix when optimizers are setup for PEFT by @marcromeyn :: PR: #9619
+- [NeMo-UX] Fix pipeline parallel bug by @ashors1 :: PR: #9637
+- Fixing import error fior llama-index (RAG pipeline) by @pablo-garay :: PR: #9662
+- llama CI fix by @rohitrango :: PR: #9663
+- [NeMo-UX] Make 'load_directly_on_device' configurable by @ashors1 :: PR: #9657
+- [Nemo-UX] Including all trainable-params in a PEFT-checkpoint by @marcromeyn :: PR: #9650
+- [NeMo-UX] Fix imports so local configuration of runs works again by @marcromeyn :: PR: #9690
+- Set TE flag in legacy -> mcore conversion script by @terrykong :: PR: #9722
+- Update starthere docs text by @erastorgueva-nv :: PR: #9724
+- TorchAudio installation workaround for incorrect `PYTORCH_VERSION` variable by @artbataev :: PR: #9736
+- [NeMo-UX] Match nemo 1's default behavior for drop_last and pad_samples_to_global_batch_size by @ashors1 :: PR: #9707
+- add a bit more for timeout (#9702) by @pablo-garay :: PR: #9754
+- Fix missing parallelisms by @maanug-nv :: PR: #9725
+- update branch by @nithinraok :: PR: #9764
+- Fix data preprocessing script by @cuichenx :: PR: #9759
+- vLLM 0.5.1 update by @apanteleev :: PR: #9779
+- upper bound hf-hub by @akoumpa :: PR: #9805
+- Fix few issues and docs for neva and clip in r2.0.0rc1 by @yaoyu-33 :: PR: #9681
+- add dummy vision and text transformer config (assumed mcore to be false) by @rohitrango :: PR: #9699
+- fix lita bugs by @Slyne :: PR: #9810
+- [NeMo-UX] Log `val_loss` by @ashors1 :: PR: #9814
+- [NeMo-UX] Fix some dataloading bugs by @ashors1 :: PR: #9807
+- [NeMo-UX] Adding recipes by @marcromeyn :: PR: #9720
+- [NeMo-UX] Set async_save from strategy rather than ModelCheckpoint by @ashors1 :: PR: #9800
+- Fix hf hub for 0.24+ by @titu1994 :: PR: #9806
+- [NeMo-UX] Fix a minor bug with async checkpointing by @ashors1 :: PR: #9856
+- [NeMo-UX] make progress bar easier to parse by @ashors1 :: PR: #9877
+- Docs: add "Nemo Fundamentals" page by @erastorgueva-nv :: PR: #9835
+- Create __init__.py by @stevehuang52 :: PR: #9892
+- [NeMo-UX] Fixes to make PreemptionCallback work by @hemildesai :: PR: #9830
+- Fix Docker build. Make Dockerfile consistent with CI by @artbataev :: PR: #9784
+- Multimodal data prep notebook fix by @cuichenx :: PR: #9910
+- [NeMo-UX] Add distributed checkpointing unit tests by @ashors1 :: PR: #9794
+- r2.0.0rc1 fix for dist checkpoint loading by @yaoyu-33 :: PR: #9854
+- [NeMo-UX] Rename sdk references to NeMo Run by @hemildesai :: PR: #9872
+- [NeMo-UX] Fix some serialization bugs by @ashors1 :: PR: #9868
+- add mixtral neva tutorial (moe + token fusion + siglip) by @paul-gibbons :: PR: #9926
+- [NeMo-UX] Add more NeMo Logger tests by @ashors1 :: PR: #9795
+- Akoumparouli/mixtral fixes for r2.0.0rc1 by @akoumpa :: PR: #9911
+- R2.0.0rc1 clip fix by @Slyne :: PR: #9871
+- [NeMo-UX] Add missing docstrings and update some defaults by @ashors1 :: PR: #9895
+- Add REST service requirements.txt by @oyilmaz-nvidia :: PR: #9923
+- add bert latest fix by @JRD971000 :: PR: #9921
+- remove empy reconfigure_limit_batches by @akoumpa :: PR: #9934
+- fix mem by @terrykong :: PR: #9964
+- Run a sample query for a quantized model conditionally by @janekl :: PR: #9965
+- Add pydantic-settings  by @oyilmaz-nvidia :: PR: #9961
+- Resiliency features update by @jbieniusiewi :: PR: #9714
+- [NeMo-UX] Wrap task config save in a try/except by @ashors1 :: PR: #9956
+- [NeMo-UX] Update default PTL logging `save_dir` by @ashors1 :: PR: #9954
+- Fix lita tutorial by @Slyne :: PR: #9980
+- Add deploy and REST API support to NeMo 2.0 by @athitten :: PR: #9834
+- ci: Allow changelog manual (#10156) by @ko3n1g :: PR: #10157
+- docs: Add changelog by @ko3n1g :: PR: #10155
+- add manifest file by @ko3n1g :: PR: #10161
+
+</details>
+
+## NVIDIA Neural Modules 2.0.0rc0
+
+### Highlights
+
+#### LLM and MM
+
+##### Models
+
+- Megatron Core RETRO
+  - Pre-training
+  - Zero-shot Evaluation
+
+- Pretraining, conversion, evaluation, SFT, and PEFT for:
+  - Mixtral 8X22B
+  - Llama 3
+  - SpaceGemma
+
+- Embedding Models Fine Tuning
+  - Mistral
+  - BERT
+
+- BERT models
+  - Context Parallel
+  - Distributed checkpoint
+
+- Video capabilities with NeVa
+
+##### Performance
+
+- Distributed Checkpointing
+  - Torch native backend
+  - Parallel read/write
+  - Async write
+
+- Multimodal LLM (LLAVA/NeVA)
+  - Pipeline Parallelism support
+  - Sequence packing support
+
+##### Export
+
+- Integration of Export & Deploy Modules into NeMo Framework container
+  - Upgrade to TRT-LLM 0.9
+
+#### Speech (ASR & TTS)
+
+##### Models
+
+- AED Multi Task Models (Canary) - Multi-Task Multi-Lingual Speech Recognition / Speech Translation model
+- Multimodal Domain - Speech LLM supporting SALM Model
+- Parakeet-tdt_ctc-1.1b Model - RTFx of > 1500 (can transcribe 1500 seconds of audio in 1 second)
+- Audio Codec 16kHz Small - NeMo Neural Audio Codec for discretizing speech for use in LLMs
+  - mel_codec_22khz_medium
+  - mel_codec_44khz_medium
+
+##### Perf Improvements
+
+- Transcribe() upgrade - Enables one line transcribe with files, tensors, data loaders
+- Frame looping algorithm for RNNT faster decoding - Improves Real Time Factor (RTF) by 2-3x
+- Cuda Graphs + Label-Looping algorithm for RNN-T and TDT Decoding - Transducer Greedy decoding at over 1500x RTFx, on par with CTC Non-Autoregressive models
+- Semi Sorted Batching support - External User contribution that speeds up training by 15-30%.
+
+##### Customization
+
+- Context biasing for CTC word stamping - Improve accuracy for custom vocabulary and pronunciation
+  - Longform Inference
+  - Longform inference support for AED models
+- Transcription of multi-channel audio for AED models
+
+##### Misc
+
+- Upgraded webdataset - Speech and LLM / Multimodal unified container
+
+### Detailed Changelogs
+
+#### ASR
+  
+<details><summary>Changelog</summary>
+
+- Enable using hybrid asr models in CTC Segmentation tool by @erastorgueva-nv :: PR: #8828
+- TDT confidence fix by @GNroy :: PR: #8982
+- Fix union type annotations for autodoc+mock-import rendering by @pzelasko :: PR: #8956
+- NeMo dev doc restructure by @yaoyu-33 :: PR: #8896
+- Improved random seed configuration for Lhotse dataloaders with docs by @pzelasko :: PR: #9001
+- Fix #8948, allow preprocessor to be stream captured to a cuda graph when doing per_feature normalization by @galv :: PR: #8964
+- [ASR] Support for transcription of multi-channel audio for AED models by @anteju :: PR: #9007
+- Add ASR latest news by @titu1994 :: PR: #9073
+- Fix docs errors and most warnings by @erastorgueva-nv :: PR: #9006
+- PyTorch CUDA allocator optimization for dynamic batch shape dataloading in ASR by @pzelasko :: PR: #9061
+- RNN-T and TDT inference: use CUDA graphs by default by @artbataev :: PR: #8972
+- Fix #8891 by supported GPU-side batched CTC Greedy Decoding by @galv :: PR: #9100
+- Update branch for notebooks and ci in release by @ericharper :: PR: #9189
+- Enable CUDA graphs by default only for transcription by @artbataev :: PR: #9196
+- rename paths2audiofiles to audio by @nithinraok :: PR: #9209
+- Fix ASR_Context_Biasing.ipynb contains FileNotFoundError by @andrusenkoau :: PR: #9233
+- Cherrypick: Support dataloader as input to `audio` for transcription (#9201) by @titu1994 :: PR: #9235
+- Update Online_Offline_Microphone_VAD_Demo.ipynb by @stevehuang52 :: PR: #9252
+- Dgalvez/fix greedy batch strategy name r2.0.0rc0 by @galv :: PR: #9243
+- Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward by @galv :: PR: #9246
+- Fix loading github raw images on notebook by @nithinraok :: PR: #9282
+- typos by @nithinraok :: PR: #9314
+- Re-enable cuda graphs in training modes. by @galv :: PR: #9338
+- add large model stable training fix and contrastive loss update for variable seq by @nithinraok :: PR: #9259
+- Fix conv1d package in r2.0.0rc0  by @pablo-garay :: PR: #9369
+- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347) by @titu1994 :: PR: #9350
+- Make a backward compatibility for old MSDD configs in label models by @tango4j :: PR: #9377
+- Force diarizer to use CUDA if cuda is available and if device=None. by @tango4j :: PR: #9380
+
+</details>
+  
+#### TTS
+
+<details><summary>Changelog</summary>
+
+- [TTS] Add tutorial for training audio codecs by @rlangman :: PR: #8723
+- Update radtts.py by @blisc :: PR: #9097
+- [Nemo CICD] RADTTS test optional by @pablo-garay :: PR: #9112
+- Remove Radtts CI test by @blisc :: PR: #9144
+- Fix T5 G2P Input and Output Types by @blisc :: PR: #9224
+
+</details>
+
+#### LLM and MM
+
+<details><summary>Changelog</summary>
+
+- Rachitg/dpa by @rachitgarg91 :: PR: #8911
+- Remove precision args in trainer due to PTL update by @yaoyu-33 :: PR: #8908
+- Huvu/mcore retro by @huvunvidia :: PR: #8861
+- fsdp tp > 1 bug fix by @dimapihtar :: PR: #8947
+- Fix memory leak at loss func by @minitu :: PR: #8868
+- change the condition for get qkv tensor from linear_qkv output in mcoremixin by @HuiyingLi :: PR: #8965
+- Add safety checks for 'data' key in MegatronGPTModel cfg by @HuiyingLi :: PR: #8991
+- [NeMo-UX] Adding MegatronParallel by @cuichenx :: PR: #8987
+- Skip top_p computations when set to 1.0 by @odelalleau :: PR: #8905
+- Gemma bug by @cuichenx :: PR: #8962
+- [NeMo-UX] Adding megatron strategy by @marcromeyn :: PR: #8995
+- Quantized checkpoint support in export and deploy modules by @janekl :: PR: #8859
+- add geglu to mlp swap by @JRD971000 :: PR: #8999
+- add timeout for new_group by @acphile :: PR: #8998
+- Zero-shot evaluation pipeline for mcore RETRO by @huvunvidia :: PR: #8941
+- Added fusion for squared relu by @sanandaraj5597 :: PR: #8963
+- Developer Documents for mcore RETRO by @huvunvidia :: PR: #9026
+- [NeMo-UX] Adding GPTModel & MockDataModule by @marcromeyn :: PR: #9011
+- Adding unit test for mcore RETRO model by @huvunvidia :: PR: #9022
+- docs and simplification of cmd args by @arendu :: PR: #8979
+- [NeMo-UX] Add checkpoint-io to MegatronStrategy by @marcromeyn :: PR: #9057
+- Enable Sequence Packing and Pipeline Parallel in NeVA by @yaoyu-33 :: PR: #8957
+- Mingyuanm/add back fp8 support to sd by @Victor49152 :: PR: #9070
+- unfused lora by @arendu :: PR: #9004
+- Handle case where num_query_groups is set to null for LoRA config setup by @vysarge :: PR: #9075
+- Alit/griffin by @JRD971000 :: PR: #9021
+- Implement DistributedCheckpointIO by @mikolajblaz :: PR: #9016
+- Video Neva Pretraining + Inference Implementation by @paul-gibbons :: PR: #9095
+- HF to .nemo for Mixtral-8x22B-instruct by @akoumpa :: PR: #9060
+- mcore ds updates by @dimapihtar :: PR: #8951
+- Alit/griffin perf by @JRD971000 :: PR: #9107
+- Add assert for max_steps to be positive in MegatronGPTSFTModel by @athitten :: PR: #9110
+- Extend sequence length padding for GPT SFT to account for context parallel by @vysarge :: PR: #8869
+- Update gpt dataset config parameter for mock by @thomasdhc :: PR: #9118
+- Add Mcore DistributedDataParallel and distributed optimizer into Nemo by @gdengk :: PR: #9034
+- Revert "Add assert for max_steps to be positive in MegatronGPTSFTMode… by @pablo-garay :: PR: #9128
+- scripts to convert HF lora to nemo by @arendu :: PR: #9102
+- Prevent duplicated checkpoints by @mikolajblaz :: PR: #9015
+- add TN/ITN link in speech tools list by @erastorgueva-nv :: PR: #9142
+- Cleanup deprecated files and temporary changes by @cuichenx :: PR: #9088
+- Use DP+CP groups as the FSDP sharding domain by @erhoo82 :: PR: #9145
+- CUDA memory profile by @erhoo82 :: PR: #9096
+- Fix missing func for T5 model by @gdengk :: PR: #9141
+- Add knob for load_directly_on_device by @mikolajblaz :: PR: #9125
+- Revert rope fusion defaults by @cuichenx :: PR: #9238
+- Update nemo.export module for quantized models by @janekl :: PR: #9250
+- Fix circular import for MM dataprep notebook by @cuichenx :: PR: #9287
+- neva media_type + text generation default fix by @paul-gibbons :: PR: #9257
+- fix lora and ptuning and isort/black by @oyilmaz-nvidia :: PR: #9290
+- add check if num layers is divisible by pp size by @dimapihtar :: PR: #9208
+- Fix P-tuning for Llama based models by @apanteleev :: PR: #9297
+- add deprecation warnings by @pablo-garay :: PR: #9266
+- move pooler under post_process by @dimapihtar :: PR: #9328
+- add deprecation note for nmt by @dimapihtar :: PR: #9342
+- Fix incorrect checkpoint removal logic (#9192) by @mikolajblaz :: PR: #9204
+- fix fp16 precision issue by @dimapihtar :: PR: #9376
+- Fix module.training for Neva in FusedAttn backward which causes nan by @yaoyu-33 :: PR: #8877
+
+</details>
+
+#### Export
+
+<details><summary>Changelog</summary>
+
+- Updates for TRT-LLM 0.9 by @oyilmaz-nvidia :: PR: #8873
+- Mingyuanm/sdxl export by @Victor49152 :: PR: #8926
+- Avoid unpacking NeMo checkpoints before exporting to TRT-LLM by @apanteleev :: PR: #8866
+- Update gemma for trt-llm 0.9 by @oyilmaz-nvidia :: PR: #8974
+- TRT-LLM export P-tuning related fixes by @apanteleev :: PR: #8863
+
+</details>
+
+#### General Improvements
+
+<details><summary>Changelog</summary>
+
+- Update package info by @ericharper :: PR: #8793
+- [Nemo CICD] Update mcore 4.13.24 by @pablo-garay :: PR: #8917
+- Akoumparouli/low mem mixtral ckpt converter by @akoumpa :: PR: #8895
+- Adding RETRO tests to Action Tests (cicd-main.yml)  by @huvunvidia :: PR: #8942
+- Akoumparouli/fix sd train 2 by @akoumpa :: PR: #8883
+- Update te install for jenkins by @ericharper :: PR: #8954
+- [Nemo CICD] Add last job depending on others for blocking check by @pablo-garay :: PR: #8959
+- Minor quantization pipeline updates by @janekl :: PR: #8924
+- Fix External CLIP Converter by @yaoyu-33 :: PR: #8960
+- PP support in LoRA merge script by @cuichenx :: PR: #8934
+- Update PR template by @ericharper :: PR: #8978
+- Update Latest News by @shashank3959 :: PR: #8837
+- Fix incorrect link to latest news in README by @shashank3959 :: PR: #8985
+- Update dependency install for LLM and MM by @ericharper :: PR: #8990
+- Temporarily remove mcore dep by @ericharper :: PR: #9010
+- [Nemo CICD] further specialize runners for more parallelism by @pablo-garay :: PR: #9036
+- Update mm dataprep notebook based on feedback by @cuichenx :: PR: #9029
+- Fix import in lora merge script by @cuichenx :: PR: #9032
+- [Nemo CICD] Run when labeled:Run CICD by @pablo-garay :: PR: #9044
+- [Nemo CICD] Add tag/label for 1-gpu runner by @pablo-garay :: PR: #9046
+- [Nemo CICD] checkout v4 by @pablo-garay :: PR: #9048
+- [Nemo CICD] Remove temp test change by @pablo-garay :: PR: #9049
+- remove in-place addition for dreambooth train with text encoder by @Victor49152 :: PR: #8825
+- Mingyuanm/sdxl quantization notebook by @Victor49152 :: PR: #9042
+- [Nemo CICD] Trigger on comment issued by @pablo-garay :: PR: #9062
+- zarr ckpt to torch_dist ckpt converter by @dimapihtar :: PR: #8842
+- Restore PTQ tests for Llama2 (reopened) by @janekl :: PR: #9064
+- add clip H config by @JRD971000 :: PR: #9082
+- [NeMo-UX] Add mixed-precision plugin by @marcromeyn :: PR: #9065
+- Comment baichuan test and update pr template by @ericharper :: PR: #9085
+- Add safe extraction of nemo tar files by @athitten :: PR: #8976
+- Improved `shard_id` parsing in `LazyNemoTarredIterator`, enables AIS dataloading by @pzelasko :: PR: #9077
+- [NeMo-UX] Add mistral-7b model by @marcromeyn :: PR: #9066
+- Llama3 Conversion Script Update by @suiyoubi :: PR: #9089
+- dehardcode test string by @JimmyZhang12 :: PR: #8865
+- [Nemo CICD] Try trigger cicd run on comment by @pablo-garay :: PR: #9111
+- Lhotse dataloading: RIR augmentation and nemo/tarred input support for RIR and noise aug by @pzelasko :: PR: #9109
+- mixtral evaluation PR by @Slyne :: PR: #8989
+- [Nemo CICD] Revert: run GHA cicd on comment by @pablo-garay :: PR: #9119
+- [Nemo CICD] Comment out flaky test: running too long by @pablo-garay :: PR: #9123
+- [Nemo CICD] Add timeout to unit tests by @pablo-garay :: PR: #9132
+- [Nemo CICD] Indicate optional test in name (prefix) by @pablo-garay :: PR: #9139
+- video neva null image+video folder path fix by @paul-gibbons :: PR: #9116
+- [NeMo-UX] Add data module by @cuichenx :: PR: #9133
+- NeMo Inference Requirements by @oyilmaz-nvidia :: PR: #9093
+- Remove debug print by @maanug-nv :: PR: #9074
+- Remove legacy CI by @pablo-garay :: PR: #9149
+- Update support for push_to_hf_hub() by @titu1994 :: PR: #9159
+- [Nemo CICD] comment out flaky PTQ tests by @pablo-garay :: PR: #9160
+- Update branch by @ericharper :: PR: #9211
+- dist adam transpose fix by @dimapihtar :: PR: #9239
+- [Nemo CICD] Increase time limit for Speech_Checkpoints_tests (#9186) by @pablo-garay :: PR: #9247
+- Pin transformers by @ericharper :: PR: #9261
+- Fix typo in HF tutorial by @titu1994 :: PR: #9302
+
+</details>
+
+## NVIDIA Neural Modules 1.23.0
+
+### Highlights
+
+#### Models
+
+##### Nvidia Starcoder 2 - 15B
+
+- Announcement - https://developer.nvidia.com/blog/unlock-your-llm-coding-potential-with-starcoder2/
+- AI Foundation Model Inference  - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/starcoder2-15b
+- https://huggingface.co/bigcode/starcoder2-15b
+
+##### NeMo Canary
+Announcement - https://nvidia.github.io/NeMo/blogs/2024/2024-02-canary/
+
+- https://huggingface.co/nvidia/canary-1b
+
+#### NeMo LLM
+
+- Falcon
+- Code Llama
+- StarCoder
+- GPT perf improvements
+- Context parallelism
+- Mistral
+- Mixtral (without expert parallelism)
+- Mcore GPT Dataset integration
+
+#### NeMo MM
+- CLIP
+- Stable Diffusion (supporting LoRA)
+- Imagen
+- ControlNet (for SD)
+- Instruct pix2pix (for SD)
+- LLAVA
+- NeVA
+- DreamFusion++
+- NSFW filtering
+
+#### NeMo ASR
+
+- Lhotse Dataloading support #7880
+- Canary: Multi task multi lingual ASR #8242
+- LongForm Audio for Diarization #7737
+- Faster algorithm for RNN-T Greedy #7926
+- Cache-Aware streaming notebook #8296
+
+#### NeMo TTS
+
+#### NeMo Vision
+
+#### Known Issues
+
+##### ASR
+
+###### RNNT WER calculation when fused batch size > 1 during validation / test step()
+
+Previously, the RNNT metric was stateful while the CTC one was not ([r1.22.0](https://github.com/NVIDIA/NeMo/blob/r1.22.0/nemo/collections/asr/metrics/rnnt_wer_bpe.py#L419-L420), [r1.23.0](https://github.com/NVIDIA/NeMo/blob/r1.23.0/nemo/collections/asr/metrics/wer.py#L333))
+
+Therefore this calculation in the RNNT joint for fused operation worked properly. However with the unification of metrics in r1.23.0, a bug was introduced where only the last sub-batch of metrics calculates the scores and does not accumulate. This is patched via https://github.com/NVIDIA/NeMo/pull/8587 and will be fixed in the next release.
+
+**Workaround**: Explicitly disable fused batch size during inference using the following command 
+
+```python
+from omegaconf import open_dict
+model = ...
+decoding_cfg = model.cfg.decoding
+with open_dict(decoding_cfg):
+  decoding_cfg.fused_batch_size = -1
+model.change_decoding_strategy(decoding_cfg)
+```
+
+Note: This bug does not affect scores calculated via model.transcribe() (since it does not calculate metrics during inference, just text), or using the `transcribe_speech.py` or `speech_to_text_eval.py` in `examples/asr`.
+
+###### Two failing unit tests due to a change in expected results, caused by lhotse version update
+
+#### Container
+
+For additional information regarding NeMo containers, please visit: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo
+
+`docker pull nvcr.io/nvidia/nemo:24.01.speech`
+
+#### ASR
+
+<details><summary>Changelog</summary>
+
+- Update link to yaml file in ASR_with_Transducers.ipynb by @Faith-Nchifor :: PR: #8014
+- Use convert_hf_dataset_to_nemo by @karpnv :: PR: #8017
+- Update asr_language_modeling.rst: Add a missing word by @martin0258 :: PR: #8007
+- spelling mistake by @orena1 :: PR: #7903
+- update asr eval by @stevehuang52 :: PR: #8045
+- fix noise aug by @stevehuang52 :: PR: #8057
+- Various fixes for typos and urls by @titu1994 :: PR: #8066
+- [Fix] Increase length check tolerance to prevent test failing by @anteju :: PR: #8067
+- Add text metrics to asr eval by @stevehuang52 :: PR: #8087
+- fix device setting to allow using accelerator cpu by @orena1 :: PR: #8084
+- .ctm in data simulator annotator compliant with RT-09 specification by @popcornell :: PR: #8004
+- Fix AST eval by @stevehuang52 :: PR: #8112
+- fix: numba.*_num_threads resets torch num_threads #8141 by @itzsimpl :: PR: #8145
+- Update dependencies by @titu1994 :: PR: #8156
+- NeMo + Lhotse integration by @pzelasko :: PR: #7880
+- Speedup RNN-T greedy decoding by @artbataev :: PR: #7926
+- [docker] Install k2 before NeMo for faster image rebuilding by @pzelasko :: PR: #8204
+- [docs] Add --force_codec to tarred dataset creation examples by @pzelasko :: PR: #8227
+- Temporarily use the previous RNN-T decoding algorithm as default by @artbataev :: PR: #8226
+- Make TDT inference not require duration params by @hainan-xv :: PR: #8207
+- Cache Aware Streaming tutorial notebook by @erastorgueva-nv :: PR: #8296
+- fix path location and branch by @nithinraok :: PR: #8304
+- Attention encoder-decoder models for multiple speech-to-text tasks  … by @titu1994 :: PR: #8324
+- Remove asr webapp by @titu1994 :: PR: #8347
+- remove _target_ at model level in aed model config [ASR] by @krishnacpuvvada :: PR: #8351
+- Add change_vocabulary and save_tokenizers() support to Multitask ASR models by @titu1994 :: PR: #8357
+- Change default beam size by @titu1994 :: PR: #8371
+-  adding jenkins test for speech_to_text_aed model by @krishnacpuvvada :: PR: #8368
+- Add Finetuning tutorial with HF Datasets by @nithinraok :: PR: #8356
+- wer fix by @tbartley94 :: PR: #8404
+- add ensemble decoding fix by @nithinraok :: PR: #8427
+- Update k2 by @artbataev :: PR: #8492
+
+</details>
+
+#### TTS
+
+<details><summary>Changelog</summary>
+
+- [TTS] Scale sampler steps by number of devices by @rlangman :: PR: #7947
+- Add All Multimodal Source Code Part 2: Text to image, x to nerf by @yaoyu-33 :: PR: #7970
+- [TTS] Add period discriminator and feature matching loss to codec recipe by @rlangman :: PR: #7884
+- Added VectorQuantizer base class by @anteju :: PR: #8011
+
+</details>
+
+#### LLMS
+
+<details><summary>Changelog</summary>
+
+- Add interface to set NCCL options of each process group by @erhoo82 :: PR: #7923
+- Support O2 training of PEFT and SFT by @cuichenx :: PR: #7971
+- [NLP] Access scaler only in FP16 case by @janekl :: PR: #7916
+- [NLP] Minor improvements in Llama conversion script by @janekl :: PR: #7978
+- [NLP] Use helpers from utils_funcs.py in Llama conversion by @janekl :: PR: #7979
+- [NLP] Remove replace_sampler_ddp (deprecated in Trainer) by @janekl :: PR: #7981
+- Reworked MegatronPretrainingRandomBatchSampler to correctly handle epochs > 1 by @trias702 :: PR: #7920
+- Remove deprecated arguments from TE's TransformerLayer by @jbaczek :: PR: #7917
+- Add All Multimodal Source Code by @yaoyu-33 :: PR: #7791
+- First draft of mcore bert model in NeMo by @shanmugamr1992 :: PR: #7814
+- Support Falcon Variants (7B/40B/180B) in Mcore NeMo by @xuanzic :: PR: #7666
+- FSDP + Tensor Parallelism by @erhoo82 :: PR: #7897
+- Packed Sequence by @cuichenx :: PR: #7945
+- Adding method back that was removed accidentally by @ericharper :: PR: #8038
+- [NLP] ArtifactItem with init=True to make it debuggable by @janekl :: PR: #7980
+- SFT patch: (1) enable sequence parallelism and (2) enable profile by @erhoo82 :: PR: #7963
+- migration to PTL 2.0 for spellmapper model by @bene-ges :: PR: #7924
+- Change the megatron config lr scheduler default and fix to change partitions script by @shan18 :: PR: #8094
+- (1) Add SHARP interface to M-CORE, (2) use send/recv to send train loss to the first rank instead of b-cast by @erhoo82 :: PR: #7793
+- Reconfigure limit_val_batches only for int by @athitten :: PR: #8099
+- Fixing wrapper and moving it to base class by @shanmugamr1992 :: PR: #8055
+- fix gated_linear_unit bug by @Agoniii :: PR: #8042
+- Fix Adapter for MCore models by @cuichenx :: PR: #8124
+- add war fix for sync issues by @gshennvm :: PR: #8130
+- Improve PEFT UX by @cuichenx :: PR: #8131
+- Enhance flexibility by passing callbacks as method argument by @michal2409 :: PR: #8015
+- context parallelism by @xrennvidia :: PR: #7739
+- Make pipelined TP comm overlap available with mcore by @erhoo82 :: PR: #8005
+- remove deprecated scripts by @arendu :: PR: #8138
+- adding OnlineSampleMapping by @arendu :: PR: #8137
+- Add distopt support for FP8 params and BF16 optimizer state by @timmoon10 :: PR: #7909
+- Revert adding OnlineSampleMapping by @pablo-garay :: PR: #8164
+- Token count and sequence length logging for MegatronGPTSFTModel by @vysarge :: PR: #8136
+- Use latest apex internal API by @jbaczek :: PR: #8129
+- tune specific params in the base model by @arendu :: PR: #7745
+- Virtual pipeline parallel support for MegatronGPTSFTModel by @vysarge :: PR: #7964
+- removed deprecated peft model by @arendu :: PR: #8183
+- remove more deprecated files by @arendu :: PR: #8169
+- Pre-generate cu_seqlens argmin and max_seqlen to remove host-to-device sync by @erhoo82 :: PR: #8108
+- Add the interface to use SHARP to FSDP strategy by @erhoo82 :: PR: #8202
+- Multimodal required NLP base model changes by @yaoyu-33 :: PR: #8188
+- [NLP] Improve and unify loading state_dict for community models by @janekl :: PR: #7977
+- Rename Finetuning Scripts by @cuichenx :: PR: #8201
+- Final multimodal PR with our recent developments on MM side by @yaoyu-33 :: PR: #8127
+- Add include_text parameter to SFT dataloaders by @Kipok :: PR: #8198
+- Add random_seed argument to generate by @Kipok :: PR: #8162
+- Added support for neptune logger by @harishankar-gopalan :: PR: #8210
+- Pre-compute max_seqlen and cu_seqlens_argmin in all model-parallel cases by @erhoo82 :: PR: #8222
+- Use PackedSeqParams in accordance with changes in Megatron-LM by @cuichenx :: PR: #8205
+- Fix to peft & virtual pipeline parallel unsupported check by @vysarge :: PR: #8216
+- Fixed the tp overlap switch by @sanandaraj5597 :: PR: #8195
+- add knobs for rope/swiglu fusion by @lhb8125 :: PR: #8184
+- Added sample cpu_offloading switch to YAML by @sanandaraj5597 :: PR: #8148
+- Syncing random seed between ranks in generate by @Kipok :: PR: #8230
+- add first_val_step to mcore scheduler by @JimmyZhang12 :: PR: #8150
+- Correct padding for SFT input data to account for sequence parallel + TE's fp8 op dimension requirements by @vysarge :: PR: #8240
+- Mistral 7b conversion script by @akoumpa :: PR: #8052
+- switch to mcore dataset [with FIM support] by @dimapihtar :: PR: #8149
+- Mixtral to NeMo conversion script. by @akoumpa :: PR: #8155
+- fixes to accomendate mcore changes by @HuiyingLi :: PR: #8261
+- Allow MegatronPretrainingRandomSampler to do multi-epoch training by @trias702 :: PR: #8239
+- Add dist ckpt support for regular optimizers by @mikolajblaz :: PR: #7749
+- add deallocate pipeline output optimization by @JimmyZhang12 :: PR: #8279
+- Fix memory leak caused by context parallelism hanging references by omegaconf by @JimmyZhang12 :: PR: #8299
+- distributed fused adam + rampup bs support by @dimapihtar :: PR: #8302
+- Update PEFT Doc by @cuichenx :: PR: #8262
+- Converter script fixes for mixtral/mistral by @akoumpa :: PR: #8272
+- Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 by @erhoo82 :: PR: #8334
+- Enable megatron core loggers for GPT pretraining by @ashbhandare :: PR: #8354
+- mcore ds fix by @dimapihtar :: PR: #8283
+- release updates by @dimapihtar :: PR: #8378
+- Mcore customization doc by @HuiyingLi :: PR: #8298
+- updated link to pubmed by @nithinraok :: PR: #8402
+- mcore customization doc minor fix by @HuiyingLi :: PR: #8421
+- Fixing mcore bert for TP, PP and SP by @shanmugamr1992 :: PR: #8336
+- Add settings to suppress bf16 compile errors in CI on V100 by @athitten :: PR: #8481
+- MoE parameter passing by @akoumpa :: PR: #8255
+- Add fp8 support for SD/Update notebook paths by @Victor49152 :: PR: #8489
+
+</details>
+
+#### NeMo Tools
+
+<details><summary>Changelog</summary>
+
+- SDE bugfix log by @Jorjeous :: PR: #8430
+
+</details>
+
+#### General Improvements
+
+<details><summary>Changelog</summary>
+
+- Add news section to README by @ericharper :: PR: #7984
+- Fixing conversion script to work for code llama by @shanmugamr1992 :: PR: #7997
+- Fix crash when converting to mcore a model using rotary embeddings by @odelalleau :: PR: #7998
+- Added a procedure for Windows users, README by @Jorjeous :: PR: #7942
+- Update manifest.py to speedup loading tarred datasets by @stevehuang52 :: PR: #7900
+- [Fix] Fixed name of a test by @anteju :: PR: #7986
+- Fix lora merge script by @cuichenx :: PR: #8113
+- Support transcoding audio formats when saving tarred datasets (FLAC, OPUS) by @pzelasko :: PR: #8102
+- README edit to change Apple Silicon install instructions (to fix a break introduced by pytorch 2) by @stephenmcconnachie :: PR: #8122
+- Fixes NVIDIA/apex installation to not erroneously install the  pkg by @terrykong :: PR: #8126
+- Graphviz fix by @GNroy :: PR: #7843
+- Update README.rst by @fayejf :: PR: #8154
+- Fix TP>1 issue for conversion script by @cuichenx :: PR: #8144
+- Support torch jit script by @artbataev :: PR: #8027
+- NeMo Multimodal Docs and Tests Initial PR by @yaoyu-33 :: PR: #8028
+- Remove left-over prints in NeMo+Lhotse code by @pzelasko :: PR: #8180
+- Upgrade to DLFW PyTorch 23.12 by @ericharper :: PR: #8163
+- Add Lhotse support for  key in NeMo manifests by @pzelasko :: PR: #8197
+- Fix CPU Initialization and TP>1 for LoRA Merge Script by @cuichenx :: PR: #8199
+- Add support in Neural Typecheck to disable semantic checks by @titu1994 :: PR: #8212
+- Pin lhotse=1.19.2 in r1.23.0 by @pzelasko :: PR: #8303
+- Multimodal r1.23.0 bug fix  by @yaoyu-33 :: PR: #8315
+- MCore dataset compatibility for tokenizers by @vysarge :: PR: #8390
+- Update NFA video download link by @erastorgueva-nv :: PR: #8406
+- Update MM Dataprep Tutorial by @cuichenx :: PR: #8410
+- Fix dreambooth data sampler issue by @yaoyu-33 :: PR: #8400
+- Fix a bug in CTM line processing function for multi-speaker data simulations by @tango4j :: PR: #8416
+- Akoumparouli/mistral bugfix by @akoumpa :: PR: #8353
+- pin to 0.5.0 by @ericharper :: PR: #8465
+- Update NeMo Multimodal Requirements by @yaoyu-33 :: PR: #8515
+- Fix link in multimodal dataprep tutorial by @cuichenx :: PR: #8517
+
+</details>
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 2f75014b86d1..d015796096fc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -13,6 +13,7 @@ NVIDIA NeMo Framework is an end-to-end, cloud-native framework designed to build
 - Activation Recomputation
 - Positional Embeddings and Positional Interpolation
 - Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) with `TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_
+- Knowledge Distillation-based training with `TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_
 - Sequence Packing
 
 `NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for:
diff --git a/docs/source/nlp/distillation.rst b/docs/source/nlp/distillation.rst
new file mode 100644
index 000000000000..01e5d599cb8c
--- /dev/null
+++ b/docs/source/nlp/distillation.rst
@@ -0,0 +1,58 @@
+.. _megatron_distillation:
+
+Distillation
+==========================
+
+Knowledge Distillation (KD)
+--------------------------------
+
+KD involves using information from an existing trained model to train a second (usually smaller, faster) model, thereby "distilling" knowledge from one to the other.
+
+Distillation has two primary benefits: faster convergence and higher end accuracy than traditional training.
+
+In NeMo, distillation is enabled by the `NVIDIA TensorRT Model Optimizer (ModelOpt) <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ library -- a library to optimize deep-learning models for inference on GPUs.
+
+The logits-distillation process consists of the following steps:
+
+1. Loading both student and teacher model checkpoints (must support same parallelism strategy, if any)
+2. Training until convergence, where forward passes are run on both models (and backward only on student), performing a specific loss function between the logits.
+3. Saving the final student model.
+
+
+Example
+^^^^^^^
+The example below shows how to run the distillation script for LLama models.
+
+The script must be launched correctly with the number of processes equal to tensor parallelism. This is achieved with the ``torchrun`` command below:
+
+.. code-block:: bash
+
+STUDENT_CKPT="path/to/student.nemo"  # can also be None (will use default architecture found in examples/nlp/language_modeling/conf/megatron_llama_distill.yaml)
+TEACHER_CKPT="path/to/teacher.nemo"
+TOKENIZER="path/to/tokenizer.model"
+DATA_PATHS="[1.0,path/to/documents]"
+FINAL_SAVE_FILE="final_checkpoint.nemo"
+TP=4
+
+NPROC=$TP
+launch_config="torchrun --nproc_per_node=$NPROC"
+
+${launch_config} examples/nlp/language_modeling/megatron_gpt_distillation.py \
+    model.restore_from_path=$STUDENT_CKPT \
+    model.kd_teacher_restore_from_path=$TEACHER_CKPT \
+    model.tensor_model_parallel_size=${TP} \
+    model.tokenizer.model=$TOKENIZER \
+    model.data.data_prefix=$DATA_PATHS \
+    model.nemo_path=$FINAL_SAVE_FILE \
+    trainer.precision=bf16 \
+    trainer.devices=$NPROC
+
+For large models, the command can be used in multi-node setting. For example, this can be done with `NeMo Framework Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_ using Slurm.
+
+
+Limitations
+^^^^^^^^^^^
+* Only Megatron Core-based GPT models are supported
+* Only logit-pair distillation is supported for now
+* Pipeline parallelism not yet supported
+* FSDP strategy not yet supported
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
index 6060726d5ba8..435e3e24350e 100644
--- a/docs/source/starthere/intro.rst
+++ b/docs/source/starthere/intro.rst
@@ -102,7 +102,7 @@ This final step involves installing the TensorRT Model Optimizer package.
 
 .. code-block:: bash
 
-    pip install nvidia-modelopt[torch]~=0.13.0 --extra-index-url https://pypi.nvidia.com
+    pip install nvidia-modelopt[torch]~=0.15.0 --extra-index-url https://pypi.nvidia.com
 
 
 .. code-block:: bash
diff --git a/examples/asr/speech_to_text_eval.py b/examples/asr/speech_to_text_eval.py
index 7b59ffe3fbfc..0024783d2362 100644
--- a/examples/asr/speech_to_text_eval.py
+++ b/examples/asr/speech_to_text_eval.py
@@ -64,7 +64,7 @@
 
 import json
 import os
-from dataclasses import dataclass, is_dataclass
+from dataclasses import dataclass, field, is_dataclass
 from typing import Optional
 
 import torch
@@ -99,8 +99,13 @@ class EvaluationConfig(transcribe_speech.TranscriptionConfig):
     only_score_manifest: bool = False
     scores_per_sample: bool = False
 
-    text_processing: Optional[TextProcessingConfig] = TextProcessingConfig(
-        punctuation_marks=".,?", separate_punctuation=False, do_lowercase=False, rm_punctuation=False,
+    text_processing: Optional[TextProcessingConfig] = field(
+        default_factory=lambda: TextProcessingConfig(
+            punctuation_marks=".,?",
+            separate_punctuation=False,
+            do_lowercase=False,
+            rm_punctuation=False,
+        )
     )
 
 
diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index d54ee34c18cd..7f5bc0659150 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -116,7 +116,7 @@
 class ModelChangeConfig:
 
     # Sub-config for changes specific to the Conformer Encoder
-    conformer: ConformerChangeConfig = ConformerChangeConfig()
+    conformer: ConformerChangeConfig = field(default_factory=ConformerChangeConfig)
 
 
 @dataclass
@@ -164,14 +164,14 @@ class TranscriptionConfig:
     overwrite_transcripts: bool = True
 
     # Decoding strategy for CTC models
-    ctc_decoding: CTCDecodingConfig = CTCDecodingConfig()
+    ctc_decoding: CTCDecodingConfig = field(default_factory=CTCDecodingConfig)
 
     # Decoding strategy for RNNT models
     # enable CUDA graphs for transcription
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
+    rnnt_decoding: RNNTDecodingConfig = field(default_factory=lambda: RNNTDecodingConfig(fused_batch_size=-1))
 
     # Decoding strategy for AED models
-    multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
+    multitask_decoding: MultiTaskDecodingConfig = field(default_factory=MultiTaskDecodingConfig)
     # Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs:
     # Implicit single-turn assuming default role='user' (works with Canary-1B)
     #  +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes
@@ -187,7 +187,7 @@ class TranscriptionConfig:
     att_context_size: Optional[list] = None
 
     # Use this for model-specific changes before transcription
-    model_change: ModelChangeConfig = ModelChangeConfig()
+    model_change: ModelChangeConfig = field(default_factory=ModelChangeConfig)
 
     # Config for word / character error rate calculation
     calculate_wer: bool = True
diff --git a/examples/llm/megatron_gpt_pretraining.py b/examples/llm/megatron_gpt_pretraining.py
index d3d049e4296e..73e96a23bf81 100644
--- a/examples/llm/megatron_gpt_pretraining.py
+++ b/examples/llm/megatron_gpt_pretraining.py
@@ -92,7 +92,7 @@ def get_args():
         callbacks=callbacks,
         log_every_n_steps=1,
         limit_val_batches=2,
-        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed", amp_O2=False),
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
     )
 
     nemo_logger = NeMoLogger(
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 85609c2dd9b0..95cb1dcf48ec 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -117,6 +117,7 @@ model:
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+  scale_positional_embedding: False # Apply scaling for RoPE frequencies 
 
   ## Reset learning rate schedule.
   # 1. reset_lr=True, reset_lr_steps=False. When pre-training an existing checkpoint "from scratch" on a different dataset.
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml b/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml
new file mode 100644
index 000000000000..052de2ff7d48
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml
@@ -0,0 +1,222 @@
+name: megatron_llama_distill
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: "megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}"
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  restore_from_path: null # overwrite if starting from pretrained
+  restore_from_ckpt: null # overwrite if resuming training
+  kd_teacher_restore_from_path: null # overwrite always! (path to teacher weights)
+  nemo_path: null # overwrite always! (path to save final model)
+
+  mcore_gpt: True
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 8 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 8 # 7b: 32 | 13b: 40 | 70b: 80
+  hidden_size: 4096 # 7b: 4096 | 13b: 5120 | 70b: 8192
+  ffn_hidden_size: 11008 # Transformer FFN hidden size. Usually 4 * hidden_size. | 7b: 11008 | 13b: 13824 | 70b: 28672
+  num_attention_heads: 32 # 7b: 32 | 13b: 40 | 70b: 64
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: "rmsnorm" # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+  layernorm_epsilon: 1e-5
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: False # Whether to use bias terms in all weight matrices.
+  activation: "fast-swiglu" # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: "pre_ln" # Options ['pre_ln', 'post_ln', 'normformer']
+  openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  position_embedding_type: "rope" # Position embedding type. Options ['learned_absolute', 'rope']
+  rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  attention_type: "multihead" # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: False # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: 32 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 32 | 13b: 40 | 70b: 8
+
+  tokenizer:
+    library: "sentencepiece"
+    type: null
+    model: ??? # /path/to/tokenizer.model
+    vocab_file: null
+    merge_file: null
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+
+  # Mixed precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+  # Miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Transformer Engine
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  data:
+    # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below:
+    # data_prefix:
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+    data_prefix: ???
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 98,1,1
+    seq_length: 4096
+    skip_warmup: True
+    num_workers: 2
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10 # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 1e-5
diff --git a/examples/nlp/language_modeling/megatron_gpt_distillation.py b/examples/nlp/language_modeling/megatron_gpt_distillation.py
new file mode 100644
index 000000000000..b3ecdcfc5522
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_gpt_distillation.py
@@ -0,0 +1,576 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from pathlib import Path
+from typing import Any, Dict
+
+import modelopt.torch.distill as mtd
+import modelopt.torch.opt as mto
+import torch.multiprocessing as mp
+from omegaconf import DictConfig, OmegaConf, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
+except ImportError:
+    raise AssertionError("ModelOpt only supports Megatron-Core.")
+import types
+from abc import ABCMeta
+from importlib.metadata import version
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.parallel_state import get_tensor_model_parallel_group
+from megatron.core.transformer import TransformerConfig
+from pkg_resources import packaging
+from torch import Tensor
+from torch.nn.modules.loss import _Loss
+
+from nemo.collections.common.parts.utils import extend_instance
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import (
+    EmbeddingScalingMixin,
+    MegatronGPTModel,
+    get_specs,
+)
+from nemo.collections.nlp.modules.common.megatron.module import Float16Module
+from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+from nemo.utils.model_utils import load_config, unwrap_model
+
+mp.set_start_method("spawn", force=True)
+
+# Model config fields which affect the structure of the model.
+# These will be the values taken from the teacher's config file,
+# while the rest remain the same as student's.
+MODEL_ARCHITECHTURE_KEYS = [
+    "encoder_seq_length",
+    "max_position_embeddings",
+    "num_layers",
+    "hidden_size",
+    "ffn_hidden_size",
+    "num_attention_heads",
+    "init_method_std",
+    "use_scaled_init_method",
+    "hidden_dropout",
+    "attention_dropout",
+    "ffn_dropout",
+    "kv_channels",
+    "apply_query_key_layer_scaling",
+    "normalization",
+    "layernorm_epsilon",
+    "do_layer_norm_weight_decay",
+    "make_vocab_size_divisible_by",
+    "pre_process",
+    "post_process",
+    "persist_layer_norm",
+    "bias",
+    "activation",
+    "headscale",
+    "transformer_block_type",
+    "openai_gelu",
+    "normalize_attention_scores",
+    "position_embedding_type",
+    "rotary_percentage",
+    "attention_type",
+    "share_embeddings_and_output_weights",
+    "overlap_p2p_comm",
+    "batch_p2p_comm",
+    "num_query_groups",
+    "seq_len_interpolation_factor",
+    "rotary_base",
+    "scale_positional_embedding",
+]
+
+
+class DistillationMegatronGPTModel(MegatronGPTModel):
+    """ModelOpt Distillation-enabled subclass of `MegatronGPTModel`."""
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        """
+        Constructor.
+
+        Args:
+            cfg: Model configuration.
+            trainer: Nemo trainer instance.
+        """
+        logging.info("Distillation: Enabled.")
+        assert cfg.kd_teacher_restore_from_path is not None, "Path to teacher weights must be provided."
+        assert cfg.pipeline_model_parallel_size == 1, "Distillation mode does not yet support Pipeline Parallel."
+
+        super().__init__(cfg, trainer)
+
+        logging.info("\n\n************** Final model configuration ***********")
+        logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    def model_provider_func(self, pre_process, post_process):
+        """Model depends on pipeline paralellism."""
+        if not self.mcore_gpt:
+            raise AssertionError("ModelOpt Distillation only supports MCore model edition.")
+
+        model = MCoreGPTModel(
+            config=self.transformer_config,
+            transformer_layer_spec=get_specs(
+                self.spec_name,
+                self.transformer_config,
+                self.transformer_engine,
+                self.cfg.get('hyena', None),
+            ),
+            vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
+            max_sequence_length=self.cfg.get('encoder_seq_length', 512),
+            pre_process=pre_process,
+            post_process=post_process,
+            parallel_output=True,
+            share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True),
+            position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'),
+            rotary_percent=self.cfg.get('rotary_percentage', 1.0),
+            seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
+            rotary_base=self.cfg.get('rotary_base', 10000),
+        )
+        if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage():
+            extend_instance(model.embedding, EmbeddingScalingMixin)
+
+        # [ModelOpt] Distillation mode.
+        distill_cfg = load_distillation_config(self.transformer_config)
+        # Intialize DistillationModel.
+        kd_config = {
+            "teacher_model": (_teacher_provider, [self.cfg, copy.deepcopy(self.trainer)], {}),
+            "criterion": distill_cfg["criterion"],
+            "loss_balancer": distill_cfg["loss_balancer"],
+        }
+        model = mtd.convert(model, mode=[("kd_loss", kd_config)])
+
+        # Additional tweaks needed for MCore/Nemo.
+        adjust_distillation_model_for_mcore(model, distill_cfg)
+
+        return model
+
+    def get_forward_output_and_loss_func(self, validation_step=False, tuning=False):
+        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
+
+            # Get data batch
+            batch = self.get_batch(dataloader_iter, tuning)
+
+            # Transfer needed data to GPU
+            required_keys = set()
+            max_seqlen = batch['max_seqlen'].squeeze() if 'max_seqlen' in batch else None
+            cu_seqlens_argmin = batch['cu_seqlens_argmin'] if 'cu_seqlens_argmin' in batch else None
+            if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+                required_keys.update(batch.keys())
+            else:
+                required_keys.add('attention_mask')
+                if 'cu_seqlens' in batch:
+                    required_keys.add('cu_seqlens')
+                if parallel_state.is_pipeline_first_stage():
+                    required_keys.update(('tokens', 'position_ids'))
+                if parallel_state.is_pipeline_last_stage():
+                    required_keys.update(('labels', 'loss_mask'))
+            if self.get_attention_mask_from_fusion and 'attention_mask' in required_keys:
+                required_keys.remove('attention_mask')
+            batch = {
+                key: val.cuda(non_blocking=True) if key in required_keys and isinstance(val, torch.Tensor) else None
+                for key, val in batch.items()
+            }
+
+            # slice batch along sequence dimension for context parallelism
+            batch = self.get_batch_on_this_context_parallel_rank(batch)
+
+            # Model forward pass
+            forward_args = {
+                'input_ids': batch['tokens'],
+                'position_ids': batch['position_ids'],
+                'attention_mask': None if self.get_attention_mask_from_fusion else batch['attention_mask'],
+                'labels': batch['labels'] if 'labels' in batch else None,
+                'loss_mask': batch['loss_mask'],
+            }
+
+            # TODO: @eharper can we add this to mcore?
+            forward_args.pop('loss_mask')
+
+            if 'cu_seqlens' in batch:  # packed sequence from GPTSFTPackedDataset
+                # these args are passed eventually into TEDotProductAttention.forward()
+                cu_seqlens = batch['cu_seqlens'].squeeze()  # remove batch size dimension (mbs=1)
+                # remove -1 "paddings" added in collate_fn
+                if cu_seqlens_argmin is not None:
+                    cu_seqlens = cu_seqlens[: cu_seqlens_argmin.item()]
+                else:
+                    cu_seqlens = cu_seqlens[: torch.argmin(cu_seqlens)]
+
+                try:
+                    from megatron.core.packed_seq_params import PackedSeqParams
+                except (ImportError, ModuleNotFoundError) as e:
+                    mcore_version = packaging.version.Version(version('megatron-core'))
+                    logging.error(
+                        f"megatron-core v{mcore_version} does not support training with packed sequence. "
+                        "Please use megatron-core >= 0.5.0, or set model.data.train_ds.packed_sequence=False"
+                    )
+                    raise e
+
+                forward_args['packed_seq_params'] = PackedSeqParams(
+                    cu_seqlens_q=cu_seqlens,
+                    cu_seqlens_kv=cu_seqlens,
+                    max_seqlen_q=max_seqlen,
+                    max_seqlen_kv=max_seqlen,
+                    qkv_format='thd',
+                )
+
+            output_tensor = model(**forward_args)
+
+            def loss_func(output_tensor):
+                if validation_step:
+                    loss_for_ub = self.loss_func(
+                        batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor, validation_step=True
+                    )
+                else:
+                    # [ModelOpt] KD Loss for a micro-batch (ub)
+                    unwrapped_model = unwrap_model(model, (Float16Module, MCoreFloat16Module))
+                    loss_for_ub = unwrapped_model.compute_kd_loss(
+                        loss_reduction_fn=lambda x: self.loss_func(
+                            batch['loss_mask'], batch['num_valid_tokens_in_ub'], x
+                        )
+                    )
+                cp_size = parallel_state.get_context_parallel_world_size()
+                if validation_step and not self.validation_drop_last:
+                    num_valid_tokens_in_ub = batch['num_valid_tokens_in_ub']
+                    if loss_for_ub.isnan():
+                        assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input'
+                        loss_sum_for_ub = torch.zeros_like(loss_for_ub)
+                        num_valid_tokens_in_ub = 0
+                    else:
+                        if self.sample_weight == 'constant':
+                            num_valid_tokens_in_ub = 1
+                        loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub
+
+                    loss_sum_and_ub_size_all_gpu = torch.cat(
+                        [
+                            loss_sum_for_ub.clone().detach().view(1),
+                            torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(),
+                        ]
+                    )
+                    # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds)
+                    torch.distributed.all_reduce(
+                        loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group()
+                    )
+                    return loss_for_ub * cp_size, {'loss_sum_and_ub_size': loss_sum_and_ub_size_all_gpu}
+                else:
+                    reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+                    return loss_for_ub * cp_size, {'avg': reduced_loss}
+
+            return output_tensor, loss_func
+
+        return fwd_output_and_loss_func
+
+    def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor, validation_step=False):
+        loss = super().loss_func(loss_mask, num_valid_tokens_in_ub, output_tensor)
+        if not validation_step and self.cfg.tensor_model_parallel_size > 1:
+            # [ModelOpt] KD loss requires extra all-reduce to ensure same values across MP-TP partitions.
+            loss = torch.sum(tensor_parallel.gather_from_tensor_model_parallel_region(loss.reshape(1)))
+        return loss
+
+    def configure_optimizers(self):
+        with self.model.hide_teacher_model():
+            return super().configure_optimizers()
+
+
+########################################################
+
+
+def load_distillation_config(student_cfg: TransformerConfig) -> Dict[str, Any]:
+    """Create a default distillation config for MCore GPT Models.
+
+    Args:
+        student_cfg: Model config for student model.
+    """
+    logit_pair = ("output_layer", "output_layer")  # logit module names for MCoreGPTModel
+    tp_enabled = student_cfg.tensor_model_parallel_size > 1
+
+    cfg = {
+        "criterion": {tuple(logit_pair): LogitsKLLoss(tensor_parallel=tp_enabled)},
+        "loss_balancer": None,
+        "skip_lm_loss": True,
+    }
+    return cfg
+
+
+class BaseLoss(_Loss, metaclass=ABCMeta):
+    """Abstract base class for Megatron distillation losses."""
+
+    def __init__(self, tensor_parallel: bool = False):
+        """Constructor.
+
+        Args:
+            tensor_parallel: Whether tensor parallelism is enabled or not.
+        """
+        super().__init__()
+        self._tensor_parallel = tensor_parallel
+
+    def pre_forward(self, predictions: Tensor, targets: Tensor) -> Tuple[Tensor, Tensor]:
+        """Performs projection of student tensor to match teacher's size if necessary."""
+        if isinstance(predictions, tuple):
+            # `ColumnParallelLinear` returns bias too
+            predictions, targets = predictions[0], targets[0]
+
+        targets = targets.detach()
+
+        return predictions, targets
+
+    def post_forward(self, loss: Tensor) -> Tensor:
+        """Reshapes tensor from [s, b] to [b, s] for upcoming loss masking."""
+        loss = loss.transpose(0, 1).contiguous()
+        return loss
+
+
+class LogitsKLLoss(BaseLoss):
+    """Calculates KL-Divergence loss between two logits tensors without reducing the sequence dim."""
+
+    def __init__(
+        self,
+        tensor_parallel: bool = False,
+        temperature: float = 1.0,
+        reverse: bool = False,
+    ):
+        """
+        Constructor.
+
+        Args:
+            tensor_parallel: Whether tensor parallelism is enabled or not.
+            temperature: Divide tensors by this value prior to calculating loss.
+            reverse: Whether to reverse the loss as KLD(teacher, student) instead of KLD(student, teacher)
+        """
+        super().__init__(tensor_parallel)
+        self._temperature = temperature
+        self._reverse = reverse
+
+    def forward(self, predictions: Tensor, targets: Tensor) -> Tensor:
+        """
+        Forward function.
+
+        Args:
+            predictions: Student model tensors (size [s, b, h])
+            targets: Teacher model tensors (size [s, b, h])
+
+        Returns:
+            KLD loss of tensors (size [b, s])
+        """
+        predictions, targets = self.pre_forward(predictions, targets)
+
+        # Division by temp should happen prior to finding max for both student and teacher.
+        # Currently we don't use temperature in any of ours runs (temp=1.0)
+        output_teacher = targets.float() / self._temperature
+        output_student = predictions.float() / self._temperature
+
+        # Compute local softmax, and the reweight to compute global softmax.
+        if self._tensor_parallel:
+
+            # Maximum value along vocab dimension across all GPUs.
+            teacher_logits_max, _ = torch.max(output_teacher, dim=-1)
+            torch.distributed.all_reduce(
+                teacher_logits_max,
+                op=torch.distributed.ReduceOp.MAX,
+                group=get_tensor_model_parallel_group(),
+            )
+            output_teacher = output_teacher - teacher_logits_max.unsqueeze(dim=-1)
+
+            denom_teacher = torch.sum(torch.exp(output_teacher), dim=-1)
+            # We can't use `gather_from_tensor_model_parallel_region` here since it discards
+            # gradients from other ranks - we need to all_reduce the gradients as well.
+            denom_teacher = all_reduce_autograd(denom_teacher, group=get_tensor_model_parallel_group())
+
+            # Maximum value along vocab dimension across all GPUs.
+            student_logits_max, _ = torch.max(output_student, dim=-1)
+            torch.distributed.all_reduce(
+                student_logits_max,
+                op=torch.distributed.ReduceOp.MAX,
+                group=get_tensor_model_parallel_group(),
+            )
+            output_student = output_student - student_logits_max.unsqueeze(dim=-1).detach()
+
+            denom_student = torch.sum(torch.exp(output_student), dim=-1)
+            denom_student = all_reduce_autograd(denom_student, group=get_tensor_model_parallel_group())
+
+            slen, bsz, sharded_vocab_size = output_student.shape
+            student_log_prob = output_student - torch.log(denom_student).view(slen, bsz, 1).expand(
+                slen, bsz, sharded_vocab_size
+            )
+            teacher_log_prob = output_teacher - torch.log(denom_teacher).view(slen, bsz, 1).expand(
+                slen, bsz, sharded_vocab_size
+            )
+
+            if self._reverse:
+                loss = torch.sum(
+                    F.kl_div(teacher_log_prob, student_log_prob, reduction="none", log_target=True),
+                    dim=-1,
+                )
+            else:
+                loss = torch.sum(
+                    F.kl_div(student_log_prob, teacher_log_prob, reduction="none", log_target=True),
+                    dim=-1,
+                )
+
+        else:
+            if self._reverse:
+                loss = torch.sum(
+                    F.kl_div(
+                        F.log_softmax(output_teacher, dim=-1), F.softmax(output_student, dim=-1), reduction="none"
+                    ),
+                    dim=-1,
+                )
+            else:
+                loss = torch.sum(
+                    F.kl_div(
+                        F.log_softmax(output_student, dim=-1), F.softmax(output_teacher, dim=-1), reduction="none"
+                    ),
+                    dim=-1,
+                )
+
+        return self.post_forward(loss)
+
+
+class _AllReduce(torch.autograd.Function):
+    """Implementation from old PyTorch `torch.distributed.nn.parallel`."""
+
+    @staticmethod
+    def forward(ctx, op, group, tensor):
+        ctx.group, ctx.op = group, op
+        tensor = tensor.clone()
+        torch.distributed.all_reduce(tensor, op=op, group=group)
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, None, _AllReduce.apply(ctx.op, ctx.group, grad_output))
+
+
+def all_reduce_autograd(tensor, op=torch.distributed.ReduceOp.SUM, group=torch.distributed.group.WORLD):
+    return _AllReduce.apply(op, group, tensor)
+
+
+def adjust_distillation_model_for_mcore(model: mtd.DistillationModel, distill_cfg: Dict[str, Any]):
+    """Extra modifcations to ``mtd.DistillationModel`` requried for Megatron-Core."""
+
+    # HACK: Get rid of ModelOpt Distillation state
+    mto.ModeloptStateManager(model)._state.pop()
+
+    # HACK: Hide teacher during `sharded_state_dict` method.
+    def _sharded_state_dict(self, *args, **kwargs) -> ShardedStateDict:
+        with self.hide_teacher_model():
+            return self._sharded_state_dict(*args, **kwargs)
+
+    model._sharded_state_dict = model.sharded_state_dict
+    model.sharded_state_dict = types.MethodType(_sharded_state_dict, model)
+
+    # HACK: Skip `lm_loss` bypassing it when training if not needed for backprop.
+    def _compute_language_model_loss(self, labels, logits) -> Tensor:
+        if self.training:
+            return torch.zeros_like(labels)
+        return self._compute_language_model_loss(labels, logits)
+
+    if distill_cfg["skip_lm_loss"]:
+        model._compute_language_model_loss = model.compute_language_model_loss
+        model.compute_language_model_loss = types.MethodType(_compute_language_model_loss, model)
+
+
+########################################################
+
+
+def _teacher_provider(cfg: DictConfig, trainer: Trainer) -> MCoreGPTModel:
+    """Teacher model factory (must be a non-local function to pickle)."""
+    logging.info("Distillation: Loading teacher weights...")
+    teacher_model_cfg = _merge_model_arch_fields(cfg, cfg.kd_teacher_restore_from_path)
+
+    model = MegatronGPTModel.restore_from(
+        cfg.kd_teacher_restore_from_path,
+        override_config_path=teacher_model_cfg,
+        trainer=trainer,
+    )
+    teacher_model_module_list = model.get_model_module_list()
+    logging.info("Distillation: ... teacher weights loaded.")
+    return teacher_model_module_list[0]
+
+
+def _merge_model_arch_fields(cfg: DictConfig, model_load_path: str) -> DictConfig:
+    """Overwrite model-architecture fields of a config with a checkpoint's."""
+    model_cfg = load_config(model_load_path)
+    model_arch_keys = [k for k in MODEL_ARCHITECHTURE_KEYS if k in model_cfg]
+    model_arch_cfg = OmegaConf.masked_copy(model_cfg, model_arch_keys)
+    with open_dict(cfg):
+        cfg = OmegaConf.merge(cfg, model_arch_cfg)
+        # Add tokenizer from model if not provided
+        if OmegaConf.is_missing(cfg.tokenizer, "model"):
+            cfg.tokenizer = model_cfg.tokenizer
+    return cfg
+
+
+########################################################
+
+
+@hydra_runner(config_path="conf", config_name="megatron_llama_distill")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+    exp_manager(trainer, cfg.exp_manager)
+
+    with open_dict(cfg):
+        cfg.model.name = "modelopt"  # Convert TE layernorm spec to unfused format
+        # HACK: Checkpoint-loading process hangs/loops if this isn't present here for some reason.
+        cfg.model.target = "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel"
+
+    # Continual training
+    if cfg.model.get("restore_from_path") is not None:
+        # Option 1: Restore only the model weights from a .nemo file
+        logging.info(f"Continual training: loading weights from {cfg.model.restore_from_path}")
+
+        # Merge model config's architecture fields with the one from the checkpoint
+        cfg.model = _merge_model_arch_fields(cfg.model, cfg.model.restore_from_path)
+
+        model = DistillationMegatronGPTModel.restore_from(
+            restore_path=cfg.model.restore_from_path,
+            override_config_path=cfg.model,
+            trainer=trainer,
+            save_restore_connector=NLPSaveRestoreConnector(),
+        )
+        logging.info("... weights loaded.")
+    elif cfg.model.get("restore_from_ckpt") is not None:
+        # Option 2: Restore both model weights and optimizer states from a PTL checkpoint
+        logging.info(f"Continual training: loading weights and optimizer states from {cfg.model.restore_from_ckpt}")
+        trainer.ckpt_path = Path(cfg.model.restore_from_ckpt)
+        model = DistillationMegatronGPTModel(cfg.model, trainer)
+        logging.info("... weights and optimizer states loaded.")
+
+    # Start new pretraining or resume from a checkpoint if it exists
+    else:
+        logging.info("Instantiating new model ...")
+        model = DistillationMegatronGPTModel(cfg.model, trainer)
+        logging.info("... model instantiated.")
+
+    trainer.fit(model)
+
+    if cfg.model.nemo_path:
+        model.save_to(cfg.model.nemo_path)
+    else:
+        logging.warning("Skipping saving final model as no `model.nemo_path` provided.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/asr/data/data_simulation.py b/nemo/collections/asr/data/data_simulation.py
index 5ee2ad19b951..369a95d9ee9a 100644
--- a/nemo/collections/asr/data/data_simulation.py
+++ b/nemo/collections/asr/data/data_simulation.py
@@ -1122,7 +1122,7 @@ def _generate_session(
                 alignments=self._alignments,
                 session_name=filename,
                 speaker_id=speaker_ids[speaker_turn],
-                start=int(start / self._params.data_simulator.sr),
+                start=float(start / self._params.data_simulator.sr),
             )
 
             self.annotator.annote_lists['ctm'].extend(new_ctm_entries)
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 55663a04a5a0..c4c88091cb28 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -30,7 +30,7 @@
 from lhotse.dataset.dataloading import resolve_seed
 from lhotse.lazy import LazyIteratorChain, LazyJsonlIterator
 from lhotse.serialization import open_best
-from lhotse.utils import compute_num_samples
+from lhotse.utils import compute_num_samples, ifnone
 
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 
@@ -332,14 +332,16 @@ def __iter__(self) -> Generator[Cut, None, None]:
 
         # Handle NeMo tarred manifests with offsets.
         # They have multiple JSONL entries where audio paths end with '-sub1', '-sub2', etc. for each offset.
-        offset_pattern = re.compile(r'^.+(-sub\d+)$')
+        offset_pattern = re.compile(r'^(?P<stem>.+)(?P<sub>-sub\d+)(?P<ext>\.\w+)?$')
 
         for sid in shard_ids:
             manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
 
             def basename(d: dict) -> str:
                 return (
-                    k[: -len(m.group(1))] if (m := offset_pattern.match(k := d["audio_filepath"])) is not None else k
+                    m.group("stem") + ifnone(m.group("ext"), "")
+                    if (m := offset_pattern.match(k := d["audio_filepath"])) is not None
+                    else k
                 )
 
             shard_manifest: dict[str, list[dict]] = groupby(basename, self.shard_id_to_manifest[sid])
diff --git a/nemo/collections/common/parts/utils.py b/nemo/collections/common/parts/utils.py
index e8eb1b999292..75783815548a 100644
--- a/nemo/collections/common/parts/utils.py
+++ b/nemo/collections/common/parts/utils.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import math
 import os
 from typing import Iterable, List
 
+logger = logging.getLogger(__name__)
+
 import einops
 import torch
 import torch.nn as nn
@@ -109,6 +112,31 @@ def extend_instance(obj, mixin):
     )  # mixin needs to go first for our forward() logic to work
 
 
+def apply_rope_scaling(freqs):
+    # Apply scaling for RoPE frequencies
+    logger.info("apply rope scaling ...")
+    # Values obtained from grid search
+    scale_factor = 8
+    low_freq_factor = 1
+    high_freq_factor = 4
+    old_context_len = 8192  # original llama3 length
+
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    new_freqs = []
+    for freq in freqs:
+        wavelen = 2 * math.pi / freq
+        if wavelen < high_freq_wavelen:
+            new_freqs.append(freq)
+        elif wavelen > low_freq_wavelen:
+            new_freqs.append(freq / scale_factor)
+        else:
+            assert low_freq_wavelen != high_freq_wavelen
+            smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+            new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
+    return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+
+
 def mask_sequence_tensor(tensor: torch.Tensor, lengths: torch.Tensor):
     """
     For tensors containing sequences, zero out out-of-bound elements given lengths of every element in the batch.
diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
index 6622054c4f98..b2d9b5ba8cca 100644
--- a/nemo/collections/llm/gpt/data/pre_training.py
+++ b/nemo/collections/llm/gpt/data/pre_training.py
@@ -190,6 +190,7 @@ def test_dataloader(self) -> EVAL_DATALOADERS:
 
     def _create_dataloader(self, dataset, mode, **kwargs) -> WrappedDataLoader:
         self.init_global_step = self.trainer.global_step
+        self.data_sampler.init_global_step = self.init_global_step
         dataloader = WrappedDataLoader(
             mode=mode,
             dataset=dataset,
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index 913144d1bf5f..71b60d5df59f 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -17,12 +17,26 @@ class to provide a specific implementation of the forward method.
     """
 
     def forward(self, x):
-        linear_output, bias = self.to_wrap(x)
-        if isinstance(linear_output, tuple) and len(linear_output) == 2:
-            linear_output, layernorm_output = linear_output
-            adapter_output = self.adapter(layernorm_output)
-        else:
-            adapter_output = self.adapter(x)
+        linear_output = self.to_wrap(x)
+        assert isinstance(
+            linear_output, tuple
+        ), f"{self.to_wrap} should return a tuple but instead returns {linear_output}"
+        """ Four cases for the wrapped module's return values
+        1. nothing: (out, None)
+        2. return_bias: (out, bias)
+        2. return_layernorm_output: ((out, ln_out), None)
+        3. both: (out, bias, ln_out)
+        """
+        if len(linear_output) == 2:
+            linear_output, bias = linear_output
+            if isinstance(linear_output, tuple) and len(linear_output) == 2:
+                linear_output, layernorm_output = linear_output
+                x = layernorm_output
+        elif len(linear_output) == 3:
+            linear_output, bias, layernorm_output = linear_output
+            x = layernorm_output
+
+        adapter_output = self.adapter(x)
         return linear_output + adapter_output, bias
 
 
@@ -72,6 +86,8 @@ class LoRA(PEFT):
     alpha: int = 32
     dropout: float = 0.0
     dropout_position: Literal['pre', 'post'] = 'post'
+    lora_A_init_method: str = "xavier"
+    lora_B_init_method: str = "zero"
 
     def transform(self, m: nn.Module, name=None, prefix=None):
         """
@@ -96,6 +112,11 @@ def transform(self, m: nn.Module, name=None, prefix=None):
                 input_is_parallel = False
                 in_features = m.in_features
                 out_features = m.out_features * tp_size
+                # LoRA is applied after layernorm, so layernorm output must be returned
+                m.return_layernorm_output = True
+                # perf optimization for LoRA + SP
+                if m.config.sequence_parallel and not m.ub_overlap_ag:
+                    m.return_layernorm_output_gathered = True
             else:  # name in ['linear_proj', 'linear_fc2']
                 # Row Parallel Linear
                 input_is_parallel = True
@@ -110,8 +131,8 @@ def transform(self, m: nn.Module, name=None, prefix=None):
                 activation='identity',
                 norm_position=None,
                 norm_type=None,
-                column_init_method="normal",
-                row_init_method="zero",
+                column_init_method=self.lora_A_init_method,
+                row_init_method=self.lora_B_init_method,
                 gather_output=False,
                 input_is_parallel=input_is_parallel,
                 dropout=self.dropout,
diff --git a/nemo/collections/llm/tokenizer.py b/nemo/collections/llm/tokenizer.py
index 3943e24ba799..808642758ebe 100644
--- a/nemo/collections/llm/tokenizer.py
+++ b/nemo/collections/llm/tokenizer.py
@@ -9,8 +9,8 @@
     track_io(
         AutoTokenizer,
         artifacts=[
-            FileArtifact("vocab_file"),
-            FileArtifact("merges_file"),
+            FileArtifact("vocab_file", required=False),
+            FileArtifact("merges_file", required=False),
         ],
     )
     __all__.append("AutoTokenizer")
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 22ee37ec361b..0198c07c82d2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -30,7 +30,7 @@
 from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
 from pytorch_lightning.trainer.trainer import Trainer
 
-from nemo.collections.common.parts.utils import extend_instance
+from nemo.collections.common.parts.utils import apply_rope_scaling, extend_instance
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
     MegatronCorePretrainingSampler,
     MegatronPretrainingRandomSampler,
@@ -77,6 +77,7 @@
 from nemo.utils.te_utils import is_float8tensor
 
 try:
+    import megatron.core as core
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
@@ -429,6 +430,7 @@ def get_inference_config(self):
     def model_provider_func(self, pre_process, post_process):
         """Model depends on pipeline paralellism."""
         if self.mcore_gpt:
+
             model = MCoreGPTModel(
                 config=self.transformer_config,
                 transformer_layer_spec=get_specs(
@@ -448,6 +450,10 @@ def model_provider_func(self, pre_process, post_process):
                 seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
                 rotary_base=self.cfg.get('rotary_base', 10000),
             )
+
+            if self.cfg.get('scale_positional_embedding', False):
+                model.rotary_pos_emb.inv_freq = apply_rope_scaling(model.rotary_pos_emb.inv_freq)
+
             if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage():
                 extend_instance(model.embedding, EmbeddingScalingMixin)
         else:
diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py
index 03afec176325..8ee3fa1c05e7 100644
--- a/nemo/export/multimodal/build.py
+++ b/nemo/export/multimodal/build.py
@@ -208,7 +208,10 @@ def forward(self, images):
             return vision_x
 
     encoder = AutoModel.from_pretrained(
-        vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True
+        vision_config["from_pretrained"],
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation='eager',
     )
     vision_encoder = encoder.vision_model
     hf_config = encoder.config
@@ -326,7 +329,10 @@ def forward(self, images):
             return vision_x
 
     encoder = AutoModel.from_pretrained(
-        vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True
+        vision_config["from_pretrained"],
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation='eager',
     )
     vision_encoder = encoder.vision_model
     hf_config = encoder.config
diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py
index 4025634ebe28..9119b2474b17 100644
--- a/nemo/lightning/io/artifact/base.py
+++ b/nemo/lightning/io/artifact/base.py
@@ -6,8 +6,9 @@
 
 
 class Artifact(ABC, Generic[ValueT]):
-    def __init__(self, attr: str):
+    def __init__(self, attr: str, required: bool = True):
         self.attr = attr
+        self.required = required
 
     @abstractmethod
     def dump(self, value: ValueT, path: Path) -> ValueT:
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 1cd45648ea20..69368599682e 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -91,6 +91,14 @@ def __call__(self, output_path: Optional[Path] = None, overwrite: bool = False)
             logging.error(f"An error occurred: {e}")
             raise
 
+        finally:
+            # Delete the lock file if it exists
+            if lock_path.exists():
+                try:
+                    os.remove(lock_path)
+                except OSError as e:
+                    logging.warning(f"Failed to remove lock file {lock_path}: {e}")
+
         return _output_path
 
     def local_path(self, base_path: Optional[Path] = None) -> Path:
@@ -152,19 +160,26 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
 
         return _trainer
 
-    def nemo_save(self, output_path: Path, trainer: pl.Trainer) -> None:
+    def nemo_save(self, output_path: Path, trainer: pl.Trainer, dump_io: bool = True) -> None:
         """
         Saves the model's state to the specified path using the trainer's current strategy.
 
         Args:
             output_path (Path): The path where the model checkpoint will be saved.
             trainer (pl.Trainer): The trainer with the strategy to save the model.
+            dump_io (bool): If True, the IO configuration will be saved to the output path.
         """
         trainer.strategy._setup_optimizers = False
         trainer.strategy._init_model_parallel = False
         trainer.strategy.setup(trainer)
         trainer.save_checkpoint(output_path)
 
+        from nemo.lightning.io.pl import TrainerContext
+        from nemo.utils.get_rank import is_global_rank_zero
+
+        if is_global_rank_zero() and dump_io:
+            TrainerContext.from_trainer(trainer).io_dump(output_path)
+
     def nemo_load(
         self, path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True
     ) -> Tuple[pl.LightningModule, pl.Trainer]:
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index 741c1400474a..d0d4d0243ff7 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -432,13 +432,24 @@ def _io_init(self, **kwargs) -> fdl.Config[Self]:
     -------
         fdl.Config[Self]: The initialized configuration object.
     """
-    return fdl.Config(type(self), **kwargs)
+    try:
+        return fdl.Config(type(self), **kwargs)
+    except Exception as e:
+        error_msg = (
+            f"Error creating fdl.Config for {type(self).__name__}: {str(e)}\n"
+            f"Arguments that caused the error: {kwargs}\n"
+            f"This may be due to unsupported argument types or nested configurations."
+        )
+        raise RuntimeError(error_msg) from e
 
 
 def _io_wrap_init(cls):
     """Wraps the __init__ method of a class to add IO functionality."""
     original_init = cls.__init__
 
+    if getattr(cls, "__wrapped_init__", False):
+        return cls
+
     @functools.wraps(original_init)
     def wrapped_init(self, *args, **kwargs):
         if hasattr(self, "io_transform_args"):
@@ -453,6 +464,7 @@ def wrapped_init(self, *args, **kwargs):
         original_init(self, *args, **kwargs)
 
     cls.__init__ = wrapped_init
+    cls.__wrapped_init__ = True
     return cls
 
 
@@ -502,6 +514,10 @@ def _io_path_elements_fn(x):
 def _artifact_transform(cfg: fdl.Config, output_path: Path):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
         current_val = getattr(cfg, artifact.attr)
+        if current_val is None:
+            if artifact.required:
+                raise ValueError(f"Artifact '{artifact.attr}' is required but not provided")
+            continue
         new_val = artifact.dump(current_val, output_path)
         setattr(cfg, artifact.attr, new_val)
 
diff --git a/nemo/lightning/pytorch/callbacks/model_transform.py b/nemo/lightning/pytorch/callbacks/model_transform.py
index 5d48851843fc..8a07566f92c3 100644
--- a/nemo/lightning/pytorch/callbacks/model_transform.py
+++ b/nemo/lightning/pytorch/callbacks/model_transform.py
@@ -71,6 +71,11 @@ def _maybe_apply_transform(self, trainer):
 
     def apply_transform(self, trainer):
         self.model_transform(trainer.model)
+        from pytorch_lightning.utilities import model_summary
+
+        logging.info(
+            f"After applying model_transform:\n" f"{model_summary.summarize(trainer.lightning_module, max_depth=1)}"
+        )
 
     @property
     def _needs_to_call(self) -> bool:
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index 3909c680cfc1..08a56f854e4a 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -25,6 +25,7 @@ def __init__(
         rampup_batch_size: Optional[List[int]] = None,
         dataloader_type: Literal["single", "cyclic", "batch"] = "single",
         init_consumed_samples: int = 0,
+        init_global_step: int = 0,
     ):
         self.seq_len = seq_len
         self.micro_batch_size = micro_batch_size
@@ -35,6 +36,7 @@ def __init__(
         self.prev_consumed_samples = self.init_consumed_samples
         self.if_first_step = 0
         self.prev_global_batch_size = None
+        self.init_global_step = init_global_step
 
     def setup(self, global_rank: int) -> None:
         from nemo.lightning.data import setup_microbatch_calculator
@@ -92,9 +94,7 @@ def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModul
 
         self.prev_global_batch_size = self.current_global_batch_size
 
-        # TODO: Add consumed samples
-        consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_consumed_samples)
-
+        consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_global_step)
         pl_module.log(
             'consumed_samples',
             consumed_samples,
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
index 5e43e09c0420..65b7c6292249 100644
--- a/nemo/lightning/pytorch/plugins/mixed_precision.py
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -26,11 +26,17 @@
 AnyT = TypeVar("AnyT")
 
 
+def get_optim_config(optimizer: Optimizer):
+    try:
+        return optimizer.mcore_optimizer.config
+    except:
+        raise ValueError("Failed to extract optimizer config from module.")
+
+
 class MegatronMixedPrecision(MixedPrecision):
     def __init__(
         self,
         precision: Literal["16-mixed", "bf16-mixed"],
-        amp_O2: bool = False,
         device="cuda",
     ) -> None:
         if precision == "bf16-mixed":
@@ -39,21 +45,6 @@ def __init__(
             scaler = GradScaler(init_scale=2**32, growth_interval=1000, hysteresis=2)
 
         super().__init__(precision, device, scaler)
-        self.amp_O2 = amp_O2
-
-    def connect(
-        self, model: Module, optimizers: List[Optimizer], lr_schedulers: List[Any]
-    ) -> Tuple[Module, List[Optimizer], List[Any]]:
-        """Connects this plugin to the accelerator and the training process."""
-        from nemo.core.optim import MainParamsOptimizerWrapper
-
-        if not optimizers or not self.amp_O2 or isinstance(optimizers[0], MainParamsOptimizerWrapper):
-            return model, optimizers, lr_schedulers
-
-        _optimizers = [*optimizers]
-        _optimizers[0] = self.convert_optimizer(_optimizers[0])
-
-        return model, _optimizers, lr_schedulers
 
     def convert_module(self, module: Module) -> Module:
         """Convert the module parameters to the precision type this plugin handles.
@@ -68,11 +59,11 @@ def convert_module(self, module: Module) -> Module:
             config = get_model_config(module.module)
             config.fp16 = self.precision == "16-mixed"
             config.bf16 = self.precision == "bf16-mixed"
-            if isinstance(module.module, Float16Module):
-                new_float16_module = Float16Module(config, module.module.module)
-                module.module = new_float16_module
-            else:
+            config.autocast = False
+            if hasattr(module, 'module'):
                 module.module = Float16Module(config, module.module)
+            else:
+                module = Float16Module(config, module)
 
         return module
 
@@ -82,16 +73,10 @@ def convert_optimizer(self, optimizer: Optimizer) -> Optimizer:
         This is optional and depends on the precision limitations during optimization.
 
         """
-        from nemo.core.optim import MainParamsOptimizerWrapper
-
-        if isinstance(optimizer, MainParamsOptimizerWrapper) or not self.amp_O2:
-            return optimizer
-
-        return MainParamsOptimizerWrapper(
-            optimizer,
-            fp32_grad_accum=True,
-            contiguous_grad_bucket=True,
-        )
+        optim_config = get_optim_config(optimizer)
+        assert optim_config.bf16 == (self.precision == "bf16-mixed"), "BF16 enabled on model but not on optimizer"
+        assert optim_config.fp16 == (self.precision == "fp16-mixed"), "BF16 enabled on model but not on optimizer"
+        return optimizer
 
     def convert_input(self, data: AnyT) -> AnyT:
         """Convert model inputs (forward) to the floating point precision type of this plugin.
@@ -120,7 +105,7 @@ def optimizer_step(
     ) -> None:
         from nemo.core.optim import MainParamsOptimizerWrapper
 
-        if not self.amp_O2 and not isinstance(optimizer, MainParamsOptimizerWrapper):
+        if not isinstance(optimizer, MainParamsOptimizerWrapper):
             return super().optimizer_step(optimizer, model, closure, **kwargs)
 
         if self.scaler is None:
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 99eee9ad6bd3..f22ed5b40a20 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -4,7 +4,7 @@
 import os
 import shutil
 from collections import OrderedDict
-from contextlib import ExitStack
+from contextlib import ExitStack, contextmanager
 from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ContextManager, Dict, List, Literal, Mapping, Optional, TypeVar, Union, cast
@@ -621,6 +621,8 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr
         assert self.megatron_parallel is not None
 
         _strategy_lib.load_model_state_dict(self.megatron_parallel, checkpoint, strict=strict)
+        for opt in self.optimizers:
+            opt.reload_model_params()
 
     @property
     @override
@@ -731,6 +733,14 @@ def parallelism(self) -> ParallelismConfig:
             pipeline_dtype=self.pipeline_dtype,
         )
 
+    @contextmanager
+    @override
+    def tensor_init_context(self, empty_init: Optional[bool] = None):
+        # Materializaton happens in `setup()`
+        # @akoumparouli: using Parent's tensor_init_context causes mcore
+        # parameters to be initialized on GPU instead of (assumed) CPU.
+        yield
+
 
 def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
     """PTL considers checkpoints as .ckpt files.
diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py
index da1a77c3c731..81f4d12bd3fb 100644
--- a/nemo/lightning/pytorch/trainer.py
+++ b/nemo/lightning/pytorch/trainer.py
@@ -10,18 +10,24 @@
 
 
 class Trainer(pl.Trainer, IOMixin):
+
+    def add_io(self, obj):
+        """Recurse to the leaves of a container and add io functionality to non-serializable leaves"""
+        if isinstance(obj, (dict, list)):
+            if isinstance(obj, dict):
+                obj = obj.values()
+            for item in obj:
+                self.add_io(item)
+        else:
+            if not serialization.find_node_traverser(type(obj)):
+                track_io(type(obj))
+            return
+
     def io_init(self, **kwargs) -> fdl.Config[Self]:
         # Each argument of the trainer can be stateful so we copy them
         cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items()}
 
-        for val in cfg_kwargs.values():
-            if not serialization.find_node_traverser(type(val)):
-                track_io(type(val))
-            elif isinstance(val, list):
-                for v in val:
-                    if not serialization.find_node_traverser(type(v)):
-                        track_io(type(v))
-
+        self.add_io(cfg_kwargs)
         return fdl.Config(type(self), **cfg_kwargs)
 
     def to_fabric(self, callbacks=None, loggers=None) -> Fabric:
diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py
index ec0650a90e7d..e18a2a3d3b6c 100644
--- a/nemo/utils/callbacks/cuda_graph.py
+++ b/nemo/utils/callbacks/cuda_graph.py
@@ -139,16 +139,6 @@ def to_tensor(self, value, name):
     return value
 
 
-def register_key(self, key, meta, value):
-    # PyTorch Lightning creates all metrics on GPU, but creating the metric on
-    # its input device is prefered.
-    # Refer to: https://github.com/Lightning-AI/pytorch-lightning/blob/2.0.7/src/lightning/pytorch/trainer/connectors/logger_connector/result.py#L409
-    metric = _ResultMetric(meta, isinstance(value, torch.Tensor))
-    device = value.device if isinstance(value, torch.Tensor) else self.device
-    metric = metric.to(device)
-    self[key] = metric
-
-
 def update_metrics(self, key, value, batch_size):
     # PyTorch Lightning always move all metrics to GPU, but moving the metric to
     # its input device is prefered.
@@ -374,8 +364,6 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
         # Use smart metrics to avoid syncs
         LightningModule.__orig_to_tensor__ = LightningModule._LightningModule__to_tensor
         LightningModule._LightningModule__to_tensor = to_tensor
-        _ResultCollection.__orig_register_key__ = _ResultCollection.register_key
-        _ResultCollection.register_key = register_key
         _ResultCollection.__orig_update_metrics__ = _ResultCollection.update_metrics
         _ResultCollection.update_metrics = update_metrics
 
@@ -409,8 +397,6 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
 
         LightningModule._LightningModule__to_tensor = LightningModule.__orig_to_tensor__
         del LightningModule.__orig_to_tensor__
-        _ResultCollection.register_key = _ResultCollection.__orig_register_key__
-        del _ResultCollection.__orig_register_key__
         _ResultCollection.update_metrics = _ResultCollection.__orig_update_metrics__
         del _ResultCollection.__orig_update_metrics__
 
diff --git a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
index 1f8c69b5b240..35039f8d02e9 100644
--- a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
+++ b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
@@ -62,7 +62,7 @@ def get_args():
         help="Path to output mcore weights file (ends in .nemo).",
     )
     parser.add_argument(
-        "--cpu-only",
+        "--cpu_only",
         action="store_true",
         help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
         "but this option makes the conversion script significantly slower.",
@@ -73,7 +73,7 @@ def get_args():
         help="Run conversion again and overwrite output file when the output file already exists",
     )
     parser.add_argument(
-        "--ignore-if-missing",
+        "--ignore_if_missing",
         default="rotary_pos_emb.inv_freq",
         help="comma-separated list of state_dict keys that are known to be missing in mcore and can be safely ignored",
     )
@@ -158,8 +158,8 @@ def build_key_mapping(nemo_cfg):
         for wb in ('weight', 'bias') if has_layernorm_bias else ('weight',):
             mcore_to_nemo_mapping.update(
                 {
-                    f"{mcore_prefix}.{i}.self_attention.linear_qkv.layer_norm_{wb}": f"{nemo_prefix}.{i}.input_layernorm.{wb}",
-                    f"{mcore_prefix}.{i}.mlp.linear_fc1.layer_norm_{wb}": f"{nemo_prefix}.{i}.post_attention_layernorm.{wb}",
+                    f"{mcore_prefix}.{i}.input_layernorm.{wb}": f"{nemo_prefix}.{i}.input_layernorm.{wb}",
+                    f"{mcore_prefix}.{i}.pre_mlp_layernorm.{wb}": f"{nemo_prefix}.{i}.post_attention_layernorm.{wb}",
                 }
             )
 
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
index 0837e0e6ccf2..4eb8cb6330ca 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
@@ -18,6 +18,8 @@
     python convert_llama_hf_to_nemo.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --output_path <path_to_output_nemo_file>
+     --precision bf16 \
+     --llama31 True 
 """
 
 import os
@@ -60,6 +62,13 @@ def get_args():
         required=False,
         help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
+    parser.add_argument(
+        "--llama31",
+        type=bool,
+        default=True,
+        required=False,
+        help="Whether the model is from LLaMa 3.1 family. LLaMa 3.1 enables scaling for RoPE frequencies.",
+    )
     parser.add_argument("--precision", type=str, default="16", help="Model precision")
     args = parser.parse_args()
     return args
@@ -110,6 +119,7 @@ def load_config(args, llama_config):
     while llama_config['vocab_size'] % base != 0:
         base //= 2
     nemo_config.make_vocab_size_divisible_by = base
+    nemo_config.scale_positional_embedding = args.llama31
 
     return nemo_config
 
@@ -161,6 +171,7 @@ def convert(args):
             plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
 
     nemo_config.precision = precision
+    nemo_config.micro_batch_size = 1
     print(f"nemo_config: {nemo_config}")
 
     # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
@@ -298,12 +309,22 @@ def convert(args):
 
     # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
     if 'tokenizer_model' not in hf_config:
-        if hf_config['num_hidden_layers'] == 32:
-            model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
-        elif hf_config['num_hidden_layers'] == 80:
-            model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+        if args.llama31:
+            if hf_config['num_hidden_layers'] == 32:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B')
+            elif hf_config['num_hidden_layers'] == 80:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-70B')
+            elif hf_config['num_hidden_layers'] == 126:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B')  # 405B tokenizer is the same as 8B
+            else:
+                logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
         else:
-            logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+            if hf_config['num_hidden_layers'] == 32:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
+            elif hf_config['num_hidden_layers'] == 80:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+            else:
+                logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
 
     # cast to target precision and disable cpu init
     dtype = torch_dtype_from_precision(precision)
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
new file mode 100644
index 000000000000..f395e34765d0
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
+  Example to run this conversion script:
+    python convert_llama_hf_to_nemo.py \
+     --input_name_or_path <path_to_hf_checkpoints_folder> \
+     --input_state_dict <path_to_saved_state_dict> \
+     --output_path <path_to_output_nemo_file> \
+     --precision bf16
+     --llama31 True
+"""
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface LLaMA checkpoints",
+    )
+    parser.add_argument(
+        "--input_state_dict",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface LLaMA checkpoints",
+    )
+
+    parser.add_argument(
+        "--llama31",
+        type=bool,
+        default=True,
+        required=False,
+        help="Apply scaling for RoPE frequencies",
+    )
+
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+def load_config(args, llama_config):
+    nemo_config = OmegaConf.load(args.hparams_file).model
+
+    if llama_config.get('rope_theta', None):
+        nemo_config['rotary_base'] = llama_config['rope_theta']
+    nemo_config.encoder_seq_length = llama_config['max_position_embeddings']
+    nemo_config.num_layers = int(llama_config['num_hidden_layers'])
+    nemo_config.hidden_size = llama_config['hidden_size']
+    nemo_config.ffn_hidden_size = llama_config['intermediate_size']
+    nemo_config.num_attention_heads = llama_config['num_attention_heads']
+    nemo_config.max_position_embeddings = llama_config['max_position_embeddings']
+    nemo_config.init_method_std = llama_config['initializer_range']
+    nemo_config.layernorm_epsilon = llama_config['rms_norm_eps']
+    if 'num_key_value_heads' in llama_config:
+        nemo_config.num_query_groups = llama_config['num_key_value_heads']
+    nemo_config.use_cpu_initialization = True
+    nemo_config.activation = 'fast-swiglu'
+    nemo_config.megatron_amp_O2 = True  # True
+    nemo_config.scale_positional_embedding = args.llama31
+
+    # Tokenizer config
+    if 'tokenizer_model' in llama_config:
+        nemo_config.tokenizer.model = llama_config['tokenizer_model']
+    else:
+        # Llama3 uses converted TikToken Tokenizer
+        tokenizer_dict = {
+            'library': 'huggingface',
+            'type': args.input_name_or_path,
+            'use_fast': True,
+        }
+        nemo_config.tokenizer = tokenizer_dict
+
+    if llama_config['rope_scaling'] is not None:
+        if llama_config['rope_scaling']['type'] == 'linear':
+            nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor']
+        else:
+            raise ValueError("Only linear rope scaling type is supported now")
+    if llama_config['rope_theta'] is not None:
+        nemo_config['rotary_base'] = llama_config['rope_theta']
+
+    base = 128
+    while llama_config['vocab_size'] % base != 0:
+        base //= 2
+    nemo_config.make_vocab_size_divisible_by = base
+
+    return nemo_config
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    import torch
+
+    model = LlamaForCausalLM.from_pretrained(
+        args.input_name_or_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
+    )
+    hf_config = vars(model.config)
+    if os.path.exists(f'{args.input_name_or_path}/tokenizer.model'):
+        tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
+        hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path)
+    print(f"hf_config: {hf_config}")
+    print("named parameters:")
+    for name, param in model.named_parameters():
+        print(f"- {name}")
+
+    nemo_config = load_config(args, hf_config)
+
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+    elif args.precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            precision = args.precision
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = args.precision[2:]  # prune bf in string
+    else:
+        precision = args.precision
+
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
+                growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+                hysteresis=nemo_config.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if nemo_config.get('megatron_amp_O2', False):
+            print('HALF PRECISION')
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+    nemo_config.precision = precision
+    nemo_config.micro_batch_size = 1
+    print(f"nemo_config: {nemo_config}")
+
+    # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
+
+    hidden_size = hf_config["hidden_size"]
+    head_num = hf_config["num_attention_heads"]
+    head_size = hidden_size // head_num
+    num_layers = hf_config["num_hidden_layers"]
+
+    mcore_gpt = nemo_config.mcore_gpt
+
+    assert mcore_gpt == nemo_config.get(
+        'transformer_engine', False
+    ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+    param_to_weights = lambda param: param.float()
+
+    print('start init model')
+    del model
+    import time
+
+    st = time.perf_counter()
+    model = MegatronGPTModel(nemo_config, trainer)
+    print(f'Model init took {time.perf_counter() - st} sec')
+    from functools import reduce
+    from glob import glob
+
+    weights = glob(f'{args.input_state_dict}/*.pt')
+    st = time.perf_counter()
+    for weight_file in sorted(weights):
+        filename = os.path.basename(weight_file)
+        str_list = filename.split('.')
+        weight_type = str_list[-2]
+        str_name = '.'.join(str_list[1:-1])
+        print(f'-- Assign weight_type={weight_type} to {str_name}')
+        if nemo_config.get('megatron_amp_O2', False):
+            current = reduce(getattr, [model, 'model', 'module'] + str_list[:-2])
+        else:
+            current = reduce(getattr, [model, 'model'] + str_list[:-2])
+        load = torch.load(weight_file)
+        if nemo_config.get('megatron_amp_O2', False):
+            if precision == 'bf16':
+                target_precision = torch.bfloat16
+            elif precision == 16:
+                target_precision = torch.float16
+            load = load.to(target_precision)
+
+        if weight_type == 'weight':
+            assert current.weight.shape == load.shape
+            assert current.weight.dtype == load.dtype
+            current.weight = torch.nn.Parameter(load)
+            assert current.weight.norm() == load.norm()
+        elif weight_type == 'layer_norm_weight':
+            assert current.layer_norm_weight.dtype == load.dtype
+            assert current.layer_norm_weight.shape == load.shape
+            current.layer_norm_weight = torch.nn.Parameter(load)
+            assert current.layer_norm_weight.norm() == load.norm()
+        else:
+            raise ValueError(f'Unsupported weight type = {weight_type}')
+        del load
+
+    print(f'Finish loading model in {time.perf_counter() - st} sec. Start to save model')
+    st = time.perf_counter()
+    print(f'Model save took {time.perf_counter() - st} sec.')
+
+    model._save_restore_connector = NLPSaveRestoreConnector()
+
+    # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
+    if 'tokenizer_model' not in hf_config:
+        if args.llama31:
+            if hf_config['num_hidden_layers'] == 32:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B')
+            elif hf_config['num_hidden_layers'] == 80:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-70B')
+            elif hf_config['num_hidden_layers'] == 126:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B')  # 405B tokenizer is the same as 8B
+            else:
+                logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+        else:
+            if hf_config['num_hidden_layers'] == 32:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
+            elif hf_config['num_hidden_layers'] == 80:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+            else:
+                logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+
+    # cast to target precision and disable cpu init
+    dtype = torch_dtype_from_precision(precision)
+    model = model.to(dtype=dtype)
+    model.cfg.use_cpu_initialization = False
+
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
new file mode 100644
index 000000000000..940a9df5f9a8
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
+  Example to run this conversion script:
+    python convert_llama_hf_to_nemo.py \
+     --input_name_or_path <path_to_hf_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file>
+     --precision bf16 
+     --apply_rope_scaling True
+"""
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface LLaMA checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output to dict dir")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument(
+        "--apply_rope_scaling",
+        type=bool,
+        default=True,
+        required=False,
+        help="Apply scaling for RoPE frequencies",
+    )
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+def load_config(args, llama_config):
+    nemo_config = OmegaConf.load(args.hparams_file).model
+
+    if llama_config.get('rope_theta', None):
+        nemo_config['rotary_base'] = llama_config['rope_theta']
+    nemo_config.encoder_seq_length = llama_config['max_position_embeddings']
+    nemo_config.num_layers = int(llama_config['num_hidden_layers'])
+    nemo_config.hidden_size = llama_config['hidden_size']
+    nemo_config.ffn_hidden_size = llama_config['intermediate_size']
+    nemo_config.num_attention_heads = llama_config['num_attention_heads']
+    nemo_config.max_position_embeddings = llama_config['max_position_embeddings']
+    nemo_config.init_method_std = llama_config['initializer_range']
+    nemo_config.layernorm_epsilon = llama_config['rms_norm_eps']
+    if 'num_key_value_heads' in llama_config:
+        nemo_config.num_query_groups = llama_config['num_key_value_heads']
+    nemo_config.use_cpu_initialization = True
+    nemo_config.activation = 'fast-swiglu'
+    nemo_config.megatron_amp_O2 = True
+
+    # Tokenizer config
+    if 'tokenizer_model' in llama_config:
+        nemo_config.tokenizer.model = llama_config['tokenizer_model']
+    else:
+        # Llama3 uses converted TikToken Tokenizer
+        tokenizer_dict = {
+            'library': 'huggingface',
+            'type': args.input_name_or_path,
+            'use_fast': True,
+        }
+        nemo_config.tokenizer = tokenizer_dict
+
+    if llama_config['rope_scaling'] is not None:
+        if llama_config['rope_scaling']['type'] == 'linear':
+            nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor']
+        else:
+            raise ValueError("Only linear rope scaling type is supported now")
+    if llama_config['rope_theta'] is not None:
+        nemo_config['rotary_base'] = llama_config['rope_theta']
+
+    base = 128
+    while llama_config['vocab_size'] % base != 0:
+        base //= 2
+    nemo_config.make_vocab_size_divisible_by = base
+
+    return nemo_config
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    import torch
+
+    model = LlamaForCausalLM.from_pretrained(
+        args.input_name_or_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
+    )
+    hf_config = vars(model.config)
+    if os.path.exists(f'{args.input_name_or_path}/tokenizer.model'):
+        tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
+        hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path)
+
+    print("named parameters:")
+    for name, param in model.named_parameters():
+        print(f"- {name}")
+
+    nemo_config = load_config(args, hf_config)
+    nemo_config.scale_positional_embedding = args.apply_rope_scaling
+
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+    elif args.precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            precision = args.precision
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = args.precision[2:]  # prune bf in string
+    else:
+        precision = args.precision
+
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
+                growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+                hysteresis=nemo_config.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if nemo_config.get('megatron_amp_O2', False):
+            print('HALF PRECISION')
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+    nemo_config.precision = precision
+    print(f"nemo_config: {nemo_config}")
+
+    # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
+
+    hidden_size = hf_config["hidden_size"]
+    head_num = hf_config["num_attention_heads"]
+    head_size = hidden_size // head_num
+    num_layers = hf_config["num_hidden_layers"]
+
+    mcore_gpt = nemo_config.mcore_gpt
+
+    assert mcore_gpt == nemo_config.get(
+        'transformer_engine', False
+    ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+    param_to_weights = lambda param: param.float()
+
+    checkpoint = OrderedDict()
+    checkpoint['state_dict'] = OrderedDict()
+
+    embed_weight = model.state_dict()[f'model.embed_tokens.weight']
+    if mcore_gpt:
+        embed_weights_base_name = f'model.embedding.word_embeddings.weight'
+    else:
+        embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
+    checkpoint['state_dict'][embed_weights_base_name] = param_to_weights(embed_weight)
+
+    # in hf, this is defined as register_buffer(..., persistent=False) so it won't be in the state dict
+    if f'model.layers.0.self_attn.rotary_emb.inv_freq' in model.state_dict():
+        rotary_embed_weight = model.state_dict()[f'model.layers.0.self_attn.rotary_emb.inv_freq']
+        if mcore_gpt:
+            rotary_embed_weight_base_name = f'model.rotary_pos_emb.inv_freq'
+        else:
+            rotary_embed_weight_base_name = f'model.language_model.rotary_pos_emb.inv_freq'
+        checkpoint['state_dict'][rotary_embed_weight_base_name] = param_to_weights(rotary_embed_weight)
+
+    if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
+        num_query_groups = head_num
+    else:
+        num_query_groups = nemo_config.num_query_groups
+        assert head_num % num_query_groups == 0, 'head_num must be divisible by num_query_groups'
+    if mcore_gpt:
+        assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.'
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+        old_tensor_shape = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].size()
+        new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+        new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+        q = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].view(*new_q_tensor_shape)
+        k = model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight'].view(*new_kv_tensor_shape)
+        v = model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight'].view(*new_kv_tensor_shape)
+        qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+        heads_per_group = head_num // num_query_groups
+        for i in range(num_query_groups):
+            qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+            qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+            qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+        qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+        if mcore_gpt:
+            qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'
+        else:
+            qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight'
+        checkpoint['state_dict'][qkv_weights_base_name] = param_to_weights(qkv_weights)
+
+        # attention dense
+        o_weight = model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']
+        if mcore_gpt:
+            o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
+        else:
+            o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight'
+        checkpoint['state_dict'][o_weight_base_name] = param_to_weights(o_weight)
+
+        # MLP
+        mlp_down_weight = model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']
+        mlp_gate_weight = model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']
+        if mcore_gpt:
+            mlp_down_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.weight'
+        else:
+            mlp_down_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_h_to_4h.weight'
+        mlp_down_weight = torch.cat((mlp_down_weight, mlp_gate_weight), axis=0)
+        checkpoint['state_dict'][mlp_down_base_name] = param_to_weights(mlp_down_weight)
+
+        mlp_up_weight = model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']
+        if mcore_gpt:
+            mlp_up_base_name = f'model.decoder.layers.{l}.mlp.linear_fc2.weight'
+        else:
+            mlp_up_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_4h_to_h.weight'
+        checkpoint['state_dict'][mlp_up_base_name] = param_to_weights(mlp_up_weight)
+
+        # LayerNorm
+        input_ln_weight = model.state_dict()[f'model.layers.{l}.input_layernorm.weight']
+        if mcore_gpt:
+            input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+        else:
+            input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight'
+        checkpoint['state_dict'][input_ln_base_name] = param_to_weights(input_ln_weight)
+
+        post_attn_ln_weight = model.state_dict()[f'model.layers.{l}.post_attention_layernorm.weight']
+        if mcore_gpt:
+            post_attn_ln_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
+        else:
+            post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
+        checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+
+        print(f"done layer {l}")
+
+    final_ln_weight = model.state_dict()[f'model.norm.weight']
+    if mcore_gpt:
+        final_ln_base_name = f'model.decoder.final_layernorm.weight'
+    else:
+        final_ln_base_name = f'model.language_model.encoder.final_layernorm.weight'
+    checkpoint['state_dict'][final_ln_base_name] = param_to_weights(final_ln_weight)
+
+    output_layer_weight = model.state_dict()[f'lm_head.weight']
+    if mcore_gpt:
+        output_layer_base_name = f'model.output_layer.weight'
+    else:
+        output_layer_base_name = f'model.language_model.output_layer.weight'
+    checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight)
+
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+
+    del model
+    import gc
+
+    gc.collect()
+
+    if nemo_config.get('megatron_amp_O2', False):
+        keys = list(checkpoint['state_dict'].keys())
+        print('convert to O2')
+        for key in keys:
+            checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
+
+    # os.mkdir(args.output_path, exist_ok=True)
+    for key in checkpoint['state_dict']:
+        print(f'Saving {key} in {checkpoint["state_dict"][key].dtype}..')
+        save_location = f'{args.output_path}/{key[13:]}.pt'
+        torch.save(checkpoint['state_dict'][key], save_location)
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index a8c8621c8d31..45765e6d1997 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -1735,10 +1735,14 @@ def nemo_tarred_manifest_path_with_offset(tmp_path_factory) -> Tuple[str, str]:
         TarWriter(f"{root}/audios_0.tar", shard_size=None) as tar_writer,
         SequentialJsonlWriter(root / "tarred_audio_filepaths.jsonl") as mft_writer,
     ):
-        tar_writer.write(recording.id, BytesIO(recording.sources[0].source))
+
+        def audio_path(n: int = None):
+            return recording.id + ("" if n is None else f"-sub{n}") + ".wav"
+
+        tar_writer.write(audio_path(), BytesIO(recording.sources[0].source))
         mft_writer.write(
             {  # segment 0-3s
-                "audio_filepath": recording.id,
+                "audio_filepath": audio_path(),
                 "offset": 0.0,
                 "duration": 3.0,
                 "text": "irrelevant",
@@ -1748,7 +1752,7 @@ def nemo_tarred_manifest_path_with_offset(tmp_path_factory) -> Tuple[str, str]:
         )
         mft_writer.write(
             {  # segment 4-9s
-                "audio_filepath": recording.id + "-sub1",
+                "audio_filepath": audio_path(1),
                 "offset": 4.0,
                 "duration": 5.0,
                 "text": "irrelevant-2",
@@ -1758,7 +1762,7 @@ def nemo_tarred_manifest_path_with_offset(tmp_path_factory) -> Tuple[str, str]:
         )
         mft_writer.write(
             {  # full recording - for reference
-                "audio_filepath": recording.id + "-sub2",
+                "audio_filepath": audio_path(2),
                 "offset": 0.0,
                 "duration": 10.0,
                 "text": "irrelevant irrelevant-2",
@@ -1794,7 +1798,7 @@ def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_p
     assert len(batch) == 3
 
     # Validate example containing full 10s recording.
-    full_cut = batch[-1]
+    full_cut = batch[1]
     assert full_cut.start == 0.0
     assert full_cut.duration == 10.0
     assert full_cut.supervisions[0].text == "irrelevant irrelevant-2"
@@ -1803,7 +1807,7 @@ def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_p
     assert full_audio.shape[1] == full_cut.num_samples == 160000  # 10s * 16kHz
 
     # Validate segment 0-3s.
-    cut = batch[0]
+    cut = batch[2]
     assert cut.start == 0.0
     assert cut.duration == 3.0
     assert cut.supervisions[0].text == "irrelevant"
@@ -1817,7 +1821,7 @@ def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_p
     # Note: LazyNeMoTarredIterator removes the offset information, as it creates a new recording
     # that's a "subset" of the original recording as a memory saving optimization.
     # Hence, we will not see cut.start == 4.0.
-    cut = batch[1]
+    cut = batch[0]
     assert cut.start == 0.0
     assert cut.duration == 5.0
     assert cut.supervisions[0].text == "irrelevant-2"
diff --git a/tests/lightning/io/test_mixin.py b/tests/lightning/io/test_mixin.py
index 824608db6bf0..3a520b8e74ae 100644
--- a/tests/lightning/io/test_mixin.py
+++ b/tests/lightning/io/test_mixin.py
@@ -14,3 +14,10 @@ def test_reinit(self):
         assert copied is not dummy
         assert copied.a == dummy.a
         assert copied.b == dummy.b
+
+    def test_init(self):
+        outputs = []
+        for i in range(1001):
+            outputs.append(DummyClass(i, i))
+
+        assert len(outputs) == 1001