diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index 4ef6c5a9f9df..8c61c767b4f1 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -34,22 +34,27 @@ on:
         description: Last 2000 characters of the test step's log
         value: ${{ jobs.main.outputs.log }} 
 jobs:
+  
   main:
     runs-on: ${{ inputs.RUNNER }} 
     outputs:
       conclusion: ${{ steps.main.conclusion }}
       log: ${{ steps.main.outputs.log }}
-    permissions:
-      actions: write # Required for cancelling workflows 
     steps:
         - name: Docker system cleanup
           run: |
             docker system prune -a --filter "until=48h" --force
 
+        - name: Docker pull image
+          run: |
+            docker pull nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+
         - id: main
           name: Run main script
           timeout-minutes: ${{ inputs.TIMEOUT }}
           run: |
+            mkdir -p ${{ github.run_id }}
+            cd ${{ github.run_id }}/
             set +e 
             (  
               set -e
@@ -65,9 +70,7 @@ jobs:
             
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: failure() && inputs.IS_OPTIONAL == false
-
         - name: after_script
           if: always() && inputs.AFTER_SCRIPT != ':'
           run: |
-            docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
-            
\ No newline at end of file
+            docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
\ No newline at end of file
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 253e114c78f3..2919be733ac3 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -93,8 +93,8 @@ jobs:
           exit 0
         '
         ### \'\'
-
-
+  
+  
   L0_Unit_Tests_GPU:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -105,14 +105,16 @@ jobs:
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
       IS_OPTIONAL: true
 
-  L0_Unit_Tests_CPU:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-cpu
-      TIMEOUT: 60
-      SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+   # # TODO refactor: Commenting this test out until it is fixed & works properly again (test passes again)
+  # OPTIONAL_L0_Unit_Tests_CPU:
+  #   needs: [cicd-test-container-setup]
+  #   uses: ./.github/workflows/_test_template.yml
+  #   with:
+  #     RUNNER: self-hosted-azure-cpu
+  #     TIMEOUT: 60
+  #     SCRIPT: |
+  #       CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+  #     IS_OPTIONAL: true
 
   L0_Setup_Test_Data_And_Models:
     needs: [cicd-test-container-setup]
@@ -532,55 +534,45 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/asr/speech_to_text_results
 
-
-  # L2_Speech_to_Text_AED:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure-gpus-1
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/asr/speech_multitask/speech_to_text_aed.py \
-  #           model.prompt_format=canary \
-  #           model.model_defaults.asr_enc_hidden=256 \
-  #           model.model_defaults.lm_dec_hidden=256 \
-  #           model.encoder.n_layers=12 \
-  #           model.transf_encoder.num_layers=0 \
-  #           model.transf_decoder.config_dict.num_layers=12 \
-  #           model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \
-  #           ++model.train_ds.is_tarred=false \
-  #           model.train_ds.batch_duration=60 \
-  #           +model.train_ds.text_field="answer" \
-  #           +model.train_ds.lang_field="target_lang" \
-  #           model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
-  #           +model.validation_ds.text_field="answer" \
-  #           +model.validation_ds.lang_field="target_lang" \
-  #           model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
-  #           +model.test_ds.text_field="answer" \
-  #           +model.test_ds.lang_field="target_lang" \
-  #           model.tokenizer.langs.spl_tokens.dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \
-  #           model.tokenizer.langs.spl_tokens.type="bpe" \
-  #           model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \
-  #           model.tokenizer.langs.en.type=bpe \
-  #           ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \
-  #           ++model.tokenizer.langs.es.type=bpe \
-  #           trainer.devices=1 \
-  #           trainer.accelerator="gpu" \
-  #           +trainer.use_distributed_sampler=false \
-  #           +trainer.fast_dev_run=True \
-  #           exp_manager.exp_dir=examples/asr/speech_to_text_aed_results
-  #           rm -rf examples/asr/speech_to_text_results
-
+  L2_Speech_to_Text_AED:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/speech_multitask/speech_to_text_aed.py \
+        model.prompt_format=canary \
+        model.model_defaults.asr_enc_hidden=256 \
+        model.model_defaults.lm_dec_hidden=256 \
+        model.encoder.n_layers=12 \
+        model.transf_encoder.num_layers=0 \
+        model.transf_decoder.config_dict.num_layers=12 \
+        model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \
+        model.train_ds.batch_duration=60 \
+        model.train_ds.use_bucketing=false \
+        model.train_ds.shuffle_buffer_size=100 \
+        model.train_ds.num_workers=0 \
+        +model.train_ds.text_field="answer" \
+        +model.train_ds.lang_field="target_lang" \
+        model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
+        +model.validation_ds.text_field="answer" \
+        +model.validation_ds.lang_field="target_lang" \
+        model.validation_ds.num_workers=0 \
+        model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
+        +model.test_ds.text_field="answer" \
+        +model.test_ds.lang_field="target_lang" \
+        model.test_ds.num_workers=0 \
+        spl_tokens.model_dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \
+        model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \
+        model.tokenizer.langs.en.type=bpe \
+        ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \
+        ++model.tokenizer.langs.es.type=bpe \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_aed_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_results
 
   # L2: Speaker dev run
   L2_Speaker_dev_run_Speaker_Recognition:
@@ -817,6 +809,66 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf stt_test_res.json
 
+  # L2: Speech Transcription
+  L2_Speech_Transcription_Canary_Transcribe_Full_Manifest:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/asr/transcribe_speech.py \
+        dataset_manifest=/home/TestData/asr/canary/dev-other-wav-10-canary-fields.json \
+        output_filename=preds.json \
+        batch_size=10 \
+        pretrained_name=nvidia/canary-1b \
+        num_workers=0 \
+        amp=false \
+        compute_dtype=bfloat16 \
+        matmul_precision=medium
+      AFTER_SCRIPT: |
+        rm -rf preds.json transcribe.log
+
+  L2_Speech_Transcription_Canary_Transcribe_With_Prompt:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/asr/transcribe_speech.py \
+        dataset_manifest=/home/TestData/asr/canary/dev-other-wav-10.json \
+        output_filename=preds.json \
+        batch_size=10 \
+        pretrained_name=nvidia/canary-1b \
+        num_workers=0 \
+        amp=false \
+        compute_dtype=bfloat16 \
+        matmul_precision=medium \
+        +prompt.source_lang="en" \
+        +prompt.target_lang="en" \
+        +prompt.task="asr" \
+        +prompt.pnc="no"
+      AFTER_SCRIPT: |
+        rm -rf preds.json transcribe.log
+
+  L2_Speech_Transcription_Canary_Transcribe_Audio_Dir:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/asr/transcribe_speech.py \
+        audio_dir=/home/TestData/asr/canary/dev-other-wav \
+        output_filename=preds.json \
+        batch_size=10 \
+        pretrained_name=nvidia/canary-1b \
+        num_workers=0 \
+        amp=false \
+        compute_dtype=bfloat16 \
+        matmul_precision=medium
+      AFTER_SCRIPT: |
+        rm -rf preds.json
+  
+
   # L2: Transducer alignment
   L2_Transducer_alignment_Running_pytest:
     needs: [cicd-test-container-setup]
@@ -975,7 +1027,6 @@ jobs:
         data.test_ds.use_cache=false \
         data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
 
-
   # L2: Intent and Slot Classification Tasks
   L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
@@ -1013,11 +1064,11 @@ jobs:
         rm -rf checkpoints2
 
     # TODO: add when megatron-bert is supported again
-    # stage('L2: Model Parallel Size 2 Megatron Text Classification') {
+    # stage("L2: Model Parallel Size 2 Megatron Text Classification") {
     #   when {
     #     anyOf{
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
@@ -1042,11 +1093,11 @@ jobs:
     #   }
     # }
 
-    # stage('L2: Model Parallel Size 2 Megatron Autoresume') {
+    # stage("L2: Model Parallel Size 2 Megatron Autoresume") {
     #   when {
     #     anyOf{
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
@@ -1073,11 +1124,11 @@ jobs:
     #   }
     # }
 
-    # stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') {
+    # stage("L2: Model Parallel Size 2 Megatron Evaluation from .nemo") {
     #   when {
     #     anyOf{
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
@@ -1094,11 +1145,11 @@ jobs:
     #   }
     # }
 
-    # stage('L2: Model Parallel Size 2 Megatron Train from .nemo') {
+    # stage("L2: Model Parallel Size 2 Megatron Train from .nemo") {
     #   when {
     #     anyOf{
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
@@ -1500,18 +1551,18 @@ jobs:
         model.encoder.num_layers=4 \
         model.encoder.hidden_size=64 \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
+        model.encoder.activation="swiglu" \
         model.encoder.masked_softmax_fusion=False \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=2 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
+        model.decoder.activation="swiglu" \
         model.decoder.masked_softmax_fusion=False \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
         model.micro_batch_size=2 \
         model.global_batch_size=4 \
@@ -1545,18 +1596,18 @@ jobs:
         model.encoder.num_layers=4 \
         model.encoder.hidden_size=64 \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
+        model.encoder.activation="swiglu" \
         model.encoder.masked_softmax_fusion=False \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=2 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
+        model.decoder.activation="swiglu" \
         model.decoder.masked_softmax_fusion=False \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
         model.micro_batch_size=2 \
         model.global_batch_size=4 \
@@ -1598,24 +1649,24 @@ jobs:
         model.encoder.hidden_size=64 \
         model.encoder.arch=perceiver \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
+        model.encoder.activation="swiglu" \
         model.encoder.masked_softmax_fusion=False \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=2 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
+        model.decoder.activation="swiglu" \
         model.decoder.masked_softmax_fusion=False \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
         model.micro_batch_size=2 \
         model.global_batch_size=4 \
         model.data.data_impl=text_mmap \
         model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string='"800,100,100"' \
+        model.data.splits_string="\"800,100,100\"" \
         model.data.whole_word_masking=False \
         model.tokenizer.library=sentencepiece \
         model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
@@ -1643,24 +1694,24 @@ jobs:
         model.encoder.hidden_size=64 \
         model.encoder.arch=perceiver \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
+        model.encoder.activation="swiglu" \
         model.encoder.masked_softmax_fusion=False \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=2 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
+        model.decoder.activation="swiglu" \
         model.decoder.masked_softmax_fusion=False \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
         model.micro_batch_size=2 \
         model.global_batch_size=4 \
         model.data.data_impl=text_mmap \
         model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string='"800,100,100"' \
+        model.data.splits_string="\"800,100,100\"" \
         model.data.whole_word_masking=False \
         model.tokenizer.library=sentencepiece \
         model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
@@ -1672,16 +1723,16 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/nlp/language_modeling/megatron_mim_results
 
-    # stage('L2: NMT Bottleneck Fallback') {
+    # stage("L2: NMT Bottleneck Fallback") {
     #   when {
     #     anyOf {
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
     #   parallel {
-    #     stage('L2: seq2seq (no bottleneck)') {
+    #     stage("L2: seq2seq (no bottleneck)") {
     #         steps {
     #           cd examples/nlp/machine_translation && \
     #           enc_dec_nmt-bottleneck.py \
@@ -1718,16 +1769,16 @@ jobs:
     #     }
     #   }
     # }
-    # stage('L2: NMT Bottleneck Architecture') {
+    # stage("L2: NMT Bottleneck Architecture") {
     #   when {
     #     anyOf {
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
     #   parallel {
-    #     stage('Bridge Encoder (identity)') {
+    #     stage("Bridge Encoder (identity)") {
     #         steps {
     #           cd examples/nlp/machine_translation && \
     #           enc_dec_nmt-bottleneck.py \
@@ -1762,7 +1813,7 @@ jobs:
     #           exp_manager=null
     #         }
     #     }
-    #     stage('Perceiver Encoder (params)') {
+    #     stage("Perceiver Encoder (params)") {
     #         steps {
     #           cd examples/nlp/machine_translation && \
     #           enc_dec_nmt-bottleneck.py \
@@ -1799,16 +1850,16 @@ jobs:
     #     }
     #   }
     # }
-    # stage('L2: NMT Bottleneck LVM') {
+    # stage("L2: NMT Bottleneck LVM") {
     #   when {
     #     anyOf {
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
     #   parallel {
-    #     stage('VAE') {
+    #     stage("VAE") {
     #         steps {
     #           cd examples/nlp/machine_translation && \
     #           enc_dec_nmt-bottleneck.py \
@@ -1843,7 +1894,7 @@ jobs:
     #           exp_manager=null
     #         }
     #     }
-    #     stage('MIM') {
+    #     stage("MIM") {
     #         steps {
     #           cd examples/nlp/machine_translation && \
     #           enc_dec_nmt-bottleneck.py \
@@ -2050,7 +2101,7 @@ jobs:
           model.num_layers=8 \
           model.hidden_size=256 \
           model.num_attention_heads=8 \
-          model.activations_checkpoint_method='block' \
+          model.activations_checkpoint_method="block" \
           model.activations_checkpoint_num_layers=1 \
           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
           model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
@@ -2080,7 +2131,7 @@ jobs:
           model.num_layers=8 \
           model.hidden_size=256 \
           model.num_attention_heads=8 \
-          model.activations_checkpoint_method='block' \
+          model.activations_checkpoint_method="block" \
           model.activations_checkpoint_num_layers=1 \
           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
           model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
@@ -2099,7 +2150,7 @@ jobs:
         trainer.devices=2 \
         trainer.precision=bf16 \
         trainer.accelerator=gpu \
-        model.data.data_prefix=['none'] \
+        model.data.data_prefix=["none"] \
         exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
         model.mcore_gpt=True \
         model.tensor_model_parallel_size=1 \
@@ -2114,7 +2165,7 @@ jobs:
         model.init_method_std=0.023 \
         model.optim.lr=6.0e-4 \
         model.megatron_amp_O2=True \
-        model.data.splits_string=\'\"98,2,0\"\' \
+        model.data.splits_string="\"98,2,0\"" \
         model.data.dataloader_type=cyclic \
         trainer.max_steps=10
 
@@ -2123,7 +2174,7 @@ jobs:
         trainer.devices=2 \
         trainer.precision=bf16 \
         trainer.accelerator=gpu \
-        model.data.data_prefix=['none'] \
+        model.data.data_prefix=["none"] \
         exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
         model.mcore_gpt=True \
         model.tensor_model_parallel_size=1 \
@@ -2138,7 +2189,7 @@ jobs:
         model.init_method_std=0.023 \
         model.optim.lr=6.0e-4 \
         model.megatron_amp_O2=True \
-        model.data.splits_string=\'\"98,2,0\"\' \
+        model.data.splits_string="\"98,2,0\"" \
         model.data.dataloader_type=cyclic \
         trainer.max_steps=20
       AFTER_SCRIPT: |
@@ -2292,16 +2343,16 @@ jobs:
   #               from pandas.testing import assert_frame_equal
   #               from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
   #               import torch
-  #               if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()):
+  #               if not (torch.cuda.is_available() and "A100" in torch.cuda.get_device_name()):
   #                   import sys
   #                   sys.exit(0)
-  #               event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
+  #               event_file = list(pathlib.Path("examples/nlp/language_modeling/retro_results/megatron_retro/smalltest").glob("events.out.tfevents*"))[0]
   #               ea = EventAccumulator(str(event_file)).Reload()
   #               vals = []
-  #               for i in ea.Scalars('reduced_train_loss'):
+  #               for i in ea.Scalars("reduced_train_loss"):
   #                   vals.append(i.value)
-  #               training_curve = pd.DataFrame({'loss': vals})
-  #               gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv')
+  #               training_curve = pd.DataFrame({"loss": vals})
+  #               gt_curve = pd.read_csv("/home/TestData/nlp/megatron_retro/expected_learning_curve.csv")
   #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
 
   #               rm -rf examples/nlp/language_modeling/retro_results
@@ -2317,13 +2368,13 @@ jobs:
         python examples/nlp/rag/rag_indexing.py \
         trainer.num_nodes=1 \
         trainer.devices=1 \
-        trainer.precision='bf16-mixed' \
-        indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \
+        trainer.precision="bf16-mixed" \
+        indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \
         indexing.embedder.embed_batch_size=128 \
-        indexing.data.data_path='/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data' \
+        indexing.data.data_path="/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data" \
         indexing.data.chunk_size=256 \
         indexing.data.chunk_overlap=10 \
-        indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index'
+        indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index"
 
   L2_RAG_Pipeline_Generating:
     needs: [cicd-test-container-setup]
@@ -2333,14 +2384,14 @@ jobs:
       SCRIPT: |
         python examples/nlp/rag/rag_generating.py \
         trainer.devices=1 \
-        trainer.precision='bf16-mixed' \
-        indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \
-        indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' \
-        generating.llm.model_path='/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo' \
+        trainer.precision="bf16-mixed" \
+        indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \
+        indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index" \
+        generating.llm.model_path="/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo" \
         generating.inference.tokens_to_generate=50 \
         generating.inference.greedy=False \
         generating.inference.temperature=1.0 \
-        generating.query='Which art schools did I applied to?'
+        generating.query="Which art schools did I applied to?"
 
   L2_BioMegatron_Bert_NER_Task:
     needs: [cicd-test-container-setup]
@@ -3098,29 +3149,44 @@ jobs:
   
   L2_Megatron_GPT_Reranker:
     needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        rm -rf /home/TestData/nlp/megatron_ir/working_dir
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            rm -rf /home/TestData/nlp/megatron_ir/working_dir
 
-        python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \
-        exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
-        model.global_batch_size=4 \
-        model.micro_batch_size=4 \
-        trainer.devices=1 \
-        trainer.num_nodes=1 \
-        trainer.max_epochs=null \
-        trainer.max_steps=20 \
-        trainer.val_check_interval=10 \
-        model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
-        model.peft.lora_tuning.adapter_dim=8 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
-        model.data.validation_ds.write_embeddings_to_file=True \
-        model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
-      AFTER_SCRIPT: |
-        rm -rf /home/TestData/nlp/megatron_ir/working_dir
+            python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \
+            exp_manager.exp_dir="/home/TestData/nlp/megatron_ir/working_dir" \
+            model.global_batch_size=4 \
+            model.micro_batch_size=4 \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            trainer.max_epochs=null \
+            trainer.max_steps=20 \
+            trainer.val_check_interval=10 \
+            model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
+            model.peft.lora_tuning.adapter_dim=8 \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
+            model.data.validation_ds.write_embeddings_to_file=True \
+            model.data.validation_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/val_embs" \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
+
+
+            rm -rf /home/TestData/nlp/megatron_ir/working_dir
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
 
   L2_Megatron_GPT_Embedding:
     needs: [cicd-test-container-setup]
@@ -3131,7 +3197,7 @@ jobs:
         rm -rf /home/TestData/nlp/megatron_ir/working_dir
 
         python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
-        exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
+        exp_manager.exp_dir="/home/TestData/nlp/megatron_ir/working_dir" \
         model.global_batch_size=4 \
         model.micro_batch_size=4 \
         trainer.devices=1 \
@@ -3139,25 +3205,25 @@ jobs:
         trainer.max_epochs=null \
         trainer.max_steps=20 \
         trainer.val_check_interval=10 \
-        model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
+        model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
         model.peft.lora_tuning.adapter_dim=8 \
         model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
         model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
         model.data.validation_ds.write_embeddings_to_file=True \
-        model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
+        model.data.validation_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/val_embs" \
         model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
 
 
         python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
         trainer.devices=1 \
         trainer.num_nodes=1 \
-        model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
-        model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \
+        model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
+        model.peft.restore_from_path="/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo" \
         model.global_batch_size=4 \
         model.micro_batch_size=4 \
         model.peft.lora_tuning.adapter_dim=8 \
         model.data.test_ds.write_embeddings_to_file=True \
-        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \
+        model.data.test_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/test_embs" \
         model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
         model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl]
       AFTER_SCRIPT: |
@@ -3203,15 +3269,15 @@ jobs:
         trainer.devices=2 \
         model.megatron_amp_O2=True \
         model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=['quarel4'] \
+        model.data.test_ds.names=["quarel4"] \
         model.global_batch_size=2 \
         model.micro_batch_size=1 \
         model.data.test_ds.tokens_to_generate=10 \
         model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \
+        model.data.test_ds.output_file_path_prefix="/home/TestData/nlp/lora_tuning_pp2/out" \
         inference.greedy=True \
         inference.repetition_penalty=1.0 \
-        inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl'
+        inference.outfile_path="/home/TestData/nlp/lora_tuning_pp2/out.jsonl"
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/lora_tuning_pp2
 
@@ -3235,7 +3301,7 @@ jobs:
         model.pipeline_model_parallel_size=1 \
         model.tensor_model_parallel_size=2 \
         model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
-        model.peft.peft_scheme='lora' \
+        model.peft.peft_scheme="lora" \
         model.answer_only_loss=True \
         model.micro_batch_size=1 \
         model.global_batch_size=1 \
@@ -3252,15 +3318,15 @@ jobs:
         model.tensor_model_parallel_size=2 \
         trainer.devices=2 \
         model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=['quarel4'] \
+        model.data.test_ds.names=["quarel4"] \
         model.global_batch_size=2 \
         model.micro_batch_size=1 \
         model.data.test_ds.tokens_to_generate=10 \
         model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \
+        model.data.test_ds.output_file_path_prefix="/home/TestData/nlp/lora_tuning_tp2/out" \
         inference.greedy=True \
         inference.repetition_penalty=1.0 \
-        inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'
+        inference.outfile_path="/home/TestData/nlp/lora_tuning_tp2/out.jsonl"
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/lora_tuning_tp2
 
@@ -3300,12 +3366,12 @@ jobs:
         +model.tp_comm_overlap_ag=False \
         +model.tp_comm_overlap_rs=False \
         +model.tp_comm_overlap_disable_qkv=True \
-        model.peft.peft_scheme='lora' \
+        model.peft.peft_scheme="lora" \
         model.peft.lora_tuning.adapter_dim=16 \
         model.peft.lora_tuning.alpha=32 \
         model.peft.lora_tuning.column_init_method="kaiming" \
-        +model.peft.lora_tuning.dropout_position='pre' \
-        model.peft.lora_tuning.target_modules=['attention'] \
+        +model.peft.lora_tuning.dropout_position="pre" \
+        model.peft.lora_tuning.target_modules=["attention"] \
         model.peft.lora_tuning.adapter_dropout=0.1 \
         +model.peft.lora_tuning.a2a_experimental=1 \
         model.answer_only_loss=True \
@@ -3328,7 +3394,7 @@ jobs:
       SCRIPT: |
         python examples/nlp/language_modeling/megatron_gpt_eval.py \
             gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
-            prompts=['How to fix GPU memory? A:'] \
+            prompts=["How to fix GPU memory? A:"] \
             tensor_model_parallel_size=1 \
             inference.tokens_to_generate=32 \
             trainer.precision=32
@@ -3555,8 +3621,8 @@ jobs:
         model.decoder.num_layers=4 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
+        model.encoder.transformer_block_type="pre_ln" \
+        model.decoder.transformer_block_type="pre_ln" \
         model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
         model.data.data_impl=text_mmap \
@@ -3588,8 +3654,8 @@ jobs:
         model.decoder.num_layers=4 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
+        model.encoder.transformer_block_type="pre_ln" \
+        model.decoder.transformer_block_type="pre_ln" \
         model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
         model.data.data_impl=text_mmap \
@@ -3984,7 +4050,7 @@ jobs:
       SCRIPT: |
         python examples/nlp/language_modeling/megatron_t5_eval.py \
             --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
+            --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
             --tensor_model_parallel_size 1
 
   L2_Megatron_Core_T5_Eval:
@@ -3995,7 +4061,7 @@ jobs:
       SCRIPT: |
         NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_t5_eval.py \
             --model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
-            --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
+            --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
             --tensor_model_parallel_size 1
 
   L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
@@ -4020,18 +4086,18 @@ jobs:
         model.encoder.num_layers=4 \
         model.encoder.hidden_size=64 \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='reglu' \
+        model.encoder.activation="reglu" \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=4 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='reglu' \
+        model.decoder.activation="reglu" \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
+        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
 
         python examples/nlp/language_modeling/megatron_bart_pretraining.py \
         trainer.devices=2 \
@@ -4050,18 +4116,18 @@ jobs:
         model.encoder.num_layers=4 \
         model.encoder.hidden_size=64 \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='reglu' \
+        model.encoder.activation="reglu" \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=4 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='reglu' \
+        model.decoder.activation="reglu" \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
+        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
       AFTER_SCRIPT: |
         rm -rf examples/nlp/language_modeling/bart_pretrain_results
 
@@ -4482,7 +4548,7 @@ jobs:
       AFTER_SCRIPT: |
         rm -f examples/asr/evaluation_transcripts.json
 
-  L2_Stable_Diffusion_Training:
+  OPTIONAL_L2_Stable_Diffusion_Training:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     with:
@@ -4530,6 +4596,7 @@ jobs:
         model.data.synthetic_data=True
       AFTER_SCRIPT: |
         rm -rf examples/multimodal/text_to_image/sd_train_results
+      IS_OPTIONAL: true
 
   L2_NeMo_2_GPT_Pretraining_no_transformer_engine:
     needs: [cicd-test-container-setup]
@@ -4566,7 +4633,7 @@ jobs:
       - gpu-test
       - cicd-test-container-setup
       - L0_Unit_Tests_GPU
-      - L0_Unit_Tests_CPU
+      #- OPTIONAL_L0_Unit_Tests_CPU
       - L2_Community_LLM_Checkpoints_tests_Bert
       - L2_Community_LLM_Checkpoints_tests_Llama
       - L2_Community_LLM_Checkpoints_tests_StarCoder
@@ -4664,7 +4731,7 @@ jobs:
       - L2_TTS_Fast_dev_runs_1_Mixer-TTS
       - L2_TTS_Fast_dev_runs_1_Hifigan
       - Speech_Checkpoints_tests
-      - L2_Stable_Diffusion_Training
+      #- OPTIONAL_L2_Stable_Diffusion_Training
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
     if: always()
     runs-on: ubuntu-latest
diff --git a/docs/source/performance/performance_summary.md b/docs/source/performance/performance_summary.md
new file mode 100644
index 000000000000..c5bdda7b040d
--- /dev/null
+++ b/docs/source/performance/performance_summary.md
@@ -0,0 +1,42 @@
+
+# Performance Benchmarks
+
+## Large Language Models**
+
+### Pretraining
+
+- The results in the table below show pre-training performance for various tasks at FP8 precision.
+  - Container: [NeMo24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags)
+  - System: DGX-H100
+
+| Model         | #-GPUs | GBS  | MBS | Sequence Length| TP | PP | CP | VP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to train in days (10T tokens, 1K GPUs)*** |
+| -----         | ------ | ---  | --- | ---------------| -- | -- | -- | -- | ------------------ | ----------------------- | ------------------------------------------------------ |
+| GPT3-5B       | 64     | 2048 | 4   | 2048           | 1  | 1  | 1  | 1  | 22521              | 736                     | ***5***                                                |
+| GPT3-20B      | 64     | 256  | 2   | 2048           | 2  | 1  | 1  | 1  | 5851               | 750                     | ***19***                                               |
+| GPT3-175B     | 128    | 256  | 1   | 2048           | 4  | 8  | 1  | 6  | 726                | 782                     | **156**                                                |
+| GPT3-175B     | 512    | 2048 | 2   | 2048           | 4  | 8  | 1  | 6  | 782                | [842](https://mlcommons.org/benchmarks/training/)                     | **145**                                                |
+| LLAMA2-7B     | 8      | 128  | 1   | 4096           | 1  | 1  | 1  | 1  | 16847              | 776                     | ***7***                                                | 
+| LLAMA2-13B    | 16     | 128  | 1   | 4096           | 1  | 4  | 1  | 10 | 8646               | 754                     | ***13***                                               |
+| LLAMA2-70B    | 64     | 128  | 1   | 4096           | 4  | 4  | 1  | 20 | 1707               | 759                     | ***66***                                               |
+| Nemotron-8B   | 64     | 256  | 4   | 4096           | 2  | 1  | 1  | 1  | 12701              | 653                     | ***9***                                                |
+| Nemotron-22B  | 64     | 256  | 2   | 4096           | 2  | 4  | 1  | 10 | 4256               | 554                     | ***27***                                               |
+| Nemotron-340B | 128    | 32   | 1   | 4096           | 8  | 8  | 1  | 12 | 322                | 678                     | ***351***                                              |
+| LLAMA3-8B     | 8      | 128  | 1   | 8192           | 1  | 1  | 2  | 1  | 12036              | 697                     | ***9***                                                |
+| LLAMA3-70B    | 64     | 128  | 1   | 8192           | 4  | 4  | 2  | 5  | 1533               | 738                     | ***74***                                               |
+
+### Finetuning
+
+- The results in the table below show finetuning performance of LLAMA2 models with SFT (supervised fine-tuning), and LoRA (Low-rank adaptors) at FP8 precision.
+  - Container: [NeMo24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags)
+  - System: DGX-H100
+- For fine-tuning, we use `SQuAD-v1.1 <https://rajpurkar.github.io/SQuAD-explorer/>`__ dataset, and the inputs are packed to 4096 tokens.
+
+
+| Model      | Task     | #-GPUs | GBS | MBS | Packed Sequence Length | TP | PP | Tokens / sec / GPU | Model TFLOP / sec / GPU |  ***Est. time to finetune in mins (10M tokens)***  |
+| -----      | ----     | ---    | --- | --- | --------------- | -- | -- | ------------------ | -----------------------        | -------------------------------------------------- |
+| LLAMA2-7B  | SFT      | 8      | 32  | 1   | 4096            | 1  | 1  | 17120              | 682                            | ***1.2***                                          |
+| LLAMA2-13B | SFT      | 8      | 32  | 1   | 4096            | 1  | 4  | 9741               | 754                            | ***2.1***                                          |
+| LLAMA2-70B | SFT      | 16     | 32  | 1   | 4096            | 4  | 4  | 1833               | 756                            | ***5.7***                                          |
+| LLAMA2-7B  | LoRA     | 8      | 32  | 1   | 4096            | 1  | 1  | 25206              | 673                            | ***0.8***                                          |
+| LLAMA2-13B | LoRA     | 8      | 32  | 1   | 4096            | 1  | 1  | 14161              | 733                            | ***1.5***                                          |
+| LLAMA2-70B | LoRA     | 8      | 32  | 1   | 4096            | 2  | 4  | 2557               | 705                            | ***8.1***                                          |
diff --git a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
index 77260e515a90..4c6aa643c19d 100644
--- a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
+++ b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
@@ -251,7 +251,7 @@ trainer:
   strategy: ddp
   accumulate_grad_batches: 1
   gradient_clip_val: 0.0
-  precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP.
+  precision: bf16-mixed # Should be set to bf16-mixed/16-mixed for O1 and O2 to enable the AMP.
   log_every_n_steps: 100  # Interval of logging.
   enable_progress_bar: True
   num_sanity_val_steps: 2 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
@@ -259,6 +259,7 @@ trainer:
   sync_batchnorm: true
   enable_checkpointing: False  # Provided by exp_manager
   logger: false  # Provided by exp_manager
+  use_distributed_sampler: false  # Lhotse has its own distributed sampler
 
 exp_manager:
   exp_dir: null
diff --git a/nemo/README.md b/nemo/README.md
index 869ce2f50031..a6025e77822a 100644
--- a/nemo/README.md
+++ b/nemo/README.md
@@ -10,3 +10,7 @@ NeMo (**Ne**ural **Mo**dules) is a toolkit for creating AI applications built ar
 * Vision - collection of modules and models for building computer vision networks
 * Multimodal - collection of modules and models for building multimodal networks
 * Audio - collection of modules and models for building audio processing networks
+
+**Performance**
+
+Performance benchmarks for pre-training and fine-tuning of various models can be found [here](../docs/source/performance/performance_summary.md)
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
index b1fc17bf8836..4c2afd8865d4 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -67,7 +67,9 @@ class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):
     def __init__(
         self,
         tokenizer: TokenizerSpec,
-        prompt_format_fn: Callable[[CutSet, TokenizerWrapper], Sequence[Sequence[int]]],
+        prompt_format_fn: Callable[
+            [CutSet, TokenizerWrapper], tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]
+        ],
     ):
         super().__init__()
         self.tokenizer = TokenizerWrapper(tokenizer)
@@ -95,7 +97,7 @@ def __getitem__(self, cuts: CutSet) -> PromptedAudioToTextMiniBatch:
             prompted_transcript_lens=prompts_with_answers_lens,
         )
 
-    def _collate_tokens(self, tokens: list[list[int]]) -> tuple[torch.Tensor, torch.Tensor]:
+    def _collate_tokens(self, tokens: list[list[int] | torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
         tokens = [torch.as_tensor(t) for t in tokens]
         token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long)
         tokens = collate_vectors(tokens, padding_value=self.padding_value)
@@ -196,19 +198,19 @@ def canary(
                 },
             )
         ]
-        if text := ' '.join(s.text for s in cut.supervisions if s.text is not None):
-            # Create answer_ids only if there is some transcript in the data.
-            turns.extend(
-                dict(
-                    role="assistant",
-                    slots={
-                        "text": text,
-                        formatter.PROMPT_LANGUAGE_SLOT: ifnone(
-                            cut.supervisions[0].language, cut.custom.get("target_lang")
-                        ),
-                    },
-                ),
-            )
+        # If data has no transcript, create empty response with <eos> only.
+        text = ' '.join(s.text for s in cut.supervisions if s.text is not None)
+        turns.append(
+            dict(
+                role="assistant",
+                slots={
+                    "text": text,
+                    formatter.PROMPT_LANGUAGE_SLOT: ifnone(
+                        cut.supervisions[0].language, cut.custom.get("target_lang")
+                    ),
+                },
+            ),
+        )
         encoded = formatter.encode_dialog(turns)
         prompts_with_answers.append(encoded["input_ids"])
         prompts.append(encoded["context_ids"])
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 301b2b0ad026..ddc86257a5f1 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -1038,13 +1038,11 @@ def predict_step(
             signal = batch.audio
             signal_len = batch.audio_lens
 
-        transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
+        _, _, enc_states, enc_mask = self.forward(
             input_signal=signal,
             input_signal_length=signal_len,
             processed_signal=processed_signal,
             processed_signal_length=processed_signal_length,
-            transcript=batch.prompt,
-            transcript_length=batch.prompt_lens,
         )
 
         text = self.decoding.decode_predictions_tensor(
diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index bae2c9ffdc67..415096a0c9d5 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -21,6 +21,7 @@
 from omegaconf import OmegaConf
 from torch.utils.data import DataLoader
 
+from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextMiniBatch
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder
 from nemo.collections.asr.parts.preprocessing.features import normalize_batch
@@ -1643,7 +1644,16 @@ def _get_batch_preds(self, keep_logits=False):
             tokens = self.input_tokens.to(device).repeat(feat_signal.size(0), 1)
             tokens_len = torch.tensor([tokens.size(1)] * tokens.size(0), device=device).long()
 
-            batch_input = (feat_signal, feat_signal_len, None, None, tokens, tokens_len)
+            batch_input = PromptedAudioToTextMiniBatch(
+                audio=feat_signal,
+                audio_lens=feat_signal_len,
+                transcript=None,
+                transcript_lens=None,
+                prompt=tokens,
+                prompt_lens=tokens_len,
+                prompted_transcript=None,
+                prompted_transcript_lens=None,
+            )
             predictions = self.asr_model.predict_step(batch_input, has_processed_signal=True)
             self.all_preds.extend(predictions)
             del predictions
diff --git a/nemo/collections/common/prompts/canary.py b/nemo/collections/common/prompts/canary.py
index f2b1e58c3bb2..e3f537957832 100644
--- a/nemo/collections/common/prompts/canary.py
+++ b/nemo/collections/common/prompts/canary.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
 from nemo.collections.common.tokenizers.canary_tokenizer import (
     CANARY_BOS,
@@ -33,6 +35,11 @@ class CanaryPromptFormatter(PromptFormatter):
         },
     }
 
+    def _validate_slot_values(self, expected: dict[str, Modality], received: dict[str, Any]) -> None:
+        if "taskname" in received and "task" not in received:
+            received["task"] = received.pop("taskname")
+        return super()._validate_slot_values(expected=expected, received=received)
+
     def encode_turn(self, prompt_template: str, expected_slots: dict, slot_values: dict) -> list[int]:
         # This method handles a level of indirection for Canary.
         # It maps values provided in trcfg to the actual special tokens
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 1be5c41e4919..33a21990e8f7 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -1,3 +1,4 @@
+import math
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Union
@@ -32,6 +33,7 @@ class FineTuningDataModule(pl.LightningDataModule):
         num_workers (int, optional): The number of worker processes for data loading. Defaults to 8.
         pin_memory (bool, optional): Whether to pin memory during data loading for faster GPU training. Defaults to True.
         persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs. Defaults to False.
+        max_train_steps (int, optional): Maximum number of steps to train. Used to calculate samples mapping for the mmap dataset
     """
 
     def __init__(
@@ -60,18 +62,40 @@ def __init__(
         self.num_workers = num_workers
         self.pin_memory = pin_memory
         self.persistent_workers = persistent_workers
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
+        self.rampup_batch_size = rampup_batch_size
+        self.data_sampler = None
+        self.max_train_samples = None
+
+    def setup(self, stage: str):
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
-            micro_batch_size=micro_batch_size,
-            global_batch_size=global_batch_size,
-            rampup_batch_size=rampup_batch_size,
+            micro_batch_size=self.micro_batch_size,
+            global_batch_size=self.global_batch_size,
+            rampup_batch_size=self.rampup_batch_size,
+            dataloader_type="batch",
         )
 
+        # Follows the calculation in nemo.collections.nlp.data.language_modeling.megatron.
+        # base_dataset_utils.get_datasets_weights_and_num_samples
+        self.max_train_samples = int(math.ceil(self.global_batch_size * self.trainer.max_steps * 1.005))
+
     def train_dataloader(self) -> DataLoader:
-        return self._create_dataloader(self._create_dataset(str(self.train_path)))
+        return self._create_dataloader(
+            self._create_dataset(
+                str(self.train_path),
+                max_num_samples=self.max_train_samples,
+            )
+        )
 
     def val_dataloader(self) -> DataLoader:
-        return self._create_dataloader(self._create_dataset(str(self.validation_path)))
+        return self._create_dataloader(
+            self._create_dataset(
+                str(self.validation_path),
+                is_test=True,
+            ),
+        )
 
     def test_dataloader(self) -> DataLoader:
         return self._create_dataloader(
@@ -85,7 +109,12 @@ def test_dataloader(self) -> DataLoader:
     @lru_cache
     def _create_dataset(self, path, **kwargs):
         return create_sft_dataset(
-            path, tokenizer=self.tokenizer, seq_length=self.seq_length, memmap_workers=self.memmap_workers, **kwargs
+            path,
+            tokenizer=self.tokenizer,
+            seq_length=self.seq_length,
+            memmap_workers=self.memmap_workers,
+            seed=self.seed,
+            **kwargs,
         )
 
     def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py
index 7e4cf8b6c74e..e226678d4894 100644
--- a/nemo/collections/llm/gpt/model/mistral.py
+++ b/nemo/collections/llm/gpt/model/mistral.py
@@ -111,7 +111,7 @@ def make_vocab_size_divisible_by(mistral_vocab_size):
             num_layers=source.num_hidden_layers,
             hidden_size=source.hidden_size,
             ffn_hidden_size=source.intermediate_size,
-            kv_channels=source.get('head_dim', source.hidden_size // source.num_attention_heads),
+            kv_channels=getattr(source, 'head_dim', source.hidden_size // source.num_attention_heads),
             num_attention_heads=source.num_attention_heads,
             # max_position_embeddings=source.max_position_embeddings,
             init_method_std=source.initializer_range,
diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py
index 868ad5b332b5..d8a265fd41d1 100644
--- a/nemo/collections/llm/gpt/model/mixtral.py
+++ b/nemo/collections/llm/gpt/model/mixtral.py
@@ -155,7 +155,7 @@ def config(self) -> MixtralConfig8x7B | MixtralConfig8x22B:
             num_layers=config.num_hidden_layers,
             hidden_size=config.hidden_size,
             ffn_hidden_size=config.intermediate_size,
-            kv_channels=config.get('head_dim', config.hidden_size // config.num_attention_heads),
+            kv_channels=getattr(config, 'head_dim', config.hidden_size // config.num_attention_heads),
             max_position_embeddings=config.max_position_embeddings,  # TODO
             seq_length=config.max_position_embeddings,
             # RoPE
diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index a26354fee8be..59acdec6f8e2 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -122,14 +122,45 @@ def add_megatron_sampler(
     global_batch_size: int,
     rampup_batch_size: Optional[List[int]] = None,
     consumed_samples: int = 0,
-    dataloader_type: Literal["single", "cyclic"] = "single",
+    dataloader_type: Literal["single", "cyclic", "batch"] = "single",
     drop_last: bool = True,
     pad_samples_to_global_batch_size: bool = False,
     # data_sharding: bool = False
 ) -> DataLoader:
+    """
+    This function takes an existing PyTorch `DataLoader` and configures it to use a Megatron sampler.
+    The Megatron sampler is responsible for splitting the data into batches
+    during training with Megatron.
+
+    Args:
+        dataloader (DataLoader): The original PyTorch DataLoader to wrap.
+        micro_batch_size (int): The size of each micro-batch.
+        global_batch_size (int): The effective size of the training batch across all data parallel devices.
+        rampup_batch_size (Optional[List[int]]): A list of target batch sizes for a gradual
+            rampup schedule during training (optional).
+        consumed_samples (int, optional): The number of samples consumed before
+            starting this iteration (defaults to 0).
+        dataloader_type (Literal["single", "cyclic", "batch"], optional): The type of
+            Megatron sampler to use. Valid options are:
+                - "single": Uses `MegatronPretrainingSampler` for single pass data sampling.
+                - "cyclic": Uses `MegatronPretrainingRandomSampler` for cyclic data sampling.
+                - "batch": Uses `MegatronPretrainingBatchSampler` for batch sampling. This is the option to
+                  use for fine-tuning workloads, where sequence lengths are variable between samples.
+                  Sampling the entire global batch together ensures that sequences in a global batch are
+                  padded to the same lengths.
+            Defaults to "single".
+        drop_last (bool, optional): Whether to drop the last incomplete batch
+            (defaults to True).
+        pad_samples_to_global_batch_size (bool, optional): Whether to pad the last incomplete
+            batch to the `global_batch_size`  (defaults to False, only applies when
+            `drop_last` is False).
+
+    Returns:
+        DataLoader: A new DataLoader instance with the configured Megatron sampler.
+    """
+
     from megatron.core import parallel_state
 
-    ## TODO: expose drop_last and pad_samples_to_global_batch_size args
     if dataloader_type == 'single':
         batch_sampler = MegatronPretrainingSampler(
             total_samples=len(dataloader.dataset),
@@ -152,6 +183,21 @@ def add_megatron_sampler(
             drop_last=drop_last,
             # data_sharding=data_sharding
         )
+    elif dataloader_type == 'batch':
+        from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import (
+            MegatronPretrainingBatchSampler,
+        )
+
+        batch_sampler = MegatronPretrainingBatchSampler(
+            total_samples=len(dataloader.dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            data_parallel_rank=parallel_state.get_data_parallel_rank(),
+            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            drop_last=drop_last,
+            pad_samples_to_global_batch_size=not drop_last,
+        )
     else:
         raise Exception(f'{dataloader_type} dataloader type is not supported.')
 
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index dfc78c30a929..741c1400474a 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -293,9 +293,12 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa
         """
         connector = self._get_connector(path)
         ckpt_path: Path = connector.local_path(base_path=base_path)
-        ckpt_path = connector(ckpt_path, overwrite=overwrite)
+        # If already in multiproc environment (e.g. due to torchrun invocation) run only on RANK = 0
+        from nemo.utils.get_rank import is_global_rank_zero
 
-        connector.on_import_ckpt(self)
+        if is_global_rank_zero():
+            ckpt_path = connector(ckpt_path, overwrite=overwrite)
+            connector.on_import_ckpt(self)
 
         return ckpt_path
 
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 5b2ac7ff1adb..37978510972d 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -230,6 +230,20 @@ def forward(
         _num_microbatches: int = num_microbatches or self.infer_num_microbatches(data)
 
         pipeline = self.pipeline
+
+        use_global_batch_sampler = self.trainer.datamodule.data_sampler.dataloader_type == 'batch'
+        if use_global_batch_sampler:
+            from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split
+
+            # The current way of using a batch sampler + split to micro iterator results in
+            # extraneous padding, and is only implemented to ensure bit-exactness with NeMo 1.
+            # This part in NeMo 1 was written when megatron fwd_bwd_function did not support unequal
+            # sequence lengths, but it does now. Hence this part should be revisited in the future.
+            batch = next(data)
+            if isinstance(batch, tuple) and len(batch) == 3:
+                batch = batch[0]
+            data = get_iterator_k_split(batch, _num_microbatches, True)
+
         data_iterator: List[Iterator[DataT]] = self.to_data_iterator_list(data)
         context = self._build_context({**locals()})
 
diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py
index 9848fdb2b8fd..70ebf943b333 100644
--- a/nemo/lightning/pytorch/callbacks/nsys.py
+++ b/nemo/lightning/pytorch/callbacks/nsys.py
@@ -35,15 +35,15 @@ def __init__(
         ranks: List[int] = [0],
         gen_shape: bool = False,
     ):
-        assert type(start_step) == int, f'Nsys start_step must be of type int. Found: {type(start_step)}'
+        assert type(start_step) is int, f'Nsys start_step must be of type int. Found: {type(start_step)}'
         self._nsys_profile_start_step = start_step
 
-        assert type(end_step) == int, f'Nsys end_step must be of type int. Found: {type(start_step)}'
+        assert type(end_step) is int, f'Nsys end_step must be of type int. Found: {type(start_step)}'
         self._nsys_profile_end_step = end_step
 
         assert (
             self._nsys_profile_end_step >= self._nsys_profile_start_step
-        ), f'Nsys end_step must be greater than or equal to nsys start_step'
+        ), 'Nsys end_step must be greater than or equal to nsys start_step'
 
         self._nsys_profile_ranks = ranks
         self._nsys_profile_gen_shape = gen_shape
@@ -60,8 +60,9 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Opt
         """
 
         device = trainer.strategy.root_device
+        current_step = trainer.strategy.current_epoch_step
         if device.type == 'cuda':
-            if batch_idx == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
+            if current_step == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
                 logging.info("====== Start nsys profiling ======")
                 torch.cuda.cudart().cudaProfilerStart()
                 if self._nsys_profile_gen_shape:
@@ -76,8 +77,9 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int)
         """
 
         device = trainer.strategy.root_device
+        current_step = trainer.strategy.current_epoch_step
         if device.type == 'cuda':
-            if batch_idx == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
+            if current_step == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
                 logging.info("====== End nsys profiling ======")
                 torch.cuda.cudart().cudaProfilerStop()
                 torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)
diff --git a/nemo/lightning/pytorch/callbacks/progress.py b/nemo/lightning/pytorch/callbacks/progress.py
index 9ccf871f820f..1b1276953a61 100644
--- a/nemo/lightning/pytorch/callbacks/progress.py
+++ b/nemo/lightning/pytorch/callbacks/progress.py
@@ -8,15 +8,6 @@ class MegatronProgressBar(TQDMProgressBar):
     for megatron models.
     """
 
-    def get_current_epoch_step(self, trainer) -> int:
-        """
-        Get the value of step within an epoch.
-        """
-        return max(
-            trainer.fit_loop.epoch_loop.automatic_optimization.optim_progress.optimizer.step.current.completed,
-            trainer.fit_loop.epoch_loop.manual_optimization.optim_step_progress.current.completed,
-        )
-
     def init_train_tqdm(self):
         """
         Override bar_format to not have 's/it'.
@@ -41,7 +32,7 @@ def on_train_batch_end(self, trainer, pl_module, *_, **__):
         """
         Override parent class on_train_batch_end to update progress bar per global batch instead of per microbatch.
         """
-        n = self.get_current_epoch_step(trainer)
+        n = trainer.strategy.current_epoch_step
         if self._should_update(n, self.train_progress_bar.total):
             _update_n(self.train_progress_bar, n)
             self.train_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index 7796e4bb92de..3909c680cfc1 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -23,7 +23,7 @@ def __init__(
         micro_batch_size: int = 4,
         global_batch_size: int = 8,
         rampup_batch_size: Optional[List[int]] = None,
-        dataloader_type: Literal["single", "cyclic"] = "single",
+        dataloader_type: Literal["single", "cyclic", "batch"] = "single",
         init_consumed_samples: int = 0,
     ):
         self.seq_len = seq_len
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index f44906ef7b7b..99eee9ad6bd3 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -647,6 +647,16 @@ def checkpoint_io(self) -> CheckpointIO:
     def checkpoint_io(self, io: CheckpointIO) -> None:
         self._checkpoint_io = io
 
+    @property
+    def current_epoch_step(self) -> int:
+        """
+        Get the value of step within an epoch.
+        """
+        return max(
+            self.trainer.fit_loop.epoch_loop.automatic_optimization.optim_progress.optimizer.step.current.completed,
+            self.trainer.fit_loop.epoch_loop.manual_optimization.optim_step_progress.current.completed,
+        )
+
     def _get_data_step(self, step_type: str) -> Optional[_ModuleStepFunction]:
         for fn_name in [f"{step_type}_data_step", "data_step"]:
             if hasattr(self.lightning_module, fn_name):
diff --git a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
index 5785db656217..425a6c696120 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
@@ -132,7 +132,7 @@ def load_config(mistral_config, tokenizer, config_path):
     # Tokenizer config
     if hasattr(tokenizer, 'vocab_file'):
         nemo_config.tokenizer.model = tokenizer.vocab_file
-    else:
+    elif os.path.exists(os.path.join(config_path, 'tekken.json')):
         # Load tekken.json, extract the 'vocab' field & write it to file.
         vocab_path = os.path.join(config_path, 'tekken.json')
         assert os.path.exists(vocab_path), f"Expected {vocab_path} to exist"
@@ -156,6 +156,14 @@ def load_config(mistral_config, tokenizer, config_path):
             'sentencepiece_legacy': False,
         }
         nemo_config.tokenizer = tokenizer_dict
+    else:
+        # Otherwise use HF
+        tokenizer_dict = {
+            'library': 'huggingface',
+            'type': args.input_name_or_path,
+            'use_fast': True,
+        }
+        nemo_config.tokenizer = tokenizer_dict
 
     # TODO(@akoumparouli): rope_scaling.
     nemo_config['rotary_base'] = mistral_config['rope_theta']
diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
index 99d1795aea9c..c50267ef6b42 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
@@ -40,7 +40,7 @@ def get_args():
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output HF checkpoint.")
     parser.add_argument('--hf_model_name', type=str, default="mistralai/Mistral-7B-v0.1", help="Name of HF checkpoint")
-    parser.add_argument("--precision", type=str, default="32", help="Model precision")
+    parser.add_argument("--precision", type=str, default="bf16", help="Model precision")
     args = parser.parse_args()
     return args
 
@@ -48,7 +48,8 @@ def get_args():
 def load_config(hf_model_name, nemo_config):
     hf_config = AutoConfig.from_pretrained(hf_model_name)
     # SWA; nemo_config.window_size is list [left-bound, right-bound]
-    hf_config.sliding_window = nemo_config.window_size[0]
+    if hasattr(nemo_config, 'window_size'):
+        hf_config.sliding_window = nemo_config.window_size[0]
     hf_config.max_position_embeddings = nemo_config.encoder_seq_length
     hf_config.num_hidden_layers = nemo_config.num_layers
     hf_config.hidden_size = nemo_config.hidden_size
@@ -83,6 +84,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
         model_config.use_cpu_initialization = True
     else:
         map_location = None
+    model_config.perform_initialization = False
 
     if cpu_only:
         logging.info("******** Loading model on CPU. This will take a significant amount of time.")
@@ -129,7 +131,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
     num_layers = model.cfg.num_layers
     num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
 
-    head_size = hidden_size // head_num
+    head_size = model.cfg.get('kv_channels', hidden_size // head_num)
     heads_per_group = head_num // num_query_groups
     qkv_total_dim = head_num + 2 * num_query_groups
 
@@ -215,6 +217,14 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
 
 
 if __name__ == '__main__':
+    import transformers
+
+    type(transformers.__version__)
+    from packaging.version import Version
+
+    if Version(transformers.__version__) < Version('4.44.0'):
+        logging.warning("You need to use transformers >= v4.44.0")
+
     args = get_args()
     hf_state_dict, nemo_config, dtype = convert(args.input_name_or_path, args.precision)
 
diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py
index 4e805c8f34de..b6267ad7d1cf 100644
--- a/tests/collections/asr/test_asr_multitask_model_bpe.py
+++ b/tests/collections/asr/test_asr_multitask_model_bpe.py
@@ -17,18 +17,29 @@
 
 import pytest
 import torch
+from lhotse import CutSet
+from lhotse.testing.dummies import DummyManifest
 from omegaconf import DictConfig
 
+from nemo.collections.asr.data.audio_to_text_lhotse_prompted import (
+    PromptedAudioToTextLhotseDataset,
+    PromptedAudioToTextMiniBatch,
+    canary,
+)
 from nemo.collections.asr.models.aed_multitask_models import EncDecMultiTaskModel
 from nemo.collections.asr.parts.submodules import multitask_beam_decoding as beam_decode
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
+from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
 from nemo.collections.common.prompts.canary import CanaryPromptFormatter
 from nemo.collections.common.tokenizers import CanaryTokenizer
 
 
 @pytest.fixture()
 def asr_model(test_data_dir):
-    preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})}
+    preprocessor = {
+        'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor',
+        'params': {"window_size": 0.02, "window_stride": 0.01, "features": 64},
+    }
 
     model_defaults = {'asr_enc_hidden': 128, 'lm_enc_hidden': 64, 'lm_dec_hidden': 64}
 
@@ -384,3 +395,113 @@ def test_build_tokenizer(self, asr_model, test_data_dir):
 
         for i, j in zip(ids1, ids2):
             assert i == j
+
+    @pytest.mark.unit
+    def test_predict_step(self, asr_model, test_data_dir):
+        cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True)
+        c = cuts[0]
+        c.supervisions[0].language = "en"
+        c.source_lang = "en"
+        c.target_lang = "en"
+        c.task = "asr"
+        c.pnc = "no"
+        dataset = PromptedAudioToTextLhotseDataset(tokenizer=asr_model.tokenizer, prompt_format_fn=canary)
+        batch = dataset[cuts]
+
+        # Numpy array test
+        outputs = asr_model.predict_step(batch)
+        print(outputs)
+        assert len(outputs) == 1
+        assert isinstance(outputs[0], str)
+
+    @pytest.mark.unit
+    def test_FrameBatchMultiTaskAED(self, asr_model, test_data_dir):
+        model = FrameBatchMultiTaskAED(asr_model, batch_size=1)
+
+        audio_file = os.path.join(test_data_dir, "asr", "train", "an4", "wav", "an46-mmap-b.wav")
+        meta = {
+            'audio_filepath': audio_file,
+            'duration': 100000,
+            'source_lang': 'en',
+            'taskname': 'asr',
+            'target_lang': 'en',
+            'pnc': 'yes',
+            'answer': 'nothing',
+        }
+        model.read_audio_file(audio_file, delay=0.0, model_stride_in_secs=40.0, meta_data=meta)
+        outputs = model.transcribe()
+        assert isinstance(outputs, str)
+
+
+@pytest.mark.unit
+def test_prompted_dataset(asr_model):
+    dataset = PromptedAudioToTextLhotseDataset(tokenizer=asr_model.tokenizer, prompt_format_fn=canary)
+
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=3, with_data=True)
+
+    c = cuts[0]
+    c.supervisions[0].language = "en"
+    c.source_lang = "en"
+    c.target_lang = "en"
+    c.task = "asr"
+    c.pnc = "no"
+
+    c = cuts[1]
+    c.supervisions[0].language = "de"
+    c.supervisions[0].text = "unerheblich"
+    c.source_lang = "en"
+    c.target_lang = "de"
+    c.taskname = "ast"  # note: testing for "taskname" as we support it together with "task"
+    c.pnc = "yes"
+
+    c = cuts[2]
+    c.supervisions[0].language = "en"
+    c.supervisions[0].text = ""
+    c.source_lang = "en"
+    c.target_lang = "en"
+    c.task = "asr"
+    c.pnc = "yes"
+
+    batch = dataset[cuts]
+
+    assert isinstance(batch, PromptedAudioToTextMiniBatch)
+    assert batch.audio.shape == (3, 16000)
+    assert batch.audio_lens.tolist() == [16000, 16000, 16000]
+
+    # Test example 0 (transcription)
+    i = 0
+    assert (
+        asr_model.tokenizer.ids_to_text(batch.prompt[i]) == '<|startoftranscript|><|en|><|transcribe|><|en|><|nopnc|>'
+    )
+    assert batch.prompt_lens[i] == 5
+    assert asr_model.tokenizer.ids_to_text(batch.transcript[i]) == 'i##r##r##el##e##v##a##nt<pad><pad>'
+    assert batch.transcript_lens[i] == 8
+    assert (
+        asr_model.tokenizer.ids_to_text(batch.prompted_transcript[i])
+        == '<|startoftranscript|><|en|><|transcribe|><|en|><|nopnc|>i##r##r##el##e##v##a##nt<|endoftext|><pad><pad>'
+    )
+    assert batch.prompted_transcript_lens[i] == 14
+
+    # Test example 1 (translation)
+    i = 1
+    assert asr_model.tokenizer.ids_to_text(batch.prompt[i]) == '<|startoftranscript|><|en|><|translate|><|de|><|pnc|>'
+    assert batch.prompt_lens[i] == 5
+    assert asr_model.tokenizer.ids_to_text(batch.transcript[i]) == 'u##ne##r##h##e##b##l##i##c##h'
+    assert batch.transcript_lens[i] == 10
+    assert (
+        asr_model.tokenizer.ids_to_text(batch.prompted_transcript[i])
+        == '<|startoftranscript|><|en|><|translate|><|de|><|pnc|>u##ne##r##h##e##b##l##i##c##h<|endoftext|>'
+    )
+    assert batch.prompted_transcript_lens[i] == 16
+
+    # Test example 2 (no transcript, e.g. noise)
+    i = 2
+    assert asr_model.tokenizer.ids_to_text(batch.prompt[i]) == '<|startoftranscript|><|en|><|transcribe|><|en|><|pnc|>'
+    assert batch.prompt_lens[i] == 5
+    assert asr_model.tokenizer.ids_to_text(batch.transcript[i]) == '<pad>' * 10
+    assert batch.transcript_lens[i] == 0
+    assert (
+        asr_model.tokenizer.ids_to_text(batch.prompted_transcript[i])
+        == '<|startoftranscript|><|en|><|transcribe|><|en|><|pnc|><|endoftext|>' + '<pad>' * 10
+    )
+    assert batch.prompted_transcript_lens[i] == 6
diff --git a/tests/lightning/pytorch/callbacks/test_nsys.py b/tests/lightning/pytorch/callbacks/test_nsys.py
index e8734ad1c1ac..d87da58b8ad0 100644
--- a/tests/lightning/pytorch/callbacks/test_nsys.py
+++ b/tests/lightning/pytorch/callbacks/test_nsys.py
@@ -68,6 +68,7 @@ def test_on_train_batch_start_profiling(
         mock_get_rank.return_value = 0
         callback = NsysCallback(start_step=10, end_step=20, ranks=[0], gen_shape=True)
 
+        mock_trainer.strategy.current_epoch_step = 10
         callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
 
         mock_cudart().cudaProfilerStart.assert_called_once()
@@ -80,6 +81,7 @@ def test_on_train_batch_start_no_profiling(self, mock_cudart, mock_get_rank, moc
         mock_get_rank.return_value = 0
         callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
 
+        mock_trainer.strategy.current_epoch_step = 9
         callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 9)
 
         mock_cudart().cudaProfilerStart.assert_not_called()
@@ -94,6 +96,7 @@ def test_on_train_batch_end_profiling(
         mock_get_rank.return_value = 0
         callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
 
+        mock_trainer.strategy.current_epoch_step = 20
         callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20)
 
         mock_cudart().cudaProfilerStop.assert_called_once()
@@ -163,6 +166,7 @@ def test_profiling_range(
         mock_get_rank.return_value = 0
         callback = NsysCallback(start_step=start_step, end_step=end_step, ranks=[0])
 
+        mock_trainer.strategy.current_epoch_step = batch_idx
         callback.on_train_batch_start(mock_trainer, mock_pl_module, None, batch_idx)
 
         if expected_call:
@@ -183,13 +187,16 @@ def test_single_profile_range(self, mock_cudart, mock_get_rank, mock_trainer, mo
         mock_trainer.strategy.root_device.type = 'cuda'
 
         # Start of range
+        mock_trainer.strategy.current_epoch_step = 10
         callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
         assert mock_cudart().cudaProfilerStart.call_count == 1, "cudaProfilerStart was not called"
 
         # Middle of range
+        mock_trainer.strategy.current_epoch_step = 25
         callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 25)
         assert mock_cudart().cudaProfilerStart.call_count == 1, "cudaProfilerStart was called again"
 
         # End of range
+        mock_trainer.strategy.current_epoch_step = 40
         callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 40)
         assert mock_cudart().cudaProfilerStop.call_count == 1, "cudaProfilerStop was not called"