From ed244d943948b105982f5cb17b51991e8acebfb7 Mon Sep 17 00:00:00 2001
From: Joosung Yoon <7252598+eagle705@users.noreply.github.com>
Date: Fri, 15 Nov 2024 21:23:04 +0900
Subject: [PATCH 01/48] Fix head_size in NeMo to HF checkpoint converters for
 width pruned model support (#11230)

* update attn head_size to kv_channels for width pruning support

Signed-off-by: Joosung <joosungy@nvidia.com>

* Update llama ckpt converter usage about tokenizer args

Signed-off-by: Joosung <joosungy@nvidia.com>

---------

Signed-off-by: Joosung <joosungy@nvidia.com>
Co-authored-by: Joosung <joosungy@nvidia.com>
---
 .../convert_baichuan2_nemo_to_hf.py           |  2 +-
 .../convert_chatglm_nemo_to_hf.py             |  2 +-
 .../convert_llama_nemo_to_hf.py               | 40 ++++++++++++-------
 .../convert_mistral_7b_nemo_to_hf.py          |  2 +-
 .../convert_mixtral_nemo_to_hf.py             |  2 +-
 .../convert_qwen2_nemo_to_hf.py               |  2 +-
 .../convert_starcoder2_nemo_to_hf.py          |  2 +-
 7 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py
index ec048e4b6f19..f84350f75f60 100644
--- a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py
@@ -128,7 +128,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
     ffn_hidden_size = model.cfg.ffn_hidden_size
     num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
 
-    head_size = hidden_size // head_num
+    head_size = model.cfg.get("kv_channels") or (hidden_size // head_num)  # equivalent to hf's head_dim
     heads_per_group = head_num // num_query_groups
     qkv_total_dim = head_num + 2 * num_query_groups
 
diff --git a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py
index 5a8e52ee8be5..11279c15c0c9 100644
--- a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py
@@ -126,7 +126,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
     num_layers = model.cfg.num_layers
     num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
 
-    head_size = hidden_size // head_num
+    head_size = model.cfg.get("kv_channels") or (hidden_size // head_num)  # equivalent to hf's head_dim
     heads_per_group = head_num // num_query_groups  # 32 / 2 = 16
     qkv_total_dim = head_num + 2 * num_query_groups  # 32 + 2 * 2 = 36
 
diff --git a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
index a3c40676a980..126d50be8355 100644
--- a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
@@ -26,7 +26,7 @@
 from nemo.utils import logging
 
 """
-Script to convert a llama2 checkpoint in nemo (mcore path) into a HuggingFace checkpoint.
+Script to convert a llama checkpoint in nemo (mcore path) into a HuggingFace checkpoint.
 This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder.
 
 1) Generate only HF weights from a nemo file:
@@ -37,13 +37,21 @@
     
 2) Generate the full HF model folder
 
+    python convert_llama_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin \
+    --hf_input_path /path/to/input_hf_folder \
+    --hf_output_path /path/to/output_hf_folder
+
+3) Generate the full HF model folder with a custom tokenizer
+
     python convert_llama_nemo_to_hf.py \
     --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
     --output_path /path/to/pytorch_model.bin \
     --hf_input_path /path/to/input_hf_folder \
     --hf_output_path /path/to/output_hf_folder \
-    --input_tokenizer /path/to/tokenizer \
-    --hf_output_tokenizer /path/to/output_tokenizer \
+    --input_tokenizer /path/to/custom_nemo_tokenizer.model \
+    --hf_output_tokenizer /path/to/output_tokenizer
 
     Use the --cpu-only flag if the model cannot fit in the GPU (e.g. Llama2 70b). 
     However this option makes the conversion script significantly slower.
@@ -143,7 +151,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
     ffn_hidden_size = model.cfg.ffn_hidden_size
     num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
 
-    head_size = hidden_size // head_num
+    head_size = model.cfg.get("kv_channels") or (hidden_size // head_num)  # equivalent to hf's head_dim
     heads_per_group = head_num // num_query_groups
     qkv_total_dim = head_num + 2 * num_query_groups
 
@@ -246,21 +254,25 @@ def replace_hf_weights_and_tokenizer(
     nemo_exported = torch.load(weights_file)
 
     if tokenizer_path:
-        tokenizer = LlamaTokenizer.from_pretrained(
-            tokenizer_path,
-            local_files_only=True,
-            legacy=False,
-        )
-        tmp_tokenizer = convert_slow_tokenizer.convert_slow_tokenizer(tokenizer)
-        fast_tokenizer = LlamaTokenizerFast(tokenizer_object=tmp_tokenizer)
-        tokenizer_length = len(fast_tokenizer)
-        model.resize_token_embeddings(tokenizer_length)
+        try:
+            tokenizer = LlamaTokenizer.from_pretrained(
+                tokenizer_path,
+                local_files_only=True,
+                legacy=False,
+            )
+            tmp_tokenizer = convert_slow_tokenizer.convert_slow_tokenizer(tokenizer)
+            fast_tokenizer = LlamaTokenizerFast(tokenizer_object=tmp_tokenizer)
+            tokenizer_length = len(fast_tokenizer)
+            model.resize_token_embeddings(tokenizer_length)
+        except:
+            tokenizer = None
+            logging.warning("Could not load custom tokenizer, proceeding with default tokenizer")
 
     model.load_state_dict(nemo_exported)
     model.save_pretrained(output_hf_path)
     logging.info(f"Full HF model saved to {output_hf_path}")
 
-    if tokenizer_path:
+    if tokenizer_path and (tokenizer is not None):
         fast_tokenizer.save_pretrained(output_hf_tokenizer)
         tokenizer.save_pretrained(output_hf_tokenizer)
         logging.info(f"Tokenizer saved to {output_hf_tokenizer}")
diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
index b8c30a1b929d..70e7f0b751d3 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
@@ -134,7 +134,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
     num_layers = model.cfg.num_layers
     num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
 
-    head_size = model.cfg.get('kv_channels', hidden_size // head_num)
+    head_size = model.cfg.get("kv_channels") or (hidden_size // head_num)  # equivalent to hf's head_dim
     heads_per_group = head_num // num_query_groups
     qkv_total_dim = head_num + 2 * num_query_groups
 
diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
index 2bac2eaad616..2961b11870d5 100644
--- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
@@ -137,7 +137,7 @@ def convert(in_file, precision=None) -> None:
     num_layers = model.cfg.num_layers
     num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
 
-    head_size = hidden_size // head_num
+    head_size = model.cfg.get("kv_channels") or (hidden_size // head_num)  # equivalent to hf's head_dim
     heads_per_group = head_num // num_query_groups
     qkv_total_dim = head_num + 2 * num_query_groups
 
diff --git a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
index 6080499ffdf8..164d1a28b6c0 100644
--- a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
@@ -142,7 +142,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
     ffn_hidden_size = model.cfg.ffn_hidden_size
     num_query_groups = model.cfg.get("num_query_groups", head_num)
 
-    head_size = hidden_size // head_num
+    head_size = model.cfg.get("kv_channels") or (hidden_size // head_num)  # equivalent to hf's head_dim
     heads_per_group = head_num // num_query_groups
     qkv_total_dim = head_num + 2 * num_query_groups
 
diff --git a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
index 4b65533b74ec..a271e44f0bdb 100644
--- a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
@@ -141,7 +141,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
     num_layers = model.cfg.num_layers
     num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
 
-    head_size = hidden_size // head_num
+    head_size = model.cfg.get("kv_channels") or (hidden_size // head_num)  # equivalent to hf's head_dim
     heads_per_group = head_num // num_query_groups
     qkv_total_dim = head_num + 2 * num_query_groups
 

From 9c2ed1edb8af865f9521f2743af2e24e1225ef49 Mon Sep 17 00:00:00 2001
From: meatybobby <bobchen@nvidia.com>
Date: Fri, 15 Nov 2024 08:59:01 -0800
Subject: [PATCH 02/48] Add openai-gelu in gated activation (#11293)

---
 nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
index db1aec0f5a55..b0e134ab0c35 100644
--- a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
+++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
@@ -161,7 +161,7 @@ def convert_model_to_trt_llm_ckpt(
         or nemo_model_config.get("layernorm_zero_centered_gamma", False),
         "tp_size": training_tp_size,
         "split_gated_activation": nemo_model_config.get("activation", "gelu")
-        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu"]
+        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu", "openai-gelu"]
         and (decoder_type == "gptnext" or is_mcore),
         "num_attention_heads": num_attention_heads,
         "num_kv_heads": num_kv_heads,
@@ -336,7 +336,7 @@ def dist_model_to_trt_llm_ckpt(
         "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
         "tp_size": tp_size,
         "split_gated_activation": nemo_model_config.get("activation", "gelu")
-        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu"],
+        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu", "openai-gelu"],
         "num_attention_heads": nemo_model_config["num_attention_heads"],
         "num_kv_heads": nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
         "convert_on_device": True,

From 83a21665a0971bc09407bc0496bd40401853ebac Mon Sep 17 00:00:00 2001
From: gvenkatakris <gvenkatakris@nvidia.com>
Date: Fri, 15 Nov 2024 09:53:49 -0800
Subject: [PATCH 03/48] Fixes per comments (#11280)

* Fixes per comments

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>

* Update README

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>

---------

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>
---
 tutorials/llm/llama-3/README.rst              |  4 +--
 .../01_data_preparation.ipynb                 |  2 +-
 .../02_teacher_finetuning.ipynb               | 12 +++----
 .../03_a_depth_pruning.ipynb                  |  6 ++--
 .../03_b_width_pruning.ipynb                  | 14 ++++----
 ...04_a_distilling_depth_pruned_student.ipynb |  6 ++--
 ...04_b_distilling_width_pruned_student.ipynb |  6 ++--
 .../05_display_results.ipynb                  | 31 +++++++----------
 .../llama-3/pruning-distillation/README.rst   | 33 ++++++++++---------
 .../pruning-distillation/introduction.ipynb   | 20 +++++------
 10 files changed, 65 insertions(+), 69 deletions(-)

diff --git a/tutorials/llm/llama-3/README.rst b/tutorials/llm/llama-3/README.rst
index 3bb1a0896b82..1d12b8847c0d 100755
--- a/tutorials/llm/llama-3/README.rst
+++ b/tutorials/llm/llama-3/README.rst
@@ -2,7 +2,7 @@
 Getting Started with Llama 3 and Llama 3.1
 ==========================================
 
-This repository contains jupyter notebook tutorials using NeMo Framework for Llama-3 and Llama-3.1 models by Meta.
+This repository contains Jupyter Notebook tutorials using the NeMo Framework for Llama-3 and Llama-3.1 models by Meta.
 
 .. list-table:: 
    :widths: 100 25 100
@@ -16,7 +16,7 @@ This repository contains jupyter notebook tutorials using NeMo Framework for Lla
      - Perform LoRA PEFT on Llama 3 8B Instruct using a dataset for bio-medical domain question answering. Deploy multiple LoRA adapters with NVIDIA NIM.
    * - `Llama 3.1 Law-Domain LoRA Fine-Tuning and Deployment with NeMo Framework and NVIDIA NIM <./sdg-law-title-generation>`_
      - `Law StackExchange <https://huggingface.co/datasets/ymoslem/Law-StackExchange>`_
-     - Perform LoRA PEFT on Llama 3.1 8B Instruct using a synthetically augmented version of Law StackExchange with NeMo Framework, followed by deployment with NVIDIA NIM. As a pre-requisite, follow the tutorial for  `data curation using NeMo Curator <https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/peft-curation-with-sdg>`__.
+     - Perform LoRA PEFT on Llama 3.1 8B Instruct using a synthetically augmented version of Law StackExchange with NeMo Framework, followed by deployment with NVIDIA NIM. As a prerequisite, follow the tutorial for `data curation using NeMo Curator <https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/peft-curation-with-sdg>`_.
    * - `Llama 3.1 Pruning and Distillation with NeMo Framework <./pruning-distillation>`_
      - `WikiText-103-v1 <https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1>`_
      - Perform pruning and distillation on Llama 3.1 8B using the WikiText-103-v1 dataset with NeMo Framework.
diff --git a/tutorials/llm/llama-3/pruning-distillation/01_data_preparation.ipynb b/tutorials/llm/llama-3/pruning-distillation/01_data_preparation.ipynb
index 1f84dd2719e6..8548c0cfb1d0 100644
--- a/tutorials/llm/llama-3/pruning-distillation/01_data_preparation.ipynb
+++ b/tutorials/llm/llama-3/pruning-distillation/01_data_preparation.ipynb
@@ -9,7 +9,7 @@
     "\n",
     "The dataset has to be preprocessed using the [preprocess_data_for_megatron.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/nlp_language_modeling/preprocess_data_for_megatron.py) script included in the NeMo Framework. This step will also tokenize data using the `meta-llama/Meta-Llama-3.1-8B` tokenizer model to convert the data into a memory map format.\n",
     "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your train, test and validation data files."
+    "> `NOTE:` In the block of code below, pass the paths to your train, test, and validation data files."
    ]
   },
   {
diff --git a/tutorials/llm/llama-3/pruning-distillation/02_teacher_finetuning.ipynb b/tutorials/llm/llama-3/pruning-distillation/02_teacher_finetuning.ipynb
index 8d08793bbe9a..7d58ac4779aa 100644
--- a/tutorials/llm/llama-3/pruning-distillation/02_teacher_finetuning.ipynb
+++ b/tutorials/llm/llama-3/pruning-distillation/02_teacher_finetuning.ipynb
@@ -6,15 +6,15 @@
    "metadata": {},
    "source": [
     "\n",
-    "### Step 2: Finetune the teacher on the dataset\n",
+    "### Step 2: Fine-tune the teacher on the dataset\n",
     "\n",
-    "NeMo framework includes a standard python script [megatron_gpt_pretraining.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_pretraining.py) for training a model. Once you have your model downloaded and the dataset ready, fine-tuning the teacher model with NeMo is essentially just running this script!\n",
+    "NeMo Framework includes a standard Python script, [megatron_gpt_pretraining.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_pretraining.py), for training a model. Once you have your model downloaded and the dataset ready, fine-tuning the teacher model with NeMo is essentially just running this script!\n",
     "\n",
-    "We finetune the unpruned model on our dataset to correct the distribution shift across the original dataset the model was trained on. Per the [blog](https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/) and [tech report](https://arxiv.org/pdf/2408.11796), experiments showed that, without correcting for the distribution shift, the teacher provides suboptimal guidance on the dataset when being distilled.\n",
+    "We fine-tune the unpruned model on our dataset to correct the distribution shift from the original dataset the model was trained on. According to the [blog](https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/) and [tech report](https://arxiv.org/pdf/2408.11796), experiments showed that without correcting for this distribution shift, the teacher provides suboptimal guidance on the dataset during distillation.\n",
     "\n",
     "For this demonstration, this training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps.\n",
     "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test and validation data files as well as path to the teacher .nemo model."
+    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test, and validation data files, as well as the path to the teacher .nemo model."
    ]
   },
   {
@@ -124,8 +124,8 @@
    "id": "3040a993-8423-475f-8bc6-d1dd1ce16a83",
    "metadata": {},
    "source": [
-    "This will create a finetuned teacher model named `megatron_llama_ft.nemo` in `./distill_trainings/megatron_llama_ft/checkpoints/`. We'll use this later.\n",
-    "> `NOTE:`This script takes at least 20 minutes to run (depending on GPU) and will generate the finetuned teacher model."
+    "This will create a fine-tuned teacher model named `megatron_llama_ft.nemo` in `./distill_trainings/megatron_llama_ft/checkpoints/`. We'll use this later.\n",
+    "> `NOTE:`This script takes at least 20 minutes to run (depending on GPU) and will generate the fine-tuned teacher model."
    ]
   }
  ],
diff --git a/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb b/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb
index a195c2f3a405..d64f8c15bd00 100644
--- a/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb
+++ b/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb
@@ -5,8 +5,8 @@
    "id": "8bc99d2f-9ac6-40c2-b072-12b6cb7b9aca",
    "metadata": {},
    "source": [
-    "### Step 3: Prune the finetuned-teacher model to create a student\n",
-    "In this step, we will explore two methods to prune the finetuned teacher model. Refer to the ``NOTE`` in the **_step-by-step instructions_** section of [introduction.ipynb](./introduction.ipynb) to decide which pruning techniques you would like to explore.\n",
+    "### Step 3: Prune the fine-tuned teacher model to create a student\n",
+    "In this step, we will explore two methods to prune the fine-tuned teacher model. Refer to the ``NOTE`` in the **_step-by-step instructions_** section of [introduction.ipynb](./introduction.ipynb) to decide which pruning techniques you would like to explore.\n",
     "\n",
     "In the first method, depth-pruning, we trim the layers of the model."
    ]
@@ -21,7 +21,7 @@
     "\n",
     "Per the [blog](https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/) and [tech report](https://arxiv.org/pdf/2408.11796), removing contiguous layers from the second last block (layers 16 to 31 continuously) yields the best overall results. \n",
     "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your finetuned teacher .nemo model."
+    "> `NOTE:` In the block of code below, pass the paths to your fine-tuned teacher .nemo model."
    ]
   },
   {
diff --git a/tutorials/llm/llama-3/pruning-distillation/03_b_width_pruning.ipynb b/tutorials/llm/llama-3/pruning-distillation/03_b_width_pruning.ipynb
index 7d91d36cbb32..5c4a47872afb 100644
--- a/tutorials/llm/llama-3/pruning-distillation/03_b_width_pruning.ipynb
+++ b/tutorials/llm/llama-3/pruning-distillation/03_b_width_pruning.ipynb
@@ -5,8 +5,8 @@
    "id": "8bc99d2f-9ac6-40c2-b072-12b6cb7b9aca",
    "metadata": {},
    "source": [
-    "### Step 3: Prune the finetuned-teacher model to create a student\n",
-    "In the second method, we will width-prune. In width-pruning, we trim the neurons, attention heads and embedding channels. \n",
+    "### Step 3: Step 3: Prune the fine-tuned teacher model to create a student\n",
+    "In the second method, we will width-prune. In width-pruning, we trim the neurons, attention heads, and embedding channels.\n",
     "\n",
     "Refer to the ``NOTE`` in the **_step-by-step instructions_** section of [introduction.ipynb](./introduction.ipynb) to decide which pruning techniques you would like to explore."
    ]
@@ -20,15 +20,15 @@
    "source": [
     "#### Step 3.b.: Using width-pruning\n",
     "To width-prune the model, we do the following:\n",
-    "- prune (trim) the MLP intermediate dimension from 14336 to 9216.\n",
-    "- prune the hidden size from 4096 to 3072.\n",
-    "- and retrain the attention headcount and number of layers\n",
+    "- Prune (trim) the MLP intermediate dimension from 14336 to 9216.\n",
+    "- Prune the hidden size from 4096 to 3072.\n",
+    "- Retrain the attention headcount and number of layers\n",
     "\n",
-    "For width-pruning we will use the [megatron_gpt_prune.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_prune.py) script in the NeMo Framework. To see the detailed list of parameters for width-pruning, you can view the [megatron_gpt_prune.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml) file.\n",
+    "For width-pruning, we will use the [megatron_gpt_prune.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_prune.py) script in the NeMo Framework. To see the detailed list of parameters for width-pruning, you can view the [megatron_gpt_prune.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml) file.\n",
     "\n",
     "We use the above parameters to get a competitive model for this demonstration. You can use other strategies or parameters from the [blog](https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/) or the [tech report](https://arxiv.org/pdf/2408.11796) for your experiments. \n",
     "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your finetuned teacher .nemo model.\n",
+    "> `NOTE:`  In the block of code below, pass the paths to your fine-tuned teacher .nemo model.\n",
     "\n",
     "> `TIP:` You can increase the ``batch_size`` (upto 1024) to speed up the width-pruning script execution."
    ]
diff --git a/tutorials/llm/llama-3/pruning-distillation/04_a_distilling_depth_pruned_student.ipynb b/tutorials/llm/llama-3/pruning-distillation/04_a_distilling_depth_pruned_student.ipynb
index ccbe1cbf394b..488225837731 100644
--- a/tutorials/llm/llama-3/pruning-distillation/04_a_distilling_depth_pruned_student.ipynb
+++ b/tutorials/llm/llama-3/pruning-distillation/04_a_distilling_depth_pruned_student.ipynb
@@ -6,9 +6,9 @@
    "metadata": {},
    "source": [
     "### Step 4: Distill knowledge from teacher into student\n",
-    "Distillation of a model with NeMo Framework is also possible using a python script: [megatron_gpt_distillation.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_distillation.py). In this notebook, we will explore distillation with the depth-pruned model as the `STUDENT` model. \n",
+    "Distillation of a model with NeMo Framework is also possible using a Python script: [megatron_gpt_distillation.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_distillation.py). In this notebook, we will explore distillation with the depth-pruned model as the `STUDENT` model.\n",
     "\n",
-    "For this demonstration, the `TEACHER` would be the finetuned teacher model `megatron_llama_ft.nemo` and the `STUDENT` model would be the pruned 4B model. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps."
+    "For this demonstration, the `TEACHER` would be the fine-tuned teacher model `megatron_llama_ft.nemo` and the `STUDENT` model would be the pruned 4B model. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps."
    ]
   },
   {
@@ -19,7 +19,7 @@
     "#### Step 4.a.: Using depth-pruned student\n",
     "While distilling knowledge from the teacher to depth-pruned model, the `STUDENT` model would be  `4b_depth_pruned_model.nemo` as produced by the [depth-pruning](./03_a_depth_pruning.ipynb) notebook. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps.\n",
     "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test and validation data files as well as path to the teacher and student .nemo models."
+    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test, and validation data files, as well as path to the teacher and student .nemo models."
    ]
   },
   {
diff --git a/tutorials/llm/llama-3/pruning-distillation/04_b_distilling_width_pruned_student.ipynb b/tutorials/llm/llama-3/pruning-distillation/04_b_distilling_width_pruned_student.ipynb
index 48e81c96cdcf..95110dd19dd9 100644
--- a/tutorials/llm/llama-3/pruning-distillation/04_b_distilling_width_pruned_student.ipynb
+++ b/tutorials/llm/llama-3/pruning-distillation/04_b_distilling_width_pruned_student.ipynb
@@ -6,10 +6,10 @@
    "metadata": {},
    "source": [
     "### Step 4: Distill knowledge from teacher into student\n",
-    "Distillation of a model with NeMo Framework is also possible using a python script: [megatron_gpt_distillation.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_distillation.py). \n",
+    "Distillation of a model with NeMo Framework is also possible using a Python script: [megatron_gpt_distillation.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_distillation.py). \n",
     "In this notebook, we will explore distillation with the width-pruned model as the `STUDENT` model.\n",
     "\n",
-    "For this demonstration, the `TEACHER` would be the finetuned teacher model `megatron_llama_ft.nemo` and the `STUDENT` model would be the pruned 4B model. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps."
+    "For this demonstration, the `TEACHER` would be the fine-tuned teacher model `megatron_llama_ft.nemo` and the `STUDENT` model would be the pruned 4B model. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps."
    ]
   },
   {
@@ -20,7 +20,7 @@
     "#### Step 4.b.: Using width-pruned student\n",
     "While distilling knowledge from the teacher to width-pruned model, the `STUDENT` model would be  `4b_width_pruned_model.nemo` as produced by the [width-pruning](./03_b_width_pruning.ipynb) notebook. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps.\n",
     "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test and validation data files as well as path to the teacher and student .nemo models."
+    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test, and validation data files, as well as path to the teacher and student .nemo models."
    ]
   },
   {
diff --git a/tutorials/llm/llama-3/pruning-distillation/05_display_results.ipynb b/tutorials/llm/llama-3/pruning-distillation/05_display_results.ipynb
index 0264cc288957..dcb483c55ab6 100644
--- a/tutorials/llm/llama-3/pruning-distillation/05_display_results.ipynb
+++ b/tutorials/llm/llama-3/pruning-distillation/05_display_results.ipynb
@@ -8,7 +8,8 @@
     "### Step 5: Display the validation loss\n",
     "\n",
     "Now that the results are in, let's visualize the validation loss of the two distilled models using the `tensorboard` library. \n",
-    "> `NOTE:` This notebook demonstrates the use of the teacher finetuning, pruning and the distillation script. These scripts should ideally be run on a multi-node cluster with a larger `GLOBAL_BATCH_SIZE` and `STEPS` to see improvement in the validation loss."
+    "\n",
+    "> `NOTE:` This notebook demonstrates the use of the teacher fine-tuning, pruning, and the distillation script. These scripts should ideally be run on a multi-node cluster with a larger `GLOBAL_BATCH_SIZE` and `STEPS` to see improvement in the validation loss."
    ]
   },
   {
@@ -16,8 +17,8 @@
    "id": "b5822d62-8131-4046-8c22-0bf0fce81df7",
    "metadata": {},
    "source": [
-    "#### Validation Loss using depth-pruned model as student in distillation script\n",
-    "Here is an image of the validation loss over 30 steps of running the training step in the distillation script when we distill the knowledge from the finetuned teacher model to the depth-pruned student."
+    "#### Validation Loss Using Depth-Pruned Model as Student in Distillation Script\n",
+    "Here is an image of the validation loss over 30 steps of running the training step in the distillation script, where we distill the knowledge from the fine-tuned teacher model to the depth-pruned student."
    ]
   },
   {
@@ -35,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "id": "db6fcf26-8ae8-40e1-875a-0a10bf85be81",
    "metadata": {
     "tags": []
@@ -44,7 +45,7 @@
     {
      "data": {
       "text/html": [
-       "<h5>Validation Loss over 30 Training Steps with Depth-Pruned model as Student</h5>"
+       "<h5>Validation Loss over 30 Training Steps with Depth-Pruned Model as Student</h5>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -68,7 +69,7 @@
    ],
    "source": [
     "from IPython.display import Image, display, HTML\n",
-    "title = \"Validation Loss over 30 Training Steps with Depth-Pruned model as Student\"\n",
+    "title = \"Validation Loss over 30 Training Steps with Depth-Pruned Model as Student\"\n",
     "display(HTML(f\"<h5>{title}</h5>\"))\n",
     "display(Image(url=\"https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_depth_pruned_student_distillation.png\", width=400))"
    ]
@@ -78,8 +79,8 @@
    "id": "f10041ae-6533-47de-9f76-f97d4469c27a",
    "metadata": {},
    "source": [
-    "#### Validation Loss using width-pruned model as student in distillation script\n",
-    "Here is an image of the validation loss over 30 steps of running the training step in the distillation script when we distill the knowledge from the finetuned teacher model to the width-pruned student."
+    "#### Validation Loss Using Width-Pruned Model as Student in Distillation Script\n",
+    "Here is an image of the validation loss over 30 steps of running the training step in the distillation script, where we distill the knowledge from the fine-tuned teacher model to the width-pruned student."
    ]
   },
   {
@@ -97,7 +98,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "ecd79583-f662-40c6-a690-9f4bb847de4e",
    "metadata": {
     "tags": []
@@ -106,7 +107,7 @@
     {
      "data": {
       "text/html": [
-       "<h5>Validation Loss over 30 Training Steps with Width-Pruned model as Student</h5>"
+       "<h5>Validation Loss over 30 Training Steps with Width-Pruned Model as Student</h5>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -130,18 +131,10 @@
    ],
    "source": [
     "from IPython.display import Image, display, HTML\n",
-    "title = \"Validation Loss over 30 Training Steps with Width-Pruned model as Student\"\n",
+    "title = \"Validation Loss over 30 Training Steps with Width-Pruned Model as Student\"\n",
     "display(HTML(f\"<h5>{title}</h5>\"))\n",
     "display(Image(url=\"https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_width_pruned_student_distillation.png\", width=400))"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7ab6ed6f-8bc3-4188-919f-7cee842635ed",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorials/llm/llama-3/pruning-distillation/README.rst b/tutorials/llm/llama-3/pruning-distillation/README.rst
index 34febcffa366..45cb119ffcd8 100644
--- a/tutorials/llm/llama-3/pruning-distillation/README.rst
+++ b/tutorials/llm/llama-3/pruning-distillation/README.rst
@@ -1,13 +1,13 @@
 Llama 3.1 Pruning and Distillation with NeMo Framework
 =======================================================================================
 
-`Llama 3.1 <https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/>`_ are open-source large language models by Meta that deliver state-of-the-art performance on popular industry benchmarks. They have been pretrained on over 15 trillion tokens, and support a 128K token context length. They are available in three sizes, 8B, 70B, and 405B, and each size has two variants—base pretrained and instruction tuned.
+`Llama 3.1 <https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/>`_ models, developed by Meta, are open-source large language models that deliver state-of-the-art performance on popular industry benchmarks. Pretrained on over 15 trillion tokens, they support a 128K token context length. These models are available in three sizes: 8B, 70B, and 405B. Each size offers two variants: base pretrained and instruction tuned.
 
-`NVIDIA NeMo Framework <https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html>`_ provides tools to perform teacher finetuning, pruning and distillation on Llama 3.1 to fit your use case.
+`NVIDIA NeMo Framework <https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html>`_ provides tools to perform teacher fine-tuning, pruning, and distillation on Llama 3.1 to fit your use case.
 
 `NVIDIA TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ is a library (referred to as **Model Optimizer**, or **ModelOpt**) comprising state-of-the-art model optimization techniques including `quantization <https://github.com/NVIDIA/TensorRT-Model-Optimizer#quantization>`_, `sparsity <https://github.com/NVIDIA/TensorRT-Model-Optimizer#sparsity>`_, `distillation <https://github.com/NVIDIA/TensorRT-Model-Optimizer#distillation>`_, and `pruning <https://github.com/NVIDIA/TensorRT-Model-Optimizer#pruning>`_ to compress models.
 
-`LLM Pruning and Distillation in Practice: The Minitron Approach <https://arxiv.org/abs/2408.11796>`_ provides tools to perform teacher finetuning, pruning and distillation on Llama 3.1 as described in the `tech report <https://arxiv.org/abs/2408.11796>`_.
+`LLM Pruning and Distillation in Practice: The Minitron Approach <https://arxiv.org/abs/2408.11796>`_ provides tools to perform teacher fine-tuning, pruning, and distillation on Llama 3.1 as described in the `tech report <https://arxiv.org/abs/2408.11796>`_.
 
 `How to Prune and Distill Llama-3.1 8B to an NVIDIA Llama-3.1-Minitron 4B Model <https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/>`_ provides practical and effective structured compression best practices for LLMs that combine depth, width, attention, and MLP pruning with knowledge distillation-based retraining. These strategies are presented in the `Compact Language Models via Pruning and Knowledge Distillation <https://arxiv.org/pdf/2407.14679>`_ paper.
 
@@ -16,30 +16,33 @@ Llama 3.1 Pruning and Distillation with NeMo Framework
 Objectives
 ----------
 
-This tutorial shows how to perform depth-pruning, teacher finetuning and distillation on **Llama 3.1 8B** using the `WikiText-103-v1 <https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1>`_ dataset with NeMo Framework. The `WikiText-103-v1 <https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1>`_ language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. For this demonstration, we will perform teacher correction by running a light finetuning procedure on the ``Meta Llama 3.1 8B`` teacher model to generate a finetuned teacher model ``megatron_llama_ft.nemo`` needed for optimal distillation. This finetuned teacher model is then trimmed. There are two methods to prune a model: depth-pruning and width-pruning. We will be exploring both pruning techniques which will yield ``4b_depth_pruned_model.nemo`` and ``4b_width_pruned_model.nemo`` respectively. These models will serve as a starting point for distillation to create the final distilled 4B models.
+This tutorial demonstrates how to perform depth-pruning, width-pruning, teacher fine-tuning, and distillation on **Llama 3.1 8B** using the `WikiText-103-v1 <https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1>_ dataset with the NeMo Framework. The WikiText-103-v1 <https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1>`_ language modeling dataset comprises over 100 million tokens extracted from verified Good and Featured articles on Wikipedia.
+
+For this demonstration, we will perform teacher correction by running a light fine-tuning procedure on the ``Meta LLama 3.1 8B`` teacher model to generate a fine-tuned teacher model, ``megatron_llama_ft.nemo``, needed for optimal distillation. This fine-tuned teacher model is then trimmed. There are two methods to prune a model: depth-pruning and width-pruning. We will explore both techniques, yielding ``4b_depth_pruned_model.nemo`` and ``4b_width_pruned_model.nemo``, respectively. These models will serve as starting points for distillation to create the final distilled 4B models.
+
 We are using models utilizing the ``meta-llama/Meta-Llama-3.1-8B`` tokenizer for this demonstration.
 
-``NOTE:`` A subset of functions is being demonstrated in the notebooks. Some features like Neural Architecture Search (NAS) are unavailable but will be supported in future releases.
+``NOTE:`` A subset of functions is being demonstrated in the notebooks. Some features like Neural Architecture Search (NAS) are unavailable, but will be supported in future releases.
 
 Requirements
 -------------
 
 * System Configuration
-    * Access to at least 8 NVIDIA GPU with an individual memory of at least 80GB, for example: 8 x H100-80GB or 8 x A100-80GB.
+    * Access to at least 8 NVIDIA GPUs, each with a memory of at least 80GB (e.g., 8 x H100-80GB or 8 x A100-80GB).
     * A Docker-enabled environment, with `NVIDIA Container Runtime <https://developer.nvidia.com/container-runtime>`_ installed, which will make the container GPU-aware.
 
-* `Authenticate with NVIDIA NGC <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-authentication>`_, and download `NGC CLI Tool <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-cli-tool>`_. You will use this tool to download the model and customize it with NeMo Framework.
+* `Authenticate with NVIDIA NGC <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-authentication>`_ and download `NGC CLI Tool <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-cli-tool>`_. You will use this tool to download the model and customize it with NeMo Framework.
 
 * Get your Hugging Face `access token <https://huggingface.co/docs/hub/en/security-tokens>`_, which will be used to obtain the tokenizer required during training.
 
-``NOTE:`` The default configuration in the notebook runs on 8 x 80GB NVIDIA GPUs but you can potentially reduce Tensor Parallel size ``(TENSOR_PARALLEL_SIZE)`` along with the Micro-Batchsize ``(MICRO_BATCH_SIZE)`` in the teacher finetuning and distillation scripts to accommodate lower resource availability.
+``NOTE:`` The default configuration in the notebook runs on 8 x 80GB NVIDIA GPUs. However, you can potentially reduce the Tensor Parallel size ``(TENSOR_PARALLEL_SIZE)`` along with the Micro-Batchsize ``(MICRO_BATCH_SIZE)`` in the teacher fine-tuning and distillation scripts to accommodate lower resource availability.
 
-Create a pruned and distilled model with NeMo Framework
+Create a Pruned and Distilled Model with NeMo Framework
 ------------------------------------------------------------------------------
 
-For pruning and distilling the model, you will use the NeMo Framework which is available as a `docker container <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_.
+For pruning and distilling the model, you will use the NeMo Framework, which is available as a `Docker container <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_.
 
-``NOTE:`` These notebooks use `NVIDIA TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ under the hood for pruning and distillation.
+``NOTE:`` These notebooks use the `NVIDIA TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ under the hood for pruning and distillation.
 
 
 1. Download the `Llama 3.1 8B .nemo <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/llama-3_1-8b-nemo>`_ from NVIDIA NGC using the `NGC CLI <https://org.ngc.nvidia.com/setup/installers/cli>`_. Generate the ``NGC_API_KEY`` following these `instructions <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#option-2-from-ngc>`_. The following command saves the ``.nemo`` format model in a folder named ``llama-3_1-8b-nemo_v1.0`` in the current directory. You can specify another path using the ``-d`` option in the CLI tool.
@@ -75,7 +78,7 @@ For pruning and distilling the model, you will use the NeMo Framework which is a
 
 4. Then, navigate to `this notebook <./introduction.ipynb>`_ to get started.
 
-This directory contains a list of notebooks which will go over all the steps to create a distilled 4B model.
+This directory contains a list of notebooks that cover all the steps to create a distilled 4B model.
 
 :: 
 
@@ -91,7 +94,7 @@ This directory contains a list of notebooks which will go over all the steps to
    
 Results
 ------------------------------------------------------------------------------
-``NOTE:`` This notebook demonstrates the use of the teacher finetuning, pruning and the distillation scripts. These scripts should ideally be run on a multi-node cluster with a larger ``GLOBAL_BATCH_SIZE`` and ``STEPS`` to see improvement in the validation loss.
+``NOTE:`` This notebook demonstrates the use of the teacher fine-tuning, pruning, and the distillation scripts. These scripts should ideally be run on a multi-node cluster with a larger ``GLOBAL_BATCH_SIZE`` and ``STEPS`` to see improvement in the validation loss.
 
 Here are the validation loss plots over 30 steps of running the training step in the distillation script (at the end of the `notebook <./05_display_results.ipynb>`_).
 
@@ -100,11 +103,11 @@ Here are the validation loss plots over 30 steps of running the training step in
   :alt: Diagram showing the validation loss over 30 steps of running the training step in the distillation script when using the depth-pruned model as the student
   :align: center
 
-  Figure 1: Validation Loss Plot when using the depth-pruned model as the student
+  Figure 1: Validation Loss Plot When Using the Depth-Pruned Model as the Student
   
 .. figure:: https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_width_pruned_student_distillation.png
   :width: 400px
   :alt: Diagram showing the validation loss over 30 steps of running the training step in the distillation script when using the width-pruned model as the student
   :align: center
 
-  Figure 2: Validation Loss Plot when using the width-pruned model as the student 
\ No newline at end of file
+  Figure 2: Validation Loss Plot When Using the Width-Pruned Model as the Student
\ No newline at end of file
diff --git a/tutorials/llm/llama-3/pruning-distillation/introduction.ipynb b/tutorials/llm/llama-3/pruning-distillation/introduction.ipynb
index 1a3efc9f5f1e..71a5a6cfb03c 100644
--- a/tutorials/llm/llama-3/pruning-distillation/introduction.ipynb
+++ b/tutorials/llm/llama-3/pruning-distillation/introduction.ipynb
@@ -7,7 +7,7 @@
     "tags": []
    },
    "source": [
-    "# Pruning and Distillation of Llama 3.1 model with NeMo Framework"
+    "# Efficient Model Reduction with Pruning and Distillation of Llama 3.1 Using NeMo Framework"
    ]
   },
   {
@@ -15,15 +15,15 @@
    "id": "03fd1cf4-c67a-4b8d-a5e5-46531be0f991",
    "metadata": {},
    "source": [
-    "This demonstration showcases performing pruning and distillation on **Llama 3.1-8B** with the [WikiText-103-v1](https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1) dataset using NeMo Framework. The [WikiText-103-v1](https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1) language modeling dataset is a collection of over 100 million tokens extracted from the set of verified 'Good' and 'Featured' articles on Wikipedia. \n",
+    "This tutorial demonstrates how to perform depth-pruning, teacher fine-tuning, and distillation on **Llama 3.1-8B** using the [WikiText-103-v1](https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1) dataset with NeMo Framework. The [WikiText-103-v1](https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1) language modeling dataset comprises over 100 million tokens extracted from verified Good and Featured articles on Wikipedia.\n",
     "\n",
-    "For this demonstration, we will perform a light finetuning procedure on the `Meta Llama 3.1 8B` teacher model to generate a finetuned teacher model. This finetuned teacher model will then be trimmed. There are two methods to prune a model: depth-pruning and width-pruning. This workflow will showcase both methods which will yield `4b_depth_pruned_model.nemo` and `4b_width_pruned_model.nemo` respectively, that will serve as a starting point for distillation to the final 4B models. \n",
+    "For this demonstration, we will perform teacher correction by running a light fine-tuning procedure on the `Meta Llama 3.1 8B` teacher model to generate a fine-tuned teacher model, `megatron_llama_ft.nemo`, needed for optimal distillation. This fine-tuned teacher model is then trimmed. There are two methods to prune a model: depth-pruning and width-pruning. We will explore both techniques, yielding `4b_depth_pruned_model.nemo` and `4b_width_pruned_model.nemo`, respectively. These models will serve as starting points for distillation to create the final distilled 4B models.\n",
     "\n",
     "> We are using models utilizing the `meta-llama/Meta-Llama-3.1-8B` tokenizer for this demonstration.\n",
     "\n",
     "> `NOTE:` Ensure that you run this notebook inside the [NeMo Framework container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) which has all the required dependencies. \n",
     "\n",
-    "**Instructions are available in the associated tutorial README to download the model and the container.**"
+    "**Instructions for downloading the model and the container are available in the [README](./README.rst).**"
    ]
   },
   {
@@ -49,8 +49,8 @@
    "source": [
     "---\n",
     "## Prerequisites\n",
-    "Ensure you have the following -\n",
-    "1. **Get the teacher model**: Download the `Meta Llama 3.1 8B .nemo` model. You must follow the instructions in the associated README to download and mount the folder to the NeMo FW container."
+    "Ensure you meet the prerequisites listed in this section.\n",
+    "1. **Get the teacher model**: Download the `Meta Llama 3.1 8B .nemo` model. You must follow the instructions in the associated README to download and mount the folder to the NeMo Framework container."
    ]
   },
   {
@@ -149,12 +149,12 @@
    },
    "source": [
     "---\n",
-    "##  Step-by-step instructions\n",
+    "##  Step-by-Step Instructions\n",
     "\n",
     "This workflow is structured into seven notebooks:\n",
     "1. [Prepare the dataset](./01_data_preparation.ipynb)\n",
-    "2. [Finetune the teacher on the dataset](./02_teacher_finetuning.ipynb)\n",
-    "3. Prune the finetuned-teacher model to create a student \n",
+    "2. [Fine-tune the teacher on the dataset](./02_teacher_finetuning.ipynb)\n",
+    "3. Prune the fine-tuned teacher model to create a student \n",
     "   - 3.a. [Using depth-pruning](./03_a_depth_pruning.ipynb)\n",
     "   - 3.b. [Using width-pruning](./03_b_width_pruning.ipynb)\n",
     "4. Distill knowledge from teacher into student\n",
@@ -162,7 +162,7 @@
     "   - 4.b. [Using width-pruned student](./04_b_distilling_width_pruned_student.ipynb)\n",
     "5. [Display the validation loss](./05_display_results.ipynb)\n",
     "\n",
-    "> `NOTE:` We are exploring two methods to prune the finetuned teacher model: [depth-pruning](./03_a_depth_pruning.ipynb) and [width-pruning](./03_b_width_pruning.ipynb). Per the [tech report](https://arxiv.org/pdf/2408.11796), we can observe that width-pruning generally outperforms depth-pruning so users can choose to perform either [depth-pruning](./03_a_depth_pruning.ipynb) or [width-pruning](./03_b_width_pruning.ipynb) or both methods."
+    "> `NOTE:` We are exploring two methods to prune the fine-tuned teacher model: [depth-pruning](./03_a_depth_pruning.ipynb) and [width-pruning](./03_b_width_pruning.ipynb). Per the [tech report](https://arxiv.org/pdf/2408.11796), we can observe that width-pruning generally outperforms depth-pruning so users can choose to perform either [depth-pruning](./03_a_depth_pruning.ipynb) or [width-pruning](./03_b_width_pruning.ipynb) or both methods."
    ]
   }
  ],

From 5db91e7a640953928fe746eed4ab1314c9e43834 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Fri, 15 Nov 2024 14:09:50 -0500
Subject: [PATCH 04/48] Add T5TTS (#11193)

* added training and inference recipes for T5-TTS.
* fix some attention errors
* add copyright headers.
* added TODO and detail error log info.
* fixed missing a corner case.
* added classes to __all__
* fixed to return either self-attention scores or cross-attention scores in ParallelTransformerLayer_ class.

Signed-off-by: XuesongYang <XuesongYang@users.noreply.github.com>

---------

Signed-off-by: Jason <jasoli@nvidia.com>
Signed-off-by: blisc <blisc@users.noreply.github.com>
Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: XuesongYang <XuesongYang@users.noreply.github.com>
Co-authored-by: blisc <blisc@users.noreply.github.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: XuesongYang <XuesongYang@users.noreply.github.com>
---
 .../conf/megatron_t5_speechllm_inference.yaml |  160 +
 ...megatron_t5_speechllm_inference_model.yaml |  213 ++
 ...n_t5_speechllm_inference_multiencoder.yaml |  218 ++
 .../conf/megatron_t5_speechllm_medium.yaml    |  161 +
 .../megatron_t5_speechllm_multiencoder.yaml   |  223 ++
 .../conf/megatron_t5_speechlm_model.yaml      |  221 ++
 .../tts/speechllm/megatron_t5_speechllm.py    |   57 +
 .../megatron_t5_speechllm_inference.py        |   53 +
 .../common/parts/preprocessing/collections.py |  126 +
 .../common/parts/preprocessing/manifest.py    |   17 +-
 .../tokenizers/sentencepiece_tokenizer.py     |   10 +-
 .../megatron/base_prompt_learning_dataset.py  |   52 +-
 .../nlp/modules/common/megatron/attention.py  |  162 +-
 .../common/megatron/megatron_decoders.py      |    5 +-
 .../megatron/megatron_encoder_decoder.py      |   56 +-
 .../common/megatron/megatron_encoders.py      |   51 +-
 .../megatron/megatron_transformer_decoder.py  |   49 +-
 .../megatron/megatron_transformer_encoder.py  |  221 +-
 .../nlp/modules/common/megatron/module.py     |    7 +-
 .../megatron/token_level_encoder_decoder.py   |  458 ++-
 .../modules/common/megatron/transformer.py    |  153 +-
 .../nlp/modules/common/megatron/utils.py      |   23 +-
 .../tts/data/speechllm/__init__.py            |   13 +
 .../data/speechllm/t5_speechllm_dataset.py    | 1355 +++++++++
 .../speechllm/t5_speechllm_tarred_dataset.py  |  986 ++++++
 .../tts/g2p/models/zh_cn_pinyin.py            |   15 +-
 .../tts/models/speechllm/__init__.py          |   13 +
 .../megatron_base_speechllm_prompt_model.py   |  444 +++
 .../speechllm/megatron_t5_speechllm_model.py  | 2672 +++++++++++++++++
 nemo/collections/tts/parts/utils/helpers.py   |   70 +-
 .../tts/parts/utils/tts_dataset_utils.py      |   14 +-
 requirements/requirements_tts.txt             |    2 +
 32 files changed, 8180 insertions(+), 100 deletions(-)
 create mode 100644 examples/tts/speechllm/conf/megatron_t5_speechllm_inference.yaml
 create mode 100644 examples/tts/speechllm/conf/megatron_t5_speechllm_inference_model.yaml
 create mode 100644 examples/tts/speechllm/conf/megatron_t5_speechllm_inference_multiencoder.yaml
 create mode 100644 examples/tts/speechllm/conf/megatron_t5_speechllm_medium.yaml
 create mode 100644 examples/tts/speechllm/conf/megatron_t5_speechllm_multiencoder.yaml
 create mode 100644 examples/tts/speechllm/conf/megatron_t5_speechlm_model.yaml
 create mode 100644 examples/tts/speechllm/megatron_t5_speechllm.py
 create mode 100644 examples/tts/speechllm/megatron_t5_speechllm_inference.py
 create mode 100644 nemo/collections/tts/data/speechllm/__init__.py
 create mode 100644 nemo/collections/tts/data/speechllm/t5_speechllm_dataset.py
 create mode 100644 nemo/collections/tts/data/speechllm/t5_speechllm_tarred_dataset.py
 create mode 100644 nemo/collections/tts/models/speechllm/__init__.py
 create mode 100644 nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py
 create mode 100644 nemo/collections/tts/models/speechllm/megatron_t5_speechllm_model.py

diff --git a/examples/tts/speechllm/conf/megatron_t5_speechllm_inference.yaml b/examples/tts/speechllm/conf/megatron_t5_speechllm_inference.yaml
new file mode 100644
index 000000000000..8b37077bfdd5
--- /dev/null
+++ b/examples/tts/speechllm/conf/megatron_t5_speechllm_inference.yaml
@@ -0,0 +1,160 @@
+name: megatron_t5_speechllm_tts_inference
+checkpoint_path: ???
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 32
+  logger: False
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 10000
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: null
+  check_val_every_n_epoch: 3
+  gradient_clip_val: 1.0
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 2
+    mode: min
+    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
+    filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+
+model:
+  seed: 1234
+  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  global_batch_size: 16
+  micro_batch_size: 16 # micro batch size should equal global batch size when pipeline parallel = 1
+  validation_global_batch_size: ${model.global_batch_size}
+  validation_micro_batch_size: ${model.micro_batch_size}
+  validation_drop_last: False
+  report_validation_metric: False
+  validation_metric: accuracy
+  num_speech_tokens: 10112 # Vocabulary size pertaining to speech
+  seq_pattern: "parallel" # parallel, delay_parallel, flatten
+  temperature: 0.85 # Temperature to be used for inference
+  top_k: 80 # Top k to be used for inference
+  max_inference_timesteps: 1000 # Maximum number of timesteps to run inference for
+
+  restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  existing_tasks: []
+  new_tasks: ["squad"]
+  codecmodel_type: nemo_codec
+  codecmodel_path: ???
+  english_only_model: true
+  context_conditioning: decoder
+  use_flash_attention: false
+  lm_vocab_size: 30000
+  task_templates:
+  - taskname: "squad"
+    prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
+    total_virtual_tokens: 3
+    virtual_token_splits: [3]
+    truncate_field: context
+    answer_field: answer
+
+  p_tuning: # P-tuning specific params
+      encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
+      num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
+      dropout: 0.0
+
+  prompt_tuning: # Prompt tunin specific params
+    new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
+    new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random
+
+  data:
+    grapheme_prefix: null
+    train_ds: null
+    validation_ds: null
+    test_ds: ???
+    max_seq_length: 1536
+    sample_rate: 24000
+    add_eos: true
+    add_bos: false
+    decoder_starts_with_pad: False
+    add_eos_to_decoder_output: True
+    add_sentinel_to_input: True
+    ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
+    shuffle: true
+    num_workers: 4
+    pin_memory: true
+    speech_offset: 30000
+    train_task: all
+    sup_data_path: None
+    num_speech_codebooks: 8
+    codebook_fps: 86
+    context_duration_min: 2.9
+    context_duration_max: 2.9
+    context_slice_method: "fixed"
+    phoneme_probability: 1.0
+    g2p:
+      english:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_prefix: ${model.data.grapheme_prefix}
+      spanish:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
+        phoneme_probability: 0.8
+        use_chars: True
+        use_stresses: True
+        ignore_ambiguous_words: False
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "es-ES"
+      mandarin:
+        _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
+        phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
+        word_segmenter: "jieba"
+        phoneme_prefix: ""
+        phoneme_case: "lower"
+        tone_prefix: "#"
+        ascii_letter_prefix: ${model.data.grapheme_prefix}
+        ascii_letter_case: "upper"
+      german:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
+        heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_case: mixed
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "de-DE"
+
+  optim:
+    name: fused_adam
+    lr: 5e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
\ No newline at end of file
diff --git a/examples/tts/speechllm/conf/megatron_t5_speechllm_inference_model.yaml b/examples/tts/speechllm/conf/megatron_t5_speechllm_inference_model.yaml
new file mode 100644
index 000000000000..1858edf9e667
--- /dev/null
+++ b/examples/tts/speechllm/conf/megatron_t5_speechllm_inference_model.yaml
@@ -0,0 +1,213 @@
+name: megatron_t5_speechllm_tts_inference
+checkpoint_path: ???
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 32
+  logger: False
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 10000
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: null
+  check_val_every_n_epoch: 3
+  gradient_clip_val: 1.0
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 2
+    mode: min
+    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
+    filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+
+model:
+  seed: 1234
+  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  global_batch_size: 16
+  micro_batch_size: 16 # micro batch size should equal global batch size when pipeline parallel = 1
+  validation_global_batch_size: ${model.global_batch_size}
+  validation_micro_batch_size: ${model.micro_batch_size}
+  validation_drop_last: False
+  report_validation_metric: False
+  validation_metric: accuracy
+  num_speech_tokens: 10112 # Vocabulary size pertaining to speech
+  seq_pattern: "parallel" # parallel, delay_parallel, flatten
+  temperature: 0.85 # Temperature to be used for inference
+  top_k: 80 # Top k to be used for inference
+  max_inference_timesteps: 1000 # Maximum number of timesteps to run inference for
+
+  restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  existing_tasks: []
+  new_tasks: ["squad"]
+  codecmodel_type: nemo_codec
+  codecmodel_path: ???
+  english_only_model: true
+  context_conditioning: decoder
+  train_from_scratch: true
+  override_tokenizer_vocab_file: ???
+  use_flash_attention: false
+  lm_vocab_size: 30000
+
+  frozen_model:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    pipeline_model_parallel_split_rank: 0
+    make_vocab_size_divisible_by: 128
+    pre_process: true
+    post_process: true
+    gradient_as_bucket_view: true
+    native_amp_init_scale: 4294967296
+    native_amp_growth_interval: 1000
+    fp16_lm_cross_entropy: false
+    seed: 1234
+    use_cpu_initialization: false
+    apex_transformer_log_level: 30
+    tokenizer:
+      library: megatron
+      type: BertWordPieceCase
+      model: null
+      vocab_file: null
+      merge_file: null
+    optim:
+      name: null
+    data:
+      dataset_type: t5
+    encoder:
+      arch: transformer
+      bias_activation_fusion: false
+      use_flash_attention: ${model.use_flash_attention}
+      num_layers: 12
+      hidden_size: 768
+      ffn_hidden_size: 2048
+      num_attention_heads: 12
+      init_method_std: 0.015
+      hidden_dropout: 0.1
+      attention_dropout: 0.1
+      kv_channels: 64
+      activation: geglu
+    decoder:
+      arch: transformer
+      bias_activation_fusion: false
+      use_flash_attention: ${model.use_flash_attention}
+      num_layers: 12
+      hidden_size: 768
+      ffn_hidden_size: 2048
+      num_attention_heads: 12
+      init_method_std: 0.015
+      hidden_dropout: 0.1
+      attention_dropout: 0.1
+      kv_channels: 64
+      activation: geglu
+
+  task_templates:
+  - taskname: "squad"
+    prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
+    total_virtual_tokens: 3
+    virtual_token_splits: [3]
+    truncate_field: context
+    answer_field: answer
+
+  p_tuning: # P-tuning specific params
+      encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
+      num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
+      dropout: 0.0
+
+  prompt_tuning: # Prompt tunin specific params
+    new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
+    new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random
+
+  data:
+    grapheme_prefix: null
+    train_ds: null
+    validation_ds: null
+    test_ds: ???
+    max_seq_length: 1536
+    sample_rate: 24000
+    add_eos: true
+    add_bos: false
+    decoder_starts_with_pad: False
+    add_eos_to_decoder_output: True
+    add_sentinel_to_input: True
+    ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
+    shuffle: true
+    num_workers: 4
+    pin_memory: true
+    speech_offset: 30000
+    train_task: all
+    sup_data_path: None
+    num_speech_codebooks: 8
+    codebook_fps: 86
+    context_duration_min: 2.9
+    context_duration_max: 2.9
+    context_slice_method: "fixed"
+    phoneme_probability: 1.0
+    g2p:
+      english:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_prefix: ${model.data.grapheme_prefix}
+      spanish:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
+        phoneme_probability: 0.8
+        use_chars: True
+        use_stresses: True
+        ignore_ambiguous_words: False
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "es-ES"
+      mandarin:
+        _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
+        phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
+        word_segmenter: "jieba"
+        phoneme_prefix: ""
+        phoneme_case: "lower"
+        tone_prefix: "#"
+        ascii_letter_prefix: ${model.data.grapheme_prefix}
+        ascii_letter_case: "upper"
+      german:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
+        heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_case: mixed
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "de-DE"
+
+  optim:
+    name: fused_adam
+    lr: 5e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
diff --git a/examples/tts/speechllm/conf/megatron_t5_speechllm_inference_multiencoder.yaml b/examples/tts/speechllm/conf/megatron_t5_speechllm_inference_multiencoder.yaml
new file mode 100644
index 000000000000..8ad967d20538
--- /dev/null
+++ b/examples/tts/speechllm/conf/megatron_t5_speechllm_inference_multiencoder.yaml
@@ -0,0 +1,218 @@
+name: megatron_t5_speechllm
+checkpoint_path: ???
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 32
+  logger: False
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1
+  max_steps: 250000
+  log_every_n_steps: 10
+  val_check_interval: null
+  check_val_every_n_epoch: 1
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 3
+    mode: min
+    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
+    filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+
+model:
+  seed: 1234
+  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  global_batch_size: 2
+  micro_batch_size: 2 # micro batch size should equal global batch size when pipeline parallel = 1
+  validation_global_batch_size: ${model.global_batch_size}
+  validation_micro_batch_size: ${model.micro_batch_size}
+  validation_drop_last: False
+  report_validation_metric: False
+  validation_metric: accuracy
+  num_speech_tokens: 10112 # Vocabulary size pertaining to speech
+  seq_pattern: "parallel" # parallel, delay_parallel, flatten
+  temperature: 0.85 # Temperature to be used for inference
+  top_k: 80 # Top k to be used for inference
+  max_inference_timesteps: 2000 # Maximum number of timesteps to run inference for
+  restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  existing_tasks: []
+  new_tasks: ["squad"]
+  codecmodel_type: nemo_codec
+  codecmodel_path: ???
+  english_only_model: true
+  context_conditioning: encoder
+  train_from_scratch: true
+  override_tokenizer_vocab_file: ???
+  use_flash_attention: false
+  lm_vocab_size: 30000
+  enc_output_to_layers: [[0,1,2],[3,4,5,6,7,8]]
+
+  frozen_model:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    pipeline_model_parallel_split_rank: 0
+    make_vocab_size_divisible_by: 128
+    pre_process: true
+    post_process: true
+    gradient_as_bucket_view: true
+    native_amp_init_scale: 4294967296
+    native_amp_growth_interval: 1000
+    fp16_lm_cross_entropy: false
+    seed: 1234
+    use_cpu_initialization: false
+    apex_transformer_log_level: 30
+    tokenizer:
+      library: megatron
+      type: BertWordPieceCase
+      model: null
+      vocab_file: null
+      merge_file: null
+    optim:
+      name: null
+    data:
+      dataset_type: t5
+    encoder:
+      arch: multi_transformer
+      n_transformers: 2
+      bias_activation_fusion: false
+      use_flash_attention: ${model.use_flash_attention}
+      num_layers: 6
+      hidden_size: 768
+      ffn_hidden_size: 2048
+      num_attention_heads: 12
+      init_method_std: 0.015
+      hidden_dropout: 0.1
+      attention_dropout: 0.1
+      kv_channels: 64
+      activation: geglu
+    decoder:
+      arch: transformer
+      bias_activation_fusion: false
+      use_flash_attention: ${model.use_flash_attention}
+      num_layers: 12
+      hidden_size: 768
+      ffn_hidden_size: 2048
+      num_attention_heads: 12
+      init_method_std: 0.015
+      hidden_dropout: 0.1
+      attention_dropout: 0.1
+      kv_channels: 64
+      activation: geglu
+
+  task_templates:
+  - taskname: "squad"
+    prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
+    total_virtual_tokens: 3
+    virtual_token_splits: [3]
+    truncate_field: context
+    answer_field: answer
+
+  p_tuning: # P-tuning specific params
+      encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
+      num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
+      dropout: 0.0
+
+  prompt_tuning: # Prompt tunin specific params
+    new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
+    new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random
+
+  data:
+    grapheme_prefix: null
+    train_ds: null
+    validation_ds: null
+    test_ds: ???
+    max_seq_length: 2048
+    sample_rate: 24000
+    add_eos: true
+    add_bos: false
+    use_attention_prior: false
+    attention_prior_scaling_factor: 0.05
+    cross_attention_epsilon: 0.0
+    decoder_starts_with_pad: False
+    add_eos_to_decoder_output: True
+    add_sentinel_to_input: True
+    ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
+    shuffle: true
+    num_workers: 4
+    pin_memory: true
+    speech_offset: 30128
+    train_task: all
+    sup_data_path: None
+    num_speech_codebooks: 8
+    codebook_fps: 86
+    context_duration_min: 2.9
+    context_duration_max: 2.9
+    context_slice_method: "fixed"
+    phoneme_probability: 1.0
+    encoder_type: ${model.frozen_model.encoder.arch}
+    g2p:
+      english:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_prefix: ${model.data.grapheme_prefix}
+      spanish:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
+        phoneme_probability: 0.8
+        use_chars: True
+        use_stresses: True
+        ignore_ambiguous_words: False
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "es-ES"
+      mandarin:
+        _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
+        phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
+        word_segmenter: "jieba"
+        phoneme_prefix: ""
+        phoneme_case: "lower"
+        tone_prefix: "#"
+        ascii_letter_prefix: ${model.data.grapheme_prefix}
+        ascii_letter_case: "upper"
+      german:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
+        heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_case: mixed
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "de-DE"
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
diff --git a/examples/tts/speechllm/conf/megatron_t5_speechllm_medium.yaml b/examples/tts/speechllm/conf/megatron_t5_speechllm_medium.yaml
new file mode 100644
index 000000000000..bd31f0712fdf
--- /dev/null
+++ b/examples/tts/speechllm/conf/megatron_t5_speechllm_medium.yaml
@@ -0,0 +1,161 @@
+name: megatron_t5_speechllm_medium
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 32
+  logger: False
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1
+  max_steps: 1000000
+  log_every_n_steps: 10
+  val_check_interval: null
+  check_val_every_n_epoch: 1
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 3
+    mode: min
+    save_nemo_on_train_end: False
+    filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+
+model:
+  seed: 1234
+  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  global_batch_size: 2
+  micro_batch_size: 2 # micro batch size should equal global batch size when pipeline parallel = 1
+  validation_global_batch_size: ${model.global_batch_size}
+  validation_micro_batch_size: ${model.micro_batch_size}
+  validation_drop_last: False
+  report_validation_metric: False
+  validation_metric: accuracy
+  num_speech_tokens: 10112 # Vocabulary size pertaining to speech
+  seq_pattern: "parallel" # parallel, delay_parallel, flatten
+  attn_prior_scaledown_start_step: 10000
+  attn_prior_end_step: 11000
+  return_all_crossattention_probs: True
+  num_cross_attention_heads: 12 # 12 for 220m, 16 for 3b.
+  restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  existing_tasks: []
+  new_tasks: ["squad"]
+  freeze_model: false
+  use_alignment_loss: true
+  codecmodel_type: nemo_codec
+  codecmodel_path: ???
+  english_only_model: true
+  context_conditioning: decoder
+  use_flash_attention: false
+  lm_vocab_size: 30000
+
+  task_templates:
+  - taskname: "squad"
+    prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
+    total_virtual_tokens: 3
+    virtual_token_splits: [3]
+    truncate_field: context
+    answer_field: answer
+
+  p_tuning: # P-tuning specific params
+      encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
+      num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
+      dropout: 0.0
+
+  prompt_tuning: # Prompt tunin specific params
+    new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
+    new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random
+
+  data:
+    grapheme_prefix: null
+    train_ds: ???
+    validation_ds: ???
+    max_seq_length: 2048
+    sample_rate: 24000
+    add_eos: true
+    add_bos: false
+    use_attention_prior: true
+    attention_prior_scaling_factor: 0.05
+    cross_attention_epsilon: 0.0
+    decoder_starts_with_pad: False
+    add_eos_to_decoder_output: True
+    add_sentinel_to_input: True
+    ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
+    shuffle: true
+    num_workers: 4
+    pin_memory: true
+    speech_offset: 30128
+    train_task: all
+    num_speech_codebooks: 8
+    codebook_fps: 86
+    context_duration_min: 2.9
+    context_duration_max: 2.9
+    g2p:
+      english:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_prefix: ${model.data.grapheme_prefix}
+      spanish:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
+        phoneme_probability: 0.8
+        use_chars: True
+        use_stresses: True
+        ignore_ambiguous_words: False
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "es-ES"
+      mandarin:
+        _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
+        phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
+        word_segmenter: "jieba"
+        phoneme_prefix: ""
+        phoneme_case: "lower"
+        tone_prefix: "#"
+        ascii_letter_prefix: ${model.data.grapheme_prefix}
+        ascii_letter_case: "upper"
+      german:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
+        heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_case: mixed
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "de-DE"
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
\ No newline at end of file
diff --git a/examples/tts/speechllm/conf/megatron_t5_speechllm_multiencoder.yaml b/examples/tts/speechllm/conf/megatron_t5_speechllm_multiencoder.yaml
new file mode 100644
index 000000000000..bf3f65ff9e00
--- /dev/null
+++ b/examples/tts/speechllm/conf/megatron_t5_speechllm_multiencoder.yaml
@@ -0,0 +1,223 @@
+name: megatron_t5_speechllm
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 32
+  logger: False
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1
+  max_steps: 250000
+  log_every_n_steps: 10
+  val_check_interval: null
+  check_val_every_n_epoch: 1
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 3
+    mode: min
+    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
+    filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+
+model:
+  seed: 1234
+  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  global_batch_size: 2
+  micro_batch_size: 2 # micro batch size should equal global batch size when pipeline parallel = 1
+  validation_global_batch_size: ${model.global_batch_size}
+  validation_micro_batch_size: ${model.micro_batch_size}
+  validation_drop_last: False
+  report_validation_metric: False
+  validation_metric: accuracy
+  num_speech_tokens: 10112 # Vocabulary size pertaining to speech
+  seq_pattern: "parallel" # parallel, delay_parallel, flatten
+  attn_prior_scaledown_start_step: 10000
+  attn_prior_end_step: 11000
+  return_all_crossattention_probs: True
+  num_cross_attention_heads: 12 # 12 for 220m, 16 for 3b.
+  restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  existing_tasks: []
+  new_tasks: ["squad"]
+  freeze_model: false
+  use_alignment_loss: true
+  codecmodel_type: nemo_codec
+  codecmodel_path: ???
+  english_only_model: true
+  context_conditioning: encoder
+  train_from_scratch: true
+  override_tokenizer_vocab_file: ???
+  use_flash_attention: false
+  lm_vocab_size: 30000
+  enc_output_to_layers: [[0,1,2],[3,4,5,6,7,8]]
+
+  frozen_model:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    pipeline_model_parallel_split_rank: 0
+    make_vocab_size_divisible_by: 128
+    pre_process: true
+    post_process: true
+    gradient_as_bucket_view: true
+    native_amp_init_scale: 4294967296
+    native_amp_growth_interval: 1000
+    fp16_lm_cross_entropy: false
+    seed: 1234
+    use_cpu_initialization: false
+    apex_transformer_log_level: 30
+    tokenizer:
+      library: megatron
+      type: BertWordPieceCase
+      model: null
+      vocab_file: null
+      merge_file: null
+    optim:
+      name: null
+    data:
+      dataset_type: t5
+    encoder:
+      arch: multi_transformer
+      n_transformers: 2
+      bias_activation_fusion: false
+      use_flash_attention: ${model.use_flash_attention}
+      num_layers: 6
+      hidden_size: 768
+      ffn_hidden_size: 2048
+      num_attention_heads: 12
+      init_method_std: 0.015
+      hidden_dropout: 0.1
+      attention_dropout: 0.1
+      kv_channels: 64
+      activation: geglu
+    decoder:
+      arch: transformer
+      bias_activation_fusion: false
+      use_flash_attention: ${model.use_flash_attention}
+      num_layers: 12
+      hidden_size: 768
+      ffn_hidden_size: 2048
+      num_attention_heads: 12
+      init_method_std: 0.015
+      hidden_dropout: 0.1
+      attention_dropout: 0.1
+      kv_channels: 64
+      activation: geglu
+
+  task_templates:
+  - taskname: "squad"
+    prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
+    total_virtual_tokens: 3
+    virtual_token_splits: [3]
+    truncate_field: context
+    answer_field: answer
+
+  p_tuning: # P-tuning specific params
+      encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
+      num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
+      dropout: 0.0
+
+  prompt_tuning: # Prompt tunin specific params
+    new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
+    new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random
+
+  data:
+    grapheme_prefix: null
+    train_ds: ???
+    validation_ds: ???
+    max_seq_length: 2048
+    sample_rate: 24000
+    add_eos: true
+    add_bos: false
+    use_attention_prior: true
+    attention_prior_scaling_factor: 0.05
+    cross_attention_epsilon: 0.0
+    decoder_starts_with_pad: False
+    add_eos_to_decoder_output: True
+    add_sentinel_to_input: True
+    ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
+    shuffle: true
+    num_workers: 4
+    pin_memory: true
+    speech_offset: 30128
+    train_task: all
+    num_speech_codebooks: 8
+    codebook_fps: 86
+    context_duration_min: 2.9
+    context_duration_max: 2.9
+    encoder_type: ${model.frozen_model.encoder.arch}
+    g2p:
+      english:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_prefix: ${model.data.grapheme_prefix}
+      spanish:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
+        phoneme_probability: 0.8
+        use_chars: True
+        use_stresses: True
+        ignore_ambiguous_words: False
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "es-ES"
+      mandarin:
+        _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
+        phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
+        word_segmenter: "jieba"
+        phoneme_prefix: ""
+        phoneme_case: "lower"
+        tone_prefix: "#"
+        ascii_letter_prefix: ${model.data.grapheme_prefix}
+        ascii_letter_case: "upper"
+      german:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
+        heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_case: mixed
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "de-DE"
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 1000
+      constant_steps: 0
+      min_lr: 1e-5
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/tts/speechllm/conf/megatron_t5_speechlm_model.yaml b/examples/tts/speechllm/conf/megatron_t5_speechlm_model.yaml
new file mode 100644
index 000000000000..d69bfb979182
--- /dev/null
+++ b/examples/tts/speechllm/conf/megatron_t5_speechlm_model.yaml
@@ -0,0 +1,221 @@
+name: megatron_t5_speechllm
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 32
+  logger: False
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1
+  max_steps: 250000
+  log_every_n_steps: 10
+  val_check_interval: null
+  check_val_every_n_epoch: 1
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 3
+    mode: min
+    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
+    filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+
+model:
+  seed: 1234
+  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  global_batch_size: 2
+  micro_batch_size: 2 # micro batch size should equal global batch size when pipeline parallel = 1
+  validation_global_batch_size: ${model.global_batch_size}
+  validation_micro_batch_size: ${model.micro_batch_size}
+  validation_drop_last: False
+  report_validation_metric: False
+  validation_metric: accuracy
+  num_speech_tokens: 10112 # Vocabulary size pertaining to speech
+  seq_pattern: "parallel" # parallel, delay_parallel, flatten
+  attn_prior_scaledown_start_step: 10000
+  attn_prior_end_step: 11000
+  return_all_crossattention_probs: True
+  num_cross_attention_heads: 12 # 12 for 220m, 16 for 3b.
+  restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  existing_tasks: []
+  new_tasks: ["squad"]
+  freeze_model: false
+  use_alignment_loss: true
+  codecmodel_type: nemo_codec
+  codecmodel_path: ???
+  english_only_model: true
+  context_conditioning: decoder
+  train_from_scratch: true
+  override_tokenizer_vocab_file: ???
+  use_flash_attention: false
+  lm_vocab_size: 30000
+
+  frozen_model:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    pipeline_model_parallel_split_rank: 0
+    make_vocab_size_divisible_by: 128
+    pre_process: true
+    post_process: true
+    gradient_as_bucket_view: true
+    native_amp_init_scale: 4294967296
+    native_amp_growth_interval: 1000
+    fp16_lm_cross_entropy: false
+    seed: 1234
+    use_cpu_initialization: false
+    apex_transformer_log_level: 30
+    tokenizer:
+      library: megatron
+      type: BertWordPieceCase
+      model: null
+      vocab_file: null
+      merge_file: null
+    optim:
+      name: null
+    data:
+      dataset_type: t5
+    encoder:
+      arch: transformer
+      bias_activation_fusion: false
+      use_flash_attention: ${model.use_flash_attention}
+      num_layers: 12
+      hidden_size: 768
+      ffn_hidden_size: 2048
+      num_attention_heads: 12
+      init_method_std: 0.015
+      hidden_dropout: 0.1
+      attention_dropout: 0.1
+      kv_channels: 64
+      activation: geglu
+    decoder:
+      arch: transformer
+      bias_activation_fusion: false
+      use_flash_attention: ${model.use_flash_attention}
+      num_layers: 12
+      hidden_size: 768
+      ffn_hidden_size: 2048
+      num_attention_heads: 12
+      init_method_std: 0.015
+      hidden_dropout: 0.1
+      attention_dropout: 0.1
+      kv_channels: 64
+      activation: geglu
+
+  task_templates:
+  - taskname: "squad"
+    prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
+    total_virtual_tokens: 3
+    virtual_token_splits: [3]
+    truncate_field: context
+    answer_field: answer
+
+  p_tuning: # P-tuning specific params
+      encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
+      num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
+      dropout: 0.0
+
+  prompt_tuning: # Prompt tunin specific params
+    new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
+    new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random
+
+  data:
+    use_ipa: false
+    grapheme_prefix: null
+    train_ds: ???
+    validation_ds: ???
+    max_seq_length: 2048
+    sample_rate: 24000
+    add_eos: true
+    add_bos: false
+    use_attention_prior: true
+    attention_prior_scaling_factor: 0.05
+    cross_attention_epsilon: 0.0
+    decoder_starts_with_pad: False
+    add_eos_to_decoder_output: True
+    add_sentinel_to_input: True
+    ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
+    shuffle: true
+    num_workers: 4
+    pin_memory: true
+    speech_offset: 30128
+    train_task: all
+    num_speech_codebooks: 8
+    codebook_fps: 86
+    context_duration_min: 2.9
+    context_duration_max: 2.9
+    g2p:
+      english:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_prefix: ${model.data.grapheme_prefix}
+      spanish:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
+        phoneme_probability: 0.8
+        use_chars: True
+        use_stresses: True
+        ignore_ambiguous_words: False
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "es-ES"
+      mandarin:
+        _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
+        phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
+        word_segmenter: "jieba"
+        phoneme_prefix: ""
+        phoneme_case: "lower"
+        tone_prefix: "#"
+        ascii_letter_prefix: ${model.data.grapheme_prefix}
+        ascii_letter_case: "upper"
+      german:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
+        heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_case: mixed
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "de-DE"
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 1000
+      constant_steps: 0
+      min_lr: 1e-5
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/tts/speechllm/megatron_t5_speechllm.py b/examples/tts/speechllm/megatron_t5_speechllm.py
new file mode 100644
index 000000000000..c4ec1a77f944
--- /dev/null
+++ b/examples/tts/speechllm/megatron_t5_speechllm.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from omegaconf.omegaconf import OmegaConf, open_dict
+
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.collections.tts.models.speechllm.megatron_t5_speechllm_model import MegatronT5SpeechLMModel
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+
+@hydra_runner(config_path="conf", config_name="megatron_t5_speechllm_medium.yaml")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    # MegatronTrainerBuilder compat checks
+    if "gradient_as_bucket_view" not in cfg.model:
+        with open_dict(cfg):
+            cfg.model.gradient_as_bucket_view = False
+
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+    exp_manager(trainer, cfg.exp_manager)
+
+    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
+    with open_dict(cfg):
+        cfg.model.precision = cfg.trainer.precision
+
+    # load existing or init new soft prompt T5 model
+    if cfg.model.get("restore_path", None) is not None:
+        logging.info(f"cfg.model.restore_path {cfg.model.restore_path}")
+        model = MegatronT5SpeechLMModel.restore_from(
+            cfg.model.restore_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector()
+        )
+    else:
+        logging.info(f"cfg.model.restore_path is None")
+        model = MegatronT5SpeechLMModel(cfg.model, trainer=trainer)
+        model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/tts/speechllm/megatron_t5_speechllm_inference.py b/examples/tts/speechllm/megatron_t5_speechllm_inference.py
new file mode 100644
index 000000000000..48d46952a993
--- /dev/null
+++ b/examples/tts/speechllm/megatron_t5_speechllm_inference.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from omegaconf.omegaconf import OmegaConf, open_dict
+
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.tts.models.speechllm.megatron_t5_speechllm_model import MegatronT5SpeechLMModel
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+
+@hydra_runner(config_path="conf", config_name="megatron_t5_speechllm_inference.yaml")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    # MegatronTrainerBuilder compat checks
+    if "gradient_as_bucket_view" not in cfg.model:
+        with open_dict(cfg):
+            cfg.model.gradient_as_bucket_view = False
+
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+    exp_manager(trainer, cfg.exp_manager)
+
+    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
+    with open_dict(cfg):
+        cfg.model.precision = cfg.trainer.precision
+
+    # load existing or init new soft prompt T5 model
+    checkpoint_path = cfg.get('checkpoint_path', None)
+    assert checkpoint_path is not None, "Please specify checkpoint_path in the config file"
+    model = MegatronT5SpeechLMModel.load_from_checkpoint(
+        checkpoint_path=checkpoint_path, trainer=trainer, cfg=cfg.model
+    )
+    model.eval()
+    model = model.cuda()
+    trainer.test(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py
index b16ac50e4d56..915f406a3e88 100644
--- a/nemo/collections/common/parts/preprocessing/collections.py
+++ b/nemo/collections/common/parts/preprocessing/collections.py
@@ -308,6 +308,132 @@ def __init__(
         super().__init__(data)
 
 
+class InstructionTuningAudioText(_Collection):
+    """`AudioText` collector from asr structured json files."""
+
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='InstructionTuningText',
+        field_names='id context context_type context_duration question question_type answer answer_type answer_duration speaker',
+    )
+
+    def __init__(
+        self,
+        manifests_files: Union[str, List[str]],
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+        max_seq_length: Optional[float] = None,
+        max_number: Optional[int] = None,
+        do_sort_by_duration: bool = False,
+        index_by_file_id: bool = False,
+        decoder_only_model: bool = False,
+        use_phoneme_tokenizer: bool = False,
+    ):
+        """Parse lists of audio files, durations and transcripts texts.
+        Args:
+            manifests_files: Either single string file or list of such -
+                manifests to yield items from.
+            *args: Args to pass to `AudioText` constructor.
+            **kwargs: Kwargs to pass to `AudioText` constructor.
+        """
+
+        output_type = self.OUTPUT_TYPE
+        self.use_phoneme_tokenizer = use_phoneme_tokenizer
+        data, duration_filtered, num_filtered, total_duration = [], 0.0, 0, 0.0
+        if index_by_file_id:
+            self.mapping = {}
+
+        for item in manifest.item_iter(manifests_files):
+
+            id = item['id']
+            context = item['context']
+            context_duration = item['context_duration']
+            context_type = item['context_type']
+            question = item['question']
+            question_type = item['question_type']
+            speaker = item['speaker']
+            answer = item['answer']
+            answer_duration = item['answer_duration']
+            answer_type = item['answer_type']
+            task = item['task']
+
+            task = 'tts' if task is None else task
+            duration = answer_duration if task == 'tts' else context_duration
+            if min_duration is not None and duration < min_duration:
+                duration_filtered += duration
+                num_filtered += 1
+                continue
+
+            if max_duration is not None and duration > max_duration:
+                duration_filtered += duration
+                num_filtered += 1
+                continue
+
+            # Check segment length
+            approx_context_len = min(self._get_len(context_type, context, context_duration) * 0.3, 400)
+            approx_question_len = self._get_len(question_type, question, None)
+            approx_answer_len = self._get_len(answer_type, answer, answer_duration)
+
+            if (
+                decoder_only_model and approx_context_len + approx_question_len + approx_answer_len >= max_seq_length
+            ) or (approx_context_len + approx_question_len >= max_seq_length or approx_answer_len >= max_seq_length):
+                duration_filtered += duration
+                num_filtered += 1
+                continue
+
+            total_duration += duration
+            data.append(
+                output_type(
+                    id,
+                    context,
+                    context_type,
+                    context_duration,
+                    question,
+                    question_type,
+                    answer,
+                    answer_type,
+                    answer_duration,
+                    speaker,
+                )
+            )
+
+            if index_by_file_id:
+                file_id, _ = os.path.splitext(os.path.basename(context))
+                if ".context" in file_id:
+                    file_id = file_id[:-8]
+                if file_id not in self.mapping:
+                    self.mapping[file_id] = []
+                self.mapping[file_id].append(len(data) - 1)
+
+            # Max number of entities filter.
+            if len(data) == max_number:
+                break
+
+        if do_sort_by_duration:
+            if index_by_file_id:
+                logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
+            else:
+                data.sort(key=lambda entity: entity.duration)
+
+        logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600)
+        logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600)
+
+        super().__init__(data)
+
+    def _get_len(self, field_type, data, duration_data):
+        if field_type == "SPEECH":
+            return duration_data * 76  # TODO: add explanation for the hardcoded value.
+        elif field_type == "TEXT":
+            if self.use_phoneme_tokenizer:
+                # Approx len is number of characters
+                return len(data)
+            else:
+                return len(data.split(' ')) + 3  # # TODO: add explanation for the hardcoded value.
+        elif field_type == "TOKENS":
+            return len(data) + 3
+        else:
+            raise ValueError(f"Unknown field type {field_type}.")
+
+
 class ASRAudioText(AudioText):
     """`AudioText` collector from asr structured json files."""
 
diff --git a/nemo/collections/common/parts/preprocessing/manifest.py b/nemo/collections/common/parts/preprocessing/manifest.py
index 1d49bd7c7019..e2ad08bd04c2 100644
--- a/nemo/collections/common/parts/preprocessing/manifest.py
+++ b/nemo/collections/common/parts/preprocessing/manifest.py
@@ -110,6 +110,8 @@ def __parse_item(line: str, manifest_file: str) -> Dict[str, Any]:
         item['audio_file'] = item.pop('audio_filename')
     elif 'audio_filepath' in item:
         item['audio_file'] = item.pop('audio_filepath')
+    elif 'context' in item:
+        item['audio_file'] = item['context']
 
     # Video File
     if 'video_filename' in item:
@@ -132,7 +134,9 @@ def __parse_item(line: str, manifest_file: str) -> Dict[str, Any]:
         item['video_file'] = get_full_path(audio_file=item['video_file'], manifest_file=manifest_file)
 
     # Duration.
-    if 'duration' not in item:
+    if 'context_duration' in item and 'duration' not in item:
+        item['duration'] = item['context_duration']
+    elif 'duration' not in item:
         raise ValueError(
             f"Manifest file {manifest_file} has invalid json line structure: {line} without proper duration key."
         )
@@ -184,6 +188,15 @@ def __parse_item(line: str, manifest_file: str) -> Dict[str, Any]:
         orig_sr=item.get('orig_sample_rate', None),
         token_labels=item.get('token_labels', None),
         lang=item.get('lang', None),
+        context=item.get('context', None),
+        context_type=item.get('context_type', None),
+        context_duration=item.get('context_duration', None),
+        answer=item.get('answer', None),
+        answer_type=item.get('answer_type', None),
+        answer_duration=item.get('answer_duration', None),
+        question=item.get('question', None),
+        question_type=item.get('question_type', None),
+        task=item.get('task', None),
     )
     return item
 
@@ -247,7 +260,7 @@ def get_full_path(
         if (
             (len(audio_file) < audio_file_len_limit)
             and not os.path.isabs(audio_file)
-            and not os.path.isfile(audio_file)
+            # and not os.path.isfile(audio_file) # Commented out because it slows down dataloading
         ):
             # If audio_file is not available and the path is not absolute, the full path is assumed
             # to be relative to the manifest file parent directory or data directory.
diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
index a8ea949019c1..56a4b04dfe0f 100644
--- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
+++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
@@ -25,7 +25,7 @@
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.utils import logging
 
-__all__ = ['SentencePieceTokenizer', 'create_spt_model']
+__all__ = ['SentencePieceTokenizer', 'SentencePieceSpeechLLMTTSTokenizer', 'create_spt_model']
 
 
 class SentencePieceTokenizer(TokenizerSpec, ChatTemplateMixin):
@@ -315,6 +315,14 @@ def vocab(self):
         return main_vocab + special_tokens
 
 
+class SentencePieceSpeechLLMTTSTokenizer(SentencePieceTokenizer):
+    def add_phone_tokens_to_special_tokens(self):
+        for i, word in enumerate(self.vocab):
+            if word.startswith("p{"):
+                self.special_token_to_id[word] = i
+                self.id_to_special_token[i] = word
+
+
 def create_spt_model(
     data_file: str,
     vocab_size: int,
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py
index bbd14f47a651..ea5f8c5a930b 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import omegaconf
 import torch
 
 from nemo.collections.nlp.modules.common import VirtualPromptSource
@@ -70,8 +71,55 @@ def __init__(
         # Datasets are a list of file path strings to .json or .jsonl files
         elif isinstance(datasets[0], str):
             for path in datasets:
-                dataset = open(path, 'r', encoding='utf-8')
-                self.load_data(dataset)
+                with open(path, 'r', encoding='utf-8') as dataset:
+                    dataset_examples = self.load_data(dataset)
+                self.examples.extend(dataset_examples)
+        elif isinstance(datasets[0], omegaconf.ListConfig) or isinstance(datasets[0], list):
+            # Dataset is a list of tuples with the first element being the probability of sampling from the dataset
+            # This code repeates the smaller datasets to approximately match the target probabilities
+            total_examples = 0
+            dataset_lengths = []
+            target_probs = []
+            datasets_examples_list = []
+            for prob_and_path in datasets:
+                prob = prob_and_path[0]
+                path = prob_and_path[1]
+                with open(path, 'r', encoding='utf-8') as dataset:
+                    dataset_examples = self.load_data(dataset)
+                datasets_examples_list.append(dataset_examples)
+                dataset_lengths.append(len(dataset_examples))
+                total_examples += len(dataset_examples)
+                target_probs.append(prob)
+
+            # Normalize the target probs
+            target_probs = [prob / sum(target_probs) for prob in target_probs]
+            current_probs = [dataset_lengths[i] / total_examples for i in range(len(dataset_lengths))]
+
+            # Increase number of examples needed without reducing the larger datasets with low target probs
+            new_total_examples = total_examples
+            for dataset_idx in range(len(datasets)):
+                if target_probs[dataset_idx] < current_probs[dataset_idx]:
+                    target_total_examples = int(dataset_lengths[dataset_idx] / target_probs[dataset_idx])
+                    new_total_examples = max(new_total_examples, target_total_examples)
+
+            final_total_examples = 0
+            final_dataset_lengths = []
+            for dataset_idx in range(len(datasets)):
+                num_samples_required = int(new_total_examples * target_probs[dataset_idx])
+                num_repeat = max(
+                    int(round(num_samples_required // dataset_lengths[dataset_idx])), 1
+                )  # At least 1 repeat
+                logging.info("dataset idx {}, num_repeat {}".format(dataset_idx, num_repeat))
+                dataset_examples_repeated = datasets_examples_list[dataset_idx] * num_repeat
+                final_dataset_lengths.append(len(dataset_examples_repeated))
+                final_total_examples += len(dataset_examples_repeated)
+                self.examples.extend(dataset_examples_repeated)
+
+            final_probs = [final_dataset_lengths[i] / final_total_examples for i in range(len(final_dataset_lengths))]
+            logging.info("Target probs: {}".format(target_probs))
+            logging.info("Final probs: {}".format(final_probs))
+            logging.info("Initial total examples: {}".format(total_examples))
+            logging.info("Final total examples: {}".format(final_total_examples))
         else:
             raise ValueError("Datasets must be a list of dicts or a list of filepath strings")
 
diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py
index c1b4e3023e42..d5784081f6f0 100644
--- a/nemo/collections/nlp/modules/common/megatron/attention.py
+++ b/nemo/collections/nlp/modules/common/megatron/attention.py
@@ -380,6 +380,7 @@ def forward(
         rotary_pos_emb=None,  # rotary positional embedding
         relative_position_bias=None,
         checkpoint_core_attention=False,
+        return_scores=False,
     ):
         # hidden_states: [sq, b, h]
 
@@ -398,7 +399,9 @@ def forward(
 
         # Some consistency check.
         if inference_max_sequence_len:
-            assert self.inference_current_sequence_len < self.inference_key_memory.size(0)
+            # Added equals to as inference key_memory size refers to cross-attention key size
+            # which is already equal to the current "sequence length"
+            assert self.inference_current_sequence_len <= self.inference_key_memory.size(0)
             assert inference_max_sequence_len == self.inference_key_memory.size(0)
         # This is added for safety. In case inference_max_sequence_len
         # is not provided, make sure there is no potential memory left
@@ -433,28 +436,40 @@ def forward(
             (query_layer, key_layer, value_layer) = tensor_parallel.split_tensor_along_last_dim(
                 mixed_x_layer, 3, contiguous_split_chunks=True
             )
-        else:
+        else:  # Else in cross_attention
             # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
-            mixed_kv_layer, _ = self.key_value(encoder_output)
-            if self.is_adapter_available():
-                lora_kv_adapter = self.get_adapter_module(AdapterName.LORA_KV_ADAPTER)
-                if lora_kv_adapter and self.adapter_cfg[AdapterName.LORA_KV_ADAPTER]['enabled']:
-                    lora_mixed_kv_layer = lora_kv_adapter(encoder_output)
-                    mixed_kv_layer = mixed_kv_layer + lora_mixed_kv_layer
-
-            # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
-            new_tensor_shape = mixed_kv_layer.size()[:-1] + (
-                self.num_attention_heads_per_partition,
-                2 * self.hidden_size_per_attention_head,
-            )
-            if self.megatron_legacy:
-                mixed_kv_layer = self._transpose_last_dim(mixed_kv_layer, 2, True)
-            mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
+            if (
+                inference_max_sequence_len is None
+            ) or self.inference_current_sequence_len < inference_max_sequence_len:
+                # If we are in traning and inference_max_sequence_len is None
+                # Or we haven't cached the key and value part of cross attention in the decoder on step 0,
+                # Do the caching
+                mixed_kv_layer, _ = self.key_value(encoder_output)
+                if self.is_adapter_available():
+                    lora_kv_adapter = self.get_adapter_module(AdapterName.LORA_KV_ADAPTER)
+                    if lora_kv_adapter and self.adapter_cfg[AdapterName.LORA_KV_ADAPTER]['enabled']:
+                        lora_mixed_kv_layer = lora_kv_adapter(encoder_output)
+                        mixed_kv_layer = mixed_kv_layer + lora_mixed_kv_layer
+
+                # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+                new_tensor_shape = mixed_kv_layer.size()[:-1] + (
+                    self.num_attention_heads_per_partition,
+                    2 * self.hidden_size_per_attention_head,
+                )
+                if self.megatron_legacy:
+                    mixed_kv_layer = self._transpose_last_dim(mixed_kv_layer, 2, True)
+                mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
 
-            # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
-            (key_layer, value_layer) = tensor_parallel.split_tensor_along_last_dim(
-                mixed_kv_layer, 2, contiguous_split_chunks=True
-            )
+                # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+                (key_layer, value_layer) = tensor_parallel.split_tensor_along_last_dim(
+                    mixed_kv_layer, 2, contiguous_split_chunks=True
+                )
+            else:
+                # else if we are in inference and have already cached key, value, can just read cache
+                key_layer = self.inference_key_memory[: self.inference_current_sequence_len, ...]
+                value_layer = self.inference_value_memory[: self.inference_current_sequence_len, ...]
+                if attention_mask is not None:
+                    attention_mask = attention_mask[..., -1, :].unsqueeze(-2)
 
             # Attention head [sq, b, h] --> [sq, b, hp]
             query_layer, _ = self.query(hidden_states)
@@ -490,7 +505,9 @@ def forward(
         if rotary_pos_emb is not None:
             rotary_pos_emb = rotary_pos_emb if isinstance(rotary_pos_emb, tuple) else ((rotary_pos_emb,) * 2)
 
-        if inference_max_sequence_len:
+        # If we are in cross attention (inference_current_sequence_len == inference_max_sequence_len == inference_key_memory.size(0))
+        # We only need to cache this once
+        if inference_max_sequence_len and self.inference_current_sequence_len < inference_max_sequence_len:
             # Adjust the range variables.
             start = self.inference_current_sequence_len
             self.inference_current_sequence_len += key_layer.size(0)
@@ -501,7 +518,7 @@ def forward(
             key_layer = self.inference_key_memory[:end, ...]
             value_layer = self.inference_value_memory[:end, ...]
             # Adjust attention mask
-            if attention_mask is not None:
+            if attention_mask is not None and self.attention_type == AttnType.self_attn:
                 attention_mask = attention_mask[..., start:end, :end]
             # adjust the key rotary positional embedding
             if rotary_pos_emb is not None:
@@ -569,7 +586,10 @@ def forward(
                 relative_position_bias=relative_position_bias,
                 headscale_tensor=self.head_scale_tensor if self.headscale else None,
                 inference_mode=inference_max_sequence_len is not None and query_layer.shape[0] == 1,
+                return_scores=return_scores,
             )
+            if return_scores:
+                context_layer, attention_probs = context_layer
 
         # =================
         # Output. [sq, b, h]
@@ -585,6 +605,9 @@ def forward(
         if get_key_value:
             output = [output, present]
 
+        if return_scores:
+            output = [output, attention_probs]
+
         return output, bias
 
 
@@ -857,6 +880,7 @@ def forward(
         relative_position_bias=None,
         headscale_tensor=None,
         inference_mode=None,
+        return_scores=None,
     ):
         b, np, sq, sk, hn = (
             query_layer.size(1),
@@ -914,9 +938,27 @@ def forward(
         # relative_position_bias [b, np, sq, sk]
         # context_layer [b, np, sq, hn]
         # ==================================================
-        context_layer = self.attn_fn(
-            query_layer, key_layer, value_layer, attention_mask, relative_position_bias, inference_mode
-        )
+        if not return_scores:
+            context_layer = self.attn_fn(
+                query_layer,
+                key_layer,
+                value_layer,
+                attention_mask,
+                relative_position_bias,
+                inference_mode,
+            )
+        else:
+            # SpeechLLM TTS modifications
+            context_layer = self.torch_attention_with_prior(
+                query_layer,
+                key_layer,
+                value_layer,
+                attention_mask,
+                relative_position_bias,
+                inference_mode,
+                return_scores=return_scores,
+            )
+            context_layer, attention_probs = context_layer
 
         if headscale_tensor is not None:
             context_layer = context_layer * headscale_tensor
@@ -928,7 +970,10 @@ def forward(
         new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
         context_layer = context_layer.view(*new_context_layer_shape)
 
-        return context_layer
+        if return_scores:
+            return context_layer, attention_probs
+        else:
+            return context_layer
 
     def torch_attention(self, query_layer, key_layer, value_layer, attention_mask, attention_bias, inference_mode):
         sq, b, np, hn = query_layer.shape
@@ -986,6 +1031,69 @@ def torch_attention(self, query_layer, key_layer, value_layer, attention_mask, a
 
         return context_layer
 
+    def torch_attention_with_prior(
+        self, query_layer, key_layer, value_layer, attention_mask, attention_bias, inference_mode, return_scores=False
+    ):
+        sq, b, np, hn = query_layer.shape
+        sk = key_layer.shape[0]
+
+        if self.multi_query_attention:
+            query_layer = rearrange(query_layer, 'sq b np hn -> b (np sq) hn')
+            key_layer = rearrange(key_layer, 'sk b 1 hn -> b hn sk')
+            value_layer = rearrange(value_layer, 'sv b np hn -> (b np) sv hn')
+        else:
+            query_layer = rearrange(query_layer, 'sq b np hn -> (b np) sq hn')
+            key_layer = rearrange(key_layer, 'sk b np hn -> (b np) hn sk')
+            value_layer = rearrange(value_layer, 'sv b np hn -> (b np) sv hn')
+
+        matmul_input_buffer = torch.empty(
+            query_layer.shape[0],
+            query_layer.shape[1],
+            key_layer.shape[2],
+            dtype=query_layer.dtype,
+            device=query_layer.device,
+        )
+
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer,
+            key_layer,
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor) if self.normalize_attention_scores else 1.0,
+        )
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(b, np, sq, sk)
+
+        if attention_bias is not None:
+            # attention_bias is not None only for cross attention layers right now in T5
+            attention_scores = torch.log_softmax(attention_scores, dim=-1) + attention_bias
+
+        _attention_probs = self.scale_mask_softmax(attention_scores, attention_mask)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+
+        if not self.sequence_parallel:
+            with tensor_parallel.random.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(_attention_probs)
+        else:
+            attention_probs = self.attention_dropout(_attention_probs)
+
+        # change view [b * np, sq, sk]
+        attention_probs = rearrange(attention_probs, 'b np sq sk -> (b np) sq sk')
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+
+        # change view [b, np, sq, hn]
+        context_layer = rearrange(context_layer, '(b np) sq hn -> b np sq hn', np=np)
+
+        if return_scores:
+            # return context_layer, _attention_probs
+            return context_layer, attention_scores
+        else:
+            return context_layer
+
     def flash_attention(self, query_layer, key_layer, value_layer, attention_mask, attention_bias, inference_mode):
         query_layer = rearrange(query_layer, 'sq b np hn -> b sq np hn')
         key_layer = rearrange(key_layer, 'sk b np hn -> b sk np hn')
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py
index 712ce10b81b5..d2945a061584 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """Transformer based language model."""
-from ast import Mod
+from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType
 from nemo.collections.nlp.modules.common.megatron.megatron_transformer_decoder import MegatronTransformerDecoderModule
 from nemo.collections.nlp.modules.common.megatron.retrieval_transformer import (
     MegatronRetrievalTransformerDecoderModule,
@@ -87,7 +87,7 @@ def get_decoder_model(
     transformer_block_type="pre_ln",
     hidden_steps=-1,
     parent_model_type=ModelType.encoder_or_decoder,
-    layer_type=None,
+    layer_type=LayerType.decoder,
     chunk_size=64,
     layer_number_offset=0,  # this is use only for attention norm_factor scaling
     megatron_legacy=False,
@@ -158,6 +158,7 @@ def get_decoder_model(
             moe_dropout=moe_dropout,
             position_embedding_type=position_embedding_type,
             use_flash_attention=use_flash_attention,
+            layer_type=layer_type,
         )
     elif arch == "retro":
         decoder = MegatronRetrievalTransformerDecoderModule(
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoder_decoder.py
index c4192dacb45a..744a6e18c8b1 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_encoder_decoder.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_encoder_decoder.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 """Transformer based language model."""
-from ast import Mod
 
 import torch
 
@@ -46,8 +45,7 @@
 
 
 class MegatronTransformerEncoderDecoderModule(MegatronModule):
-    """Transformer encoder-decoder model.
-    """
+    """Transformer encoder-decoder model."""
 
     def __init__(
         self,
@@ -85,6 +83,8 @@ def __init__(
                 encoder_attn_mask_type = AttnMaskType.padding
             elif hasattr(encoder.model, 'self_attn_mask_type'):
                 encoder_attn_mask_type = encoder.model.self_attn_mask_type
+            elif isinstance(encoder.model, torch.nn.ModuleList) and hasattr(encoder.model[0], 'self_attn_mask_type'):
+                encoder_attn_mask_type = encoder.model[0].self_attn_mask_type
             else:
                 raise AttributeError(
                     "Could not find an attribute for encoder self_attn_mask_type, make sure it is set when instatiating the encoder or pass it to the constructor of this class."
@@ -142,7 +142,11 @@ def encode(
         # apply hidden transformations if needed
         if self.hiddens_module is not None:
             enc_output = self.hiddens_module.apply_hidden_transforms(
-                {"hiddens": enc_output, "hiddens_mask": self.get_hiddens_mask(enc_attn_mask),}, batch_data=batch_data,
+                {
+                    "hiddens": enc_output,
+                    "hiddens_mask": self.get_hiddens_mask(enc_attn_mask),
+                },
+                batch_data=batch_data,
             )
 
         return enc_output
@@ -157,6 +161,11 @@ def decode(
         dec_get_key_value=False,
         dec_self_attention_relative_position_bias=None,
         dec_cross_attention_relative_position_bias=None,
+        return_all_crossattention_probs=False,
+        set_inference_key_value_memory=False,
+        decoder_max_sequence_len=None,
+        encoder_max_sequence_len=None,
+        enc_output_to_layers=None,
     ):
         if self.decoder is None:
             raise ValueError(f"Cannot call .decode(...) when self.decoder is None.")
@@ -170,6 +179,11 @@ def decode(
             enc_attn_mask=enc_attn_mask,
             dec_self_attention_relative_position_bias=dec_self_attention_relative_position_bias,
             dec_cross_attention_relative_position_bias=dec_cross_attention_relative_position_bias,
+            return_all_crossattention_probs=return_all_crossattention_probs,
+            set_inference_key_value_memory=set_inference_key_value_memory,
+            decoder_max_sequence_len=decoder_max_sequence_len,
+            encoder_max_sequence_len=encoder_max_sequence_len,
+            enc_output_to_layers=enc_output_to_layers,
         )
 
         return dec_output
@@ -191,6 +205,11 @@ def forward(
         dec_self_attention_relative_position_bias=None,
         dec_cross_attention_relative_position_bias=None,
         batch_data=None,
+        return_all_crossattention_probs=False,
+        set_inference_key_value_memory=False,
+        decoder_max_sequence_len=None,
+        encoder_max_sequence_len=None,
+        enc_output_to_layers=None,
     ):
         # encoder
         if enc_output is None:
@@ -207,7 +226,10 @@ def forward(
                 assert self.encoder_hidden_state is not None
                 enc_output = self.encoder_hidden_state
         else:
-            enc_attn_mask = enc_output_attn_mask.to(enc_attn_mask)
+            if isinstance(enc_output_attn_mask, list):
+                enc_attn_mask = [mask.to(enc_attn_mask[midx]) for midx, mask in enumerate(enc_output_attn_mask)]
+            else:
+                enc_attn_mask = enc_output_attn_mask.to(enc_attn_mask)
 
         if self.decoder is None or output_enc_hidden_only:
             return enc_output
@@ -216,15 +238,22 @@ def forward(
         dec_output = self.decode(
             dec_input=dec_input,
             dec_attn_mask=dec_attn_mask,
-            enc_output=enc_output["enc_output"]  # enc_output is a dict if we used hidden transformations
-            if self.hiddens_module is not None
-            else enc_output,
+            enc_output=(
+                enc_output["enc_output"]  # enc_output is a dict if we used hidden transformations
+                if self.hiddens_module is not None
+                else enc_output
+            ),
             # Adjust encoder attention mask if encoder is a perceiver.
             enc_attn_mask=self.get_hiddens_mask(enc_attn_mask),
             dec_layer_past=dec_layer_past,
             dec_get_key_value=dec_get_key_value,
             dec_self_attention_relative_position_bias=dec_self_attention_relative_position_bias,
             dec_cross_attention_relative_position_bias=dec_cross_attention_relative_position_bias,
+            return_all_crossattention_probs=return_all_crossattention_probs,
+            set_inference_key_value_memory=set_inference_key_value_memory,
+            decoder_max_sequence_len=decoder_max_sequence_len,
+            encoder_max_sequence_len=encoder_max_sequence_len,
+            enc_output_to_layers=enc_output_to_layers,
         )
 
         # if self.hiddens_module is not None enc_output is a dict, else it is a torch.tensor
@@ -246,7 +275,10 @@ def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=
     def load_state_dict(self, state_dict, strict=True):
         """Customized load."""
 
-        self.encoder.load_state_dict(state_dict[self._encoder_key], strict=strict)
-        self.decoder.load_state_dict(state_dict[self._decoder_key], strict=strict)
-        if self.hiddens_module is not None:
-            self.hiddens_module.load_state_dict(state_dict[self._hiddens_module], strict=strict)
+        try:
+            self.encoder.load_state_dict(state_dict[self._encoder_key], strict=strict)
+            self.decoder.load_state_dict(state_dict[self._decoder_key], strict=strict)
+            if self.hiddens_module is not None:
+                self.hiddens_module.load_state_dict(state_dict[self._hiddens_module], strict=strict)
+        except KeyError as e:
+            super().load_state_dict(state_dict, strict=strict)
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py
index 601eb320e8fc..3d2b2c1ecc13 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py
@@ -14,7 +14,10 @@
 
 """Transformer based language model."""
 from nemo.collections.nlp.modules.common.megatron.megatron_perceiver_encoders import MegatronPerceiverEncoderModule
-from nemo.collections.nlp.modules.common.megatron.megatron_transformer_encoder import MegatronTransformerEncoderModule
+from nemo.collections.nlp.modules.common.megatron.megatron_transformer_encoder import (
+    MegatronTransformerEncoderModule,
+    MultiMegatronTransformerEncoderModule,
+)
 from nemo.collections.nlp.modules.common.megatron.retrieval_transformer import (
     MegatronRetrievalTransformerEncoderModule,
 )
@@ -108,6 +111,7 @@ def get_encoder_model(
     version=1,  # model version
     position_embedding_type='learned_absolute',
     use_flash_attention=False,
+    n_transformers=1,
 ):
     """Build language model and return along with the key to save."""
 
@@ -167,6 +171,51 @@ def get_encoder_model(
             position_embedding_type=position_embedding_type,
             use_flash_attention=use_flash_attention,
         )
+    elif arch == "multi_transformer":
+        encoder = MultiMegatronTransformerEncoderModule(
+            config=config,
+            n_transformers=n_transformers,
+            init_method=init_method,
+            output_layer_init_method=scaled_init_method,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            num_attention_heads=num_attention_heads,
+            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+            kv_channels=kv_channels,
+            ffn_hidden_size=ffn_hidden_size,
+            encoder_attn_mask_type=encoder_attn_mask_type,
+            pre_process=pre_process,
+            post_process=post_process,
+            hidden_dropout=hidden_dropout,
+            attention_dropout=attention_dropout,
+            ffn_dropout=ffn_dropout,
+            precision=precision,
+            fp32_residual_connection=fp32_residual_connection,
+            activations_checkpoint_method=activations_checkpoint_method,
+            activations_checkpoint_num_layers=activations_checkpoint_num_layers,
+            activations_checkpoint_granularity=activations_checkpoint_granularity,
+            layernorm_epsilon=layernorm_epsilon,
+            bias_activation_fusion=bias_activation_fusion,
+            bias_dropout_add_fusion=bias_dropout_add_fusion,
+            masked_softmax_fusion=masked_softmax_fusion,
+            persist_layer_norm=persist_layer_norm,
+            openai_gelu=openai_gelu,
+            onnx_safe=onnx_safe,
+            activation=activation,
+            bias=bias,
+            normalization=normalization,
+            transformer_block_type=transformer_block_type,
+            headscale=headscale,
+            parent_model_type=parent_model_type,
+            megatron_legacy=megatron_legacy,
+            normalize_attention_scores=normalize_attention_scores,
+            num_moe_experts=num_moe_experts,
+            moe_frequency=moe_frequency,
+            moe_dropout=moe_dropout,
+            position_embedding_type=position_embedding_type,
+            use_flash_attention=use_flash_attention,
+        )
+
     elif arch == "retro":
         encoder = MegatronRetrievalTransformerEncoderModule(
             config=config,
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py
index 4a05a08820e7..14677552492b 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py
@@ -52,8 +52,7 @@
 
 
 class MegatronTransformerDecoderModule(MegatronModule, Exportable, MegatronDecoderModule):
-    """Transformer decoder model.
-    """
+    """Transformer decoder model."""
 
     def __init__(
         self,
@@ -97,6 +96,7 @@ def __init__(
         moe_dropout=0.0,
         position_embedding_type='learned_absolute',
         use_flash_attention=False,
+        layer_type=LayerType.decoder,
     ):
         super(MegatronTransformerDecoderModule, self).__init__(config=config)
 
@@ -121,7 +121,7 @@ def __init__(
         # Transformer.
         self.model = ParallelTransformer(
             config=config,
-            layer_type=LayerType.decoder,
+            layer_type=layer_type,
             init_method=self.init_method,
             output_layer_init_method=self.output_layer_init_method,
             num_layers=self.num_layers,
@@ -165,7 +165,7 @@ def __init__(
         self._model_key = 'model'
 
     def set_input_tensor(self, input_tensor):
-        """ See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.model.transformer.set_input_tensor()"""
         self.model.set_input_tensor(input_tensor)
 
     def forward(
@@ -178,15 +178,41 @@ def forward(
         get_key_value=False,
         dec_self_attention_relative_position_bias=None,
         dec_cross_attention_relative_position_bias=None,
+        return_all_crossattention_probs=False,
+        set_inference_key_value_memory=False,
+        decoder_max_sequence_len=None,
+        encoder_max_sequence_len=None,
+        enc_output_to_layers=None,
     ):
         # convert to Megatron mask
         dec_attn_mask_3d = build_attention_mask_3d(
-            source_mask=dec_attn_mask, target_mask=dec_attn_mask, attn_mask_type=self.model_attn_mask_type,
-        )
-        enc_dec_attn_mask_3d = build_attention_mask_3d(
-            source_mask=dec_attn_mask, target_mask=enc_attn_mask, attn_mask_type=AttnMaskType.padding,
+            source_mask=dec_attn_mask,
+            target_mask=dec_attn_mask,
+            attn_mask_type=self.model_attn_mask_type,
         )
 
+        if isinstance(enc_output, list):
+            assert len(enc_output) == len(enc_attn_mask)
+            enc_dec_attn_mask_3d = []
+            for i in range(len(enc_output)):
+                enc_dec_attn_mask_3d.append(
+                    attn_mask_postprocess(
+                        build_attention_mask_3d(
+                            source_mask=dec_attn_mask,
+                            target_mask=enc_attn_mask[i],
+                            attn_mask_type=AttnMaskType.padding,
+                        )
+                    )
+                )
+        else:
+            enc_dec_attn_mask_3d = attn_mask_postprocess(
+                build_attention_mask_3d(
+                    source_mask=dec_attn_mask,
+                    target_mask=enc_attn_mask,
+                    attn_mask_type=AttnMaskType.padding,
+                )
+            )
+
         # transformer decoder
         dec_output = self.model(
             dec_input,
@@ -194,9 +220,14 @@ def forward(
             layer_past=layer_past,
             get_key_value=get_key_value,
             encoder_output=enc_output,
-            enc_dec_attn_mask=attn_mask_postprocess(enc_dec_attn_mask_3d),
+            enc_dec_attn_mask=enc_dec_attn_mask_3d,
             self_attention_relative_position_bias=dec_self_attention_relative_position_bias,
             cross_attention_relative_position_bias=dec_cross_attention_relative_position_bias,
+            return_all_crossattention_probs=return_all_crossattention_probs,
+            set_inference_key_value_memory=set_inference_key_value_memory,
+            decoder_max_sequence_len=decoder_max_sequence_len,
+            encoder_max_sequence_len=encoder_max_sequence_len,
+            enc_output_to_layers=enc_output_to_layers,
         )
 
         return dec_output
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py
index 7a41e1300066..a9b80868558f 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 """Transformer based language model."""
+import torch
+
 from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType
 from nemo.collections.nlp.modules.common.megatron.megatron_encoder_module import MegatronEncoderModule
 from nemo.collections.nlp.modules.common.megatron.module import MegatronModule
@@ -163,7 +165,7 @@ def __init__(
         self._model_key = 'model'
 
     def set_input_tensor(self, input_tensor):
-        """ See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.model.transformer.set_input_tensor()"""
         self.model.set_input_tensor(input_tensor)
 
     def forward(
@@ -173,6 +175,7 @@ def forward(
         layer_past=None,
         get_key_value=False,
         enc_self_attention_relative_position_bias=None,
+        set_inference_key_value_memory=False,
     ):
         # convert to Megatron mask
         if self.use_flash_attention:
@@ -180,7 +183,9 @@ def forward(
         else:
             enc_attn_mask_3d = attn_mask_postprocess(
                 build_attention_mask_3d(
-                    source_mask=enc_attn_mask, target_mask=enc_attn_mask, attn_mask_type=self.model_attn_mask_type,
+                    source_mask=enc_attn_mask,
+                    target_mask=enc_attn_mask,
+                    attn_mask_type=self.model_attn_mask_type,
                 )
             )
 
@@ -192,6 +197,7 @@ def forward(
             get_key_value=get_key_value,
             self_attention_relative_position_bias=enc_self_attention_relative_position_bias,
             cross_attention_relative_position_bias=None,
+            set_inference_key_value_memory=set_inference_key_value_memory,
         )
 
         return enc_output
@@ -231,3 +237,214 @@ def load_state_dict(self, state_dict, strict=True):
         state_dict_ = state_dict_self_attention
 
         self.model.load_state_dict(state_dict_, strict=strict)
+
+
+class MultiMegatronTransformerEncoderModule(MegatronModule, Exportable, MegatronEncoderModule):
+    """Transformer encoder model."""
+
+    def __init__(
+        self,
+        config: ModelParallelConfig,
+        n_transformers,
+        init_method,
+        output_layer_init_method,
+        hidden_size,
+        ffn_hidden_size,
+        num_layers,
+        num_attention_heads,
+        apply_query_key_layer_scaling=True,
+        kv_channels=None,
+        pre_process=True,
+        post_process=True,
+        encoder_attn_mask_type=AttnMaskType.padding,
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        ffn_dropout=0.0,
+        precision=16,
+        fp32_residual_connection=False,
+        activations_checkpoint_method=None,
+        activations_checkpoint_num_layers=1,
+        activations_checkpoint_granularity=None,
+        layernorm_epsilon=1e-5,
+        bias_activation_fusion=True,
+        bias_dropout_add_fusion=True,
+        masked_softmax_fusion=True,
+        persist_layer_norm=False,
+        openai_gelu=False,
+        onnx_safe=False,
+        activation='gelu',
+        bias=True,
+        normalization='layernorm',
+        transformer_block_type='pre_ln',
+        headscale=False,
+        parent_model_type=ModelType.encoder_or_decoder,
+        megatron_legacy=False,
+        normalize_attention_scores=True,
+        num_moe_experts=1,
+        moe_frequency=1,
+        moe_dropout=0.0,
+        position_embedding_type='learned_absolute',
+        use_flash_attention=False,
+    ):
+        super(MultiMegatronTransformerEncoderModule, self).__init__(config=config)
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.init_method = init_method
+        self.model_attn_mask_type = encoder_attn_mask_type
+        self.hidden_dropout = hidden_dropout
+        self.output_layer_init_method = output_layer_init_method
+        self.parent_model_type = parent_model_type
+        self.normalization = normalization
+        self.transformer_block_type = transformer_block_type
+        self.use_flash_attention = use_flash_attention
+
+        if kv_channels is None:
+
+            assert (
+                hidden_size % num_attention_heads == 0
+            ), 'hidden_size must be divisible by num_attention_heads if kv_channels is None'
+            kv_channels = hidden_size // num_attention_heads
+
+        # Transformer List
+        self.model = []
+        for i in range(n_transformers):
+            transformer = ParallelTransformer(
+                config=config,
+                layer_type=LayerType.encoder,
+                init_method=self.init_method,
+                output_layer_init_method=self.output_layer_init_method,
+                num_layers=self.num_layers,
+                hidden_size=self.hidden_size,
+                num_attention_heads=num_attention_heads,
+                apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+                kv_channels=kv_channels,
+                ffn_hidden_size=ffn_hidden_size,
+                self_attn_mask_type=self.model_attn_mask_type,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+                precision=precision,
+                fp32_residual_connection=fp32_residual_connection,
+                activations_checkpoint_method=activations_checkpoint_method,
+                activations_checkpoint_num_layers=activations_checkpoint_num_layers,
+                activations_checkpoint_granularity=activations_checkpoint_granularity,
+                layernorm_epsilon=layernorm_epsilon,
+                hidden_dropout=hidden_dropout,
+                attention_dropout=attention_dropout,
+                ffn_dropout=ffn_dropout,
+                bias_activation_fusion=bias_activation_fusion,
+                bias_dropout_add_fusion=bias_dropout_add_fusion,
+                masked_softmax_fusion=masked_softmax_fusion,
+                persist_layer_norm=persist_layer_norm,
+                openai_gelu=openai_gelu,
+                onnx_safe=onnx_safe,
+                activation=activation,
+                bias=bias,
+                normalization=normalization,
+                transformer_block_type=transformer_block_type,
+                headscale=headscale,
+                model_type=parent_model_type,
+                megatron_legacy=megatron_legacy,
+                normalize_attention_scores=normalize_attention_scores,
+                num_moe_experts=num_moe_experts,
+                moe_frequency=moe_frequency,
+                moe_dropout=moe_dropout,
+                position_embedding_type=position_embedding_type,
+                use_flash_attention=use_flash_attention,
+            )
+            self.model.append(transformer)
+
+        self.model = torch.nn.ModuleList(self.model)
+
+        self._model_key = 'model'
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        for mi in range(len(self.model)):
+            self.model[mi].set_input_tensor(input_tensor)
+
+    def forward(
+        self,
+        enc_input,
+        enc_attn_mask,
+        layer_past=None,
+        get_key_value=False,
+        enc_self_attention_relative_position_bias=None,
+        set_inference_key_value_memory=False,
+    ):
+
+        assert isinstance(enc_input, list)
+        assert len(enc_input) == len(self.model)
+        assert isinstance(enc_attn_mask, list)
+        assert len(enc_attn_mask) == len(self.model)
+        assert isinstance(enc_self_attention_relative_position_bias, list)
+        # convert to Megatron mask
+        enc_outputs = []
+        for encoder_number in range(len(self.model)):
+            enc_input_ = enc_input[encoder_number]
+            enc_attn_mask_ = enc_attn_mask[encoder_number]
+            enc_self_attention_relative_position_bias_ = enc_self_attention_relative_position_bias[encoder_number]
+
+            if self.use_flash_attention:
+                enc_attn_mask_3d = enc_attn_mask_ < 0.5
+            else:
+                enc_attn_mask_3d = attn_mask_postprocess(
+                    build_attention_mask_3d(
+                        source_mask=enc_attn_mask_,
+                        target_mask=enc_attn_mask_,
+                        attn_mask_type=self.model_attn_mask_type,
+                    )
+                )
+
+            # transformer encoder
+            enc_output = self.model[encoder_number](
+                enc_input_,
+                enc_attn_mask_3d,
+                layer_past=layer_past,
+                get_key_value=get_key_value,
+                self_attention_relative_position_bias=enc_self_attention_relative_position_bias_,
+                cross_attention_relative_position_bias=None,
+                set_inference_key_value_memory=set_inference_key_value_memory,
+            )
+
+            enc_outputs.append(enc_output)
+
+        return enc_outputs
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+
+        state_dict_[self._model_key] = self.model.state_dict_for_save_checkpoint(destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Encoder.
+        if self._model_key in state_dict:
+            state_dict_ = state_dict[self._model_key]
+        # for backward compatibility.
+        elif 'transformer' in state_dict:
+            state_dict_ = state_dict['transformer']
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'transformer.' in key:
+                    state_dict_[key.split('transformer.')[1]] = state_dict[key]
+
+        # for backward compatibility.
+        state_dict_self_attention = {}
+        for key in state_dict_.keys():
+            if '.attention.' in key:
+                state_dict_self_attention[key.replace(".attention.", ".self_attention.")] = state_dict_[key]
+            else:
+                state_dict_self_attention[key] = state_dict_[key]
+        state_dict_ = state_dict_self_attention
+
+        self.model.load_state_dict(state_dict_, strict=strict)
diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py
index ccd485427c3c..a4efb2992166 100644
--- a/nemo/collections/nlp/modules/common/megatron/module.py
+++ b/nemo/collections/nlp/modules/common/megatron/module.py
@@ -113,7 +113,7 @@ def decoder_cross_attention_relative_position_embeddings_weight(self):
 
     def initialize_word_embeddings(self, init_method, vocab_size, hidden_size):
         if not self.share_token_embeddings:
-            raise Exception('initialize_word_embeddings() was called but ' 'share_token_embeddings is false')
+            raise Exception('initialize_word_embeddings() was called but share_token_embeddings is false')
 
         # This function just initializes the word embeddings in the final stage
         # when we are using pipeline parallelism. If we aren't using pipeline
@@ -140,7 +140,10 @@ def initialize_word_embeddings(self, init_method, vocab_size, hidden_size):
             # set word_embeddings weights to 0 here, then copy first
             # stage's weights using all_reduce below.
             self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
-                vocab_size, hidden_size, init_method=init_method, config=self.config,
+                vocab_size,
+                hidden_size,
+                init_method=init_method,
+                config=self.config,
             )
             self.word_embeddings.weight.data.fill_(0)
             self.word_embeddings.weight.shared = True
diff --git a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py
index b7b377940eb4..e68113949aa7 100644
--- a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py
+++ b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py
@@ -42,6 +42,7 @@
 )
 from nemo.collections.nlp.modules.common.megatron.vocab_parallel_cross_entropy import vocab_parallel_cross_entropy
 from nemo.core.classes.mixins import adapter_mixins
+from nemo.utils import logging
 
 try:
     from apex.transformer.enums import AttnMaskType, ModelType
@@ -67,7 +68,11 @@
 
     HAVE_MEGATRON_CORE = False
 
-__all__ = ["MegatronTokenLevelHead", "MegatronTokenLevelEncoderDecoderModule"]
+__all__ = [
+    "MegatronTokenLevelHead",
+    "MegatronTokenLevelEncoderDecoderModule",
+    "MegatronTokenLevelEncoderDecoderSpeechLLMModule",
+]
 
 
 class MegatronTokenLevelHead(MegatronModule):
@@ -252,6 +257,7 @@ def __init__(
                 moe_dropout=encoder_cfg.get('moe_dropout', 0.0),
                 position_embedding_type=encoder_cfg.get('position_embedding_type', 'learned_absolute'),
                 use_flash_attention=encoder_cfg.get('use_flash_attention', False),
+                n_transformers=encoder_cfg.get('n_transformers', 1),
             )
 
         if add_decoder:
@@ -388,6 +394,7 @@ def __init__(
                 moe_dropout=decoder_cfg.get('moe_dropout', 0.0),
                 position_embedding_type=decoder_cfg.get('position_embedding_type', 'learned_absolute'),
                 use_flash_attention=decoder_cfg.get('use_flash_attention', False),
+                layer_type=decoder_cfg.get('layer_type', LayerType.decoder),
             )
 
         hiddens_module = get_hiddens_module(hiddens_cfg, model_parallel_cfg=config)
@@ -410,6 +417,7 @@ def __init__(
 
         if add_decoder and post_process:
             if share_decoder_tokens_head_embeddings:
+                # parallel_output is True if TP > 1 (3b model)
                 self.tokens_head = MegatronTokenLevelHead(
                     self.word_embeddings_weight().size(0), parallel_output, bias=tokens_head_bias
                 )
@@ -469,7 +477,7 @@ def _validate_config(self):
         return encoder_kv_channels, decoder_kv_channels
 
     def set_input_tensor(self, input_tensor):
-        """ See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.model.transformer.set_input_tensor()"""
         # This is usually handled in schedules.py but some inference code still
         # gives us non-lists or None
 
@@ -566,7 +574,8 @@ def forward(
 
         if self.add_encoder and self.encoder_relative_position_embedding is not None:
             encoder_self_attention_relative_position_bias = self.encoder_relative_position_embedding(
-                query_seq_length=enc_seq_length, key_seq_length=enc_seq_length,
+                query_seq_length=enc_seq_length,
+                key_seq_length=enc_seq_length,
             )
 
         if output_enc_hidden_only:
@@ -604,8 +613,11 @@ def forward(
                     query_seq_length=dec_input_ids.size(1), key_seq_length=dec_input_ids.size(1)
                 )
                 if not self.decoder_cfg.relative_position_bias_self_attention_only:
-                    decoder_cross_attention_relative_position_bias = self.decoder_cross_attention_relative_position_embedding(
-                        query_seq_length=dec_input_ids.size(1), key_seq_length=enc_seq_length,
+                    decoder_cross_attention_relative_position_bias = (
+                        self.decoder_cross_attention_relative_position_embedding(
+                            query_seq_length=dec_input_ids.size(1),
+                            key_seq_length=enc_seq_length,
+                        )
                     )
                 else:
                     decoder_cross_attention_relative_position_bias = None
@@ -656,7 +668,8 @@ def forward(
                     # check if hiddens is used
                     if self.hiddens_cfg is not None:
                         loss_dict = self.enc_dec_model.hiddens_module.apply_loss_transforms(
-                            outputs=enc_output, batch_data=batch_data,
+                            outputs=enc_output,
+                            batch_data=batch_data,
                         )
                         loss_dict["tokens_loss"] = tokens_loss
                         # We need to store default output in a known key, so that we can mimic default behaviour
@@ -708,8 +721,437 @@ def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=
 
     def load_state_dict(self, state_dict, strict=True):
         """Customized load."""
-
-        self.encoder_embedding.encoder_embeddingload_state_dict(state_dict[self._encoder_embedding_key], strict=strict)
+        self.encoder_embedding.load_state_dict(state_dict[self._encoder_embedding_key], strict=strict)
         self.decoder_embedding.load_state_dict(state_dict[self._decoder_embedding_key], strict=strict)
         self.enc_dec_model.load_state_dict(state_dict[self._enc_dec_model_key], strict=strict)
         self.tokens_head.load_state_dict(state_dict[self._tokens_head_key], strict=strict)
+
+
+class MegatronTokenLevelEncoderDecoderSpeechLLMModule(MegatronTokenLevelEncoderDecoderModule):
+    def __init__(self, *args, **kwargs):
+        super(MegatronTokenLevelEncoderDecoderSpeechLLMModule, self).__init__(*args, **kwargs)
+        # Overridden in MegatronT5SpeechLMModel constructor
+        self.seq_pattern = "parallel"
+        self.speech_head_type = "token_level"
+        self.attn_prior_scaledown_start_step = 10000
+        self.attn_prior_end_step = 11000
+        self.use_alignment_loss = False
+        self.return_all_crossattention_probs = False
+        self.logging_step = False
+        self.num_cross_attention_heads = 12  # 12 for 220m T5, 16 for 11b T5
+        self.enc_output_to_layers = None
+
+    def get_decoder_embeddings(self, dec_input_ids, dec_position_ids, token_type_ids):
+        if dec_input_ids.dim() <= 2:
+            dec_input = self.decoder_embedding(dec_input_ids, dec_position_ids, token_type_ids=token_type_ids)
+        else:
+            dec_input = None
+            for i in range(dec_input_ids.size()[1]):
+                if i == 0:
+                    # For the first channel (text + first layer of speech), use the decoder embedding layer
+                    dec_input = self.decoder_embedding(
+                        dec_input_ids[:, i, :], dec_position_ids, token_type_ids=token_type_ids
+                    )
+                else:
+                    # For the rest of the channels (speech), use the speech embedding layer. No need for position, since already added in first layer.
+                    current = self.speech_tokens_embeddings[i - 1](dec_input_ids[:, i, :]).permute(1, 0, 2)
+                    # @pneekhara - Commenting the below because we always want to include all channels for speech.
+                    # @pneekhara - include_channel_flag can become 0 when doing autoregressive inference and the first timestep is zeros
+                    # For text inputs, only include 1st channel embeddings. Zero-out others.
+                    # include_channel_flag = (torch.sum(dec_input_ids[:, i, :], dim=1) > 0).float()  # [B]
+                    # current = current * include_channel_flag.unsqueeze(0).unsqueeze(2)
+                    dec_input = dec_input + current
+
+        return dec_input
+
+    def forward(
+        self,
+        enc_input_ids=None,
+        enc_attn_mask=None,
+        dec_input_ids=None,
+        dec_attn_mask=None,
+        token_type_ids=None,
+        labels=None,
+        batch_data=None,  # additional data to be passed to hiddens module
+        enc_output=None,  # Result of running the entire encoder
+        enc_output_attn_mask=None,
+        enc_input=None,  # Result of running encoder embedding only
+        output_enc_hidden_only=False,
+        speech_mask=None,
+        cross_attention_prior=None,
+        text_limits=None,
+        global_step=None,
+        set_inference_key_value_memory=False,
+        decoder_max_sequence_len=None,
+        encoder_max_sequence_len=None,
+    ):
+        """
+        Return value is per token / per dimension (i.e., non collapsed loss value)
+        """
+        (
+            encoder_self_attention_relative_position_bias,
+            decoder_self_attention_relative_position_bias,
+            decoder_cross_attention_relative_position_bias,
+        ) = (None, None, None)
+
+        if enc_input is not None and enc_output is not None:
+            raise ValueError(
+                """Both enc_input and enc_output are not None.
+                You should only be passing one of them.
+                enc_input is the result of the encoder embedding layer
+                enc_output is the result of running the entire transformer encoder."""
+            )
+
+        # In order of precedence, we use enc_output, enc_input, and then enc_input_ids to determine the encoder sequence length.
+        if enc_output is not None:
+            # If enc_output is provided in `batch_for_pipeline`, we need to transpose it from [B x S x H] -> [S x B x H].
+            if isinstance(enc_output, list):
+                encoder_self_attention_relative_position_bias = [None for _ in enc_output]
+                enc_output = [x.transpose(0, 1) for x in enc_output]
+                enc_seq_length = [x.size(0) for x in enc_output]
+            else:
+                enc_output = enc_output.transpose(0, 1)
+                enc_seq_length = enc_output.size(0)
+        elif enc_input is not None:
+            # If enc_input is provided, we need to transpose it from [B x S x H] -> [S x B x H].
+            if isinstance(enc_input, list):
+                encoder_self_attention_relative_position_bias = [None for _ in enc_input]
+                enc_input = [x.transpose(0, 1) for x in enc_input]
+                enc_seq_length = [x.size(0) for x in enc_input]
+            else:
+                enc_input = enc_input.transpose(0, 1)
+                enc_seq_length = enc_input.size(0)
+        # Only need to run encoder embedding and position ids if enc_input or enc_output is not provided.
+        elif enc_input_ids is not None:
+            enc_seq_length = enc_input_ids.size(1)
+            if self.pre_process and self.add_encoder:
+                # We don't need position ids for RPE, because the embedding layer does not have position embeddings.
+                if self.encoder_relative_position_embedding is None:
+                    enc_input_ids_p = enc_input_ids[:, 0, :] if enc_input_ids.dim() == 3 else enc_input_ids
+                    enc_position_ids = build_position_ids(enc_input_ids_p)
+                else:
+                    enc_position_ids = None
+                enc_input = self.encoder_embedding(enc_input_ids, enc_position_ids, token_type_ids=token_type_ids)
+                if self.is_adapter_available():
+                    _sq, _bs, _hs = enc_input.size()
+                    ptuning_adapter = self.get_adapter_module(AdapterName.PTUNING_ADAPTER)
+                    v = ptuning_adapter.virtual_tokens
+                    if (
+                        ptuning_adapter and _sq >= v
+                    ):  # The sequence should be longer the v to insert virtual embeddings.
+                        virtual_embeddings = ptuning_adapter(_bs)
+                        enc_input = enc_input[
+                            v:, :, :
+                        ]  # the first v tokens are pads so that they can be swapped out with virtual embeddings.
+                        enc_input = torch.concat([virtual_embeddings, enc_input], dim=0)
+            else:
+                enc_input = None
+        else:
+            # This should only happen with PP > 1 for enc-dec prompt learning models
+            enc_seq_length = enc_attn_mask.size(1)
+
+        if self.add_encoder and self.encoder_relative_position_embedding is not None:
+            encoder_self_attention_relative_position_bias = self.encoder_relative_position_embedding(
+                query_seq_length=enc_seq_length,
+                key_seq_length=enc_seq_length,
+            )
+
+        if output_enc_hidden_only:
+            # When pipeline parallel > 1 we need to make sure encoder exist (will be missing in decoder)
+            # SpeechT5 should not go here for inference
+            if enc_output is None and self.enc_dec_model.encoder is not None:
+                enc_output = self.enc_dec_model.encode(
+                    enc_input=enc_input,
+                    enc_attn_mask=enc_attn_mask,
+                    enc_layer_past=None,
+                    enc_get_key_value=False,
+                    enc_self_attention_relative_position_bias=encoder_self_attention_relative_position_bias,
+                    batch_data=batch_data,
+                )
+            else:
+                enc_output = self.enc_dec_model.encoder_hidden_state
+
+            return enc_output
+        else:
+            if enc_output_attn_mask is None:
+                enc_output_attn_mask = enc_attn_mask
+
+            if self.pre_process and self.add_decoder:
+                # We don't need position ids for RPE, because the embedding layer does not have position embeddings.
+                if self.decoder_relative_position_embedding is None:
+                    dec_input_ids_p = dec_input_ids[:, 0, :] if dec_input_ids.dim() == 3 else dec_input_ids
+                    dec_position_ids = build_position_ids(dec_input_ids_p)
+                else:
+                    dec_position_ids = None
+                dec_input = self.get_decoder_embeddings(dec_input_ids, dec_position_ids, token_type_ids)
+                if not set_inference_key_value_memory and (decoder_max_sequence_len or encoder_max_sequence_len):
+                    # In inference
+                    # On step 0 when set_inference_key_value_memory is True, we need all inputs in case
+                    # we are using decoder context
+                    # Else on step >= 1, only need last input
+                    logging.debug("Clipping dec_input and only keep the last input.")
+                    dec_input = dec_input[-1, :, :].unsqueeze(0)  # shape (b, embed_dim)
+            else:
+                # Note: This is when the decoder itself is split across PP ranks.
+                dec_input = None
+
+            if self.add_decoder and self.decoder_relative_position_embedding is not None:
+                decoder_self_attention_relative_position_bias = self.decoder_relative_position_embedding(
+                    query_seq_length=dec_input_ids.size(1), key_seq_length=dec_input_ids.size(1)
+                )
+                if not self.decoder_cfg.relative_position_bias_self_attention_only:
+                    decoder_cross_attention_relative_position_bias = (
+                        self.decoder_cross_attention_relative_position_embedding(
+                            query_seq_length=dec_input_ids.size(1),
+                            key_seq_length=enc_seq_length,
+                        )
+                    )
+                else:
+                    decoder_cross_attention_relative_position_bias = None
+
+            return_all_crossattention_probs = self.return_all_crossattention_probs
+            single_encoder = False
+            if not isinstance(cross_attention_prior, list):
+                single_encoder = True
+                cross_attention_prior = [cross_attention_prior]
+
+            decoder_cross_attention_relative_position_bias = []
+            for _cross_attention_prior in cross_attention_prior:
+                _decoder_cross_attention_relative_position_bias = None
+                if _cross_attention_prior is not None:
+                    # cross_attention_prior shape [B, dec_len, enc_len]
+                    # Repeat it to make it [B, 12, dec_len, enc_len]
+                    attn_prior_end_step = self.attn_prior_end_step
+                    attn_prior_scaledown_start_step = self.attn_prior_scaledown_start_step
+                    num_attention_heads = self.num_cross_attention_heads
+                    assert attn_prior_scaledown_start_step <= attn_prior_end_step
+                    logging.debug(
+                        f"attn_prior_scaledown_start_step: {attn_prior_scaledown_start_step}, attn_prior_scaledown_start_step: {attn_prior_end_step}"
+                    )
+                    if global_step >= attn_prior_end_step:
+                        _decoder_cross_attention_relative_position_bias = None
+                    elif global_step > attn_prior_scaledown_start_step and global_step < attn_prior_end_step:
+                        total_annealing_steps = attn_prior_end_step - attn_prior_scaledown_start_step
+                        curr_annealing_step = global_step - attn_prior_scaledown_start_step
+                        curr_cross_attention_prior = _cross_attention_prior + (
+                            (1.0 - _cross_attention_prior) * curr_annealing_step / total_annealing_steps
+                        )
+                        _decoder_cross_attention_relative_position_bias = curr_cross_attention_prior.unsqueeze(
+                            1
+                        ).repeat(1, num_attention_heads, 1, 1)
+                        _decoder_cross_attention_relative_position_bias = torch.log(
+                            _decoder_cross_attention_relative_position_bias + 1e-8
+                        )
+                    else:
+                        _decoder_cross_attention_relative_position_bias = _cross_attention_prior.unsqueeze(1).repeat(
+                            1, num_attention_heads, 1, 1
+                        )
+                        _decoder_cross_attention_relative_position_bias = torch.log(
+                            _decoder_cross_attention_relative_position_bias + 1e-8
+                        )
+                decoder_cross_attention_relative_position_bias.append(_decoder_cross_attention_relative_position_bias)
+
+            return_all_crossattention_probs = return_all_crossattention_probs or self.logging_step
+
+            if single_encoder:
+                decoder_cross_attention_relative_position_bias = decoder_cross_attention_relative_position_bias[0]
+
+            output = self.enc_dec_model(
+                enc_input=enc_input,
+                enc_attn_mask=enc_attn_mask,
+                dec_input=dec_input,
+                dec_attn_mask=dec_attn_mask,
+                enc_layer_past=None,
+                enc_get_key_value=False,
+                enc_output=enc_output,
+                enc_output_attn_mask=enc_output_attn_mask,
+                dec_layer_past=None,
+                dec_get_key_value=False,
+                enc_self_attention_relative_position_bias=encoder_self_attention_relative_position_bias,
+                dec_self_attention_relative_position_bias=decoder_self_attention_relative_position_bias,
+                dec_cross_attention_relative_position_bias=decoder_cross_attention_relative_position_bias,
+                return_all_crossattention_probs=return_all_crossattention_probs,
+                batch_data=batch_data,
+                set_inference_key_value_memory=set_inference_key_value_memory,
+                decoder_max_sequence_len=decoder_max_sequence_len,
+                encoder_max_sequence_len=encoder_max_sequence_len,
+                enc_output_to_layers=self.enc_output_to_layers,
+            )
+
+            alignment_loss = None
+            if self.post_process and self.add_decoder:
+                dec_output, enc_output = output  # [s, b, h]
+                if return_all_crossattention_probs:
+                    dec_output, attention_scores = dec_output
+                    attention_probs = [
+                        torch.softmax(attention_score, dim=-1)
+                        for lidx, attention_score in enumerate(attention_scores)
+                        if lidx in self.alignment_decoder_layerids
+                    ]
+
+                    if text_limits is not None and self.use_alignment_loss and hasattr(self, "forward_sum_loss"):
+                        attention_scores_filtered = [
+                            attention_scores[lidx] for lidx in self.alignment_decoder_layerids
+                        ]
+                        attention_scores_combined = torch.cat(attention_scores_filtered, dim=1)
+                        text_start_idx = text_limits[0, 0].item()
+                        assert torch.all(
+                            text_limits[:, 0] == text_start_idx
+                        )  # all texts should start at the same index
+                        end_offset = self.alignment_text_end_offset
+                        # align_every_n_head: eg if set to 2, will skip every other head
+                        # if set to 12, will select 1 head from every layer
+                        align_every_n_head = self.align_every_n_head
+                        dec_start_idx = self.decoder_context_len + 1  # +1 to remove bos
+                        attention_scores_sliced = attention_scores_combined[
+                            :, ::align_every_n_head, dec_start_idx:, text_start_idx : -(2 + end_offset)
+                        ]  # -2 to remove eos and pad
+                        attention_logprobs = (
+                            attention_scores_sliced  # not taking log_softmax, since we will do that in loss function
+                        )
+                        attention_logprobs = torch.mean(attention_logprobs, dim=1, keepdim=True)
+                        dec_len = torch.sum(dec_attn_mask, dim=1) - dec_start_idx
+                        enc_len = text_limits[:, 1] - text_limits[:, 0] - end_offset
+                        alignment_loss = self.forward_sum_loss(
+                            attn_logprob=attention_logprobs, in_lens=enc_len, out_lens=dec_len
+                        )
+                else:
+                    attention_probs = None
+                # project decoder output to vocabulary-size dimensions
+                if self.share_decoder_tokens_head_embeddings:
+                    first_layer_vocabsize = (
+                        self.speech_offset + self.speech_codebook_size
+                    )  # variables set in __init__ of speechlm model
+                    token_logits = self.tokens_head(dec_output, self.word_embeddings_weight())  # s, b, vocab
+                    if self.seq_pattern in ["parallel", "delay_parallel"]:
+                        # For flat seq_pattern we need all the logits
+                        token_logits = token_logits[:, :, :first_layer_vocabsize]
+                    speech_layers = self.num_speech_codebooks - 1
+
+                    # speech_logits_list will be used in loss calculation (parallel output)
+                    speech_logits_list = []
+                    if self.seq_pattern in ["parallel", "delay_parallel"] and torch.count_nonzero(speech_mask) > 0:
+                        for i in range(speech_layers):
+                            last_layer_logits = self.speech_tokens_heads[i](dec_output)[0]  # T, B, 1024
+                            speech_logits_list.append(last_layer_logits)  # T, B, 1024
+                else:
+                    token_logits = self.tokens_head(dec_output)[0]  # T, B, WordEmbSize
+
+                if labels is not None:
+                    if labels.dim() == 2:
+                        # [b, s] -> [s, b]
+                        labels = labels.transpose(0, 1).contiguous()
+                    elif labels.dim() == 3:
+                        # [b, c, s] -> [c, s, b]
+                        labels = labels.permute(1, 2, 0).contiguous()
+
+                    # Set label smoothing to 0 if in eval mode.
+                    label_smoothing = self.label_smoothing if self.training else 0.0
+
+                    # tensor_parallel.vocab_parallel_cross_entropy performs log_softmax and return log p(x_i|z) per token i
+                    if self.fp16_cross_entropy:
+                        assert token_logits.dtype == torch.half
+                        if labels.dim() == 3:
+                            raise NotImplementedError("fp16_cross_entropy is not support for labels of dimension 3")
+                        tokens_loss = vocab_parallel_cross_entropy(token_logits, labels, label_smoothing)
+                    else:
+                        if labels.dim() == 2:
+                            tokens_loss = vocab_parallel_cross_entropy(token_logits.float(), labels, label_smoothing)
+                        elif labels.dim() == 3:
+                            if token_logits.size()[0] != labels[0, :, :].size()[0]:
+                                raise Exception("TODO: add a permute")
+                            tokens_loss = vocab_parallel_cross_entropy(
+                                token_logits.float(), labels[0, :, :], label_smoothing
+                            )
+                            logging.debug(f"token_loss: {tokens_loss}")
+                            logging.debug(f"token_loss: {torch.all(torch.isfinite(tokens_loss))}")
+                            if (
+                                self.seq_pattern in ["parallel", "delay_parallel"]
+                                and torch.count_nonzero(speech_mask) > 0
+                            ):
+                                for i in range(speech_layers):
+                                    if speech_logits_list[i].size()[0] != labels[i + 1, :, :].size()[0]:
+                                        raise Exception("TODO: add a permute")
+                                    curr_codebook_loss = (
+                                        vocab_parallel_cross_entropy(
+                                            speech_logits_list[i].float(), labels[i + 1, :, :], label_smoothing
+                                        )
+                                        * speech_mask.T
+                                    )
+                                    tokens_loss += curr_codebook_loss
+                                    logging.debug(f"token_loss_{i}: {tokens_loss}")
+                                    logging.debug(f"token_loss_{i}: {torch.all(torch.isfinite(tokens_loss))}")
+
+                    # [s, b] -> [b, s]
+                    tokens_loss = tokens_loss.transpose(0, 1).contiguous()
+
+                    # check if hiddens is used
+                    if self.hiddens_cfg is not None:
+                        raise NotImplementedError("Not currently implemented for speechllm")
+                    else:
+                        return tokens_loss, [token_logits, speech_logits_list, attention_probs, alignment_loss]
+                else:
+                    # else return token logits (and hiddens if needed)
+                    # [s, b, h] -> [b, s, h]
+                    # If labels is None then we are in inference mode and we return the gathered logits
+                    if self.parallel_output:
+                        # Gather logits from tensor parallel if in parallel_output mode
+                        token_logits = tensor_parallel.gather_from_tensor_model_parallel_region(
+                            token_logits
+                        )  # T, B, 30208
+                        for _i in range(len(speech_logits_list)):
+                            speech_logits_list[_i] = tensor_parallel.gather_from_tensor_model_parallel_region(
+                                speech_logits_list[_i]
+                            )  # T, B, 1024
+
+                    token_logits = token_logits.transpose(0, 1).contiguous()  # (B, T, 30208)
+                    speech_logits = torch.stack(speech_logits_list, dim=-1)  # T, B, 1024, 7
+                    speech_logits = speech_logits.transpose(0, 1).contiguous()  # (B, T, 1024, 7)
+
+                    _si = self.speech_offset
+                    _ei = _si + self.speech_codebook_size
+                    first_layer_speech_logits = token_logits[:, :, _si:_ei].unsqueeze(-1)  # (b, s, 1023, 1)
+
+                    all_speech_logits = torch.cat(
+                        [first_layer_speech_logits, speech_logits], dim=-1
+                    )  # (b, s, 1024, 8)
+
+                    if self.hiddens_cfg is not None:
+                        raise NotImplementedError("Not currently implemented for speechllm")
+                    else:
+                        # all_speech_logits: tensor, (b, s, 1024, 8), all layers of speech.
+                        # token_logits: tensor, (b, s, vocab_size), text token logits.
+                        # speech_logits: tensor, (b, s, 1024, 7), 1-7 layers of speech.
+                        # attention_probs: tensor or None, (b, s, )
+                        # enc_output: tensor, (virtual_token_len+context_token_len+question_token_len+extra_id_0+[SEP], b, )
+                        return all_speech_logits, [token_logits, speech_logits, attention_probs, enc_output]
+
+            elif self.add_decoder and not self.add_encoder:
+                decoder_output, _ = output
+                return decoder_output
+            else:
+                encoder_output = output
+                return encoder_output
+
+    def state_dict(self):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._encoder_embedding_key] = self.encoder_embedding.state_dict()
+        state_dict_[self._decoder_embedding_key] = self.decoder_embedding.state_dict()
+        state_dict_[self._enc_dec_model_key] = self.enc_dec_model.state_dict()
+        state_dict_[self._tokens_head_key] = self.tokens_head.state_dict()
+        if hasattr(self, "speech_tokens_heads"):
+            state_dict_["speech_tokens_heads"] = self.speech_tokens_heads.state_dict()
+        if hasattr(self, "speech_tokens_embeddings"):
+            state_dict_["speech_tokens_embeddings"] = self.speech_tokens_embeddings.state_dict()
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        super().load_state_dict(state_dict, strict=strict)
+        if hasattr(self, "speech_tokens_heads"):
+            self.speech_tokens_heads.load_state_dict(state_dict["speech_tokens_heads"], strict=strict)
+        if hasattr(self, "speech_tokens_embeddings"):
+            self.speech_tokens_embeddings.load_state_dict(state_dict["speech_tokens_embeddings"], strict=strict)
diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
index ab10b0d0e8b3..c5108d8e3801 100644
--- a/nemo/collections/nlp/modules/common/megatron/transformer.py
+++ b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -22,6 +22,7 @@
 import torch
 import torch.nn as nn
 from einops import rearrange
+from omegaconf.listconfig import ListConfig
 
 from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
@@ -479,6 +480,10 @@ def forward(
         self_attention_relative_position_bias=None,
         cross_attention_relative_position_bias=None,
         checkpoint_core_attention=False,
+        return_crossattention_scores=False,
+        return_selfattention_scores=False,
+        decoder_max_sequence_len=None,
+        encoder_max_sequence_len=None,
     ):
         # Self attention.
         if rotary_pos_emb is not None:
@@ -489,6 +494,12 @@ def forward(
             self_attention_pos_emb = None
             cross_attention_pos_emb = None
 
+        if return_crossattention_scores and return_selfattention_scores:
+            raise NotImplementedError(
+                "We can only return 1 of cross attention scores or self attention scores. Not both yet."
+            )
+        attention_probs = None
+
         if self.layer_type != LayerType.retrieval_decoder_after_self_attn:
             # hidden_states: [b, s, h]
 
@@ -507,12 +518,16 @@ def forward(
                 layer_past=layer_past,
                 get_key_value=get_key_value,
                 set_inference_key_value_memory=set_inference_key_value_memory,
-                inference_max_sequence_len=inference_max_sequence_len,
+                inference_max_sequence_len=inference_max_sequence_len or decoder_max_sequence_len,
                 rotary_pos_emb=self_attention_pos_emb,
                 relative_position_bias=self_attention_relative_position_bias,
                 checkpoint_core_attention=checkpoint_core_attention,
+                return_scores=return_selfattention_scores,
             )
 
+            if return_selfattention_scores:
+                attention_output, attention_probs = attention_output
+
             if get_key_value:
                 attention_output, presents = attention_output
 
@@ -526,7 +541,7 @@ def forward(
                 attention_bias = None
 
             # jit scripting for a nn.module (with dropout) is not
-            # trigerring the fusion kernel. For now, we use two
+            # triggering the fusion kernel. For now, we use two
             # different nn.functional routines to account for varying
             # dropout semantics during training and inference phases.
 
@@ -553,6 +568,9 @@ def forward(
             elif self.transformer_block_type in ['pre_ln', 'normformer']:
                 # Layer norm post the self attention.
                 normalization_output = self.post_attention_layernorm(layernorm_input)
+            else:
+                normalization_output = None
+                logging.warning(f"This is a rare case since `normalization_output=None`")
         else:
             layernorm_input, normalization_output = hidden_states
 
@@ -579,7 +597,7 @@ def forward(
                     checkpoint_core_attention=checkpoint_core_attention,
                 )
             else:
-
+                # Return Scores is being passed only for inter_attention and not self attention
                 attention_output, attention_bias = self.inter_attention(
                     normalization_output,
                     enc_dec_attn_mask,
@@ -587,7 +605,12 @@ def forward(
                     rotary_pos_emb=cross_attention_pos_emb,
                     relative_position_bias=cross_attention_relative_position_bias,
                     checkpoint_core_attention=checkpoint_core_attention,
+                    return_scores=return_crossattention_scores,
+                    set_inference_key_value_memory=set_inference_key_value_memory,
+                    inference_max_sequence_len=encoder_max_sequence_len,
                 )
+                if return_crossattention_scores:
+                    attention_output, attention_probs = attention_output
 
             # If normformer, apply norm on the output of the self attention.
             if self.transformer_block_type == 'normformer':
@@ -632,6 +655,9 @@ def forward(
         if get_key_value:
             output = [output, presents]
 
+        if attention_probs is not None:
+            output = [output, attention_probs]
+
         return output
 
 
@@ -735,6 +761,10 @@ def forward(
         self_attention_relative_position_bias=None,
         cross_attention_relative_position_bias=None,
         checkpoint_core_attention=False,
+        return_crossattention_scores=False,
+        return_selfattention_scores=False,
+        decoder_max_sequence_len=None,
+        encoder_max_sequence_len=None,
     ):
         if self.dtype == torch.float32:
             return super().forward(
@@ -750,6 +780,10 @@ def forward(
                 self_attention_relative_position_bias,
                 cross_attention_relative_position_bias,
                 checkpoint_core_attention,
+                return_crossattention_scores=return_crossattention_scores,
+                return_selfattention_scores=return_selfattention_scores,
+                decoder_max_sequence_len=decoder_max_sequence_len,
+                encoder_max_sequence_len=encoder_max_sequence_len,
             )
         with torch.autocast(device_type="cuda", dtype=self.dtype):
             return super().forward(
@@ -765,6 +799,10 @@ def forward(
                 self_attention_relative_position_bias,
                 cross_attention_relative_position_bias,
                 checkpoint_core_attention,
+                return_crossattention_scores=return_crossattention_scores,
+                return_selfattention_scores=return_selfattention_scores,
+                decoder_max_sequence_len=decoder_max_sequence_len,
+                encoder_max_sequence_len=encoder_max_sequence_len,
             )
 
 
@@ -1072,10 +1110,12 @@ def __init__(
 
         # Transformer layers.
         def build_layer(layer_number):
-            if isinstance(layer_type, list):
+            if isinstance(layer_type, (list, ListConfig)):
                 lt = layer_type[layer_number - 1]
             else:
                 lt = layer_type
+            if isinstance(lt, int):
+                lt = LayerType(lt)
 
             if self.transformer_engine:
                 transformer_layer_args = {
@@ -1493,7 +1533,16 @@ def forward(
         self_attention_relative_position_bias=None,
         cross_attention_relative_position_bias=None,
         checkpoint_activations_all_layers=None,
+        return_all_crossattention_probs=False,
+        return_all_selfattention_probs=False,
+        decoder_max_sequence_len=None,
+        encoder_max_sequence_len=None,
+        enc_output_to_layers=None,
     ):
+        if return_all_crossattention_probs and return_all_selfattention_probs:
+            raise NotImplementedError(
+                "We can only return 1 of cross attention probs or self attention probs. Not both yet."
+            )
         # Checks.
         if inference_max_sequence_len:
             assert self.activations_checkpoint_method is None, 'inference does not work with activation checkpointing'
@@ -1580,6 +1629,7 @@ def forward(
                         if self.inference_params != None:
                             self.inference_params.sequence_len_offset = self.inference_current_sequence_len
 
+                    attention_probs_list = []
                     if self.return_select_layer < 0:
                         assert (
                             parallel_state.get_pipeline_model_parallel_world_size() == 1
@@ -1588,10 +1638,32 @@ def forward(
                             logging.warning("Returning embeddings states only!")
                             return hidden_states
 
+                    layer_to_encoder_num_mapping = {}
+                    if enc_output_to_layers is not None:
+                        assert len(enc_output_to_layers) == len(encoder_output)
+                        for encoder_idx in range(len(encoder_output)):
+                            for layer_idx in enc_output_to_layers[encoder_idx]:
+                                layer_to_encoder_num_mapping[layer_idx] = encoder_idx
+
                     for index in range(self.num_layers):
                         layer = self._get_layer(index)
                         past = None
 
+                        _encoder_output = encoder_output
+                        _enc_dec_attn_mask = enc_dec_attn_mask
+                        _cross_attention_relative_position_bias = cross_attention_relative_position_bias
+                        _encoder_max_sequence_len = encoder_max_sequence_len
+                        if index in layer_to_encoder_num_mapping:
+                            _encoder_output = encoder_output[layer_to_encoder_num_mapping[index]]
+                            _enc_dec_attn_mask = enc_dec_attn_mask[layer_to_encoder_num_mapping[index]]
+                            _cross_attention_relative_position_bias = cross_attention_relative_position_bias[
+                                layer_to_encoder_num_mapping[index]
+                            ]
+                            if encoder_max_sequence_len is not None:
+                                _encoder_max_sequence_len = encoder_max_sequence_len[
+                                    layer_to_encoder_num_mapping[index]
+                                ]
+
                         if layer_past is not None:
                             past = layer_past[index]
 
@@ -1625,27 +1697,65 @@ def forward(
                             hidden_states = layer(
                                 hidden_states,
                                 attention_mask,
-                                encoder_output=encoder_output,
-                                enc_dec_attn_mask=enc_dec_attn_mask,
+                                encoder_output=_encoder_output,
+                                enc_dec_attn_mask=_enc_dec_attn_mask,
                                 inference_params=self.inference_params,
                                 is_first_microbatch=is_first_microbatch,
                                 checkpoint_core_attention=checkpoint_core_attention,
                             )
                         else:
-                            hidden_states = layer(
-                                hidden_states,
-                                attention_mask,
-                                encoder_output=encoder_output,
-                                enc_dec_attn_mask=enc_dec_attn_mask,
-                                layer_past=past,
-                                get_key_value=get_key_value,
-                                set_inference_key_value_memory=set_inference_key_value_memory,
-                                inference_max_sequence_len=inference_max_sequence_len,
-                                rotary_pos_emb=rotary_pos_emb,
-                                self_attention_relative_position_bias=self_attention_relative_position_bias,
-                                cross_attention_relative_position_bias=cross_attention_relative_position_bias,
-                                checkpoint_core_attention=checkpoint_core_attention,
-                            )
+                            if layer.layer_type == LayerType.decoder and return_all_crossattention_probs:
+                                hidden_states, attention_probs = layer(
+                                    hidden_states,
+                                    attention_mask,
+                                    encoder_output=_encoder_output,
+                                    enc_dec_attn_mask=_enc_dec_attn_mask,
+                                    layer_past=past,
+                                    set_inference_key_value_memory=set_inference_key_value_memory,
+                                    inference_max_sequence_len=inference_max_sequence_len,
+                                    rotary_pos_emb=rotary_pos_emb,
+                                    self_attention_relative_position_bias=self_attention_relative_position_bias,
+                                    cross_attention_relative_position_bias=_cross_attention_relative_position_bias,
+                                    checkpoint_core_attention=checkpoint_core_attention,
+                                    return_crossattention_scores=return_all_crossattention_probs,
+                                    decoder_max_sequence_len=decoder_max_sequence_len,
+                                    encoder_max_sequence_len=_encoder_max_sequence_len,
+                                )
+                                attention_probs_list.append(attention_probs)
+                            elif layer.layer_type == LayerType.encoder and return_all_selfattention_probs:
+                                hidden_states, attention_probs = layer(
+                                    hidden_states,
+                                    attention_mask,
+                                    encoder_output=_encoder_output,
+                                    enc_dec_attn_mask=_enc_dec_attn_mask,
+                                    layer_past=past,
+                                    get_key_value=get_key_value,
+                                    set_inference_key_value_memory=set_inference_key_value_memory,
+                                    inference_max_sequence_len=inference_max_sequence_len,
+                                    rotary_pos_emb=rotary_pos_emb,
+                                    self_attention_relative_position_bias=self_attention_relative_position_bias,
+                                    cross_attention_relative_position_bias=_cross_attention_relative_position_bias,
+                                    checkpoint_core_attention=checkpoint_core_attention,
+                                    return_selfattention_scores=return_all_selfattention_probs,
+                                )
+                                attention_probs_list.append(attention_probs)
+                            else:
+                                hidden_states = layer(
+                                    hidden_states,
+                                    attention_mask,
+                                    encoder_output=_encoder_output,
+                                    enc_dec_attn_mask=_enc_dec_attn_mask,
+                                    layer_past=past,
+                                    get_key_value=get_key_value,
+                                    set_inference_key_value_memory=set_inference_key_value_memory,
+                                    inference_max_sequence_len=inference_max_sequence_len,
+                                    rotary_pos_emb=rotary_pos_emb,
+                                    self_attention_relative_position_bias=self_attention_relative_position_bias,
+                                    cross_attention_relative_position_bias=_cross_attention_relative_position_bias,
+                                    checkpoint_core_attention=checkpoint_core_attention,
+                                    decoder_max_sequence_len=decoder_max_sequence_len,
+                                    encoder_max_sequence_len=_encoder_max_sequence_len,
+                                )
 
                         if self.return_select_layer < 0:
                             assert (
@@ -1679,4 +1789,7 @@ def forward(
         if get_key_value:
             output = [output, presents]
 
+        if return_all_crossattention_probs or return_all_selfattention_probs:
+            output = [output, attention_probs_list]
+
         return output
diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py
index 601cb7a4d7e8..b0a6f755a9cc 100644
--- a/nemo/collections/nlp/modules/common/megatron/utils.py
+++ b/nemo/collections/nlp/modules/common/megatron/utils.py
@@ -18,7 +18,6 @@
 from typing import Dict, Iterator, List, Optional, Tuple, Union
 
 import torch
-import torch.nn as nn
 from torch import Tensor
 
 from nemo.utils import logging, logging_mode
@@ -474,9 +473,25 @@ def get_iterator_k_split(
     else:
         # Split a list of torch tensors
         assert batch[0].shape[0] % num_microbatches == 0, "Issue with batch size configuration!"
-        split_batch = [
-            torch.tensor_split(item, num_microbatches, dim=0) if torch.is_tensor(item) else item for item in batch
-        ]
+        split_batch = []
+        for item in batch:
+            if torch.is_tensor(item):
+                split_batch.append(torch.tensor_split(item, num_microbatches, dim=0))
+            elif isinstance(item, list):
+                if isinstance(item[0], torch.Tensor):
+                    split_tensors = [torch.tensor_split(elem, num_microbatches, dim=0) for elem in item]
+                    split_tuple = []
+                    for mbi in range(num_microbatches):
+                        split_tuple.append([split_tensors[i][mbi] for i in range(len(split_tensors))])
+                    split_tuple = tuple(split_tuple)
+                    split_batch.append(split_tuple)
+                else:
+                    split_batch.append(split_list(item, num_microbatches))
+            elif item is None:
+                split_batch.append(item)
+            else:
+                raise ValueError(f"Unsupported item type: {type(item)}")
+
         microbatches = [
             [elem[i] if elem is not None else elem for elem in split_batch] for i in range(num_microbatches)
         ]
diff --git a/nemo/collections/tts/data/speechllm/__init__.py b/nemo/collections/tts/data/speechllm/__init__.py
new file mode 100644
index 000000000000..9df65818d226
--- /dev/null
+++ b/nemo/collections/tts/data/speechllm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/tts/data/speechllm/t5_speechllm_dataset.py b/nemo/collections/tts/data/speechllm/t5_speechllm_dataset.py
new file mode 100644
index 000000000000..32f0a14f5e65
--- /dev/null
+++ b/nemo/collections/tts/data/speechllm/t5_speechllm_dataset.py
@@ -0,0 +1,1355 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+import json
+import random
+from dataclasses import dataclass
+from pathlib import Path
+from typing import ClassVar, List, Optional, Union
+
+import numpy as np
+import torch
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
+from tqdm.auto import tqdm
+
+from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
+from nemo.collections.asr.parts.preprocessing.segment import AudioSegment
+from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import get_ipa_punctuation_list
+from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import any_locale_text_preprocessing
+from nemo.collections.nlp.data.language_modeling.megatron.base_prompt_learning_dataset import BasePromptLearningDataset
+from nemo.collections.nlp.models.language_modeling.megatron_t5_model import T5Sentinel
+from nemo.collections.nlp.modules.common import VirtualPromptSource
+from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids
+from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
+from nemo.collections.tts.parts.utils.tts_dataset_utils import (
+    BetaBinomialInterpolator,
+    beta_binomial_prior_distribution,
+    general_padding,
+    get_base_dir,
+)
+from nemo.utils import logging
+
+__all__ = ['T5SpeechLMDataset', "Lang"]
+
+
+def get_full_list_puncts():
+    punct_set = set()
+    for locale_id in ["en-US", "de-DE", "fr-FR"]:
+        punct_list = get_ipa_punctuation_list(locale=locale_id)
+        punct_set.update(punct_list)
+    return sorted(punct_set)
+
+
+@dataclass
+class G2PConfig:
+    _target_: str = "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p"
+    phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
+    heteronyms: str = "scripts/tts_dataset_files/heteronyms-052722"
+    phoneme_probability: float = 0.5
+
+
+@dataclass
+class EnglishIpaG2pConfig:
+    _target_: str = "nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p"
+    phoneme_dict: str = "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+    locale: str = "en-US"
+    heteronyms: str = "scripts/tts_dataset_files/heteronyms-052722"
+    phoneme_probability: float = 0.5
+    grapheme_case: str = "upper"
+    use_stresses: bool = True
+    use_chars: bool = True
+    ignore_ambiguous_words: bool = False
+
+
+@dataclass
+class TextTokenizer:
+    _target_: str = "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
+    punct: bool = True
+    stresses: bool = True
+    chars: bool = True
+    apostrophe: bool = True
+    pad_with_space: bool = True
+    add_blank_at: bool = True
+    g2p: G2PConfig = G2PConfig()
+
+
+@dataclass
+class EnglishIpaTextTokenizer:
+    _target_: str = "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer"
+    locale: str = "en-US"
+    punct: bool = True
+    # Define non_default_punct_list as a ClassVar to explicitly mark it as a class variable
+    non_default_punct_list: ClassVar[List[str]] = get_full_list_puncts()
+    apostrophe: bool = True
+    pad_with_space: bool = True
+    add_blank_at: bool = True
+    g2p: EnglishIpaG2pConfig = EnglishIpaG2pConfig()
+
+
+@dataclass
+class TextTokenizerConfig:
+    text_tokenizer: TextTokenizer = TextTokenizer()
+
+
+@dataclass
+class EnglishIpaTextTokenizerConfig:
+    text_tokenizer: EnglishIpaTextTokenizer = EnglishIpaTextTokenizer()
+
+
+def _get_default_text_tokenizer_conf(phoneme_probability: float = 0.5, use_ipa: bool = False):
+    if use_ipa:
+        g2p = EnglishIpaG2pConfig(phoneme_probability=phoneme_probability)
+        _text_tokenizer = EnglishIpaTextTokenizer(g2p=g2p)
+        text_tokenizer: EnglishIpaTextTokenizerConfig = EnglishIpaTextTokenizerConfig(text_tokenizer=_text_tokenizer)
+    else:
+        g2p = G2PConfig(phoneme_probability=phoneme_probability)
+        _text_tokenizer = TextTokenizer(g2p=g2p)
+        text_tokenizer: TextTokenizerConfig = TextTokenizerConfig(text_tokenizer=_text_tokenizer)
+    return OmegaConf.create(OmegaConf.to_yaml(text_tokenizer))
+
+
+def pad_text_to_speech_dims(text_tensor, pad_id, pad_size=7):
+    token_len = text_tensor.shape[0]
+    empty_padding = torch.ones((pad_size, token_len), dtype=text_tensor.dtype, device=text_tensor.device) * pad_id
+    return torch.cat((text_tensor.unsqueeze(0), empty_padding), dim=0)
+
+
+class Lang(enum.Enum):
+    en = 1
+    es = 2
+    fr = 3
+    zh = 4
+    de = 4
+
+
+class T5SpeechLMDataset(BasePromptLearningDataset):
+    """
+    The dataset class for prompt-tuning or p-tuning pretrained T5 SpeechLM models.
+    """
+
+    def __init__(
+        self,
+        datasets,
+        tokenizer,
+        virtual_prompt_source: VirtualPromptSource,
+        task_templates: dict,
+        pseudo_tokens,
+        pad_token_id: str,
+        max_seq_length: int,
+        sample_rate: int,
+        min_seq_length: int = 1,
+        add_bos: bool = False,
+        add_eos: bool = True,
+        for_train: bool = True,
+        decoder_starts_with_pad: bool = False,
+        add_eos_to_decoder_output: bool = True,
+        add_sentinel_to_input: bool = True,
+        ul2_prompt_token: str = None,
+        segment_max_duration: Optional[int] = None,
+        trim: bool = False,
+        trim_ref: Optional[float] = None,
+        trim_top_db: Optional[int] = None,
+        trim_frame_length: Optional[int] = None,
+        trim_hop_length: Optional[int] = None,
+        pad_multiple: int = 1,
+        pitch_augment: bool = False,
+        sup_data_path: Optional[Union[Path, str]] = None,
+        speech_offset: Optional[int] = None,
+        train_task: Optional[str] = None,
+        seq_pattern: Optional[str] = "parallel",
+        use_attention_prior: Optional[bool] = False,
+        attention_prior_scaling_factor: Optional[float] = 1.0,
+        spec_aug=False,
+        spec_aug_time_width=0.2,
+        spec_aug_time_masks=2,
+        cross_attention_epsilon: Optional[float] = 0.0,
+        lm_vocab_size: Optional[int] = None,
+        num_speech_codebooks: Optional[int] = 8,
+        codebook_fps: Optional[int] = 86,
+        add_special_tokens_to_only_first_codebook: Optional[bool] = False,
+        context_pattern: Optional[str] = "parallel",
+        context_duration_min: Optional[float] = 3.0,
+        context_duration_max: Optional[float] = 5.0,
+        skip_datasets: Optional[List[str]] = [],  # substrings of dataset names to skip
+        english_only_model: Optional[bool] = False,
+        context_conditioning: Optional[str] = "decoder",  # encoder or decoder
+        use_beta_binomial_interpolator: Optional[str] = False,  # encoder or decoder
+        context_slice_method: Optional[str] = "random",  # random or fixed
+        phoneme_probability: Optional[float] = 0.5,
+        encoder_type: Optional[str] = "single_transformer",
+        use_ipa: bool = False,
+        **kwargs,
+    ):
+        """
+        Only speech parameters are explained here.
+        segment_max_duration: Optional[int] = None, - Speech max segment duration
+        trim: bool = False, - speech parameter
+        trim_ref: Optional[float] = None, - speech parameter
+        trim_top_db: Optional[int] = None, - speech parameter
+        trim_frame_length: Optional[int] = None, - speech parameter
+        trim_hop_length: Optional[int] = None, - speech parameter
+        pad_multiple: int = 1, - speech parameter
+        pitch_augment: bool = False, - speech parameter
+        sup_data_path: Optional[Union[Path, str]] = None, - Supplementary folder path where codecs are stored.
+        speech_offset: Optional[int] = None, - if speech tokens then add this offset to the token indices to distinguish between text and speech tokens.
+        lm_vocab_size: Optional[int] = None, - vocab size of the original language model (phoneme tokens start from this index)
+        english_only_model: Optional[bool] = False, specify if monolingual or multi-lingual modeling.
+        use_ipa: bool = False, specify if using IPA tokens or default ARPABET tokens. Either choice still mixes chars.
+        **kwargs,
+        """
+        # These two variables need to be set before calling super().__init__() because the parent class calls `load_data()` which requires these attributes.
+        self._rng = random.Random()
+        self.spec_aug = spec_aug if for_train else False
+        self.time_width = spec_aug_time_width
+        self.time_masks = spec_aug_time_masks
+        self.decoder_starts_with_pad = decoder_starts_with_pad
+        self.add_eos_to_decoder_output = add_eos_to_decoder_output
+        self.add_sentinel_to_input = add_sentinel_to_input
+        self.ul2_prompt_token = ul2_prompt_token
+        # Speech related variables
+        self.base_data_dir = None
+        self.segment_max_duration = segment_max_duration
+        self.sample_rate = sample_rate
+        self.featurizer = WaveformFeaturizer(sample_rate=self.sample_rate)
+        self.pad_multiple = pad_multiple
+        self.pitch_augment = pitch_augment
+        self.trim = trim
+        self.trim_ref = trim_ref if trim_ref is not None else np.max
+        self.trim_top_db = trim_top_db if trim_top_db is not None else 60
+        self.trim_frame_length = trim_frame_length if trim_frame_length is not None else 2048
+        self.trim_hop_length = trim_hop_length if trim_hop_length is not None else 512
+        self.speech_offset = speech_offset if speech_offset is not None else 3
+        self.seq_pattern = seq_pattern
+        self.use_attention_prior = use_attention_prior
+        self.attention_prior_scaling_factor = attention_prior_scaling_factor
+        self.cross_attention_epsilon = cross_attention_epsilon  # value of prior for context tokens (b/w 0 and 1)
+        assert self.cross_attention_epsilon >= 0.0 and self.cross_attention_epsilon <= 1.0
+        self.lm_vocab_size = tokenizer.vocab_size if lm_vocab_size is None else lm_vocab_size
+        self.num_speech_codebooks = num_speech_codebooks
+        self.codebook_fps = codebook_fps
+        self.add_special_tokens_to_only_first_codebook = add_special_tokens_to_only_first_codebook
+        # context_pattern and duration arguments are supported only if context_type is REFSPEAKERCODEC in the manifest
+        self.context_pattern = context_pattern
+        self.context_duration_min = context_duration_min
+        self.context_duration_max = context_duration_max
+        self.english_only_model = english_only_model
+        self.phoneme_tokenizer = None
+        if english_only_model:
+            self.phoneme_tokenizer = instantiate(
+                _get_default_text_tokenizer_conf(phoneme_probability=phoneme_probability, use_ipa=use_ipa)
+            ).text_tokenizer
+        else:
+            self.g2p = {"fr": lambda x: x}
+            if kwargs.get("g2p", None):
+                if "english" in kwargs["g2p"]:
+                    english_g2p = instantiate(kwargs["g2p"]["english"])
+                    self.g2p["en"] = lambda x: english_g2p(x)
+                if "spanish" in kwargs["g2p"]:
+                    spanish_g2p = instantiate(kwargs["g2p"]["spanish"])
+                    self.g2p["es"] = lambda x: spanish_g2p(x)
+                if "mandarin" in kwargs["g2p"]:
+                    mandarin_g2p = instantiate(kwargs["g2p"]["mandarin"])
+                    self.g2p["zh"] = lambda x: mandarin_g2p(x)
+                if "german" in kwargs["g2p"]:
+                    german_g2p = instantiate(kwargs["g2p"]["german"])
+                    self.g2p["de"] = lambda x: german_g2p(x)
+
+        self.context_conditioning = context_conditioning
+        if self.context_conditioning == "decoder":
+            assert (
+                self.context_duration_min == self.context_duration_max
+            ), "For decoder conditioning, context_duration_min and context_duration_max should be same"
+            self.decoder_context_len = int(
+                self.context_duration_min * self.codebook_fps
+            )  # TODO: Just take from model var?
+
+        # Initialize sup_data_path, sup_data_types and run preprocessing methods for every supplementary data type\
+        self.sup_data_path = None
+        if sup_data_path is not None:
+            Path(sup_data_path).mkdir(parents=True, exist_ok=True)
+            self.sup_data_path = sup_data_path
+
+        self.codec_folder = kwargs.pop('codec_folder', None)
+        self.train_task = train_task
+        if self.codec_folder is None and self.sup_data_path is not None:
+            self.codec_folder = Path(self.sup_data_path) / "codec"
+        elif isinstance(self.codec_folder, str):
+            self.codec_folder = Path(self.codec_folder)
+
+        self.codec_folder.mkdir(exist_ok=True, parents=True)
+
+        self.context_length = kwargs.pop('context_length', None)  # only used in gpt dataset atm
+        # self.attention_prior_strength = attention_prior_strength
+        self.transformer_type = kwargs.pop('transformer_type', 'T5')
+        self.skip_datasets = skip_datasets
+
+        self.beta_binomial_interpolator = (
+            BetaBinomialInterpolator(scaling_factor=self.attention_prior_scaling_factor)
+            if use_beta_binomial_interpolator
+            else None
+        )
+        self.context_slice_method = context_slice_method
+        self.encoder_type = encoder_type
+        super().__init__(
+            datasets=datasets,
+            tokenizer=tokenizer,
+            virtual_prompt_source=virtual_prompt_source,
+            task_templates=task_templates,
+            pseudo_tokens=pseudo_tokens,
+            pad_token_id=pad_token_id,
+            max_seq_length=max_seq_length,
+            min_seq_length=min_seq_length,
+            add_bos=add_bos,
+            add_eos=add_eos,
+            for_train=for_train,
+        )
+
+    def load_data(self, dataset):
+        """
+        Loads a dataset by filling in the task templates specified in the config file
+        with the information from each training/inference example. Converts all input
+        text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in
+        the task templates with the actual virtual prompt token ids.
+
+        params:
+            dataset: A list of json objects or a dictionary objects each
+                     containing the information needed for a training example
+        """
+        copy_dataset = list(dataset)
+        audio_filelist = []
+        # This loop is needed to calculate self.base_data_dir.
+        for json_line in copy_dataset:
+            if type(json_line) == dict:
+                doc = json_line
+            else:
+                doc = json.loads(json_line)
+            taskname = doc["taskname"]
+            prompt_template_fields = self.task_templates[taskname]["prompt_template_fields"]
+
+            for p in prompt_template_fields:
+                if f"{p}_type" in doc and doc[f"{p}_type"] == "SPEECH":
+                    audio_filelist.append(doc[p])
+        self.base_data_dir = get_base_dir(audio_filelist)
+
+        skipped = 0
+        tts = 0
+        asr = 0
+        i = 0
+        logging.info(f"copy_dataset len === {len(copy_dataset)}")
+        examples = []
+        for json_line in tqdm(copy_dataset):
+            i += 1
+
+            # Read example dict or load the information for a single example from .json file
+            if type(json_line) == dict:
+                doc = json_line
+            else:
+                doc = json.loads(json_line)
+
+            if self.context_conditioning == "decoder":
+                # Modify doc to make combine context and anwer
+                assert ";" not in doc['context'], "Multiple contexts not supported in decoder conditioning"
+                doc['answer'] = "{};{}".format(doc['context'], doc['answer'])
+                doc['answer_duration'] = self.context_duration_min + doc['answer_duration']
+                doc['answer_type'] = "CONTEXTANSWER"
+                doc['context_type'] = "DUMMYCONTEXT"
+                doc['context'] = "DUMMYCONTEXT"
+
+            question_in_manifest = doc['question']
+
+            if "Text to speech this" in question_in_manifest or "Phoneme TTS" in question_in_manifest:
+                tts += 1
+                if self.train_task not in ['tts', 'all']:
+                    continue
+            elif "Next token prediction" in question_in_manifest:
+                if self.train_task != 'tts':
+                    asr += 1
+                else:
+                    tts += 1
+                continue
+            else:
+                if self.train_task == 'tts':
+                    continue
+                asr += 1
+
+            if doc["context_type"] == "SPEECH":
+                assert "context_duration" in doc, f"context_duration key not in document {doc}"
+                approx_context_len = 3 * (self.codebook_fps + 1)  # +1 just to be safe
+                if self.context_length is not None and doc["context_duration"] < self.context_length:
+                    logging.debug(
+                        f"skipped as context_length of {doc['context_duration']} is less than {self.context_length}"
+                    )
+                    skipped += 1
+                    continue
+            elif "Remove Noise" in question_in_manifest:
+                approx_context_len = doc["answer_duration"] * (self.codebook_fps + 1)
+            elif "Extract Speaker Audio" in question_in_manifest:
+                approx_context_len = (
+                    doc["answer_duration"] * (self.codebook_fps + 1) + 400
+                )  # 400 is the max ref speaker audio
+            elif ("Text to speech this" in question_in_manifest) or ('Phoneme TTS' in question_in_manifest):
+                # approx_context_len = 400
+                approx_context_len = 5 * (
+                    self.codebook_fps + 1
+                )  # better than 400. TODO: pneekhara: Need to change things for multi-encoder vs single encoder based filtering.
+            elif "Edit Speech" in question_in_manifest:
+                approx_context_len = doc["answer_duration"] * (self.codebook_fps + 1)
+            else:
+                raise NotImplementedError(f"Unknown context type {doc['context_type']}")
+
+            approx_question_len = len(doc["question"].split(' ')) + 3
+            if 'Phoneme TTS' in question_in_manifest:
+                # approx len is equal to num of characters
+                approx_question_len = len(question_in_manifest)
+
+            if doc["answer_type"] in ["SPEECH", "AUDIOCODEC", "CONTEXTANSWER"]:
+                assert "answer_duration" in doc, f"answer_duration key not in document {doc}"
+                approx_answer_len = doc["answer_duration"] * (self.codebook_fps + 1) + 3  # +3 for EOS, BOS padding
+                if self.seq_pattern == "delay_parallel":
+                    # In delay parallel, there is padding so add 8 frames
+                    approx_answer_len = approx_answer_len + self.num_speech_codebooks
+            else:
+                approx_answer_len = len(doc["answer"].split(' ')) + 3
+
+            skip_record = False
+            for skip_dataset in self.skip_datasets:
+                if skip_dataset in doc['answer']:
+                    skip_record = True
+
+            if not skip_record:
+                if (self.transformer_type == "GPT") and (
+                    self.min_seq_length
+                    < approx_context_len + approx_question_len + approx_answer_len
+                    < self.max_seq_length
+                ):
+                    examples.append(doc)
+                elif (self.transformer_type == "T5") and (
+                    self.min_seq_length < approx_context_len + approx_question_len < self.max_seq_length
+                    and self.min_seq_length < approx_answer_len < self.max_seq_length
+                ):
+                    examples.append(doc)
+                else:
+                    logging.debug(f"skipped for {approx_context_len + approx_question_len} {approx_answer_len} len")
+                    skipped += 1
+            else:
+                print("Skipping", doc['answer'])
+                logging.debug(f"skipped for {doc['answer']} as it is in skip_datasets")
+                skipped += 1
+
+        logging.info(f'Skipped {skipped} sentences, sequence length too short or too long even after truncation')
+
+        return examples
+
+    def __getitem__(self, idx):
+        doc = self.examples[idx]
+        taskname = doc["taskname"]
+        prompt_template = self.task_templates[taskname]["prompt_template"]
+        prompt_template_fields = self.task_templates[taskname]["prompt_template_fields"]
+        total_virtual_tokens = self.task_templates[taskname]["total_virtual_tokens"]
+        virtual_token_splits = self.task_templates[taskname]["virtual_token_splits"]
+        truncation_field = self.task_templates[taskname]['truncate_field']
+        answer_field = self.task_templates[taskname]["answer_field"]
+
+        input_example = prompt_template
+
+        self._input_sanity_checks(
+            total_virtual_tokens=total_virtual_tokens,
+            virtual_token_splits=virtual_token_splits,
+            prompt_template=prompt_template,
+            prompt_template_fields=doc.keys(),  # Skip this check as we don't need it for TTS
+            truncation_field=truncation_field,
+            answer_field=answer_field,
+            doc=doc,
+        )
+        question_in_manifest = doc['question']
+
+        # Format the input example according to the template
+        # Get context, question and answer codes in a dict.
+        # TODO @xueyang: declare the instructions when initializing the dataset so that they can be re-used. Temporally
+        #  hardcode them here.
+        question_text = doc["question"].strip()
+        instructions = ["Phoneme TTS", "Text to speech this"]
+        for prefix in instructions:
+            if doc["question"].startswith(prefix):
+                question_text = doc["question"][len(prefix) :].strip()
+                break
+
+        input_dict = self._insert_data_in_template(prompt_template_fields, doc, answer_field)
+        lang = Lang[doc.get("lang", "en")]
+        context_tokens = input_dict['context']
+        question_tokens = input_dict['question']
+
+        # Logic to prune context
+        # In case of TTS task, the entire reference speech is not required, so we randomly select a portion
+        # of the reference audio.
+        # In case of Next token prediction, We want context[:T] to go in the encoder and context[T+1:] to be
+        # predicted by the decoder.
+        start_token_index = 0
+        end_token_index = -1
+        if ("Text to speech this" in question_in_manifest) and (doc["context_type"] == "SPEECH"):
+            total_context_len = context_tokens[0].size()[1]
+            reduced_len = min(
+                400,
+                (
+                    int(total_context_len * 0.2)
+                    if total_context_len > 600
+                    else int(total_context_len * random.uniform(0.2, 0.5))
+                ),
+            )
+            start_token_index = random.randint(
+                0, total_context_len - reduced_len
+            )  # start index can be greater than 440
+            context_tokens[0] = context_tokens[0][
+                :, start_token_index : min(start_token_index + 440, start_token_index + reduced_len)
+            ]
+        elif "Next token prediction" in question_in_manifest:
+            total_context_len = context_tokens[0].size()[1]
+            end_token_index = int(total_context_len * random.uniform(0.01, 0.2))
+            context_tokens[0] = context_tokens[0][:, :end_token_index]
+
+        # Get virtual tokens
+        # `virtual_tokens` is "<prompt_0><prompt_1><prompt_2>".
+        virtual_tokens = self._insert_virtual_token_placeholders(input_example.split(' ')[0], virtual_token_splits)
+
+        # a trick to align with the data format in t5 pretraining
+        virtual_tokens = self.tokenizer.text_to_ids(virtual_tokens)
+        if self.add_sentinel_to_input:
+            question_tokens = question_tokens + self.tokenizer.text_to_ids(T5Sentinel.FIRST.value)
+
+        # Add BOS/EOS to the input of encoder if desired, adds EOS by default
+        if self.ul2_prompt_token is not None:
+            ul2_prompt_token_id = self.tokenizer.text_to_ids(self.ul2_prompt_token)
+            assert len(ul2_prompt_token_id) == 1
+            context_tokens = ul2_prompt_token_id + context_tokens
+        if self.add_bos:
+            context_tokens = [self.tokenizer.bos_id] + context_tokens
+        if self.add_eos:
+            question_tokens = question_tokens + [self.tokenizer.eos_id]
+
+        # Try to truncate input text to fit into the max sequence length
+        if self._get_len(context_tokens, question_tokens, virtual_tokens) > self.max_seq_length:
+            context_tokens, question_tokens, virtual_tokens = self._truncate_input_speech(
+                context_tokens, question_tokens, virtual_tokens
+            )
+
+        virtual_tokens, virtual_tokens_len = self.list_to_tensor(virtual_tokens)
+        context_tokens, context_tokens_len = self.list_to_tensor(context_tokens)
+        question_tokens, question_tokens_len = self.list_to_tensor(question_tokens)
+
+        if doc["question_type"] == "TEXT" and doc["context_type"] != "TEXT":
+            question_tokens = pad_text_to_speech_dims(
+                question_tokens, self.tokenizer.pad_id, self.num_speech_codebooks - 1
+            )
+        if doc["context_type"] == "TEXT" and doc["question_type"] != "TEXT":
+            context_tokens = pad_text_to_speech_dims(
+                context_tokens, self.tokenizer.pad_id, self.num_speech_codebooks - 1
+            )
+        if doc["context_type"] == "TEXT" and doc["question_type"] == "TEXT":
+            context_tokens = pad_text_to_speech_dims(
+                context_tokens, self.tokenizer.pad_id, self.num_speech_codebooks - 1
+            )
+            question_tokens = pad_text_to_speech_dims(
+                question_tokens, self.tokenizer.pad_id, self.num_speech_codebooks - 1
+            )
+
+        # context_tokens: tensor, (num_speech_codebooks, audio_context_len)
+        # question_tokens: tensor, (num_speech_codebooks, instruction token len + question token len + 1 (<extra_id_0> + 1 ([SEP])), only first row includes token ids while all other rows are all zeros (pad)
+        if self.encoder_type == "multi_transformer":
+            context_and_question_tokens = [context_tokens, question_tokens]
+        else:
+            context_and_question_tokens = torch.cat([context_tokens, question_tokens], dim=1)
+
+        # get answer ids
+        if answer_field in doc.keys():  # training and validation
+            answer_ids = self._get_tokens(doc, answer_field, doc[answer_field])
+            if end_token_index > -1:
+                answer_ids[0] = answer_ids[0][:, end_token_index:]
+
+            if self.decoder_starts_with_pad:
+                answer_text_ids = [self.tokenizer.pad_id]
+            else:
+                answer_text_ids = [self.tokenizer.bos_id]
+            # a trick to align with the data format in t5 pretraining
+            # if self.add_sentinel_to_input:
+            #     answer_text_ids += self.tokenizer.text_to_ids(T5Sentinel.FIRST.value)
+            answer_text_ids += answer_ids
+
+            if self.add_eos_to_decoder_output:
+                answer_text_ids += [self.tokenizer.eos_id]
+            else:
+                answer_text_ids += self.tokenizer.text_to_ids(T5Sentinel.END.value)
+
+        if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
+            taskname_id = self.tokenizer.text_to_ids(taskname)
+        elif (
+            self.virtual_prompt_source == VirtualPromptSource.NO_PROMPT
+        ):  # TODO (@adithyare) this class and GPTPromptLearningDataset should be merged.
+            taskname_id = -1
+        else:
+            raise ValueError("Invalid virtual prompt source specified")
+
+        dec_input = None
+        dec_labels = None
+
+        # if single-encoder and context_condition is decoder, answer_text_ids = [CLS_id, context audio code tensors, zero-pad, answer audio code tensor, SEP_id]
+        # if multi-encoder, answer_text_ids = [CLS_id, answer audio codec tensor, SEP_id], so dec_input will not include audio context anymore.
+        if answer_field in doc.keys():  # training and validation
+            dec_input = answer_text_ids[:-1]
+            dec_labels = answer_text_ids[1:]
+
+        # if single-encoder and context_condition is decoder:
+        #   dec_input: shape=(self.num_speech_codebooks, 1([CLS]) + len(context audio frames) + 1([PAD]) + len(answer audio frames))
+        #   dec_labels: shape=(self.num_speech_codebooks, len(context audio frames) + 1([PAD]) + len(answer audio frames) + 1([SEP]))
+        # if multi-encoder:
+        #   dec_input: (num_speech_codebooks, 1([CLS]) + len(answer audio frames))
+        #   dec_labels: (num_speech_codebooks, len(answer audio frames) + 1([SEP]))
+        dec_input, dec_input_len = self.list_to_tensor(dec_input, True)
+        dec_labels, dec_labels_len = self.list_to_tensor(dec_labels, True)
+        is_speech = True if doc["answer_type"] != "TEXT" else False
+        if is_speech:
+            assert dec_input.dim() == 2 and dec_labels.dim() == 2
+            if self.seq_pattern == "delay_parallel":
+                num_codebooks = dec_input.shape[0]
+                dec_input_padded = torch.cat(
+                    [
+                        torch.zeros_like(dec_input[:, 0:num_codebooks]),
+                        dec_input,
+                        torch.zeros_like(dec_input[:, 0:num_codebooks]),
+                    ],
+                    dim=1,
+                )
+                dec_labels_padded = torch.cat(
+                    [
+                        torch.zeros_like(dec_labels[:, 0:num_codebooks]),
+                        dec_labels,
+                        torch.zeros_like(dec_labels[:, 0:num_codebooks]),
+                    ],
+                    dim=1,
+                )
+                dec_input_new = []
+                dec_labels_new = []
+                for _c in range(self.num_speech_codebooks):
+                    st = num_codebooks - _c
+                    et_decoder_input = dec_input_padded.shape[1] - _c
+                    et_decoder_labels = dec_labels_padded.shape[1] - _c
+                    dec_input_new.append(dec_input_padded[_c, st:et_decoder_input])
+                    dec_labels_new.append(dec_labels_padded[_c, st:et_decoder_labels])
+                dec_input = torch.stack(dec_input_new, dim=0)
+                dec_labels = torch.stack(dec_labels_new, dim=0)
+                dec_input_len = torch.tensor(dec_input.shape[1]).long()
+                dec_labels_len = torch.tensor(dec_labels.shape[1]).long()
+
+        if self.encoder_type == "multi_transformer":
+            enc_len = question_tokens_len + virtual_tokens_len
+        else:
+            enc_len = context_tokens_len + question_tokens_len + virtual_tokens_len
+        # TODO: Remove hardcoding
+        start_of_question_offset = 4  # For both "Text to Speech this" and "Phoneme TTS"
+        end_of_question_offset = 2
+        cross_attention_prior = torch.zeros(dec_labels_len, enc_len) + self.cross_attention_epsilon
+        if self.use_attention_prior:
+            prior_dec_len = dec_labels_len.item()
+            prior_dec_start_idx = 0
+            if self.context_conditioning == "decoder":
+                prior_dec_len = dec_labels_len.item() - (self.decoder_context_len + 1)
+                prior_dec_start_idx = self.decoder_context_len + 1
+            text_len = question_tokens_len.item() - start_of_question_offset - end_of_question_offset
+            audio_len = prior_dec_len
+            if self.beta_binomial_interpolator is not None:
+                cross_attention_question_prior = torch.from_numpy(self.beta_binomial_interpolator(audio_len, text_len))
+            else:
+                cross_attention_question_prior = torch.from_numpy(
+                    beta_binomial_prior_distribution(
+                        text_len,
+                        audio_len,
+                        scaling_factor=self.attention_prior_scaling_factor,
+                    )
+                )
+            if self.encoder_type == "multi_transformer":
+                cross_attention_prior[
+                    prior_dec_start_idx:, virtual_tokens_len + start_of_question_offset : -end_of_question_offset
+                ] = cross_attention_question_prior
+            else:
+                cross_attention_prior[
+                    prior_dec_start_idx:,
+                    virtual_tokens_len + context_tokens_len + start_of_question_offset : -end_of_question_offset,
+                ] = cross_attention_question_prior
+
+        if self.encoder_type == "multi_transformer":
+            context_and_question_len = [context_tokens_len, question_tokens_len]
+        else:
+            context_and_question_len = context_tokens_len + question_tokens_len
+        return (
+            taskname_id,  # List, only one item. token id for "squad"
+            virtual_tokens,  # Tensor, shape=(3,). token id for ['<prompt_0>', '<prompt_1>', '<prompt_2>']
+            virtual_tokens_len,  # tensor, 3
+            context_tokens_len,  # tensor, 1
+            # tensor if single encoder and context_condition is encoder, shape=(self.num_speech_codebooks, 1(context) + question len + 1(<extra_id_0>) + 1([SEP])). only first row includes token ids while all other rows are all zeros (pad).
+            # list if multi-encoder and context_condition is encoder.
+            context_and_question_tokens,
+            # tensor scalar if single encoder and context_condition is decoder, 1 + (question len + 1 + 1).
+            # list if multi-encoder and context_condition is encoder.
+            context_and_question_len,
+            dec_input,  # tensor, shape=(self.num_speech_codebooks, 1 CLS + context audio frame len + 1 pad + answer audio frame len), first column is [CLS_id, 0*7]^T
+            dec_input_len,  # scalar tensor, 1 CLS + context audio frame len + 1 pad + answer audio frame len. 1 corresponds to CLS id
+            dec_labels,  # tensor, shape=(self.num_speech_codebooks, context audio frame len + 1 pad + answer frame len + 1 SEP).
+            dec_labels_len,  # tensor, context audio frame len + 1 PAD + answer frame len + 1 SEP.  1 corresponds to SEP id.
+            is_speech,  # True
+            cross_attention_prior,  # tensor, shape=(dec_labels_len, context_tokens_len + question_tokens_len + virtual_tokens_len).
+            lang.value,  # int,
+            question_text,  # str, answer transcript without question type (Phoneme TTS or Text to speech this).
+        )
+
+    def _truncate_input_speech(self, context_tokens, question_tokens, virtual_tokens):
+        total_len = self._get_len(context_tokens, question_tokens, virtual_tokens)
+        context_len = self._get_element_len(context_tokens)
+        truncation_length = total_len - self.max_seq_length + 1
+        context_tokens[0] = context_tokens[0][:, min(truncation_length, context_len) :]
+        return context_tokens, question_tokens, virtual_tokens
+
+    def list_to_tensor(self, element, fill=False):
+        """
+        Convert list to tensor. The list might contain integers, 2D-tensors (speech tokens) and combination of two.
+        If all of them are ints, simply convert to tensor
+        If combination of 2D-tensor and ints. Convert int to the dimension of the tensor.
+        example: [2, 4, 5] -> torch.tensor([2, 4, 5])
+        example: [2, torch.tensor([[4, 5, 6], [6, 7, 8]])] -> torch.tensor( [[-1, 4, 5, 6], [2, 6, 7, 8]] )
+        """
+        ret, ln = None, None
+        if element is None:
+            return ret, ln
+
+        max_len = max([1 if isinstance(item, int) else len(item) for item in element])
+        if max_len == 1:
+            ret = torch.as_tensor(element).long()
+            ln = torch.tensor(ret.size()[0]).long()
+        else:
+            ret = []
+            for e in element:
+                if isinstance(e, int):
+                    tmp = torch.full((self.num_speech_codebooks, 1), e if fill else -1)
+                    tmp[self.num_speech_codebooks - 1] = e
+                    if self.add_special_tokens_to_only_first_codebook:
+                        # Fill zeros in all other codebooks (to avoid out of range when getting embeddings)
+                        tmp[1:] = 0
+                else:
+                    tmp = e
+                ret.append(tmp)
+            ret = torch.cat(ret, dim=1)
+            ln = torch.tensor(ret.size()[1]).long()
+        return ret, ln
+
+    def _get_text_tokens(self, text):
+        input_ids = self.tokenizer.text_to_ids(text)
+        return input_ids
+
+    def _get_phoneme_tokens(self, text, lang="en"):
+        if self.english_only_model:
+            input_ids = self.phoneme_tokenizer.encode(text)
+            input_ids_adjusted = [_id + self.lm_vocab_size for _id in input_ids]
+            return input_ids_adjusted
+        else:
+            text = any_locale_text_preprocessing(text)
+            input_ids = self.g2p[lang](text)
+            input_ids_adjusted = []
+            for i in input_ids:
+                input_ids_adjusted.append(f"p{{{i}}}")
+            input_ids_adjusted = self.tokenizer.text_to_ids("".join(input_ids_adjusted))
+            return input_ids_adjusted
+
+    def _pad_wav_to_multiple(self, wav):
+        if self.pad_multiple > 1:
+            if wav.shape[0] % self.pad_multiple != 0:
+                wav = torch.cat(
+                    [wav, torch.zeros(self.pad_multiple - wav.shape[0] % self.pad_multiple, dtype=torch.float)]
+                )
+        return wav
+
+    def _get_element_len(self, element):
+        length = 0
+        if isinstance(element, list):
+            for e in element:
+                if isinstance(e, int):
+                    length += 1
+                else:
+                    if e.dim() > 1:
+                        length += e.size()[1]
+                    else:
+                        length += e.size()[0]
+        else:
+            if element.dim() > 1:
+                length += element.size()[1]
+            else:
+                length += element.size()[0]
+        return length
+
+    def _get_len(self, context_tokens, question_tokens, virtual_tokens):
+        length = 0
+        length += self._get_element_len(context_tokens)
+        length += self._get_element_len(question_tokens)
+        length += self._get_element_len(virtual_tokens)
+        return length
+
+    def _load_audio(self, audio_filepath, dur=-1):
+        if self.segment_max_duration is not None and dur > 0 and dur > self.segment_max_duration:
+            # this case has been added for segmenting audio for speaker verification task of SSLDisentangler
+            n_segments = int(self.segment_max_duration * self.sample_rate)
+            features = AudioSegment.segment_from_file(
+                audio_filepath, target_sr=self.sample_rate, n_segments=n_segments, trim=self.trim
+            )
+
+            features = torch.tensor(features.samples)
+            if self.pad_multiple > 1:
+                features = self._pad_wav_to_multiple(features)
+            audio, audio_length = features, torch.tensor(features.shape[0]).long()
+        else:
+            features = self.featurizer.process(
+                audio_filepath,
+                trim=self.trim,
+                trim_ref=self.trim_ref,
+                trim_top_db=self.trim_top_db,
+                trim_frame_length=self.trim_frame_length,
+                trim_hop_length=self.trim_hop_length,
+            )
+
+            if self.pad_multiple > 1:
+                features = self._pad_wav_to_multiple(features)
+
+            audio, audio_length = features, torch.tensor(features.shape[0]).long()
+
+        return audio, audio_length
+
+    def convert_audio(self, audio, sample_rate, target_sample_rate, target_channels):
+        if audio.dim() == 1:
+            audio = audio.unsqueeze(0).unsqueeze(0)
+        assert audio.shape[1] in [1, 2], "Audio must be mono or stereo."
+        # assert sample_rate == target_sample_rate, "sample rate of FastPitch and Encodec model has to be same"
+        if target_channels == 2:
+            *shape, _, length = audio.shape
+            audio = audio.expand(*shape, target_channels, length)
+        return audio
+
+    def get_codec(self, audio):
+        wav1 = self.convert_audio(audio, self.sample_rate, self.encodec_model.sample_rate, self.encodec_model.channels)
+        encoded_frames = self.encodec_model.encode(wav1)
+        codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
+        return codes.squeeze(0)
+
+    def get_quantizer_codebook(self, reference_codec, reference_codec_length):
+        out = torch.zeros((1, 128, reference_codec_length.item()))
+        for i in range(reference_codec.size()[0]):
+            out += self.encodec_model.quantizer.vq.layers[i].decode(reference_codec[i, :].unsqueeze(0))
+        return out.squeeze(0)
+
+    def _get_speech_tokens(self, audio_filepath, dur=-1):
+        # Let's keep audio name and all internal directories in rel_audio_path_as_text_id to avoid any collisions
+        rel_audio_path = Path(audio_filepath).relative_to(self.base_data_dir).with_suffix("")
+        rel_audio_path_as_text_id = str(rel_audio_path).replace("/", "_")
+
+        # Load audio features
+        audio, audio_length = self._load_audio(audio_filepath, dur)
+
+        # Convert to codes
+        codec_path = self.codec_folder / f"{rel_audio_path_as_text_id}.pt"
+
+        if codec_path.exists():
+            try:
+                codec_codes = torch.load(codec_path).long()
+            except Exception as e:
+                print(f"[ERROR IN LOADING {codec_path}] e")
+                codec_codes = self.get_codec(audio).long()
+                torch.save(codec_codes, codec_path)
+        else:
+            codec_codes = self.get_codec(audio).long()
+            torch.save(codec_codes, codec_path)
+
+        # Convert codes to codes corresponding to megatron embedding layer
+        codec_codes[0] = (codec_codes[0] + self.speech_offset).long()
+
+        return codec_codes
+
+    def _get_tokens(self, doc, field, field_data):
+        if self.context_slice_method == "random":
+            # During training, we want a random slice of the context
+            rng = random.Random()  # Custom random generator (since random uses fixed seeds)
+        elif self.context_slice_method == "fixed":
+            # During inference, we want a fixed slice of the context
+            rng = random
+        else:
+            raise ValueError(f"Invalid context_slice_method {self.context_slice_method}")
+        if f"{field}_type" not in doc.keys():
+            field_tokens = self._get_text_tokens(field_data.strip(" "))  # list of ids
+        elif doc[f"{field}_type"] == 'TEXT':
+            _text = field_data.strip(" ")
+            if _text.startswith("Phoneme TTS"):
+                lang = doc.get("lang", "en")
+                instruction_tokens = self._get_text_tokens("Phoneme TTS")
+                field_tokens = self._get_phoneme_tokens(_text[len("Phoneme TTS") :].strip(), lang=lang)
+                field_tokens = instruction_tokens + field_tokens
+            elif _text.startswith("Edit Speech"):
+                # Always use phoneme tokenizer for edit speech
+                instruction_tokens = self._get_text_tokens("Edit Speech")
+                field_tokens = self._get_phoneme_tokens(_text[len("Edit Speech") :].strip())
+                field_tokens = instruction_tokens + field_tokens
+            elif _text.startswith("TEXT CONTEXT:"):
+                # Speaker id conditioning
+                field_tokens = self._get_text_tokens(_text)
+                # pad field tokens to fixed length
+                # assert self.context_duration_min == self.context_duration_max, "TEXT CONTEXT only supports fixed context duration"
+                # To keep context length the same for audio or tex context
+                # _fixed_context_len = int(self.context_duration_min * self.codebook_fps)
+                field_tokens = field_tokens + [self.tokenizer.eos_id]
+            else:
+                # if starts with Text to speech this
+                field_tokens = self._get_text_tokens(field_data.strip(" "))  # list of ids
+        elif doc[f"{field}_type"] == 'SPEECH':
+            dur = -1
+            if f"{field}_duration" in doc:
+                dur = doc[f"{field}_duration"]
+            field_tokens = self._get_speech_tokens(field_data, dur)  # list of ids
+            if not isinstance(field_tokens, list):
+                field_tokens = [field_tokens]
+        elif doc[f"{field}_type"] == 'AUDIOCODEC':
+            reference_codec_paths = field_data.split(";")
+            reference_codec_path = rng.choice(reference_codec_paths)
+            if self.codec_folder is not None:
+                reference_codec_path = self.codec_folder / reference_codec_path
+            field_tokens = torch.load(reference_codec_path).long()
+            field_tokens[0] = (field_tokens[0] + self.speech_offset).long()
+            field_tokens = [field_tokens]
+            # print("AUDIOCODEC", field_tokens.shape)
+        elif doc[f"{field}_type"] == 'REFSPEAKERCODEC':
+            reference_codec_paths = field_data.split(";")
+            reference_codec_path = rng.choice(reference_codec_paths)
+            if self.codec_folder is not None:
+                reference_codec_path = self.codec_folder / reference_codec_path
+            field_tokens = torch.load(reference_codec_path).long()
+            field_tokens[0] = (field_tokens[0] + self.speech_offset).long()
+            _min_len = int(self.context_duration_min * self.codebook_fps)
+            _max_len = int(self.context_duration_max * self.codebook_fps)
+            reference_codec_len = rng.randint(_min_len, _max_len)
+            reference_codec_len = min(reference_codec_len, field_tokens.shape[1])
+            si = rng.randint(0, field_tokens.shape[1] - reference_codec_len)
+            field_tokens = field_tokens[:, si : si + reference_codec_len]
+            if self.context_pattern == "delay_parallel":
+                field_tokens = torch.cat(
+                    [
+                        torch.zeros(self.num_speech_codebooks, self.num_speech_codebooks).long(),
+                        field_tokens,
+                        torch.zeros(self.num_speech_codebooks, self.num_speech_codebooks).long(),
+                    ],
+                    dim=1,
+                )
+                new_field_tokens = []
+                for _c in range(self.num_speech_codebooks):
+                    st = self.num_speech_codebooks - _c
+                    et = field_tokens.shape[1] - _c
+                    new_field_tokens.append(field_tokens[_c, st:et])
+                field_tokens = torch.stack(new_field_tokens, dim=0)
+            field_tokens = [field_tokens]
+        elif doc[f"{field}_type"] == 'DUMMYCONTEXT':
+            field_tokens = torch.zeros(self.num_speech_codebooks, 1).long()
+            return [field_tokens]
+        elif doc[f"{field}_type"] == 'CONTEXTANSWER':
+            # Both Context and Answer are in the field
+            context_info, answer_codec_path = field_data.split(";")
+            if self.codec_folder is not None:
+                context_codec_path = self.codec_folder / context_info
+                answer_codec_path = self.codec_folder / answer_codec_path
+            if context_info.startswith("TEXT CONTEXT:"):
+                context_tokens = self._get_text_tokens(context_info.strip(" "))
+                # pad field tokens to fixed length
+                assert (
+                    self.context_duration_min == self.context_duration_max
+                ), "TEXT CONTEXT only supports fixed context duration"
+                _fixed_context_len = int(self.context_duration_min * self.codebook_fps)
+                context_tokens = context_tokens + [self.tokenizer.pad_id] * (_fixed_context_len - len(context_tokens))
+
+                answer_tokens = torch.load(answer_codec_path).long()
+                answer_tokens[0] = (answer_tokens[0] + self.speech_offset).long()
+                field_tokens = context_tokens + [self.tokenizer.pad_id] + [answer_tokens]
+            else:
+                context_tokens = torch.load(context_codec_path).long()
+                context_tokens[0] = (context_tokens[0] + self.speech_offset).long()
+                assert (
+                    self.context_duration_min == self.context_duration_max
+                ), "CONTEXTANSWER only supports fixed context duration"
+                reference_codec_len = int(self.context_duration_min * self.codebook_fps)
+                if context_tokens.shape[1] < reference_codec_len:
+                    # Repeat the context to match the reference_codec_len
+                    context_tokens = torch.cat(
+                        [context_tokens] * (reference_codec_len // context_tokens.shape[1] + 1), dim=1
+                    )
+                assert (
+                    context_tokens.shape[1] >= reference_codec_len
+                ), "CONTEXTANSWER context duration is less than min duration {} {} {}".format(
+                    context_tokens.shape[1], reference_codec_len, context_codec_path
+                )
+                si = rng.randint(0, context_tokens.shape[1] - reference_codec_len)
+                context_tokens = context_tokens[:, si : si + reference_codec_len]
+
+                answer_tokens = torch.load(answer_codec_path).long()
+                answer_tokens[0] = (answer_tokens[0] + self.speech_offset).long()
+                pad_tokens = torch.zeros(self.num_speech_codebooks, 1).long()
+                # padding between context and answer
+                field_tokens = torch.cat([context_tokens, pad_tokens, answer_tokens], dim=1)
+                field_tokens = [field_tokens]
+        elif doc[f"{field}_type"] == 'SEPARATIONCODECS':
+            mixed_codec_path, reference_codec_paths = field_data.split(",")
+            reference_codec_paths = reference_codec_paths.split(";")
+            reference_codec_path = rng.choice(reference_codec_paths)
+            mixed_codec = torch.load(mixed_codec_path).long()
+            reference_codec = torch.load(reference_codec_path).long()
+            reference_codec_len = rng.randint(240, 400)
+            reference_codec = reference_codec[:, :reference_codec_len]
+            # MIXED AUDIO AND REF AUDIO ARE SEPARATED BY 8 TIMESTEPS OF 1023 TOKENS IN ALL CODEBOOKS
+            mask_tokens = (torch.ones(self.num_speech_codebooks, self.num_speech_codebooks) * 1023).long()
+            field_tokens = torch.cat([mixed_codec, mask_tokens, reference_codec], dim=1)
+            field_tokens[0] = (field_tokens[0] + self.speech_offset).long()
+            field_tokens = [field_tokens]
+        elif doc[f"{field}_type"] == 'EDITINGCODECS':
+            reference_audio_path = field_data
+            reference_codec = torch.load(reference_audio_path).long()
+            assert reference_codec.shape[1] > 80  # ensure reference audio is atleast 1 second
+            mask_len = rng.randint(40, 320)  # ~0.5 second to 4 seconds
+            mask_len = min(mask_len, reference_codec.shape[1] - 80)
+            mask_start = rng.randint(0, reference_codec.shape[1] - mask_len)
+            mask_end = mask_start + mask_len
+            mask_tokens = (torch.ones(self.num_speech_codebooks, self.num_speech_codebooks) * 1023).long()
+            seg1 = reference_codec[:, :mask_start]
+            seg2 = reference_codec[:, mask_end:]
+            field_tokens = torch.cat([seg1, mask_tokens, seg2], dim=1)
+            # MISSING AUDIO IS REPLACED WITH 8 TIMESTEPS OF 1023 TOKENS IN ALL CODEBOOKS
+            field_tokens[0] = (field_tokens[0] + self.speech_offset).long()
+            field_tokens = [field_tokens]
+        else:
+            raise Exception(f"{field}_type not recognized")
+        return field_tokens
+
+    def _insert_data_in_template(self, prompt_template_fields, doc, answer_field):
+        """Format the input example according to the template"""
+        out_dict = {}
+        for field in prompt_template_fields:
+            # discard the last one, {label} / {answer}
+            # Or if some fields from the template aren't present, e.g. {answer} during inference
+            # just remove that field from the template, leaving the space blank
+            if field == answer_field or field not in doc.keys():
+                continue
+                #  out_dict[field] = ""
+
+            elif field in doc.keys():
+                field_data = doc[field]
+                if f"{field}_type" not in doc.keys():
+                    doc[f"{field}_type"] = "TEXT"
+                    raise Exception(f"{field}_type does not exist in doc")
+                else:
+                    out_dict[field] = self._get_tokens(doc, field, field_data)
+        return out_dict
+
+    def get_position_ids(self, virtual_token, context_and_qquestion):
+        enc_input = []
+        enc_input.append(virtual_token)
+        if context_and_qquestion.dim() > 2:
+            enc_input.append(context_and_qquestion[:, 0, :])
+        else:
+            enc_input.append(context_and_qquestion)
+
+        enc_input = torch.cat(enc_input, dim=1)
+
+        enc_input_p = enc_input[:, 0, :] if enc_input.dim() == 3 else enc_input
+        return build_position_ids(enc_input_p).contiguous()
+
+    def collate_fn(self, batch):
+        """Prepares enc_input, dec_input, labels, loss_mask, enc_mask, dec_mask, position_ids, taskname_ids for global batch"""
+
+        data_dict = self.pad_batch_and_build_loss_mask(batch)
+
+        if self.encoder_type == "multi_transformer":
+            position_ids = [
+                self.get_position_ids(data_dict['virtual_tokens'], data_dict['context_and_question_tokens'][0]),
+                self.get_position_ids(data_dict['virtual_tokens'], data_dict['context_and_question_tokens'][1]),
+            ]
+        else:
+            position_ids = self.get_position_ids(data_dict['virtual_tokens'], data_dict['context_and_question_tokens'])
+
+        return (
+            data_dict['virtual_tokens'],
+            data_dict['context_and_question_tokens'],
+            data_dict['enc_mask'],
+            data_dict['dec_input'],
+            data_dict['dec_input_mask'],
+            data_dict['dec_labels'],
+            data_dict['dec_labels_mask'],
+            position_ids,
+            data_dict['taskname_id'],
+            data_dict['speech_mask'],
+            data_dict['context_and_question_tokens_lens'],
+            data_dict['cross_attention_prior'],
+            data_dict['text_limits'],
+            data_dict['lang'],
+            data_dict['question_texts'],
+        )
+
+    def pad_batch_and_build_loss_mask(self, batch):
+        """Pad enc_input, dec_input, labels in batch to max batch length while building loss_mask, enc_mask, and dec_mask"""
+        (
+            taskname_ids,
+            _,
+            virtual_tokens_len,
+            _,
+            _,
+            context_and_question_tokens_len,
+            _,
+            dec_input_len,
+            _,
+            dec_labels_len,
+            _,
+            _,
+            _,
+            question_texts,
+        ) = zip(*batch)
+
+        taskname_ids = self.pad_taskname_ids(taskname_ids)
+
+        max_virtual_tokens_len = max(virtual_tokens_len).item() if virtual_tokens_len is not None else 0
+        if isinstance(virtual_tokens_len, tuple):
+            virtual_tokens_len = torch.stack(virtual_tokens_len)
+        virtual_mask = get_mask_from_lengths(virtual_tokens_len)
+
+        if self.encoder_type == "multi_transformer":
+            max_context_len = (
+                max(_c[0] for _c in context_and_question_tokens_len)
+                if context_and_question_tokens_len is not None
+                else 0
+            )
+            max_question_len = (
+                max(_c[1] for _c in context_and_question_tokens_len)
+                if context_and_question_tokens_len is not None
+                else 0
+            )
+            max_context_and_question_tokens_len = [max_context_len, max_question_len]
+            context_len = torch.stack([_c[0] for _c in context_and_question_tokens_len])
+            question_len = torch.stack([_c[1] for _c in context_and_question_tokens_len])
+            context_mask = get_mask_from_lengths(context_len)
+            question_mask = get_mask_from_lengths(question_len)
+            context_and_question_tokens_len = [context_len, question_len]
+            context_and_question_mask = [context_mask, question_mask]
+            enc_mask = [
+                torch.cat([virtual_mask, context_and_question_mask[0]], dim=1),
+                torch.cat([virtual_mask, context_and_question_mask[1]], dim=1),
+            ]
+            # import ipdb; ipdb.set_trace()
+        else:
+            max_context_and_question_tokens_len = (
+                max(context_and_question_tokens_len).item() if context_and_question_tokens_len is not None else 0
+            )
+            if isinstance(context_and_question_tokens_len, tuple):
+                context_and_question_tokens_len = torch.stack(context_and_question_tokens_len)
+            context_and_question_mask = get_mask_from_lengths(context_and_question_tokens_len)
+            enc_mask = torch.cat([virtual_mask, context_and_question_mask], dim=1)
+
+        max_dec_input_len = max(dec_input_len).item() if dec_input_len is not None else 0
+        max_dec_labels_len = max(dec_labels_len).item() if dec_labels_len is not None else 0
+
+        (
+            virtual_tokens_list,
+            context_question_tokens_list,
+            dec_input_list,
+            dec_input_mask_list,
+            dec_labels_list,
+            dec_labels_mask_list,
+            speech_mask_list,
+            cross_attention_prior_list,
+            text_limits,
+            lang_list,
+        ) = (
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+
+        for i, sample_tuple in enumerate(batch):
+            (
+                _,
+                virtual_token,
+                virtual_token_len,
+                context_token_len,
+                context_and_question_token,
+                context_and_question_token_len,
+                dec_input,
+                dec_input_len,
+                dec_label,
+                dec_label_len,
+                is_speech,
+                cross_attention_prior,
+                lang,
+                _,
+            ) = sample_tuple
+
+            virtual_tokens_list.append(
+                general_padding(
+                    virtual_token, virtual_token_len.item(), max_virtual_tokens_len, pad_value=self.tokenizer.pad_id
+                )
+            )
+
+            if self.encoder_type == "multi_transformer":
+                context_tokens_padded = general_padding(
+                    context_and_question_token[0],
+                    context_and_question_token_len[0].item(),
+                    max_context_and_question_tokens_len[0],
+                    pad_value=self.tokenizer.pad_id,
+                )
+                if len(context_tokens_padded.shape) < 2:
+                    context_tokens_padded = pad_text_to_speech_dims(
+                        context_tokens_padded, self.tokenizer.pad_id, self.num_speech_codebooks - 1
+                    )
+                question_tokens_padded = general_padding(
+                    context_and_question_token[1],
+                    context_and_question_token_len[1].item(),
+                    max_context_and_question_tokens_len[1],
+                    pad_value=self.tokenizer.pad_id,
+                )
+                if len(question_tokens_padded.shape) < 2:
+                    question_tokens_padded = pad_text_to_speech_dims(
+                        question_tokens_padded, self.tokenizer.pad_id, self.num_speech_codebooks - 1
+                    )
+                context_question_tokens_list.append([context_tokens_padded, question_tokens_padded])
+            else:
+                # This means context and questions are concatenated together
+                context_tokens_padded = general_padding(
+                    context_and_question_token,
+                    context_and_question_token_len.item(),
+                    max_context_and_question_tokens_len,
+                    pad_value=self.tokenizer.pad_id,
+                )
+                if len(context_tokens_padded.shape) < 2:
+                    context_tokens_padded = pad_text_to_speech_dims(
+                        context_tokens_padded, self.tokenizer.pad_id, self.num_speech_codebooks - 1
+                    )
+                context_question_tokens_list.append(context_tokens_padded)
+
+            if max_dec_input_len > 0:
+                dec_input_padded = general_padding(
+                    dec_input, dec_input_len.item(), max_dec_input_len, pad_value=self.tokenizer.pad_id
+                )
+                if len(dec_input_padded.shape) < 2:
+                    dec_input_padded = pad_text_to_speech_dims(
+                        dec_input_padded, self.tokenizer.pad_id, self.num_speech_codebooks - 1
+                    )
+                dec_input_list.append(dec_input_padded)
+                dec_mask = (
+                    torch.as_tensor(([1] * dec_input_len) + ([0] * (max_dec_input_len - dec_input_len)))
+                    .long()
+                    .contiguous()
+                )
+                dec_input_mask_list.append(dec_mask)
+                speech_mask = dec_mask if is_speech else torch.zeros(dec_mask.shape)
+                speech_mask_list.append(speech_mask)
+
+            if max_dec_labels_len > 0:
+                loss_mask = (
+                    torch.as_tensor(([1] * dec_label_len) + ([0] * (max_dec_labels_len - dec_label_len)))
+                    .long()
+                    .contiguous()
+                )
+                dec_label_padded = general_padding(
+                    dec_label, dec_label_len.item(), max_dec_labels_len, pad_value=self.tokenizer.pad_id
+                )
+                if len(dec_label_padded.shape) < 2:
+                    dec_label_padded = pad_text_to_speech_dims(
+                        dec_label_padded, self.tokenizer.pad_id, self.num_speech_codebooks - 1
+                    )
+                dec_labels_list.append(dec_label_padded)
+                dec_labels_mask_list.append(loss_mask)
+
+                _p0 = max_dec_labels_len - dec_label_len
+                if self.encoder_type == "multi_transformer":
+                    _p1 = (
+                        max_virtual_tokens_len
+                        + max_context_and_question_tokens_len[1]
+                        - context_and_question_token_len[1]
+                        - virtual_token_len
+                    )
+                else:
+                    _p1 = (
+                        max_virtual_tokens_len
+                        + max_context_and_question_tokens_len
+                        - context_and_question_token_len
+                        - virtual_token_len
+                    )
+
+                cross_attention_prior_padded = torch.nn.functional.pad(
+                    cross_attention_prior,
+                    pad=(0, _p1, 0, _p0),
+                    mode="constant",
+                    value=1,
+                )
+                cross_attention_prior_list.append(cross_attention_prior_padded)
+
+                if self.encoder_type == "multi_transformer":
+                    _start_of_text_id = virtual_token_len + 4
+                    _end_of_text_id = _start_of_text_id + (
+                        context_and_question_token_len[1] - 2 - 4
+                    )  # -2 for some end tokens
+                else:
+                    _start_of_text_id = virtual_token_len + context_token_len + 4
+                    _end_of_text_id = _start_of_text_id + (
+                        context_and_question_token_len - context_token_len - 2 - 4
+                    )  # -2 for some end tokens
+                text_limits.append(torch.tensor([_start_of_text_id.item(), _end_of_text_id.item()]))
+                lang_list.append(torch.tensor(lang))
+
+        dec_labels_mask = torch.stack(dec_labels_mask_list) if len(dec_labels_mask_list) > 0 else None
+        if dec_labels_mask is not None and self.context_conditioning == 'decoder':
+            # Mask out context tokens from loss computation. +1 for bos/pad in the beginning
+            dec_labels_mask[:, : self.decoder_context_len + 1] = 0
+
+        if self.encoder_type == "multi_transformer":
+            context_batch = torch.stack([c[0] for c in context_question_tokens_list])
+            question_batch = torch.stack([c[1] for c in context_question_tokens_list])
+            context_and_question_tokens = [context_batch, question_batch]
+        else:
+            context_and_question_tokens = torch.stack(context_question_tokens_list)
+
+        data_dict = {
+            "taskname_id": taskname_ids,
+            "virtual_tokens": torch.stack(virtual_tokens_list),
+            "context_and_question_tokens": context_and_question_tokens,
+            "enc_mask": enc_mask,
+            "dec_input": torch.stack(dec_input_list) if len(dec_input_list) > 0 else None,
+            "dec_input_mask": torch.stack(dec_input_mask_list) if len(dec_input_mask_list) > 0 else None,
+            "dec_labels": torch.stack(dec_labels_list) if len(dec_labels_list) > 0 else None,
+            "dec_labels_mask": dec_labels_mask,
+            "speech_mask": torch.stack(speech_mask_list) if len(speech_mask_list) > 0 else None,
+            "context_and_question_tokens_lens": context_and_question_tokens_len,
+            "cross_attention_prior": (
+                torch.stack(cross_attention_prior_list) if len(cross_attention_prior_list) > 0 else None
+            ),
+            "text_limits": (
+                torch.stack(text_limits) if len(text_limits) > 0 else None
+            ),  # tensor, valid range of answer transcripts without virtual/instruction/end tokens.
+            "lang": torch.stack(lang_list),
+            "question_texts": question_texts,
+        }
+
+        return data_dict
diff --git a/nemo/collections/tts/data/speechllm/t5_speechllm_tarred_dataset.py b/nemo/collections/tts/data/speechllm/t5_speechllm_tarred_dataset.py
new file mode 100644
index 000000000000..9b0a4f8d06c2
--- /dev/null
+++ b/nemo/collections/tts/data/speechllm/t5_speechllm_tarred_dataset.py
@@ -0,0 +1,986 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import os
+import random
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+import webdataset as wd
+from omegaconf import OmegaConf
+
+from nemo.collections.asr.data.audio_to_text import (
+    _speech_collate_fn,
+    cache_datastore_manifests,
+    expand_sharded_filepaths,
+    shard_manifests_if_needed,
+)
+from nemo.collections.common.parts.preprocessing import collections
+from nemo.collections.nlp.models.language_modeling.megatron_t5_model import T5Sentinel
+from nemo.collections.nlp.modules.common import VirtualPromptSource
+from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids
+from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
+from nemo.collections.tts.parts.utils.tts_dataset_utils import beta_binomial_prior_distribution, general_padding
+from nemo.core.classes import IterableDataset
+from nemo.utils import logging
+
+__all__ = ['T5SpeechLMTarredDataset']
+
+
+@dataclass
+class G2PConfig:
+    _target_: str = "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p"
+    phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
+    heteronyms: str = "scripts/tts_dataset_files/heteronyms-052722"
+    phoneme_probability: float = 0.5
+
+
+@dataclass
+class TextTokenizer:
+    _target_: str = "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
+    punct: bool = True
+    stresses: bool = True
+    chars: bool = True
+    apostrophe: bool = True
+    pad_with_space: bool = True
+    add_blank_at: bool = True
+    g2p: G2PConfig = G2PConfig()
+
+
+@dataclass
+class TextTokenizerConfig:
+    text_tokenizer: TextTokenizer = TextTokenizer()
+
+
+def _get_default_text_tokenizer_conf():
+    text_tokenizer: TextTokenizerConfig = TextTokenizerConfig()
+    return OmegaConf.create(OmegaConf.to_yaml(text_tokenizer))
+
+
+def pad_text_to_speech_dims(text_tensor, pad_id):
+    token_len = text_tensor.shape[0]
+    empty_padding = torch.ones((7, token_len), dtype=text_tensor.dtype, device=text_tensor.device) * pad_id
+    return torch.cat((text_tensor.unsqueeze(0), empty_padding), dim=0)
+
+
+class InstructionTuningManifestProcessor:
+    """
+    Class that processes a manifest json file containing paths to audio files, transcripts, and durations (in seconds).
+    Each new line is a different sample. Example below:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can be comma-separated paths.
+        parser: Str for a language specific preprocessor or a callable.
+        max_duration: If audio exceeds this length, do not include in dataset.
+        min_duration: If audio is less than this length, do not include in dataset.
+        max_utts: Limit number of utterances.
+        bos_id: Id of beginning of sequence symbol to append if not None.
+        eos_id: Id of end of sequence symbol to append if not None.
+        pad_id: Id of pad symbol. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        manifest_filepath: str,
+        max_duration: Optional[float] = None,
+        min_duration: Optional[float] = None,
+        max_seq_length: Optional[float] = None,
+        max_utts: int = 0,
+        index_by_file_id: bool = False,
+        decoder_only_model: bool = False,
+        use_phoneme_tokenizer: bool = False,
+    ):
+
+        # ASRAudioText(
+        self.collection = collections.InstructionTuningAudioText(
+            manifests_files=manifest_filepath,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            max_seq_length=max_seq_length,
+            max_number=max_utts,
+            index_by_file_id=index_by_file_id,
+            decoder_only_model=decoder_only_model,
+            use_phoneme_tokenizer=use_phoneme_tokenizer,
+        )
+
+
+class _TarredInstructionTuningDataset(IterableDataset):
+    """
+    A similar Dataset to the AudioToCharDataset/AudioToBPEDataset, but which loads tarred audio files.
+    """
+
+    def __init__(
+        self,
+        audio_tar_filepaths: Union[str, List[str]],
+        manifest_filepath: str,
+        sample_rate: int,
+        shuffle_n: int = 0,
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+        max_seq_length: Optional[float] = None,
+        shard_strategy: str = "scatter",
+        shard_manifests: bool = False,
+        global_rank: int = 0,
+        world_size: int = 0,
+        return_sample_id: bool = False,
+        decoder_only_model: bool = False,
+        use_phoneme_tokenizer: bool = False,
+    ):
+        self.shard_manifests = shard_manifests
+
+        # Shard manifests if necessary and possible and then expand the paths
+        manifest_filepath = shard_manifests_if_needed(
+            shard_manifests=shard_manifests,
+            shard_strategy=shard_strategy,
+            manifest_filepaths=manifest_filepath,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+
+        # If necessary, cache manifests from object store
+        cache_datastore_manifests(manifest_filepaths=manifest_filepath)
+
+        self.manifest_processor = InstructionTuningManifestProcessor(
+            manifest_filepath=manifest_filepath,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_seq_length=max_seq_length,
+            max_utts=0,
+            index_by_file_id=True,  # Must set this so the manifest lines can be indexed by file ID
+            decoder_only_model=decoder_only_model,
+            use_phoneme_tokenizer=use_phoneme_tokenizer,
+        )
+
+        self.len = self._compute_len()
+        self.return_sample_id = return_sample_id
+
+        audio_tar_filepaths = expand_sharded_filepaths(
+            sharded_filepaths=audio_tar_filepaths,
+            shard_strategy=shard_strategy,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+
+        if shuffle_n > 0:
+            # Only shuffle training data tar files
+            logging.info("Shuffling Tar files")
+            custom_rng = random.Random()
+            custom_rng.shuffle(audio_tar_filepaths)
+            logging.info("Done shuffling Tar files")
+            logging.info(audio_tar_filepaths[:10])
+
+        self.sample_rate = sample_rate
+
+        # Put together WebDataset
+        self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None)
+
+        if shuffle_n > 0:
+            self._dataset = self._dataset.shuffle(shuffle_n)
+        else:
+            logging.info("WebDataset will not shuffle files within the tar files.")
+
+        self._dataset = (
+            self._dataset.rename(key='__key__', answer='pt', context='context.pt')
+            .to_tuple('key', 'answer', 'context')
+            .pipe(self._filter)
+            .pipe(self._loop_offsets)
+            .map(f=self._build_sample)
+        )
+
+    def _filter(self, iterator):
+        """This function is used to remove samples that have been filtered out by ASRAudioText already.
+        Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample
+        that was filtered out (e.g. for duration).
+        Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard,
+        which may make your code hang as one process will finish before the other.
+        """
+
+        class TarredAudioFilter:
+            def __init__(self, collection):
+                self.iterator = iterator
+                self.collection = collection
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                while True:
+                    audio_filename, answer_bytes, context_bytes = next(self.iterator)
+                    file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+                    if file_id in self.collection.mapping:
+                        return audio_filename, answer_bytes, context_bytes
+
+        return TarredAudioFilter(self.manifest_processor.collection)
+
+    def _loop_offsets(self, iterator):
+        """This function is used to iterate through utterances with different offsets for each file."""
+
+        class TarredAudioLoopOffsets:
+            def __init__(self, collection):
+                self.iterator = iterator
+                self.collection = collection
+                self.current_fn = None
+                self.current_bytes = None
+                self.current_context_bytes = None
+                self.offset_id = 0
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if self.current_fn is None:
+                    self.current_fn, self.current_bytes, self.current_context_bytes = next(self.iterator)
+                    self.offset_id = 0
+                else:
+                    offset_list = self.collection.mapping[self.current_fn]
+                    if len(offset_list) == self.offset_id + 1:
+                        self.current_fn, self.current_bytes, self.current_context_bytes = next(self.iterator)
+                        self.offset_id = 0
+                    else:
+                        self.offset_id += 1
+
+                return self.current_fn, self.current_bytes, self.current_context_bytes, self.offset_id
+
+        return TarredAudioLoopOffsets(self.manifest_processor.collection)
+
+    def _collate_fn(self, batch):
+        return _speech_collate_fn(batch)
+
+    def _build_sample(self, tup):
+        """Builds the training sample by combining the data from the WebDataset with the manifest info."""
+        audio_filename, encodec, ref_encodec, offset_id = tup
+        return audio_filename, encodec, ref_encodec, offset_id
+
+    def get_manifest_sample(self, sample_id):
+        return self.manifest_processor.collection[sample_id]
+
+    def __iter__(self):
+        return self._dataset.__iter__()
+
+    def _compute_len(self):
+        if self.shard_manifests and torch.distributed.is_available() and torch.distributed.is_initialized():
+            my_len = torch.tensor(len(self.manifest_processor.collection), dtype=torch.int32).cuda()
+            torch.distributed.all_reduce(my_len)
+            my_len = my_len.int()
+            logging.info(f'Sharded manifests: Total length: {my_len}')
+        else:
+            my_len = len(self.manifest_processor.collection)
+
+        return my_len
+
+    def __len__(self):
+        return self.len
+
+
+class T5SpeechLMTarredDataset(_TarredInstructionTuningDataset):
+    """
+    The dataset class for prompt-tuning or p-tuning pretrained T5 SpeechLM models.
+    """
+
+    def __init__(
+        self,
+        audio_tar_filepaths: Union[str, List[str]],
+        manifest_filepath: str,
+        tokenizer,
+        virtual_prompt_source: VirtualPromptSource,
+        task_templates: dict,
+        pseudo_tokens,
+        pad_token_id: str,
+        max_seq_length: int,
+        sample_rate: int,
+        shuffle_n: int = 0,
+        min_seq_length: int = 1,
+        add_bos: bool = False,
+        add_eos: bool = True,
+        for_train: bool = True,
+        decoder_starts_with_pad: bool = False,
+        add_eos_to_decoder_output: bool = True,
+        add_sentinel_to_input: bool = True,
+        ul2_prompt_token: str = None,
+        segment_max_duration: Optional[int] = None,
+        trim: bool = False,
+        trim_ref: Optional[float] = None,
+        trim_top_db: Optional[int] = None,
+        trim_frame_length: Optional[int] = None,
+        trim_hop_length: Optional[int] = None,
+        pad_multiple: int = 1,
+        pitch_augment: bool = False,
+        speech_offset: Optional[int] = None,
+        train_task: Optional[str] = None,
+        seq_pattern: Optional[str] = "parallel",
+        shard_strategy: str = "scatter",
+        shard_manifests: bool = False,
+        global_rank: int = 0,
+        world_size: int = 0,
+        return_sample_id: bool = False,
+        decoder_only_model: bool = False,
+        use_phoneme_tokenizer: Optional[bool] = False,
+        lm_vocab_size: Optional[int] = None,
+        use_attention_prior: Optional[bool] = False,
+        attention_prior_scaling_factor: Optional[float] = 1.0,
+        cross_attention_epsilon: Optional[float] = 0.0,
+        num_speech_codebooks: Optional[int] = 8,
+        **kwargs,
+    ):
+        """
+        Only speech parameters are explained here.
+        segment_max_duration: Optional[int] = None, - Speech max segment duration
+        trim: bool = False, - speech parameter
+        trim_ref: Optional[float] = None, - speech parameter
+        trim_top_db: Optional[int] = None, - speech parameter
+        trim_frame_length: Optional[int] = None, - speech parameter
+        trim_hop_length: Optional[int] = None, - speech parameter
+        pad_multiple: int = 1, - speech parameter
+        pitch_augment: bool = False, - speech parameter
+        speech_offset: Optional[int] = None, - if speech tokens then add this offset to the token indices to distinguish between text and speech tokens.
+        **kwargs,
+        """
+        # These two variables need to be set before calling super().__init__() because the parent class calls `load_data()` which requires these attributes.
+        self.decoder_starts_with_pad = decoder_starts_with_pad
+        self.add_eos_to_decoder_output = add_eos_to_decoder_output
+        self.add_sentinel_to_input = add_sentinel_to_input
+        self.ul2_prompt_token = ul2_prompt_token
+        # Speech related variables
+        # self.encodec_model = EncodecModel.encodec_model_24khz()
+        # self.encodec_model.set_target_bandwidth(6.0)
+        self.base_data_dir = None
+        self.segment_max_duration = segment_max_duration
+        self.sample_rate = sample_rate
+        # self.featurizer = WaveformFeaturizer(sample_rate=self.sample_rate)
+        self.pad_multiple = pad_multiple
+        self.pitch_augment = pitch_augment
+        self.trim = trim
+        self.trim_ref = trim_ref if trim_ref is not None else np.max
+        self.trim_top_db = trim_top_db if trim_top_db is not None else 60
+        self.trim_frame_length = trim_frame_length if trim_frame_length is not None else 2048
+        self.trim_hop_length = trim_hop_length if trim_hop_length is not None else 512
+        self.speech_offset = speech_offset if speech_offset is not None else 3
+        self.seq_pattern = seq_pattern
+        self.min_duration = kwargs.get('min_duration', 0.1)
+        self.max_duration = kwargs.get('max_duration', 20)
+        self.use_attention_prior = use_attention_prior
+        self.attention_prior_scaling_factor = attention_prior_scaling_factor
+        self.cross_attention_epsilon = cross_attention_epsilon  # value of prior for context tokens (b/w 0 and 1)
+        assert self.cross_attention_epsilon >= 0.0 and self.cross_attention_epsilon <= 1.0
+
+        self.train_task = train_task
+
+        # Initialized super part
+        self.tokenizer = tokenizer
+        self.virtual_prompt_source = virtual_prompt_source
+        self.task_templates = task_templates
+        self.pseudo_tokens = pseudo_tokens
+        self.pseudo_token_ids = set(self.tokenizer.tokens_to_ids(self.pseudo_tokens))
+        self.pad_token_id = pad_token_id
+        self.max_seq_length = max_seq_length
+        self.min_seq_length = min_seq_length
+        self.add_bos = add_bos
+        self.add_eos = add_eos
+        self.for_train = for_train
+        self.use_phoneme_tokenizer = use_phoneme_tokenizer
+        self.examples = []
+        self.lm_vocab_size = tokenizer.vocab_size if lm_vocab_size is None else lm_vocab_size
+        self.num_speech_codebooks = num_speech_codebooks
+
+        assert self.min_seq_length <= max_seq_length, "Min sequence length should be less than or equal to max"
+        assert self.max_seq_length > 0, "Max sequence length should be greater than 0"
+
+        self.context_length = kwargs.pop('context_length', None)  # only used in gpt dataset atm
+
+        logging.info("Loading and tokenizing dataset ... ")
+
+        super().__init__(
+            audio_tar_filepaths=audio_tar_filepaths,
+            manifest_filepath=manifest_filepath,
+            sample_rate=sample_rate,
+            shuffle_n=shuffle_n,
+            min_duration=self.min_duration,
+            max_duration=self.max_duration,
+            max_seq_length=max_seq_length,
+            shard_strategy=shard_strategy,
+            shard_manifests=shard_manifests,
+            global_rank=global_rank,
+            world_size=world_size,
+            return_sample_id=return_sample_id,
+            decoder_only_model=decoder_only_model,
+            use_phoneme_tokenizer=use_phoneme_tokenizer,
+        )
+
+        self.encodec, self.ref_encodec = None, None
+
+    def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits):
+        """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers"""
+        total_inserted_tokens = 0
+
+        for idx in range(len(virtual_token_splits)):
+            split_start = total_inserted_tokens
+            split_end = total_inserted_tokens + virtual_token_splits[idx]
+            pseudo_tokens_for_split = "".join(self.pseudo_tokens[split_start:split_end])
+            input_example = input_example.replace(f'<|VIRTUAL_PROMPT_{idx}|>', pseudo_tokens_for_split)
+            total_inserted_tokens = split_end
+
+        return input_example
+
+    def pad_taskname_ids(self, taskname_ids):
+        # Pad taskname_ids to be the same length for the prompt encoder
+        if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
+            max_taskname_length = max(len(ids) for ids in taskname_ids)
+            taskname_ids = [ids + [self.pad_token_id] * (max_taskname_length - len(ids)) for ids in taskname_ids]
+            taskname_ids = torch.tensor(taskname_ids)
+
+        # Task ids are just used for a look up embeddings for prompt-table
+        elif self.virtual_prompt_source == VirtualPromptSource.NO_PROMPT:
+            taskname_ids = torch.tensor(taskname_ids)
+
+        return taskname_ids
+
+    def _build_sample(self, tup):
+        audio_filename, self.encodec, self.ref_encodec, offset_id = tup
+
+        file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+        manifest_idx = self.manifest_processor.collection.mapping[file_id][offset_id]
+        manifest_entry = self.manifest_processor.collection[manifest_idx]
+        doc = {}
+        doc['context'] = manifest_entry.context
+        doc['context_type'] = manifest_entry.context_type
+        doc['context_duration'] = manifest_entry.context_duration
+        doc['answer'] = manifest_entry.answer
+        doc['answer_type'] = manifest_entry.answer_type
+        doc['answer_duration'] = manifest_entry.answer_duration
+        doc['question'] = manifest_entry.question
+        doc['question_type'] = manifest_entry.question_type
+
+        taskname = "squad"
+        prompt_template = self.task_templates[taskname]["prompt_template"]
+        prompt_template_fields = self.task_templates[taskname]["prompt_template_fields"]
+        virtual_token_splits = self.task_templates[taskname]["virtual_token_splits"]
+        answer_field = self.task_templates[taskname]["answer_field"]
+
+        input_example = prompt_template
+
+        question_in_manifest = manifest_entry.question
+
+        # Format the input example according to the template
+        # Get context, question and answer codes in a dict.
+        input_dict = self._insert_data_in_template(input_example, prompt_template_fields, doc, answer_field)
+        context_tokens = input_dict['context']
+        question_tokens = input_dict['question']
+
+        # Logic to prune context
+        # In case of TTS task, the entire reference speech is not required, so we randomly select a portion
+        # of the reference audio.
+        # In case of Next token prediction, We want context[:T] to go in the encoder and context[T+1:] to be
+        # predicted by the decoder.
+        start_token_index = 0
+        end_token_index = -1
+        if "Text to speech this" in question_in_manifest:
+            total_context_len = context_tokens[0].size()[1]
+            reduced_len = min(
+                400,
+                (
+                    int(total_context_len * 0.2)
+                    if total_context_len > 600
+                    else int(total_context_len * random.uniform(0.2, 0.5))
+                ),
+            )
+            start_token_index = random.randint(
+                0, total_context_len - reduced_len
+            )  # start index can be greater than 440
+            context_tokens[0] = context_tokens[0][
+                :, start_token_index : min(start_token_index + 440, start_token_index + reduced_len)
+            ]
+        elif "Next token prediction" in question_in_manifest:
+            total_context_len = context_tokens[0].size()[1]
+            end_token_index = int(total_context_len * random.uniform(0.01, 0.2))
+            context_tokens[0] = context_tokens[0][:, :end_token_index]
+
+        # Get virtual tokens
+        virtual_tokens = self._insert_virtual_token_placeholders(input_example.split(' ')[0], virtual_token_splits)
+
+        # a trick to align with the data format in t5 pretraining
+        # new
+        virtual_tokens = self.tokenizer.text_to_ids(virtual_tokens)
+        if self.add_sentinel_to_input:
+            question_tokens = question_tokens + self.tokenizer.text_to_ids(T5Sentinel.FIRST.value)
+
+        # Add BOS/EOS to the input of encoder if desired, adds EOS by default
+        if self.ul2_prompt_token is not None:
+            ul2_prompt_token_id = self.tokenizer.text_to_ids(self.ul2_prompt_token)
+            assert len(ul2_prompt_token_id) == 1
+            context_tokens = ul2_prompt_token_id + context_tokens
+        if self.add_bos:
+            context_tokens = [self.tokenizer.bos_id] + context_tokens
+        if self.add_eos:
+            question_tokens = question_tokens + [self.tokenizer.eos_id]
+
+        # Try to truncate input text to fit into the max sequence length
+        if self._get_len(context_tokens, question_tokens, virtual_tokens) > self.max_seq_length:
+            context_tokens, question_tokens, virtual_tokens = self._truncate_input_speech(
+                context_tokens, question_tokens, virtual_tokens
+            )
+
+        virtual_tokens, virtual_tokens_len = self.list_to_tensor(virtual_tokens)
+        context_tokens, context_tokens_len = self.list_to_tensor(context_tokens)
+        question_tokens, question_tokens_len = self.list_to_tensor(question_tokens)
+
+        if doc["question_type"] != "SPEECH" and doc["context_type"] == "SPEECH":
+            question_tokens = pad_text_to_speech_dims(question_tokens, self.tokenizer.pad_id)
+        if doc["context_type"] != "SPEECH" and doc["question_type"] == "SPEECH":
+            context_tokens = pad_text_to_speech_dims(context_tokens, self.tokenizer.pad_id)
+        context_tokens = context_tokens.to(question_tokens.device)
+        context_and_question_tokens = torch.cat([context_tokens, question_tokens], dim=1)
+
+        # get answer ids
+        if answer_field in doc.keys():  # training and validation
+            answer_ids = self._get_tokens(doc, answer_field, doc[answer_field])
+            if end_token_index > -1:
+                answer_ids[0] = answer_ids[0][:, end_token_index:]
+
+            if self.decoder_starts_with_pad:
+                answer_text_ids = [self.tokenizer.pad_id]
+            else:
+                answer_text_ids = [self.tokenizer.bos_id]
+
+            answer_text_ids += answer_ids
+
+            if self.add_eos_to_decoder_output:
+                answer_text_ids += [self.tokenizer.eos_id]
+            else:
+                answer_text_ids += self.tokenizer.text_to_ids(T5Sentinel.END.value)
+
+        # Skip example if the final length doesn't fit length requirements even after truncation
+        if (
+            self.min_seq_length
+            <= self._get_element_len(context_and_question_tokens) + self._get_element_len(virtual_tokens)
+            <= self.max_seq_length
+            and self.min_seq_length <= self._get_element_len(answer_text_ids) <= self.max_seq_length
+        ):
+            if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
+                taskname_id = self.tokenizer.text_to_ids(taskname)
+            elif (
+                self.virtual_prompt_source == VirtualPromptSource.NO_PROMPT
+            ):  # TODO (@adithyare) this class and GPTPromptLearningDataset should be merged.
+                taskname_id = -1
+            else:
+                raise ValueError("Invalid virtual prompt source specified")
+
+            dec_input = None
+            dec_labels = None
+
+            if answer_field in doc.keys():  # training and validation
+                dec_input = answer_text_ids[:-1]
+                dec_labels = answer_text_ids[1:]
+
+            dec_input, dec_input_len = self.list_to_tensor(dec_input, True)
+            dec_labels, dec_labels_len = self.list_to_tensor(dec_labels, True)
+            is_speech = True if doc["answer_type"] == "SPEECH" else False
+            if is_speech:
+                assert dec_input.dim() == 2 and dec_labels.dim() == 2
+                if self.seq_pattern == "delay_parallel":
+                    num_codebooks = dec_input.shape[0]
+                    dec_input_padded = torch.cat(
+                        [
+                            torch.zeros_like(dec_input[:, 0:num_codebooks]),
+                            dec_input,
+                            torch.zeros_like(dec_input[:, 0:num_codebooks]),
+                        ],
+                        dim=1,
+                    )
+                    dec_labels_padded = torch.cat(
+                        [
+                            torch.zeros_like(dec_labels[:, 0:num_codebooks]),
+                            dec_labels,
+                            torch.zeros_like(dec_labels[:, 0:num_codebooks]),
+                        ],
+                        dim=1,
+                    )
+                    dec_input_new = []
+                    dec_labels_new = []
+                    for _c in range(self.num_speech_codebooks):
+                        st = num_codebooks - _c
+                        et_decoder_input = dec_input_padded.shape[1] - _c
+                        et_decoder_labels = dec_labels_padded.shape[1] - _c
+                        dec_input_new.append(dec_input_padded[_c, st:et_decoder_input])
+                        dec_labels_new.append(dec_labels_padded[_c, st:et_decoder_labels])
+                    dec_input = torch.stack(dec_input_new, dim=0)
+                    dec_labels = torch.stack(dec_labels_new, dim=0)
+                    dec_input_len = torch.tensor(dec_input.shape[1]).long()
+                    dec_labels_len = torch.tensor(dec_labels.shape[1]).long()
+
+            enc_len = context_tokens_len + question_tokens_len + virtual_tokens_len
+            # TODO: Remove hardcoding
+            num_question_offset = 4  # For "Text to Speech this"
+
+            cross_attention_prior = torch.zeros(dec_labels_len, enc_len) + self.cross_attention_epsilon
+            if self.use_attention_prior:
+                cross_attention_question_prior = torch.from_numpy(
+                    beta_binomial_prior_distribution(
+                        question_tokens_len.item() - num_question_offset,
+                        dec_labels_len.item(),
+                        scaling_factor=self.attention_prior_scaling_factor,
+                    )
+                )
+                cross_attention_prior[:, virtual_tokens_len + context_tokens_len + num_question_offset :] = (
+                    cross_attention_question_prior
+                )
+
+            return (
+                taskname_id,
+                virtual_tokens,
+                virtual_tokens_len,
+                context_and_question_tokens,
+                context_tokens_len + question_tokens_len,
+                dec_input,
+                dec_input_len,
+                dec_labels,
+                dec_labels_len,
+                is_speech,
+                cross_attention_prior,
+            )
+        else:
+            return None
+
+    def _truncate_input_speech(self, context_tokens, question_tokens, virtual_tokens):
+        total_len = self._get_len(context_tokens, question_tokens, virtual_tokens)
+        context_len = self._get_element_len(context_tokens)
+        truncation_length = total_len - self.max_seq_length + 1
+        context_tokens[0] = context_tokens[0][:, min(truncation_length, context_len) :]
+        return context_tokens, question_tokens, virtual_tokens
+
+    def list_to_tensor(self, element, fill=False):
+        """
+        Convert list to tensor. The list might contain integers, 2D-tensors (speech tokens) and combination of two.
+        If all of them are ints, simply convert to tensor
+        If combination of 2D-tensor and ints. Convert int to the dimension of the tensor.
+        example: [2, 4, 5] -> torch.tensor([2, 4, 5])
+        example: [2, torch.tensor([[4, 5, 6], [6, 7, 8]])] -> torch.tensor( [[-1, 4, 5, 6], [2, 6, 7, 8]] )
+        """
+        ret, ln = None, None
+        if element is None:
+            return ret, ln
+
+        max_len = max([1 if isinstance(item, int) else len(item) for item in element])
+        if max_len == 1:
+            ret = torch.as_tensor(element).long()
+            ln = torch.tensor(ret.size()[0]).long()
+        else:
+            ret = []
+            for e in element:
+                if isinstance(e, int):
+                    tmp = torch.full((8, 1), e if fill else -1)
+                    tmp[7] = e
+                else:
+                    tmp = e
+                ret.append(tmp)
+            ret = torch.cat(ret, dim=1)
+            ln = torch.tensor(ret.size()[1]).long()
+        return ret, ln
+
+    def _get_text_tokens(self, text):
+        input_ids = self.tokenizer.text_to_ids(text)
+        return input_ids
+
+    def _get_phoneme_tokens(self, text):
+        input_ids = phoneme_tokenizer.encode(text)
+        input_ids_adjusted = [_id + self.lm_vocab_size for _id in input_ids]
+        return input_ids_adjusted
+
+    def _pad_wav_to_multiple(self, wav):
+        if self.pad_multiple > 1:
+            if wav.shape[0] % self.pad_multiple != 0:
+                wav = torch.cat(
+                    [wav, torch.zeros(self.pad_multiple - wav.shape[0] % self.pad_multiple, dtype=torch.float)]
+                )
+        return wav
+
+    def _get_element_len(self, element):
+        length = 0
+        if isinstance(element, list):
+            for e in element:
+                if isinstance(e, int):
+                    length += 1
+                else:
+                    if e.dim() > 1:
+                        length += e.size()[1]
+                    else:
+                        length += e.size()[0]
+        else:
+            if element.dim() > 1:
+                length += element.size()[1]
+            else:
+                length += element.size()[0]
+        return length
+
+    def _get_len(self, context_tokens, question_tokens, virtual_tokens):
+        length = 0
+        length += self._get_element_len(context_tokens)
+        length += self._get_element_len(question_tokens)
+        length += self._get_element_len(virtual_tokens)
+        return length
+
+    def _get_speech_tokens(self, field):
+
+        # Convert to codes
+        codec_codes, codec_codes_length = None, None  # Codes
+
+        if self.train_task == 'tts':
+            if field == 'context':
+                self.ref_encodec = torch.load(io.BytesIO(self.ref_encodec), map_location="cpu").long()
+                codec_codes = self.ref_encodec
+            elif field == 'answer':
+                self.encodec = torch.load(io.BytesIO(self.encodec), map_location="cpu").long()
+                codec_codes = self.encodec
+        elif self.train_task == 'asr':
+            if field == 'context':
+                self.ref_encodec = torch.load(io.BytesIO(self.ref_encodec), map_location="cpu").long()
+                codec_codes = self.ref_encodec
+
+        # codec_codes_length = torch.tensor(codec_codes.shape[1]).long()
+
+        # Convert codes to codes corresponding to megatron embedding layer
+        codec_codes[0] = (codec_codes[0] + self.speech_offset).long()
+
+        return codec_codes
+
+    def _get_tokens(self, doc, field, field_data):
+        if f"{field}_type" not in doc.keys():
+            field_tokens = self._get_text_tokens(field_data.strip(" "))  # list of ids
+        elif doc[f"{field}_type"] == 'TEXT':
+            _text = field_data.strip(" ")
+            if self.use_phoneme_tokenizer:
+                instruction_tokens = self._get_text_tokens("Phoneme TTS")
+                field_tokens = self._get_phoneme_tokens(_text.replace("Text to speech this ", ""))
+                field_tokens = instruction_tokens + field_tokens
+            else:
+                field_tokens = self._get_text_tokens(_text)  # list of ids
+        elif doc[f"{field}_type"] == 'SPEECH':
+            field_tokens = self._get_speech_tokens(field)  # list of ids
+            if not isinstance(field_tokens, list):
+                field_tokens = [field_tokens]
+        elif doc[f"{field}_type"] == 'TOKENS':
+            # Do nothing; already tokenized
+            field_tokens = field_data
+        else:
+            raise Exception(f"{field}_type not recognized")
+        return field_tokens
+
+    def _insert_data_in_template(self, input_example, prompt_template_fields, doc, answer_field):
+        """Format the input example according to the template"""
+        out_dict = {}
+        for field in prompt_template_fields:
+            # discard the last one, {label} / {answer}
+            # Or if some fields from the template aren't present, e.g. {answer} during inference
+            # just remove that field from the template, leaving the space blank
+            if field == answer_field or field not in doc.keys():
+                continue
+                #  out_dict[field] = ""
+
+            elif field in doc.keys():
+                field_data = doc[field]
+                if f"{field}_type" not in doc.keys():
+                    doc[f"{field}_type"] = "TEXT"
+                    raise Exception(f"{field}_type does not exist in doc")
+                else:
+                    out_dict[field] = self._get_tokens(doc, field, field_data)
+        return out_dict
+
+    def get_position_ids(self, virtual_token, context_and_qquestion):
+        enc_input = []
+        enc_input.append(virtual_token)
+        if context_and_qquestion.dim() > 2:
+            enc_input.append(context_and_qquestion[:, 0, :])
+        else:
+            enc_input.append(context_and_qquestion)
+
+        enc_input = torch.cat(enc_input, dim=1)
+
+        enc_input_p = enc_input[:, 0, :] if enc_input.dim() == 3 else enc_input
+        return build_position_ids(enc_input_p).contiguous()
+
+    def collate_fn(self, batch):
+        """Prepares enc_input, dec_input, labels, loss_mask, enc_mask, dec_mask, position_ids, taskname_ids for global batch"""
+
+        data_dict = self.pad_batch_and_build_loss_mask(batch)
+
+        position_ids = self.get_position_ids(data_dict['virtual_tokens'], data_dict['context_and_question_tokens'])
+
+        return (
+            data_dict['virtual_tokens'],
+            data_dict['context_and_question_tokens'],
+            data_dict['enc_mask'],
+            data_dict['dec_input'],
+            data_dict['dec_input_mask'],
+            data_dict['dec_labels'],
+            data_dict['dec_labels_mask'],
+            position_ids,
+            data_dict['taskname_id'],
+            data_dict['speech_mask'],
+            data_dict['context_and_question_tokens_lens'],
+            data_dict['cross_attention_prior'],
+        )
+
+    def pad_batch_and_build_loss_mask(self, batch):
+        """Pad enc_input, dec_input, labels in batch to max batch length while building loss_mask, enc_mask, and dec_mask"""
+        (
+            taskname_ids,
+            _,
+            virtual_tokens_len,
+            _,
+            context_and_question_tokens_len,
+            _,
+            dec_input_len,
+            _,
+            dec_labels_len,
+            _,
+            _,
+        ) = zip(*batch)
+
+        taskname_ids = self.pad_taskname_ids(taskname_ids)
+
+        max_virtual_tokens_len = max(virtual_tokens_len).item() if virtual_tokens_len is not None else 0
+        if isinstance(virtual_tokens_len, tuple):
+            virtual_tokens_len = torch.stack(virtual_tokens_len)
+        virtual_mask = get_mask_from_lengths(virtual_tokens_len)
+
+        max_context_and_question_tokens_len = (
+            max(context_and_question_tokens_len).item() if context_and_question_tokens_len is not None else 0
+        )
+        if isinstance(context_and_question_tokens_len, tuple):
+            context_and_question_tokens_len = torch.stack(context_and_question_tokens_len)
+        context_and_question_mask = get_mask_from_lengths(context_and_question_tokens_len)
+
+        max_dec_input_len = max(dec_input_len).item() if dec_input_len is not None else 0
+        max_dec_labels_len = max(dec_labels_len).item() if dec_labels_len is not None else 0
+        enc_mask = torch.cat([virtual_mask, context_and_question_mask], dim=1)
+
+        (
+            virtual_tokens_list,
+            context_question_tokens_list,
+            dec_input_list,
+            dec_input_mask_list,
+            dec_labels_list,
+            dec_labels_mask_list,
+            speech_mask_list,
+            cross_attention_prior_list,
+        ) = (
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+
+        for i, sample_tuple in enumerate(batch):
+            (
+                _,
+                virtual_token,
+                virtual_token_len,
+                context_and_question_token,
+                context_and_question_token_len,
+                dec_input,
+                dec_input_len,
+                dec_label,
+                dec_label_len,
+                is_speech,
+                cross_attention_prior,
+            ) = sample_tuple
+
+            virtual_tokens_list.append(
+                general_padding(
+                    virtual_token, virtual_token_len.item(), max_virtual_tokens_len, pad_value=self.tokenizer.pad_id
+                )
+            )
+
+            context_tokens_padded = general_padding(
+                context_and_question_token,
+                context_and_question_token_len.item(),
+                max_context_and_question_tokens_len,
+                pad_value=self.tokenizer.pad_id,
+            )
+            if len(context_tokens_padded.shape) < 2:
+                context_tokens_padded = pad_text_to_speech_dims(context_tokens_padded, self.tokenizer.pad_id)
+            context_question_tokens_list.append(context_tokens_padded)
+
+            if max_dec_input_len > 0:
+                dec_input_padded = general_padding(
+                    dec_input, dec_input_len.item(), max_dec_input_len, pad_value=self.tokenizer.pad_id
+                )
+                if len(dec_input_padded.shape) < 2:
+                    dec_input_padded = pad_text_to_speech_dims(dec_input_padded, self.tokenizer.pad_id)
+                dec_input_list.append(dec_input_padded)
+                dec_mask = (
+                    torch.as_tensor(([1] * dec_input_len) + ([0] * (max_dec_input_len - dec_input_len)))
+                    .long()
+                    .contiguous()
+                )
+                dec_input_mask_list.append(dec_mask)
+                speech_mask = dec_mask if is_speech else torch.zeros(dec_mask.shape)
+                speech_mask_list.append(speech_mask)
+
+            if max_dec_labels_len > 0:
+                loss_mask = (
+                    torch.as_tensor(([1] * dec_label_len) + ([0] * (max_dec_labels_len - dec_label_len)))
+                    .long()
+                    .contiguous()
+                )
+                dec_label_padded = general_padding(
+                    dec_label, dec_label_len.item(), max_dec_labels_len, pad_value=self.tokenizer.pad_id
+                )
+                if len(dec_label_padded.shape) < 2:
+                    dec_label_padded = pad_text_to_speech_dims(dec_label_padded, self.tokenizer.pad_id)
+                dec_labels_list.append(dec_label_padded)
+                dec_labels_mask_list.append(loss_mask)
+
+                _p0 = max_dec_labels_len - dec_label_len
+                _p1 = (
+                    max_virtual_tokens_len
+                    + max_context_and_question_tokens_len
+                    - context_and_question_token_len
+                    - virtual_token_len
+                )
+
+                cross_attention_prior_padded = torch.nn.functional.pad(
+                    cross_attention_prior,
+                    pad=(0, _p1, 0, _p0),
+                    mode="constant",
+                    value=1,
+                )
+                cross_attention_prior_list.append(cross_attention_prior_padded)
+
+        data_dict = {
+            "taskname_id": taskname_ids,
+            "virtual_tokens": torch.stack(virtual_tokens_list),
+            "context_and_question_tokens": torch.stack(context_question_tokens_list),
+            "enc_mask": enc_mask,
+            "dec_input": torch.stack(dec_input_list) if len(dec_input_list) > 0 else None,
+            "dec_input_mask": torch.stack(dec_input_mask_list) if len(dec_input_mask_list) > 0 else None,
+            "dec_labels": torch.stack(dec_labels_list) if len(dec_labels_list) > 0 else None,
+            "dec_labels_mask": torch.stack(dec_labels_mask_list) if len(dec_labels_mask_list) > 0 else None,
+            "speech_mask": torch.stack(speech_mask_list) if len(speech_mask_list) > 0 else None,
+            "context_and_question_tokens_lens": context_and_question_tokens_len,
+            "cross_attention_prior": (
+                torch.stack(cross_attention_prior_list) if len(cross_attention_prior_list) > 0 else None
+            ),
+        }
+
+        return data_dict
diff --git a/nemo/collections/tts/g2p/models/zh_cn_pinyin.py b/nemo/collections/tts/g2p/models/zh_cn_pinyin.py
index 985897d8df3f..2fe0ac3f6077 100644
--- a/nemo/collections/tts/g2p/models/zh_cn_pinyin.py
+++ b/nemo/collections/tts/g2p/models/zh_cn_pinyin.py
@@ -93,7 +93,7 @@ def __init__(
         self.ascii_letter_dict = {
             x: ascii_letter_prefix + x for x in get_grapheme_character_set(locale="en-US", case=ascii_letter_case)
         }
-        self.ascii_letter_list = sorted(self.ascii_letter_dict)
+        self.ascii_letter_list = sorted(self.ascii_letter_dict.values())
         self.ascii_letter_case = ascii_letter_case
 
         if apply_to_oov_word is None:
@@ -181,6 +181,7 @@ def __call__(self, text: str) -> List[str]:
         `['wo3', 'jin1', 'tian1', 'qu4', 'le5', 'A', 'p', 'p', 'l', 'e', ' ', 'S', 't', 'o', 'r', 'e', ',', ' ', 'mai3',
          'le5', 'yi2', 'ge4', 'i', 'P', 'h', 'o', 'n', 'e', '。']`
         """
+        err = False
         text = set_grapheme_case(text, case=self.ascii_letter_case)
 
         pinyin_seq = []
@@ -201,7 +202,15 @@ def __call__(self, text: str) -> List[str]:
             tone_hyp = pinyin[-1]
             if tone_hyp in self.tone_dict:
                 syllable = pinyin[:-1]
-                assert syllable in self.phoneme_dict, f"Syllable <{syllable}> does not exist in the dictionary."
+                # TODO: skipping the syllable that does not exist in the dictionary will lead to deletion errors in the
+                #   synthesized speech. Even though this case is uncommon, it should be fixed in future.
+                if syllable not in self.phoneme_dict:
+                    err = True
+                    logging.error(
+                        f"Syllable <{syllable}> does not exist in the dictionary. You should expect symbol "
+                        f"deletion risks!!"
+                    )
+                    continue
                 phoneme_seq += self.phoneme_dict[syllable]
                 phoneme_seq.append(self.tone_dict[tone_hyp])
             # All pinyin would end up with a number in 1-5, which represents tones of the pinyin.
@@ -211,4 +220,6 @@ def __call__(self, text: str) -> List[str]:
                 phoneme_seq.append(self.ascii_letter_dict[tone_hyp])
             else:
                 phoneme_seq.append(pinyin)
+        if err:
+            logging.error(f"|{text}| contained unknown syllables")
         return phoneme_seq
diff --git a/nemo/collections/tts/models/speechllm/__init__.py b/nemo/collections/tts/models/speechllm/__init__.py
new file mode 100644
index 000000000000..9df65818d226
--- /dev/null
+++ b/nemo/collections/tts/models/speechllm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py b/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py
new file mode 100644
index 000000000000..f5c3ba720224
--- /dev/null
+++ b/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py
@@ -0,0 +1,444 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import torch
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+from torch import Tensor
+
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.collections.nlp.metrics.prompt_learning_metrics import AccuracyScore, BLEUScore, ROUGEScores
+from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
+from nemo.collections.nlp.modules.common import (
+    PromptEncoder,
+    PromptEncoderType,
+    VirtualPromptPlaceholderToken,
+    VirtualPromptSource,
+    VirtualPromptStyle,
+)
+from nemo.collections.nlp.modules.common.transformer.text_generation import TextGeneration
+from nemo.collections.nlp.parts import utils_funcs
+from nemo.utils import AppState
+
+try:
+    from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
+
+    HAVE_APEX = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+
+try:
+    from megatron.core import parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+
+__all__ = ['MegatronBaseSpeechLM']
+
+
+class MegatronBaseSpeechLM(MegatronBaseModel, TextGeneration):
+    """
+    Model class for prompt-tuning or p-tuning a pretrained Megatron model.
+
+    Prompt Tuning initalizes virtual prompt embeddings directly from a copy of
+    certain token embeddings from the the pretrained model's vocabulary
+    and directly tunes these embedding weights. The token embeddings used in
+    initalization are specified by the user in the config file. The model can
+    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a
+    prompt table and can be added or deleted without disrupting virtual prompts
+    for other tasks.
+
+    P-tuning initializes an LSTM encoder model that generates virtual prompt
+    embeddings for every task. Each task shares the same encoder. After ptuning
+    is compelete, the learned virtual prompts can be saved to the prompt table
+    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a
+    new virtual prompt via p-tuning, they do not need to retrain on all previous
+    tasks. This gives p-tuning the same task flexiblity as prompt-tuning.
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        super().__init__(cfg, trainer)
+        self.init_model(cfg, trainer)
+        self.config = self.model_parallel_config
+
+    def init_model(self, cfg: DictConfig, trainer: Trainer):
+        self.cfg = cfg
+
+        self.load_frozen_model(cfg, trainer)
+        self.prompt_encoder = None
+        self.tokenizer = self.frozen_model.tokenizer
+
+        if hasattr(self.frozen_model.cfg, "encoder") and hasattr(self.frozen_model.cfg, "decoder"):
+            self.hidden_size = (
+                self.frozen_model.cfg.encoder.hidden_size
+            )  # Encoder and decoder need to have the same hidden size and we check for this in the frozen enc-dec model.
+        else:
+            self.hidden_size = self.frozen_model.cfg.hidden_size
+
+        self.existing_tasks = list(self.cfg.get('existing_tasks', []))
+        self.new_tasks = list(self.cfg.get('new_tasks', []))
+        self.virtual_prompt_style = VirtualPromptStyle(cfg.virtual_prompt_style)
+
+        # Load templates for assigning virtual prompt token positions
+        self.load_task_templates(self.cfg.task_templates)
+
+        if self.first_stage_of_pipeline() and self.virtual_prompt_style in [
+            VirtualPromptStyle.P_TUNING,
+        ]:
+            # TODO: Handle this when moving GPT prompt learning to the base class.
+            self.word_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.word_embeddings
+
+        # P-Tuning uses an LSTM Encoder to produce virtual token embeddings
+        if self.virtual_prompt_style == VirtualPromptStyle.P_TUNING:
+            self.virtual_prompt_source = VirtualPromptSource.PROMPT_ENCODER
+        elif self.virtual_prompt_style == VirtualPromptStyle.NO_PROMPT:
+            self.virtual_prompt_source = VirtualPromptSource.NO_PROMPT
+        else:
+            raise ValueError(f"\nvirtual prompt style '{cfg.virtual_prompt_style}'")
+
+        self._reduced_loss_buffer = []
+        self._inference_config = None
+
+        # Prepare pseudo token ids for virtual/virtual prompt tokens
+        self.pseudo_tokens = get_pseudo_tokens(self.max_virtual_tokens)
+        if isinstance(self.tokenizer, SentencePieceTokenizer):
+            self.tokenizer.add_special_tokens(self.pseudo_tokens)
+        else:
+            self.tokenizer.add_special_tokens({'additional_special_tokens': self.pseudo_tokens})
+        self.pseudo_token_ids = self.tokenizer.tokens_to_ids(self.pseudo_tokens)
+        self.pseudo_token_ids_start = self.pseudo_token_ids[0] if self.pseudo_token_ids else None
+        self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id
+        self.decoder_seq_length = cfg.get('decoder_seq_length', 40)
+
+        self.autocast_dtype = utils_funcs.torch_dtype_from_precision(self.cfg.precision)  # Mixed precision datatype
+        # make sure the default pytorch lightning gradient clipping in the basemodel
+        self.grad_clip_pl_default = True
+        self.lowest_val_loss = None
+        self.prompt_encoder = None
+
+        self.enable_autocast = not self.megatron_amp_O2 and self.autocast_dtype in [torch.float16, torch.bfloat16]
+
+        # define validation metric
+        if self.cfg.get('report_validation_metric', False):
+            validation_metric = self.cfg.get('validation_metric', 'accuracy')
+            if validation_metric == 'accuracy':
+                self.validation_metric = AccuracyScore()
+            elif validation_metric == 'bleu':
+                self.validation_metric = BLEUScore()
+            elif validation_metric == 'rouge':
+                self.validation_metric = ROUGEScores()
+
+    def load_task_templates(self, task_templates):
+        """
+        Takes in the task template portion of the config and turns
+        it into a table where each task's prompt template and
+        the number of virtual tokens to insert in a given part of
+        the prompt template are specified.
+        """
+        self.task_templates = {}
+        self.task_id_num_to_name = {}
+        self.max_virtual_tokens = 0
+
+        task_id_num = 0
+        for task in task_templates:
+            self.task_templates[task.taskname] = {
+                "prompt_template": task.prompt_template,
+                "prompt_template_fields": re.findall("\{(.*?)\}", task.prompt_template),
+                "answer_only_loss": task.get("answer_only_loss", False),
+                "answer_field": task.get("answer_field", None),
+                "truncate_field": task.truncate_field,
+                "total_virtual_tokens": task.total_virtual_tokens,
+                "virtual_token_splits": task.virtual_token_splits,
+                "task_id_num": task_id_num,
+            }
+
+            self.max_virtual_tokens = max(self.max_virtual_tokens, task.total_virtual_tokens)
+            self.task_id_num_to_name[task_id_num] = task.taskname
+            task_id_num += 1
+
+        # Check that all new tasks have the same total num virtual tokens
+        # Num virtual tokens for new tasks don't need to match num used for previously tuned tasks
+        if self.new_tasks:
+            new_task_name = self.new_tasks[0]
+            self.total_new_task_virtual_tokens = self.task_templates[new_task_name]["total_virtual_tokens"]
+
+            assert all(
+                self.task_templates[taskname]["total_virtual_tokens"] == self.total_new_task_virtual_tokens
+                for taskname in self.new_tasks
+            ), "Total virtual tokens for each task tuned simultaneously must match. If you want to use a different number of virtual tokens for different tasks, tune them separately."
+
+    def init_prompt_encoder(self):
+        """
+        Init the prompt encoder needed for p-tuning on a new task
+        """
+        # Total virtual tokens should be the same across all new tasks, so just need one
+        new_task = self.new_tasks[0]
+        total_virtual_tokens = self.task_templates[new_task]["total_virtual_tokens"]
+
+        encoder_type = PromptEncoderType(self.cfg.p_tuning.get("encoder_type", "tpmlp").lower())
+        self.prompt_encoder = PromptEncoder(
+            config=self.model_parallel_config,
+            encoder_type=encoder_type,
+            total_virtual_tokens=total_virtual_tokens,
+            token_dim=self.hidden_size,
+            hidden_size=self.cfg.p_tuning.get("encoder_hidden", self.hidden_size // 2),
+            lstm_dropout=self.cfg.p_tuning.get("dropout", 0.0),
+            num_layers=self.cfg.p_tuning.get("num_layers", 2),
+            init_std=self.cfg.p_tuning.get("init_std", 0.023),
+            taskname=new_task,
+        )
+
+    def freeze_existing_word_embeddings(self):
+        """Freeze params of existing virtual prompts that should not be tuned further"""
+        # Make sure word embeddings are frozen
+        for params in self.word_embeddings.parameters():
+            params.requires_grad = False
+
+    def state_dict(self):
+        """
+        Custom state dict that only contains prompt table and prompt encoder parameters.
+        No frozen model parameters are stored in the state dict. Prompt encoder parameters
+        are only in state dict for intermediate checkpoints saved during training. Final
+        nemo checkpoints at the end of training will contain prompt table parameters only.
+        """
+        state_dict_ = {}
+        state_dict_["frozen_model_enc_dec_model"] = self.frozen_model.enc_dec_model.state_dict()
+        state_dict_["word_embeddings"] = self.word_embeddings.state_dict()
+        if self.prompt_encoder is not None:
+            state_dict_["prompt_encoder"] = self.prompt_encoder.state_dict()
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict: bool = True):
+        """
+        Custom load state dict method that only loads prompt table and prompt encoder
+        parameters. Matching load method for this class' custom state dict method.
+        """
+        self.init_prompt_encoder()
+        self.frozen_model.enc_dec_model.load_state_dict(state_dict["frozen_model_enc_dec_model"], strict)
+        self.word_embeddings.load_state_dict(state_dict["word_embeddings"], strict)
+        if 'prompt_encoder' in state_dict:
+            self.prompt_encoder.load_state_dict(state_dict["prompt_encoder"], strict)
+
+        # Not sure why when we resume training the prompt encoder is on cpu
+        # Because it's not created on init - Should really be moved to init
+        self.prompt_encoder.to("cuda")
+
+    def embed_input(self, input_ids: Tensor, taskname_ids: Tensor, use_cached_reps: bool):
+        """
+        Replaces the virtual tokens in the input_ids with embeddings
+        calculated from either the 'prompt_table' or 'prompt_encoder'.
+        The virtual token placeholders have token_ids listed in
+        `self.pseudo_token_ids`.
+
+        params:
+            input_ids: the input token ids
+            taskname_ids: the NLP task tag token ids
+        returns:
+            the token embedding for the LM model.
+        """
+        # Replace virtual token ids with padding for forward pass through vocab embeddings
+        discrete_token_ids = input_ids.clone()
+        discrete_token_ids[(input_ids >= self.pseudo_token_ids_start)] = self.pad_token_id
+        discrete_token_embeds = self.word_embeddings(discrete_token_ids).clone()
+
+        # Find the indicies where virtual tokens should be inserted
+        virtual_token_locations = input_ids >= self.pseudo_token_ids_start
+
+        # If there are no virtual tokens, just return discrete token embeds
+        if not virtual_token_locations.any():
+            return discrete_token_embeds
+
+        if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
+            # taskname_embeddings = self.word_embeddings(taskname_ids)
+            batch_size, _ = taskname_ids.size()
+            virtual_token_embeds = self.prompt_encoder(batch_size=batch_size, use_cached_reps=use_cached_reps)
+        else:
+            raise ValueError("invalid VirtualPromptSource.")
+
+        # Create index template specifying where virtual token embeddings should be placed
+        batch_size, _, embedding_size = discrete_token_embeds.shape
+        virtual_token_index = virtual_token_locations.nonzero().reshape((batch_size, -1, 2))[:, :, 1][:, :, None]
+        virtual_token_index = virtual_token_index.expand(
+            batch_size, self.total_new_task_virtual_tokens, embedding_size
+        )
+
+        # Make sure discrete_token_embeds and virtual_token_embeds share the same dtype
+        discrete_token_embeds = discrete_token_embeds.type(virtual_token_embeds.dtype)
+
+        # Insert virtual token embeddings where they belong amoung the discrete token embeddings
+        discrete_token_embeds.scatter_(1, virtual_token_index, virtual_token_embeds)
+        input_embeds = discrete_token_embeds
+
+        return input_embeds
+
+    def on_train_end(self):
+        # Save p-tuned prompts to prompt table for inference or future task training
+        self.save_to(save_path=self.cfg.nemo_path)
+
+    def setup(self, stage=None):
+        if stage == 'predict' and self.first_stage_of_pipeline():
+            return
+
+        self.setup_test_data()
+        if stage == 'test':
+            return
+
+        if self.first_stage_of_pipeline():
+            if self.virtual_prompt_style == VirtualPromptStyle.P_TUNING:
+                if self.prompt_encoder is None:
+                    self.init_prompt_encoder()
+
+        self.setup_training_data()
+        self.setup_validation_data()
+
+    def setup_training_data(self, training_data_config=None):
+        if self.cfg.data.get('train_ds', None):
+            self._train_ds, self._train_dl = self.build_virtual_prompt_dataset(
+                dataset_paths=self.cfg.data.train_ds,
+                batch_size=self.cfg.global_batch_size,
+                for_train=True,
+                drop_last=True,
+                shuffle=True,
+                num_workers=self.cfg.data.num_workers,
+                pin_memory=True,
+            )
+        elif self.cfg.data.get('train_manifest', None):
+            self._train_ds, self._train_dl = self.build_virtual_prompt_tarred_dataset(
+                dataset_paths=self.cfg.data.train_manifest,
+                audio_path=self.cfg.data.train_audio_path,
+                batch_size=self.cfg.global_batch_size,
+                for_train=True,
+                drop_last=True,
+                shuffle=self.cfg.data.shuffle,
+                num_workers=self.cfg.data.num_workers,
+                pin_memory=True,
+            )
+
+    def setup_validation_data(self, validation_data_config=None):
+        if self.cfg.data.get('validation_ds', None):
+            self._validation_ds, self._validation_dl = self.build_virtual_prompt_dataset(
+                dataset_paths=self.cfg.data.validation_ds,
+                batch_size=self.cfg.get("validation_global_batch_size", self.cfg.global_batch_size),
+                for_train=True,
+                drop_last=self.cfg.get("validation_drop_last", True),
+                shuffle=False,
+                num_workers=self.cfg.data.num_workers,
+                pin_memory=True,
+            )
+        elif self.cfg.data.get('validation_manifest', None):
+            self._validation_ds, self._validation_dl = self.build_virtual_prompt_tarred_dataset(
+                dataset_paths=self.cfg.data.validation_manifest,
+                audio_path=self.cfg.data.validation_audio_path,
+                batch_size=self.cfg.get("validation_global_batch_size", self.cfg.global_batch_size),
+                for_train=True,
+                drop_last=self.cfg.get("validation_drop_last", True),
+                shuffle=0,
+                num_workers=self.cfg.data.num_workers,
+                pin_memory=True,
+            )
+
+    def setup_test_data(self, test_data_config=None):
+        if self.cfg.data.get('test_ds', None):
+            self._test_ds, self._test_dl = self.build_virtual_prompt_dataset(
+                dataset_paths=self.cfg.data.test_ds,
+                batch_size=self.cfg.get("validation_global_batch_size", self.cfg.global_batch_size),
+                for_train=False,
+                drop_last=False,
+                shuffle=False,
+                num_workers=self.cfg.data.num_workers,
+                pin_memory=True,
+            )
+        elif self.cfg.data.get('test_manifest', None):
+            self._test_ds, self._test_dl = self.build_virtual_prompt_tarred_dataset(
+                dataset_paths=self.cfg.data.test_manifest,
+                audio_path=self.cfg.data.test_audio_path,
+                batch_size=self.cfg.global_batch_size,
+                for_train=False,
+                drop_last=False,
+                shuffle=0,
+                num_workers=self.cfg.data.num_workers,
+                pin_memory=True,
+            )
+
+    def _reconfigure_and_process_inference_batch(self, global_batch_size_per_gpu, gbs):
+        # This should happen only on the last batch of the dataset.
+        if global_batch_size_per_gpu != gbs // parallel_state.get_data_parallel_world_size():
+            # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches.
+            app_state = AppState()
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
+                micro_batch_size=global_batch_size_per_gpu,
+                data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            )
+
+    def _reconfigure_batch_sizes(self, gbs: int, mbs: int):
+        app_state = AppState()
+        _reconfigure_microbatch_calculator(
+            rank=app_state.global_rank,
+            rampup_batch_size=None,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+        )
+
+    def set_inference_config(self, inference_config):
+        self._inference_config = inference_config
+
+    def get_inference_config(self):
+        return self._inference_config
+
+    def set_input_tensor(self, input_tensor):
+        pass
+
+    def first_stage_of_pipeline(self):
+        pass
+
+    @classmethod
+    def list_available_models(cls):
+        pass
+
+    def load_frozen_model(self, cfg, trainer):
+        pass
+
+
+def get_pseudo_tokens(num_virtual_tokens):
+    """
+    Takes in an integer and returns a list of strings where each string
+    is a numbered virtual token placeholder. If
+    num_virtual_tokens = 3, then this function returns:
+
+    ["<prompt_0>", "<prompt_1>", "<prompt_2>"]
+
+    Args:
+        num_virtual_tokens: (int) Number of virtual token strings you want to make
+
+    returns a list of string.
+
+    """
+    pseudo_tokens = [
+        VirtualPromptPlaceholderToken.BASE.value + str(i) + VirtualPromptPlaceholderToken.END.value
+        for i in range(num_virtual_tokens)
+    ]
+
+    return pseudo_tokens
diff --git a/nemo/collections/tts/models/speechllm/megatron_t5_speechllm_model.py b/nemo/collections/tts/models/speechllm/megatron_t5_speechllm_model.py
new file mode 100644
index 000000000000..b3b5d857155d
--- /dev/null
+++ b/nemo/collections/tts/models/speechllm/megatron_t5_speechllm_model.py
@@ -0,0 +1,2672 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import json
+import os
+import random
+import string
+from functools import partial
+from typing import Any, List
+
+import editdistance
+import imageio
+import numpy as np
+import soundfile as sf
+import torch
+from omegaconf import OmegaConf
+from omegaconf.dictconfig import DictConfig
+from omegaconf.omegaconf import open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+
+import nemo.collections.asr as nemo_asr
+from nemo.collections.asr.metrics.wer import word_error_rate
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceSpeechLLMTTSTokenizer
+from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
+from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel
+from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import (
+    MegatronTokenLevelEncoderDecoderSpeechLLMModule,
+)
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    average_losses_across_data_parallel_group,
+    get_iterator_k_split,
+    init_method_normal,
+)
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.utils_funcs import get_last_rank
+from nemo.collections.tts.data.speechllm.t5_speechllm_dataset import Lang, T5SpeechLMDataset
+from nemo.collections.tts.data.speechllm.t5_speechllm_tarred_dataset import T5SpeechLMTarredDataset
+from nemo.collections.tts.losses.aligner_loss import ForwardSumLoss
+from nemo.collections.tts.models import AudioCodecModel
+from nemo.collections.tts.models.speechllm.megatron_base_speechllm_prompt_model import MegatronBaseSpeechLM
+from nemo.collections.tts.parts.utils.helpers import plot_alignment_to_numpy_for_speechllm, plot_codec_to_numpy
+from nemo.utils import AppState, logging
+
+try:
+    from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches
+
+    HAVE_APEX = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_APEX = False
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core.enums import ModelType
+    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+
+import time
+
+import librosa
+from torchaudio.pipelines import SQUIM_SUBJECTIVE
+from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
+
+__all__ = ['MegatronT5SpeechLMModel']
+
+
+class MegatronT5OverrideModel(MegatronT5Model):
+    def _build_tokenizer(self):
+        if self._cfg.tokenizer.library == "sentencepiece":
+            if hasattr(self._cfg.tokenizer, "sentencepiece_legacy"):
+                legacy = self._cfg.tokenizer.sentencepiece_legacy
+            else:
+                legacy = True if self._cfg.tokenizer.library == 'sentencepiece' else False
+            self.tokenizer = SentencePieceSpeechLLMTTSTokenizer(
+                model_path=self.register_artifact("tokenizer.model", self._cfg.tokenizer.get('model', None)),
+                legacy=legacy,
+            )
+
+            if self._cfg.tokenizer.get('additional_special_tokens', None) is not None:
+                tokens_list = OmegaConf.to_object(self._cfg.tokenizer.additional_special_tokens)
+                self.tokenizer.add_special_tokens(tokens_list)
+        else:
+            super()._build_tokenizer()
+
+    def model_provider_func(self, pre_process, post_process, add_encoder, add_decoder):
+        if not hasattr(self.cfg, 'encoder') or not hasattr(self.cfg, 'decoder'):
+            logging.warning(
+                'Could not find encoder or decoder in config. This is probably because of restoring an old checkpoint. Copying shared model configs to encoder and decoder configs.'
+            )
+            # After the call below, self.cfg.encoder and self.cfg.decoder will be populated with the cfg.model configs from old checkpoints.
+            self._populate_encoder_decoder_configs_for_backward_compatibility(self.cfg)
+
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.encoder.arch == 'perceiver':
+            raise ValueError(f"Perceivers with pipeline parallel > 1 is not supported yet.")
+
+        if not hasattr(self.cfg, 'embedding_init_method_std'):
+            embedding_init_method_std = self.cfg.encoder.init_method_std
+        else:
+            embedding_init_method_std = self.cfg.embedding_init_method_std
+
+        if not hasattr(self.cfg, 'embedding_dropout'):
+            embedding_dropout = self.cfg.encoder.hidden_dropout
+        else:
+            embedding_dropout = self.cfg.embedding_dropout
+
+        model = MegatronTokenLevelEncoderDecoderSpeechLLMModule(
+            config=self.model_parallel_config,
+            encoder_cfg=self.cfg.encoder,
+            decoder_cfg=self.cfg.decoder,
+            vocab_size=self.padded_vocab_size,
+            max_position_embeddings=self.cfg.max_position_embeddings,
+            num_tokentypes=0,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process,
+            fp16_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False),
+            precision=self.cfg.get('precision', 16),
+            embedding_init_method_std=embedding_init_method_std,
+            embedding_dropout=embedding_dropout,
+            label_smoothing=self.cfg.get('label_smoothing', 0.0),
+            add_encoder=add_encoder,
+            add_decoder=add_decoder,
+            share_token_embeddings=self.cfg.get('share_token_embeddings', True),
+            share_decoder_tokens_head_embeddings=self.cfg.get('share_decoder_tokens_head_embeddings', True),
+            tokens_head_bias=self.cfg.get('tokens_head_bias', True),
+            hiddens_cfg=self.cfg.get('hiddens', None),
+        )
+        return model
+
+
+class MegatronT5SpeechLMModel(MegatronBaseSpeechLM):
+    """
+    Model class for prompt-tuning or p-tuning a pretrained Megatron T5 model.
+
+    Prompt Tuning initializes virtual prompt embeddings directly from a copy of
+    certain token embeddings from the pretrained T5 model's vocabulary
+    and directly tunes these embedding weights. The token embeddings used in
+    initialization are specified by the user in the config file. The model can
+    be prompt-tuned for multiple tasks at once. Virtual prompts are stored in a
+    prompt table and can be added or deleted without disrupting virtual prompts
+    for other tasks.
+
+    P-tuning initializes an LSTM encoder model that generates virtual prompt
+    embeddings for every task. Each task shares the same encoder. After p-tuning
+    is complete, the learned virtual prompts can be saved to the prompt table
+    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a
+    new virtual prompt via p-tuning, they do not need to retrain on all previous
+    tasks. This gives p-tuning the same task flexibility as prompt-tuning.
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        super().__init__(cfg, trainer)
+        self.model_type = ModelType.encoder_and_decoder
+        speech_codebook_size = cfg.data.get('speech_codebook_size', 1024)
+        num_speech_codebooks = cfg.data.get('num_speech_codebooks', 8)
+        speech_offset = cfg.data.get('speech_offset', 30000)
+        codecmodel_type = cfg.get('codecmodel_type', 'nemo_codec')
+        attn_prior_scaledown_start_step = cfg.get('attn_prior_scaledown_start_step', 10000)
+        attn_prior_end_step = cfg.get('attn_prior_end_step', 11000)
+        num_cross_attention_heads = cfg.get('num_cross_attention_heads', 12)
+        self.lm_vocab_size = cfg.get('lm_vocab_size', 30000)
+        self.context_pattern = cfg.data.get('context_pattern', 'parallel')
+        self.context_conditioning = cfg.get('context_conditioning', "decoder")
+        self.context_duration_min = cfg.data.get('context_duration_min', 2.9)
+        self.context_duration_max = cfg.data.get('context_duration_max', 2.9)
+        self.codebook_fps = cfg.data.get('codebook_fps', 86)
+        self.decoder_context_len = 0
+        if self.context_conditioning == "decoder":
+            assert self.context_duration_min == self.context_duration_max, "Decoder context duration must be fixed"
+            self.decoder_context_len = int(self.codebook_fps * self.context_duration_min)
+
+        self.speech_offset = speech_offset
+        self.speech_codebook_size = speech_codebook_size
+        self.num_speech_codebooks = num_speech_codebooks
+        self.codecmodel_type = codecmodel_type
+        self.enc_output_to_layers = cfg.get('enc_output_to_layers', None)
+        if self.enc_output_to_layers is not None:
+            # Convert from listconfig to list
+            self.enc_output_to_layers = [[l for l in encoder_layer] for encoder_layer in self.enc_output_to_layers]
+
+        self.frozen_model.enc_dec_model.speech_offset = speech_offset
+        self.frozen_model.enc_dec_model.speech_codebook_size = speech_codebook_size
+        self.frozen_model.enc_dec_model.num_speech_codebooks = num_speech_codebooks
+        self.frozen_model.enc_dec_model.seq_pattern = cfg.get('seq_pattern', 'parallel')
+        self.frozen_model.enc_dec_model.attn_prior_scaledown_start_step = attn_prior_scaledown_start_step
+        self.frozen_model.enc_dec_model.attn_prior_end_step = attn_prior_end_step
+        self.frozen_model.enc_dec_model.alignment_decoder_layerids = cfg.get(
+            'alignment_decoder_layerids', list(range(0, 12))
+        )
+        self.frozen_model.enc_dec_model.return_all_crossattention_probs = cfg.get(
+            'return_all_crossattention_probs', False
+        )
+        self.frozen_model.enc_dec_model.num_cross_attention_heads = num_cross_attention_heads
+        self.frozen_model.enc_dec_model.context_conditioning = self.context_conditioning
+        self.frozen_model.enc_dec_model.decoder_context_len = self.decoder_context_len
+        self.frozen_model.enc_dec_model.enc_output_to_layers = self.enc_output_to_layers
+
+        self.alignment_loss_start_step = 0
+        self.alignment_loss_end_step = float('inf')
+        self.use_alignment_loss = cfg.get('use_alignment_loss', False)
+        if self.use_alignment_loss:
+            alignment_loss_scale = cfg.get('alignment_loss_scale', 1.0)
+            self.frozen_model.enc_dec_model.use_alignment_loss = True
+            self.frozen_model.enc_dec_model.forward_sum_loss = ForwardSumLoss(loss_scale=alignment_loss_scale)
+            self.frozen_model.enc_dec_model.alignment_text_end_offset = cfg.get('alignment_text_end_offset', 0)
+            self.frozen_model.enc_dec_model.align_every_n_head = cfg.get('align_every_n_head', 1)
+            self.alignment_loss_start_step = cfg.get('alignment_loss_start_step', 0)
+            self.alignment_loss_end_step = cfg.get('alignment_loss_end_step', float('inf'))
+
+        # Need to explicitly set this since it is already initialized
+        self.frozen_model.enc_dec_model.tokens_head.parallel_output = self.frozen_model.enc_dec_model.parallel_output
+
+        list_of_speech_heads = []
+        list_of_speech_tokens_embeddings = []
+        for _ in range(self.num_speech_codebooks - 1):
+            # init is NOT used since we overwrite the weight below anyways
+            _speech_head_embedding = tensor_parallel.VocabParallelEmbedding(
+                speech_codebook_size,
+                embedding_dim=self.word_embeddings.embedding_dim,
+                init_method=lambda x: x.data.fill_(0),
+                config=self.model_parallel_config,
+            )
+            _speech_head_embedding.weight.data.fill_(0)
+            _speech_head_embedding.shared = True
+            list_of_speech_tokens_embeddings.append(_speech_head_embedding)
+            # Linear layer that maps from hidden size to speech codebook size
+            hidden_size = self.frozen_model.enc_dec_model.decoder_cfg.hidden_size
+            init_method_std = self.frozen_model.enc_dec_model.decoder_cfg.init_method_std
+            # Changing to ColumnParallelLinear instead of Linear to support 3b Tensor Parallelism
+            _speech_head = tensor_parallel.ColumnParallelLinear(
+                input_size=hidden_size,
+                output_size=speech_codebook_size,
+                bias=True,
+                gather_output=not self.frozen_model.enc_dec_model.parallel_output,
+                init_method=init_method_normal(init_method_std),
+                config=self.model_parallel_config,
+            )
+            list_of_speech_heads.append(_speech_head)
+
+        self.frozen_model.enc_dec_model.speech_tokens_heads = torch.nn.ModuleList(list_of_speech_heads)
+        self.frozen_model.enc_dec_model.speech_tokens_embeddings = torch.nn.ModuleList(
+            list_of_speech_tokens_embeddings
+        )
+
+        self.sample_rate = 24000
+        if codecmodel_type == 'nemo_codec':
+            codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'))
+            codec_model.to('cuda')
+            codec_model.eval()
+            self.sample_rate = 22050
+        else:
+            raise NotImplementedError()
+
+        self.additional_models = {'codec': codec_model}
+        self.train_check_interval = self.cfg.get('train_check_interval', 500)
+        self.plot_alignments_sliced = self.cfg.get('plot_alignments_sliced', True)
+        app_state = AppState()
+        self.is_rank_zero = app_state.global_rank == 0
+        self.predict_step_outputs = []
+        self.phoneme_tokenizer = None
+
+        # classifier-free guidance (CFG) option during training. The probability (0.0 <= ε <= 1.0) is used to trigger the action that the
+        # text or audio tokens in a batch are replaced by [UNK], such that mimicking the text- or audio-free scenario.
+        # If a random number is greater than ε, then keep text or audio tokens as-is, otherwise, the text or audio tokens are
+        # replaced by [UNK]. Default to 0.0, meaning CFG is disabled.
+        self.train_text_cfg_prob = cfg.get('train_text_cfg_prob', 0.0)
+        self.train_audio_cfg_prob = cfg.get('train_audio_cfg_prob', 0.0)
+        self._rng = random.Random()
+
+        # control the strength of the classifier guidance during inference, Logits_cfg = w*Logits_cond + (1-w)*Logits_uncond,
+        # equivalent to Logits_cfg = Logits_cond + alpha*(Logits_cond - Logits_uncond) where alpha=w-1.
+        # Default w to 1.O, indicating no interpolation is applied.
+        self.inference_cfg_interpolation_scale = cfg.get('inference_cfg_interpolation_scale', 1.0)
+        self.inference_apply_text_cfg = cfg.get('inference_apply_text_cfg', False)
+        self.inference_apply_audio_cfg = cfg.get('inference_apply_audio_cfg', False)
+        if self.inference_cfg_interpolation_scale == 1.0:
+            self.inference_apply_text_cfg = False
+            self.inference_apply_audio_cfg = False
+
+        # whether to apply cfg filter to address faster speech rate.
+        self.inference_apply_cfg_filter = cfg.get("inference_apply_cfg_filter", False)
+
+        # this scale is suggested to be smaller than `self.question_guidance_scale` and it is used to balance the weights
+        # between the conditioned logits after applying cfg filter and the original unconditioned logits. Default to 1.0,
+        # indicating only conditioned logits are used.
+        if not self.inference_apply_cfg_filter:
+            self.inference_cfg_filter_interpolation_scale = None
+        else:
+            self.inference_cfg_filter_interpolation_scale = cfg.get('inference_cfg_filter_interpolation_scale', 1.0)
+
+        # whether to estimate MOS in predict_step.
+        self.estimate_mos = cfg.get('estimate_mos', True)
+        if self.estimate_mos:
+            # requires to specify a non-matching high-quality and clean reference audio file. It is used to estimate MOS.
+            self.non_matching_ref_audio_filepath = cfg.get('non_matching_ref_audio_filepath', None)
+            if self.non_matching_ref_audio_filepath is None:
+                raise ValueError(
+                    f"Please provide a high-quality reference audio to estimate the MOS. Alternatively, "
+                    f"set `model.estimate_mos=False` to disable MOS estimation."
+                )
+            if not os.path.exists(self.non_matching_ref_audio_filepath):
+                raise FileNotFoundError(
+                    f"Please provide a valid file path for a high-quality reference audio to estimate"
+                    f" the MOS. Alternatively, set `model.estimate_mos=False` to disable MOS estimation."
+                )
+
+    def decode_wav_from_codec_model(self, codes):
+        codec_model = self.additional_models['codec']
+        if self.codecmodel_type == 'nemo_codec':
+            codec_len = torch.Tensor([codes.shape[1]]).long().cuda()
+            if codec_len < 10:
+                # return a one-second silence
+                return torch.zeros(24000).cuda()
+            wav, _ = codec_model.decode(tokens=codes.unsqueeze(0), tokens_len=codec_len)
+            wav = wav[0]
+        else:
+            raise NotImplementedError()
+        return wav
+
+    def first_stage_of_pipeline(self):
+        if self.frozen_model.enc_dec_model.pre_process and parallel_state.get_pipeline_model_parallel_rank() == 0:
+            return True
+        return False
+
+    def forward(
+        self,
+        virtual_tokens,
+        context_and_question_tokens,
+        enc_mask,
+        dec_input,
+        dec_mask,
+        position_ids,
+        taskname_ids,
+        labels=None,
+        speech_mask=None,
+        inference=False,
+        inference_step=0,
+        cross_attention_prior=None,
+        text_limits=None,
+        decoder_max_sequence_len=None,
+        encoder_max_sequence_len=None,
+    ):
+        """
+        Special forward method for p-tuning/prompt-tuning pretrained
+        T5 style models.
+        """
+        if isinstance(context_and_question_tokens, list):
+            multi_encoder = True
+            assert isinstance(enc_mask, list)
+            assert isinstance(position_ids, list)
+            if cross_attention_prior is None:
+                cross_attention_prior = [None for _ in range(len(context_and_question_tokens))]
+            assert isinstance(cross_attention_prior, list)
+            assert len(context_and_question_tokens) == len(enc_mask) == len(position_ids) == len(cross_attention_prior)
+        else:
+            multi_encoder = False
+            context_and_question_tokens = [context_and_question_tokens]
+            enc_mask = [enc_mask]
+            position_ids = [position_ids]
+            cross_attention_prior = [cross_attention_prior]
+
+        enc_output = None
+        logging.debug(
+            f"self.first_stage_of_pipeline()={self.first_stage_of_pipeline()}\tinference_step={inference_step}"
+        )
+        if self.first_stage_of_pipeline() and inference_step == 0:
+            # Get embeddings for text tokens and insert virtual token embeddings
+            encoder_input_list = []
+            for ei in range(len(context_and_question_tokens)):
+                input_embeds = self.get_embeddings_and_combine(
+                    [virtual_tokens, context_and_question_tokens[ei]], taskname_ids, inference
+                )
+                # TODO: This check needs to be revisited with PP support.
+                if hasattr(self.frozen_model.enc_dec_model.encoder_embedding, 'position_embeddings'):
+                    position_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.position_embeddings(
+                        position_ids[ei]
+                    )
+                    encoder_input = input_embeds + position_embeddings
+                else:
+                    encoder_input = input_embeds
+                encoder_input_list.append(encoder_input)
+        else:
+            encoder_input_list = None
+            encoder_input = None
+            if inference_step != 0:
+                enc_output = context_and_question_tokens if multi_encoder else context_and_question_tokens[0]
+
+        # If the decoder input starts with <pad> instead of <bos>, which is the case for huggingface T5 models, we don't want to mask the first token.
+        # For NeMo-Megatron, the sequence starts with <bos>, which is never masked so we can always set index 0 to be unmasked.
+        dec_mask[:, 0] = 1
+
+        if not self.cfg.data.get('use_attention_prior', False):
+            cross_attention_prior = [None for _ in range(len(cross_attention_prior))]
+
+        _encoder_input = encoder_input_list
+        if not multi_encoder:
+            enc_mask = enc_mask[0]
+            cross_attention_prior = cross_attention_prior[0]
+            _encoder_input = encoder_input_list[0] if encoder_input_list is not None else None
+
+        # Call forward on T5 model with preprocessed embeddings
+        if inference and inference_step == 0:
+            set_inference_key_value_memory = True
+        else:
+            set_inference_key_value_memory = False
+
+        if self.autocast_dtype == torch.float32:
+            output, out_logits = self.frozen_model.enc_dec_model(
+                enc_input_ids=None,
+                enc_attn_mask=enc_mask,
+                dec_input_ids=dec_input,
+                dec_attn_mask=dec_mask,
+                token_type_ids=None,
+                labels=labels,
+                output_enc_hidden_only=False,
+                enc_input=_encoder_input,
+                enc_output=enc_output,
+                speech_mask=speech_mask,
+                cross_attention_prior=cross_attention_prior,
+                text_limits=text_limits,
+                global_step=self.global_step,
+                set_inference_key_value_memory=set_inference_key_value_memory,
+                decoder_max_sequence_len=decoder_max_sequence_len,
+                encoder_max_sequence_len=encoder_max_sequence_len,
+            )
+        else:
+            with torch.autocast(device_type="cuda", dtype=self.autocast_dtype):
+                output, out_logits = self.frozen_model.enc_dec_model(
+                    enc_input_ids=None,
+                    enc_attn_mask=enc_mask,
+                    dec_input_ids=dec_input,
+                    dec_attn_mask=dec_mask,
+                    token_type_ids=None,
+                    labels=labels,
+                    output_enc_hidden_only=False,
+                    enc_input=_encoder_input,
+                    enc_output=enc_output,
+                    speech_mask=speech_mask,
+                    cross_attention_prior=cross_attention_prior,
+                    text_limits=text_limits,
+                    global_step=self.global_step,
+                    set_inference_key_value_memory=set_inference_key_value_memory,
+                    decoder_max_sequence_len=decoder_max_sequence_len,
+                    encoder_max_sequence_len=encoder_max_sequence_len,
+                )
+
+        return output, encoder_input, out_logits
+
+    def load_frozen_model(self, cfg, trainer):
+        self.megatron_amp_O2 = cfg.get('megatron_amp_o2', False)
+
+        # TODO: Fix this once apex patches FusedScaledMaskedSoftmax.
+        # This is a workaround for the fact that `masked_softmax_fusion` has issues with certain input sizes that may be present while finetuning.
+        cfg_language_model_path = cfg.get('language_model_path', None)
+        cfg_frozen_model = cfg.get('frozen_model', None)
+        if not (bool(cfg_language_model_path) ^ bool(cfg_frozen_model)):
+            raise ValueError(
+                "T5-TTS requires either 'language_model_path' or 'frozen_model' in its config, but not both."
+            )
+
+        if cfg_language_model_path:
+            t5_cfg = MegatronT5Model.restore_from(cfg_language_model_path, trainer=trainer, return_config=True)
+        else:
+            t5_cfg = cfg_frozen_model
+
+        OmegaConf.set_struct(t5_cfg, True)
+        with open_dict(t5_cfg):
+            if hasattr(t5_cfg, 'encoder') and hasattr(t5_cfg, 'decoder'):
+                t5_cfg.encoder.masked_softmax_fusion = False
+                t5_cfg.decoder.masked_softmax_fusion = False
+            else:
+                t5_cfg.masked_softmax_fusion = False
+            t5_cfg.megatron_amp_O2 = self.megatron_amp_O2
+            # hack to make the _GLOBAL_NUM_MICROBATCHES_CALCULATOR initialize
+            t5_cfg.micro_batch_size = cfg.get('micro_batch_size', 4)
+            t5_cfg.global_batch_size = cfg.get('global_batch_size', 4)
+            t5_cfg.precision = trainer.precision
+            t5_cfg.tokenizer.num_sentinel_tokens = cfg.get('num_sentinel_tokens', 39184 - 29056)
+            t5_cfg.seq_length = cfg.data.max_seq_length
+            if cfg.get('max_position_embeddings', None) is None:
+                t5_cfg.max_position_embeddings = cfg.data.max_seq_length
+            else:
+                t5_cfg.max_position_embeddings = cfg.get('max_position_embeddings')
+            t5_cfg.use_flash_attention = cfg.get('use_flash_attention', False)
+            if cfg.get('override_token_model', None):
+                t5_cfg.tokenizer.model = cfg['override_token_model']
+            if cfg.get('override_tokenizer_vocab_file', None):
+                t5_cfg.tokenizer.vocab_file = cfg['override_tokenizer_vocab_file']
+
+        if cfg.get('train_from_scratch', False):
+            print("Training from scratch!")
+            # Defaults for 220m model
+            # To override any of these, add +model.override_<key>=<value> to the config file.
+            # Eg. +model.override_hidden_size=1024
+            overide_keys = [
+                'hidden_size',  # 768
+                'num_layers',  # 12
+                'num_attention_heads',  # 12
+                'hidden_dropout',  # 0.1
+                'attention_dropout',  # 0.1
+                'kv_channels',  # 64
+                'ffn_hidden_size',  # 2048
+            ]
+            # Defaults for 220m model
+            for k in overide_keys:
+                if cfg.get(f'override_{k}') is not None:
+                    t5_cfg[k] = cfg.get(f'override_{k}')
+
+            self.frozen_model = MegatronT5OverrideModel(t5_cfg, trainer=trainer)
+            num_params = sum(p.numel() for p in self.frozen_model.parameters() if p.requires_grad)
+            print(f"Number of parameters: {num_params}")
+        else:
+            print(f"Loading from pretrained checkpoint: {cfg_language_model_path}")
+            if cfg_language_model_path is None:
+                raise ValueError(
+                    "T5-TTS SFT on pretrained model checkpoint requires `langauge_model_path` in its config."
+                )
+
+            self.frozen_model = MegatronT5OverrideModel.restore_from(
+                cfg_language_model_path,
+                trainer=trainer,
+                override_config_path=t5_cfg,
+                save_restore_connector=NLPSaveRestoreConnector(),
+            )
+
+        if not cfg.get('english_only_model', False):
+            self.frozen_model.tokenizer.add_phone_tokens_to_special_tokens()
+
+        logging.info(f"self.frozen_model {self.frozen_model}")
+
+    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+        """
+        Dataloader produces a global batch which is turned into a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        """
+        # Get seq length of batch
+        batch = next(dataloader_iter)
+        _, seq_length = batch[0].shape
+        if batch[4].dim() > 2:
+            _, _, dec_seq_length = batch[4].shape
+        else:
+            _, dec_seq_length = batch[4].shape
+        data_iter = get_iterator_k_split(batch, get_num_microbatches())
+
+        fwd_bwd_function = get_forward_backward_func()
+
+        losses_reduced_per_micro_batch = fwd_bwd_function(
+            forward_step_func=self.get_forward_output_and_loss_func(forward_only),
+            data_iterator=data_iter,
+            model=[self],
+            num_microbatches=get_num_microbatches(),
+            forward_only=forward_only,
+            seq_length=seq_length,
+            micro_batch_size=get_micro_batch_size(),
+            decoder_seq_length=dec_seq_length,
+        )
+
+        # only the last stages of the pipeline return losses
+        if losses_reduced_per_micro_batch:
+            # average loss across micro batches
+            loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch]
+            loss_tensor = torch.concat(loss_tensors_list)
+            loss_mean = loss_tensor.mean()
+        else:
+            # we're not on the last pipeline stage so no losses
+            loss_mean = torch.tensor(0.0).cuda()
+
+        return loss_mean
+
+    def convert_tokens_to_range(self, tokens, apply_offset_correction=True, pattern=None):
+        # convert tokens to range [0, 1024]
+        output_tokens = tokens.clone()
+        if apply_offset_correction:
+            output_tokens[0] = output_tokens[0] - self.speech_offset
+        output_tokens = torch.clamp(output_tokens, min=0, max=self.speech_codebook_size - 1)
+        if pattern is None:
+            pattern = self.cfg.get('seq_pattern', 'delay_parallel')
+        if pattern == "delay_parallel":
+            output_tokens_new = []
+            for _c in range(output_tokens.shape[0]):
+                si = _c
+                ei = _c + output_tokens.shape[1] - self.num_speech_codebooks
+                output_tokens_new.append(output_tokens[_c, si:ei])
+            output_tokens_new = torch.stack(output_tokens_new)
+            output_tokens = output_tokens_new
+
+        return output_tokens
+
+    def get_forward_output_and_loss_func(self, validation_step=False):
+        def fwd_output_and_loss_func(dataloader_iter, model):
+            batch = next(dataloader_iter)
+            _batch = []
+            for x in batch:
+                if isinstance(x, torch.Tensor):
+                    x = x.cuda(non_blocking=True)
+                elif isinstance(x, list):
+                    if isinstance(x[0], torch.Tensor):
+                        x = [y.cuda(non_blocking=True) for y in x]
+                _batch.append(x)
+            batch = _batch
+            # batch = [x.cuda(non_blocking=True) if isinstance(x, torch.Tensor) else x for x in batch]
+            (
+                virtual_tokens,
+                context_and_question_tokens,
+                enc_mask,
+                dec_input,
+                dec_input_mask,
+                labels,
+                loss_mask,
+                position_ids,
+                taskname_ids,
+                speech_mask,
+                context_and_question_tokens_lens,
+                cross_attention_prior,
+                text_limits,
+                _,  # TODO: text limit and lang not in tarred dataset
+                _,
+            ) = batch
+
+            if self.trainer.global_step % self.train_check_interval == 0 and not validation_step and self.is_rank_zero:
+                self.frozen_model.enc_dec_model.logging_step = True
+
+            _cross_attention_prior = cross_attention_prior
+            if isinstance(context_and_question_tokens, list):
+                # None for context and prior for question
+                _cross_attention_prior = [None, cross_attention_prior]
+
+            output_tensor, encoder_input, out_logits = model(
+                virtual_tokens,
+                context_and_question_tokens,
+                enc_mask,
+                dec_input,
+                dec_input_mask,
+                position_ids,
+                taskname_ids,
+                labels=labels,
+                speech_mask=speech_mask,
+                cross_attention_prior=_cross_attention_prior,
+                text_limits=text_limits,
+                inference=False,
+            )
+            output_tensor = output_tensor.contiguous()
+
+            alignment_loss = out_logits[3]
+            if alignment_loss is not None:
+                self.logger.experiment.add_scalar('train_alignment_loss', alignment_loss, self.global_step)
+
+            if self.trainer.global_step % self.train_check_interval == 0 and not validation_step and self.is_rank_zero:
+                self.frozen_model.enc_dec_model.logging_step = False
+                with torch.no_grad():
+                    with torch.cuda.amp.autocast(enabled=False):
+                        if torch.count_nonzero(speech_mask) == 0:
+                            text_labels = labels[:, 0, :]  # [B, 8, T] -> [B, T]
+                            token_logits = out_logits[0] * 1  # [T, B, V]
+                            if self.frozen_model.enc_dec_model.parallel_output:
+                                # Gather from tensor parallel region
+                                token_logits = tensor_parallel.gather_from_tensor_model_parallel_region(token_logits)
+                            token_logits = token_logits.argmax(dim=2)  # [T, B]
+                            token_logits = token_logits.t()  # [B, T]
+                            score = 0
+                            for i in range(text_labels.size()[0]):
+                                r = text_labels[i].long()
+                                nzm = r != 0
+                                r = r.tolist()
+                                h = token_logits[i].long() * nzm
+                                h = h.tolist()
+                                score += editdistance.eval(r, h)
+                            score /= text_labels.size()[0]
+                            logging.info(f"wer score : {score}")
+                            self.logger.experiment.add_scalar('WER', score, self.global_step)
+                        else:
+                            audio_len = (
+                                self.decoder_context_len + (labels[0][0][self.decoder_context_len :] != 0).sum().item()
+                            )
+                            labels_to_1024 = self.convert_tokens_to_range(labels[0, :, 0:audio_len])
+                            label_wav = self.decode_wav_from_codec_model(labels_to_1024)
+                            dec_input_to_1024 = self.convert_tokens_to_range(dec_input[0, :, 0:audio_len])
+                            dec_input_wav = self.decode_wav_from_codec_model(dec_input_to_1024)
+                            self.logger.experiment.add_audio(
+                                "train_label_wav", label_wav, self.global_step, self.sample_rate
+                            )
+                            self.logger.experiment.add_audio(
+                                "train_dec_input_wav", dec_input_wav, self.global_step, self.sample_rate
+                            )
+                            if isinstance(context_and_question_tokens, list):
+                                context_tokens = context_and_question_tokens[0]
+                                question_tokens = context_and_question_tokens[1]
+                                input_token_list_all = [
+                                    question_tokens[0, 0, i].item() for i in range(question_tokens.shape[2])
+                                ]
+                                input_token_list = [
+                                    (ti, t)
+                                    for ti, t in enumerate(input_token_list_all)
+                                    if t != 0 and t < self.speech_offset
+                                ]
+                                context_end_step = context_and_question_tokens_lens[0][0].item()
+                                _context_tokens = context_tokens[0, :, :context_end_step]
+                            else:
+                                input_token_list_all = [
+                                    context_and_question_tokens[0, 0, i].item()
+                                    for i in range(context_and_question_tokens.shape[2])
+                                ]
+                                input_token_list = [
+                                    (ti, t)
+                                    for ti, t in enumerate(input_token_list_all)
+                                    if t != 0 and t < self.speech_offset
+                                ]
+                                context_end_step = input_token_list[0][0]
+                                _context_tokens = context_and_question_tokens[0, :, :context_end_step]
+
+                            if context_end_step > 1:
+                                is_speech_context = _context_tokens[1, :].sum().item() > 0
+                                if is_speech_context:
+                                    _context_tokens = self.convert_tokens_to_range(
+                                        _context_tokens, pattern=self.context_pattern
+                                    )
+                                    _context_wav = self.decode_wav_from_codec_model(_context_tokens)
+                                    self.logger.experiment.add_audio(
+                                        "train_context_wav", _context_wav, self.global_step, self.sample_rate
+                                    )
+                                else:
+                                    _context_token_list = [v.item() for v in _context_tokens[0, :]]
+                                    _context_text = self.frozen_model.tokenizer.ids_to_text(
+                                        [v for v in _context_token_list if v < self.lm_vocab_size]
+                                    )
+                                    self.logger.experiment.add_text(
+                                        "train_context_text", _context_text, self.global_step
+                                    )
+
+                            question_si = text_limits[0, 0].item() - virtual_tokens.shape[1]
+                            question_ei = text_limits[0, 1].item() - virtual_tokens.shape[1]
+                            text_si = text_limits[0, 0].item()
+                            text_ei = text_limits[0, 1].item()
+                            input_text = self.frozen_model.tokenizer.ids_to_text(
+                                [v for v in input_token_list_all[question_si:question_ei] if v < self.lm_vocab_size]
+                            )
+                            self.logger.experiment.add_text("Train Input Text", input_text, self.global_step)
+
+                            input_phoneme_tokens = [
+                                v - self.lm_vocab_size
+                                for v in input_token_list_all[question_si:question_ei]
+                                if v >= self.lm_vocab_size
+                            ]
+
+                            if len(input_phoneme_tokens) > 0:
+                                phoneme_text = self.phoneme_tokenizer.decode(input_phoneme_tokens)
+                                self.logger.experiment.add_text(
+                                    "Train Input Phoneme Text", phoneme_text, self.global_step
+                                )
+
+                            token_logits = out_logits[0]
+                            speech_logits_list = out_logits[1]
+
+                            attention_probs_list = out_logits[2]  # list of (BS, 12, out_length, in_length)
+                            if attention_probs_list is not None:
+                                attention_sliced_list = []
+                                for lidx in range(len(attention_probs_list)):
+                                    attention_probs = attention_probs_list[lidx]
+                                    for _i in range(attention_probs.shape[1]):
+                                        name = f"Attention Probs Layer {lidx} Head {_i}"
+                                        attention_to_plot = attention_probs[0, _i, :audio_len, :text_ei]
+                                        if self.plot_alignments_sliced:
+                                            attention_to_plot = attention_probs[0, _i, 0:audio_len, text_si:text_ei]
+                                            # 4 to offset "Text to Speech this"
+                                            name += " Sliced"
+                                        alignment_image = plot_alignment_to_numpy_for_speechllm(
+                                            attention_to_plot.cpu().float().numpy().T,
+                                            phoneme_ver=0 if self.plot_alignments_sliced else 1,
+                                            phoneme_seq=None if self.plot_alignments_sliced else [text_si],
+                                        )
+                                        self.logger.experiment.add_image(
+                                            name,
+                                            alignment_image,
+                                            self.global_step,
+                                            dataformats="HWC",
+                                        )
+                                        attention_sliced_list.append(
+                                            attention_probs[
+                                                0, _i, self.decoder_context_len : audio_len, text_si:text_ei
+                                            ]
+                                        )
+                                attention_sliced = torch.stack(attention_sliced_list)
+                                attention_sliced = torch.mean(attention_sliced, 0)
+                                text = None
+                                if len(input_text) > 0:
+                                    text = self.frozen_model.tokenizer.ids_to_tokens(
+                                        [
+                                            v
+                                            for v in input_token_list_all[question_si:question_ei]
+                                            if v < self.lm_vocab_size
+                                        ]
+                                    )
+                                if len(input_phoneme_tokens) > 0:
+                                    text = phoneme_text.split("|")
+                                alignment_image_sliced = plot_alignment_to_numpy_for_speechllm(
+                                    attention_sliced.cpu().float().numpy().T,
+                                    phoneme_seq=text,
+                                    phoneme_ver=2,
+                                    vmin=0.0,
+                                    phone_offset=0,
+                                    h_offset=False,
+                                )
+                                self.logger.experiment.add_image(
+                                    f"Attention Probs Average Sliced",
+                                    alignment_image_sliced,
+                                    self.global_step,
+                                    dataformats="HWC",
+                                )
+                            if self.frozen_model.enc_dec_model.parallel_output:
+                                # Gather from tensor parallel region
+                                token_logits = tensor_parallel.gather_from_tensor_model_parallel_region(token_logits)
+                                for _i in range(len(speech_logits_list)):
+                                    speech_logits_list[_i] = tensor_parallel.gather_from_tensor_model_parallel_region(
+                                        speech_logits_list[_i]
+                                    )
+                            speech_logits = torch.stack(speech_logits_list, dim=-1)  # (t, b, 1024, 7)
+                            token_logits_example = token_logits[:, 0, :] * 1
+                            speech_logits_example = speech_logits[:, 0, :, :] * 1
+                            first_layer_tokens = token_logits_example.argmax(dim=1) - self.speech_offset
+                            other_layer_tokens = []
+                            for _i in range(speech_logits_example.shape[2]):
+                                other_layer_tokens.append(speech_logits_example[:, :, _i].argmax(dim=1))
+
+                            all_layer_tokens = torch.stack([first_layer_tokens] + other_layer_tokens)  # (8, t)
+                            all_layer_tokens = self.convert_tokens_to_range(
+                                all_layer_tokens, apply_offset_correction=False
+                            )
+                            # all_layer_tokens = torch.clip(all_layer_tokens, 0, 1023)
+                            predicted_wav = self.decode_wav_from_codec_model(all_layer_tokens)
+                            self.logger.experiment.add_audio(
+                                "train_tf_pred_wav", predicted_wav, self.global_step, self.sample_rate
+                            )
+
+            def loss_func(loss_args):
+                output_tensor, out_logits, curr_step = loss_args
+                alignment_loss = out_logits[3]
+                loss = self.frozen_model.loss_func(loss_mask, output_tensor)
+                if (
+                    (alignment_loss is not None)
+                    and (curr_step > self.alignment_loss_start_step)
+                    and (curr_step < self.alignment_loss_end_step)
+                ):
+                    logging.debug(f"Adding alignment loss. cur:{curr_step} start:{self.alignment_loss_start_step}")
+                    loss = loss + alignment_loss
+                reduced_loss = average_losses_across_data_parallel_group([loss])
+                return loss, {'avg': reduced_loss}
+
+            return [output_tensor, out_logits, self.global_step], loss_func
+
+        return fwd_output_and_loss_func
+
+    def get_forward_output_only_func(self):
+        """Used in inference / predict"""
+
+        def fwd_output_only_func(dataloader_iter, model):
+            batch = next(dataloader_iter)
+            _batch = []
+            for x in batch:
+                if isinstance(x, torch.Tensor):
+                    x = x.cuda(non_blocking=True)
+                elif isinstance(x, list):
+                    if isinstance(x[0], torch.Tensor):
+                        x = [y.cuda(non_blocking=True) for y in x]
+                _batch.append(x)
+            batch = _batch
+            # batch = [x.cuda(non_blocking=True) if isinstance(x, torch.Tensor) else x for x in batch]
+            (
+                decoder_max_sequence_len,
+                encoder_max_sequence_len,
+                context_and_question_tokens,
+                enc_mask,
+                dec_input,
+                dec_input_mask,
+                position_ids,
+                taskname_ids,
+                speech_mask,
+            ) = batch
+
+            output_logits, _, token_and_speech_logits = model(
+                context_and_question_tokens,
+                context_and_question_tokens,
+                enc_mask,
+                dec_input,
+                dec_input_mask,
+                position_ids,
+                taskname_ids,
+                labels=None,
+                speech_mask=speech_mask,
+                inference=True,
+                inference_step=1,
+                decoder_max_sequence_len=decoder_max_sequence_len,
+                encoder_max_sequence_len=encoder_max_sequence_len,
+            )
+            output_tensor = [output_logits, token_and_speech_logits]
+
+            def id_func(output_tensor):
+                return 0, {'output_logits': output_tensor[0], 'token_and_speech_logits': output_tensor[1]}
+
+            return output_tensor, id_func
+
+        return fwd_output_only_func
+
+    def backward(self, *args, **kwargs):
+        """LightningModule hook to do backward.
+        We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
+        No need to call it here.
+        """
+        return
+
+    def optimizer_zero_grad(self, *args, **kwargs):
+        """LightningModule hook to zero grad.
+        We want this to do nothing as we are zeroing grads during the training_step.
+        """
+        return
+
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+        When using pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.frozen_model.enc_dec_model.set_input_tensor(input_tensor)
+
+    def on_train_epoch_start(self) -> None:
+        gbs = self.cfg.global_batch_size
+        mbs = self.cfg.micro_batch_size
+        self._reconfigure_batch_sizes(gbs, mbs)
+        return super().on_train_epoch_start()
+
+    def on_validation_epoch_start(self) -> None:
+        gbs = self.cfg.get('validation_global_batch_size', self.cfg.global_batch_size)
+        mbs = self.cfg.get('validation_micro_batch_size', self.cfg.micro_batch_size)
+        self._reconfigure_batch_sizes(gbs, mbs)
+        return super().on_validation_epoch_start()
+
+    def training_step(self, dataloader_iter, batch_idx):
+        self._optimizer.zero_grad()
+        batch = next(dataloader_iter)
+
+        # apply text classifier-free guidance by replacing input question tokens with [UNK].
+        if self.train_text_cfg_prob > 0.0:
+            if self._rng.random() < self.train_text_cfg_prob:
+                logging.info(f"Text Classifier-Free Guidance is triggered for the {batch_idx}-th batch.")
+
+                # temporally disable computing CTC alignment loss.
+                if self.use_alignment_loss:
+                    self.frozen_model.enc_dec_model.use_alignment_loss = False
+
+                # make cross-attention prior to None to remove the prior.
+                batch[11] = None
+
+                # replace question token IDs with [UNK]'s id. No speech offset for Phoneme's [UNK]. Same op as train.
+                # instruction token IDs are bpe token IDs directly obtained from self.tokenizer without any offset.
+                # question token IDs are phoneme and grapheme token IDs and are offset by self.lm_vocab_size
+                #   if under "Phoneme TTS" instruction, so existing no overlaps between instruction and question token IDs.
+                # question token IDs are bpe token IDs without any offset
+                #   if under "Text to speech this" instruction, so existing overlaps between instruction and question token IDs.
+                context_and_question_tokens = batch[
+                    1
+                ]  # (batch_size, self.num_speech_codebooks, max_context_question_tokens_len)
+                text_limits = batch[12]
+                virtual_tokens = batch[0]
+                question_limits = text_limits - virtual_tokens.size(
+                    1
+                )  # (b, 2), reset question range to start from [pad] context, same start position as context_and_question_tokens.
+                question_start = question_limits[:, 0].unsqueeze(1)  # (b, 1)
+                question_end = question_limits[:, 1].unsqueeze(1)  # (b, 1)
+
+                if isinstance(context_and_question_tokens, list):  # indicate self.encoder_type=multi_transformers.
+                    context_tokens, question_tokens = context_and_question_tokens
+                    question_tokens_unconditioned = question_tokens.clone()
+                    time_range = torch.arange(
+                        question_tokens_unconditioned.size(2), device=question_tokens_unconditioned.device
+                    ).unsqueeze(0)
+                    question_mask = (time_range >= question_start) & (
+                        time_range < question_end
+                    )  # create a mask for question only tokens.
+                    question_tokens_unconditioned[:, 0][
+                        question_mask
+                    ] = self.tokenizer.unk_id  # only the first layer has non-zero IDs.
+                    batch[1] = [context_tokens, question_tokens_unconditioned]
+                else:
+                    context_and_question_tokens_unconditioned = (
+                        context_and_question_tokens.clone()
+                    )  # (batch_size, self.num_speech_codebooks, max_context_question_tokens_len)
+                    time_range = torch.arange(
+                        context_and_question_tokens_unconditioned.size(2),
+                        device=context_and_question_tokens_unconditioned.device,
+                    ).unsqueeze(
+                        0
+                    )  # (1, max_context_question_tokens_len)
+                    question_mask = (time_range >= question_start) & (
+                        time_range < question_end
+                    )  # create a mask for question only tokens.
+                    context_and_question_tokens_unconditioned[:, 0][
+                        question_mask
+                    ] = self.tokenizer.unk_id  # only the first layer has non-zero IDs.
+                    batch[1] = context_and_question_tokens_unconditioned
+
+                del question_limits, question_start, question_end, time_range, question_mask
+            else:
+                # recover to original alignment loss config.
+                self.frozen_model.enc_dec_model.use_alignment_loss = self.use_alignment_loss
+
+        # apply audio context classifier-free guidance by replacing audio codec with [UNK]
+        if self.train_audio_cfg_prob > 0.0:
+            if self._rng.random() < self.train_audio_cfg_prob:
+                logging.info(f"Audio Classifier-Free Guidance is triggered for the {batch_idx}-th batch.")
+
+                context_and_question_tokens = batch[
+                    1
+                ]  # (batch_size, self.num_speech_codebooks, max_context_question_tokens_len)
+
+                if isinstance(context_and_question_tokens, list):  # indicate self.encoder_type=multi_transformers.
+                    context_tokens, question_tokens = context_and_question_tokens
+                    context_tokens_unconditioned = context_tokens.clone()
+                    context_tokens_unconditioned[:, :, :] = (
+                        self.tokenizer.unk_id
+                    )  # TODO @xueyang: verify if extra tokens other than audio codec tokens are appended.
+                    batch[1] = [context_tokens_unconditioned, question_tokens]
+                else:
+                    # dec_input
+                    dec_input = batch[3]
+                    dec_input_unconditioned = dec_input.clone()
+                    dec_input_unconditioned[:, :, 1 : self.decoder_context_len + 1] = (
+                        self.tokenizer.unk_id
+                    )  # TODO @xueyang: switch to other token id if this one is conflict with text unk.
+                    batch[3] = dec_input_unconditioned
+
+        loss_mean = self.fwd_bwd_step(itertools.chain([batch]), batch_idx, forward_only=False)
+        self.allreduce_gradients()
+
+        ## logging
+        # we can only log on one rank if it is rank zero so we broadcast from last rank
+        # we can avoid this broadcast by updating the PTL log function to accept specific ranks
+        torch.distributed.broadcast(loss_mean, get_last_rank())
+
+        if self.cfg.precision == 16 and hasattr(self.trainer.precision_plugin.scaler, "_scale"):
+            loss_scale = self.trainer.precision_plugin.scaler._scale
+            if loss_scale is not None:
+                self.log('loss_scale', loss_scale, batch_size=1)
+
+        self.log('reduced_train_loss', loss_mean, prog_bar=True, rank_zero_only=True, batch_size=1)
+        lr = self._optimizer.param_groups[0]['lr']
+        self.log('lr', lr, rank_zero_only=True, batch_size=1)
+        self.log('global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True, batch_size=1)
+        return loss_mean
+
+    def get_predictions(self, input_ids, enc_mask, encoder_input, labels):
+        predicted_token_ids, log_probs = self.frozen_model.decode(
+            tokens_enc=input_ids,
+            enc_mask=enc_mask,
+            num_tokens_to_generate=self.decoder_seq_length,
+            encoder_input=encoder_input,
+            bos_id=(
+                self.tokenizer.pad_id if self.cfg.data.get('decoder_starts_with_pad', False) else self.tokenizer.bos_id
+            ),
+        )
+        # Special ids to text function to handle stripping <eos> and special tokens with sentencepiece tokenizers.
+        preds_text = MegatronT5SFTModel.ids_to_text(predicted_token_ids, self.tokenizer)
+        labels_text = MegatronT5SFTModel.ids_to_text(labels, self.tokenizer)
+        input_text = MegatronT5SFTModel.ids_to_text(input_ids, self.tokenizer)
+        return {
+            'predicted_token_ids': preds_text,
+            'labels': labels_text,
+            'enc_inputs': input_text,
+        }
+
+    def get_embeddings(self, tokens, taskname_ids, inference=False):
+        out = None
+        if tokens.dim() > 2:
+            for i in range(tokens.size()[1]):  # for 8 channels
+                if i == 0:
+                    # Embed first layer using word embeddings
+                    out = self.embed_input(tokens[:, i, :], taskname_ids, inference)  # (B, T, D)
+                else:
+                    # Embed other layers using speech embeddings
+                    cur = self.frozen_model.enc_dec_model.speech_tokens_embeddings[i - 1](tokens[:, i, :])
+                    # do not add embeddings of zero tokens of other channels (except the first channel)
+                    non_zero_flag = tokens[:, i, :] != 0  # (B, T)
+                    cur = cur * non_zero_flag.unsqueeze(2)
+                    out = out + cur
+        else:
+            out = self.embed_input(tokens, taskname_ids, inference)
+        return out
+
+    def get_embeddings_and_combine(self, token_list, taskname_ids, inference):
+        embedding_list = []
+        for tokens in token_list:
+            embedding_list.append(self.get_embeddings(tokens, taskname_ids, inference))
+        return torch.cat(embedding_list, dim=1)
+
+    def validation_step(self, batch, batch_idx, dataloader_idx=0):
+        (
+            virtual_tokens,
+            context_and_question_tokens,
+            enc_mask,
+            dec_input,
+            dec_input_mask,
+            labels,
+            loss_mask,
+            position_ids,
+            taskname_ids,
+            speech_mask,
+            context_and_question_tokens_lens,
+            cross_attention_prior,
+            text_limits,
+            _,
+            _,
+        ) = batch
+        # loss_mask (b, t)
+        # does not use dataloader_iter due to device placement issues arising from PTL
+
+        mode = self.training
+        self.eval()
+        gbs = self.cfg.get('validation_global_batch_size', self.cfg.global_batch_size)
+        self._reconfigure_and_process_inference_batch(virtual_tokens.size(0), gbs)
+
+        loss_mean = self.fwd_bwd_step(
+            itertools.chain([batch]), batch_idx, forward_only=True
+        )  # comment this out and add custom forward function to calculate WER
+        # # logging.info (f'loss_mean {loss_mean}')
+
+        if batch_idx == 0 and self.is_rank_zero:
+            self.frozen_model.enc_dec_model.logging_step = True
+            self.predict_step_outputs = []
+            # log_scalars=False avoids logging scalar TTS metrics in the predict_step
+            # Images, audio and texts will still be logged
+            self.predict_step(batch=batch, batch_idx=batch_idx, log_scalars=False, global_step=self.global_step)
+            for inf_key in self.predict_step_outputs[0]:
+                if self.predict_step_outputs[0][inf_key] is not None:
+                    self.logger.experiment.add_scalar(
+                        f'Val_{inf_key}', self.predict_step_outputs[0][inf_key], self.global_step
+                    )
+
+        labels_original = labels.clone()  # (b, 8, t)
+
+        _cross_attention_prior = cross_attention_prior
+        if isinstance(context_and_question_tokens, list):
+            _cross_attention_prior = [None, cross_attention_prior]
+
+        output_loss, _, output_logits = self.forward(
+            virtual_tokens,
+            context_and_question_tokens,
+            enc_mask,
+            dec_input,
+            dec_input_mask,
+            position_ids,
+            taskname_ids,
+            labels=labels,
+            speech_mask=speech_mask,
+            cross_attention_prior=_cross_attention_prior,
+            text_limits=text_limits,
+            inference=False,
+        )
+
+        if batch_idx == 0 and self.is_rank_zero:
+            self.frozen_model.enc_dec_model.logging_step = False
+            with torch.cuda.amp.autocast(enabled=False):
+                if torch.count_nonzero(speech_mask) == 0:
+                    text_labels = labels[:, 0, :]  # [B, 8, T] -> [B, T]
+                    token_logits = output_logits[0] * 1  # [T, B, V]
+                    if self.frozen_model.enc_dec_model.parallel_output:
+                        # Gather from tensor parallel region
+                        token_logits = tensor_parallel.gather_from_tensor_model_parallel_region(token_logits)
+                    token_logits = token_logits.argmax(dim=2)  # [T, B]
+                    token_logits = token_logits.t()  # [B, T]
+                    score = 0
+                    for i in range(text_labels.size()[0]):
+                        r = text_labels[i].long()
+                        nzm = r != 0
+                        r = r.tolist()
+                        h = token_logits[i].long() * nzm
+                        h = h.tolist()
+                        score += editdistance.eval(r, h)
+                    score /= text_labels.size()[0]
+                    logging.info(f"wer score : {score}")
+                    self.logger.experiment.add_scalar('WER', score, self.global_step)
+                else:
+                    audio_len = self.decoder_context_len + (labels[0][0][self.decoder_context_len :] != 0).sum().item()
+                    labels_to_1024 = self.convert_tokens_to_range(labels[0, :, 0:audio_len])
+                    label_wav = self.decode_wav_from_codec_model(labels_to_1024)
+                    dec_input_to_1024 = self.convert_tokens_to_range(dec_input[0, :, 0:audio_len])
+                    dec_input_wav = self.decode_wav_from_codec_model(dec_input_to_1024)
+                    self.logger.experiment.add_audio("val_label_wav", label_wav, self.global_step, self.sample_rate)
+                    self.logger.experiment.add_audio(
+                        "val_dec_input_wav", dec_input_wav, self.global_step, self.sample_rate
+                    )
+
+                    if isinstance(context_and_question_tokens, list):
+                        context_tokens = context_and_question_tokens[0]
+                        question_tokens = context_and_question_tokens[1]
+                        input_token_list_all = [
+                            question_tokens[0, 0, i].item() for i in range(question_tokens.shape[2])
+                        ]
+                        input_token_list = [
+                            (ti, t) for ti, t in enumerate(input_token_list_all) if t != 0 and t < self.speech_offset
+                        ]
+                        context_end_step = context_and_question_tokens_lens[0][0].item()
+                        _context_tokens = context_tokens[0, :, :context_end_step]
+
+                    else:
+                        input_token_list_all = [
+                            context_and_question_tokens[0, 0, i].item()
+                            for i in range(context_and_question_tokens.shape[2])
+                        ]
+                        input_token_list = [
+                            (ti, t) for ti, t in enumerate(input_token_list_all) if t != 0 and t < self.speech_offset
+                        ]
+                        context_end_step = input_token_list[0][0]
+                        _context_tokens = context_and_question_tokens[0, :, :context_end_step]
+                    if context_end_step > 1:
+                        is_speech_context = _context_tokens[1, :].sum().item() > 0
+                        if is_speech_context:
+                            _context_tokens = self.convert_tokens_to_range(
+                                _context_tokens, pattern=self.context_pattern
+                            )
+                            _context_wav = self.decode_wav_from_codec_model(_context_tokens)
+                            self.logger.experiment.add_audio(
+                                "val_context_wav", _context_wav, self.global_step, self.sample_rate
+                            )
+                        else:
+                            _context_token_list = [v.item() for v in _context_tokens[0, :]]
+                            _context_text = self.frozen_model.tokenizer.ids_to_text(
+                                [v for v in _context_token_list if v < self.lm_vocab_size]
+                            )
+                            self.logger.experiment.add_text("val_context_text", _context_text, self.global_step)
+
+                    question_si = text_limits[0, 0].item() - virtual_tokens.shape[1]
+                    question_ei = text_limits[0, 1].item() - virtual_tokens.shape[1]
+
+                    text_si = text_limits[0, 0].item()
+                    text_ei = text_limits[0, 1].item()
+
+                    input_text = self.frozen_model.tokenizer.ids_to_text(
+                        [v for v in input_token_list_all[question_si:question_ei] if v < self.lm_vocab_size]
+                    )
+                    self.logger.experiment.add_text("Val Input Text", input_text, self.global_step)
+
+                    input_phoneme_tokens = [
+                        v - self.lm_vocab_size
+                        for v in input_token_list_all[question_si:question_ei]
+                        if v >= self.lm_vocab_size
+                    ]
+                    if len(input_phoneme_tokens) > 0:
+                        phoneme_text = self.phoneme_tokenizer.decode(input_phoneme_tokens)
+                        self.logger.experiment.add_text("Val Input Phoneme Text", phoneme_text, self.global_step)
+
+                    token_logits = output_logits[0]
+                    speech_logits_list = output_logits[1]
+
+                    # if self.trainer.global_step % 500 == 0:
+                    attention_probs_list = output_logits[2]  # list of (BS, 12, out_length, in_length)
+                    if attention_probs_list is not None:
+                        attention_sliced_list = []
+                        for lidx in range(len(attention_probs_list)):
+                            attention_probs = attention_probs_list[lidx]
+                            for _i in range(attention_probs.shape[1]):
+                                attention_sliced_list.append(
+                                    attention_probs[0, _i, self.decoder_context_len : audio_len, text_si:text_ei]
+                                )
+                        attention_sliced = torch.stack(attention_sliced_list)
+                        attention_sliced = torch.mean(attention_sliced, 0)
+                        text = None
+                        if len(input_text) > 0:
+                            text = self.frozen_model.tokenizer.ids_to_tokens(
+                                [v for v in input_token_list_all[question_si:question_ei] if v < self.lm_vocab_size]
+                            )
+                        if len(input_phoneme_tokens) > 0:
+                            text = phoneme_text.split("|")
+                        alignment_image_sliced = plot_alignment_to_numpy_for_speechllm(
+                            attention_sliced.cpu().float().numpy().T,
+                            phoneme_seq=text,
+                            phoneme_ver=2,
+                            vmin=0.0,
+                            phone_offset=0,
+                            h_offset=False,
+                        )
+                        self.logger.experiment.add_image(
+                            f"Val Attention Probs Average Sliced",
+                            alignment_image_sliced,
+                            self.global_step,
+                            dataformats="HWC",
+                        )
+                    if self.frozen_model.enc_dec_model.parallel_output:
+                        # Gather from tensor parallel region
+                        token_logits = tensor_parallel.gather_from_tensor_model_parallel_region(token_logits)
+                        for _i in range(len(speech_logits_list)):
+                            speech_logits_list[_i] = tensor_parallel.gather_from_tensor_model_parallel_region(
+                                speech_logits_list[_i]
+                            )
+                    speech_logits = torch.stack(speech_logits_list, dim=-1)  # (t, b, 1024, 7)
+                    token_logits_example = token_logits[:, 0, :] * 1
+                    speech_logits_example = speech_logits[:, 0, :, :] * 1
+                    first_layer_tokens = token_logits_example.argmax(dim=1) - self.speech_offset
+                    other_layer_tokens = []
+                    for _i in range(speech_logits_example.shape[2]):
+                        other_layer_tokens.append(speech_logits_example[:, :, _i].argmax(dim=1))
+
+                    all_layer_tokens = torch.stack([first_layer_tokens] + other_layer_tokens)  # (8, t)
+                    all_layer_tokens = self.convert_tokens_to_range(all_layer_tokens, apply_offset_correction=False)
+                    all_layer_tokens = torch.clip(all_layer_tokens, 0, self.speech_codebook_size - 1)
+                    predicted_wav = self.decode_wav_from_codec_model(all_layer_tokens)
+                    self.logger.experiment.add_audio(
+                        "val_tf_pred_wav", predicted_wav, self.global_step, self.sample_rate
+                    )
+
+        first_layer_logits = output_logits[0]
+        speech_logits_list = output_logits[1]
+
+        if self.frozen_model.enc_dec_model.parallel_output:
+            # Gather from tensor parallel region
+            first_layer_logits = tensor_parallel.gather_from_tensor_model_parallel_region(first_layer_logits)
+            if torch.count_nonzero(speech_mask) > 0:
+                for _i in range(len(speech_logits_list)):
+                    speech_logits_list[_i] = tensor_parallel.gather_from_tensor_model_parallel_region(
+                        speech_logits_list[_i]
+                    )
+                speech_logits = torch.stack(speech_logits_list, dim=-1)  # (t, b, 1024, 7)
+        first_layer_preds = first_layer_logits.argmax(dim=2)  # (t,bs)
+        first_layer_preds = first_layer_preds.transpose(0, 1)  # (bs,t)
+        labels_first_layer = labels_original[:, 0, :]  # (bs,t)
+        correct_predictions = first_layer_preds == labels_first_layer  # (bs,t)
+        correct_predictions = correct_predictions * loss_mask  # (bs,t)
+        total_correct_predictions = torch.sum(correct_predictions)
+        total_predictions = torch.sum(loss_mask)
+        first_layer_accuracy = total_correct_predictions / total_predictions
+        first_layer_loss = torch.nn.functional.cross_entropy(
+            first_layer_logits.permute(1, 2, 0), labels_first_layer, reduction='none'
+        )  # (bs,t)
+        first_layer_loss = torch.sum(first_layer_loss * loss_mask) / total_predictions
+
+        metrics = {
+            'loss': loss_mean,
+            'first_layer_accuracy': first_layer_accuracy,
+            'first_layer_loss': first_layer_loss,
+        }
+        loss_total = first_layer_loss
+        for i in range(self.num_speech_codebooks - 1):
+            if torch.count_nonzero(speech_mask) > 0:
+                speech_logits_i = speech_logits[:, :, :, i]
+                speech_preds_i = speech_logits_i.argmax(dim=2)  # (t,bs)
+                speech_preds_i = speech_preds_i.transpose(0, 1)  # (bs,t)
+                labels_i = labels_original[:, i + 1, :]  # (bs,t)
+                correct_predictions_i = speech_preds_i == labels_i  # (bs,t)
+                correct_predictions_i = correct_predictions_i * loss_mask * speech_mask  # (bs,t)
+                total_correct_predictions_i = torch.sum(correct_predictions_i)
+                total_predictions_i = torch.sum(loss_mask * speech_mask)
+                speech_accuracy_i = total_correct_predictions_i / total_predictions_i
+                loss_i = torch.nn.functional.cross_entropy(
+                    speech_logits_i.permute(1, 2, 0), labels_i, reduction='none'
+                )  # (bs,t)
+                loss_i = torch.sum(loss_i * loss_mask * speech_mask) / total_predictions_i
+            else:
+                speech_accuracy_i = torch.tensor(0.0)
+                loss_i = torch.tensor(0.0)
+            metrics[f'speech_accuracy_{i+1}'] = speech_accuracy_i
+            metrics[f'speech_loss_{i+1}'] = loss_i
+            loss_total += loss_i
+
+        metrics['loss_total_check'] = loss_total
+        self.validation_step_outputs.append(metrics)
+        self.train(mode=mode)
+        self.frozen_model.train()
+        return metrics['loss']
+
+    def on_validation_epoch_end(self):
+        outputs = self.validation_step_outputs
+        if self.cfg.get('pipeline_model_parallel_size', 1) > 1:
+            if parallel_state.is_pipeline_last_stage():
+                # only the last pipeline parallel stages return loss
+                averaged_loss = torch.stack([item['loss'] for item in outputs]).mean()
+                averaged_loss_total_check = torch.stack([item['loss_total_check'] for item in outputs]).mean()
+                averaged_first_layer_accuracy = torch.stack([item['first_layer_accuracy'] for item in outputs]).mean()
+
+                self.log(
+                    'val_loss_total_check',
+                    averaged_loss_total_check,
+                    prog_bar=False,
+                    rank_zero_only=True,
+                    batch_size=1,
+                )
+                self.log(
+                    'val_first_layer_accuracy',
+                    averaged_first_layer_accuracy,
+                    prog_bar=True,
+                    rank_zero_only=True,
+                    batch_size=1,
+                )
+                logging.info(f'Validation first_layer_accuracy: {averaged_first_layer_accuracy}')
+                logging.info(f'Validation loss_total_check: {averaged_loss_total_check}')
+
+                for i in range(1, self.num_speech_codebooks):
+                    averaged_speech_accuracy = torch.stack([item[f'speech_accuracy_{i}'] for item in outputs]).mean()
+                    averaged_speech_loss = torch.stack([item[f'speech_loss_{i}'] for item in outputs]).mean()
+                    self.log(
+                        f'val_speech_accuracy_{i}',
+                        averaged_speech_accuracy,
+                        prog_bar=True,
+                        rank_zero_only=True,
+                        batch_size=1,
+                    )
+                    self.log(
+                        f'val_speech_loss_{i}', averaged_speech_loss, prog_bar=True, rank_zero_only=True, batch_size=1
+                    )
+                    logging.info(f'Validation speech_accuracy_{i}: {averaged_speech_accuracy}')
+                    logging.info(f'Validation speech_loss_{i}: {averaged_speech_loss}')
+            else:
+                averaged_loss = torch.tensor(0.0).cuda()
+
+            # we can only log on one rank if it is rank zero so we broadcast from last rank
+            torch.distributed.broadcast(averaged_loss, get_last_rank())
+
+            self.log('val_loss', averaged_loss, prog_bar=True, rank_zero_only=True, batch_size=1)
+            logging.info(f'Validation loss: {averaged_loss}')
+
+        else:
+            if len(outputs) > 0:
+                averaged_loss = torch.stack([item['loss'] for item in outputs]).mean()
+                averaged_loss_total_check = torch.stack([item['loss_total_check'] for item in outputs]).mean()
+                logging.info(f'Validation loss: {averaged_loss}')
+                self.log('val_loss', averaged_loss, prog_bar=True, rank_zero_only=True, batch_size=1)
+                self.log(
+                    'val_loss_total_check',
+                    averaged_loss_total_check,
+                    prog_bar=False,
+                    rank_zero_only=True,
+                    batch_size=1,
+                )
+
+                averaged_first_layer_accuracy = torch.stack([item['first_layer_accuracy'] for item in outputs]).mean()
+                logging.info(f'Validation first_layer_accuracy: {averaged_first_layer_accuracy}')
+                self.log(
+                    'val_first_layer_accuracy',
+                    averaged_first_layer_accuracy,
+                    prog_bar=True,
+                    rank_zero_only=True,
+                    batch_size=1,
+                )
+
+                for i in range(1, self.num_speech_codebooks):
+                    averaged_speech_accuracy = torch.stack([item[f'speech_accuracy_{i}'] for item in outputs]).mean()
+                    averaged_speech_loss = torch.stack([item[f'speech_loss_{i}'] for item in outputs]).mean()
+                    logging.info(f'Validation speech_accuracy_{i}: {averaged_speech_accuracy}')
+                    logging.info(f'Validation speech_loss_{i}: {averaged_speech_loss}')
+                    self.log(
+                        f'val_speech_accuracy_{i}',
+                        averaged_speech_accuracy,
+                        prog_bar=True,
+                        rank_zero_only=True,
+                        batch_size=1,
+                    )
+                    self.log(
+                        f'val_speech_loss_{i}', averaged_speech_loss, prog_bar=True, rank_zero_only=True, batch_size=1
+                    )
+
+        if self.cfg.get("report_validation_metric", False):
+            gather_results = [None for _ in range(parallel_state.get_data_parallel_world_size())]
+
+            all_preds = list(itertools.chain(*[item['predicted_token_ids'] for item in outputs]))
+            all_labels = list(itertools.chain(*[item['labels'] for item in outputs]))
+            all_inputs = list(itertools.chain(*[item['enc_inputs'] for item in outputs]))
+
+            assert len(all_preds) == len(all_labels)
+            assert len(all_preds) == len(all_inputs)
+
+            # Gather inputs, preds, labels from all workers
+            torch.distributed.all_gather_object(
+                gather_results,
+                [(input, pred, label) for (input, pred, label) in zip(all_inputs, all_preds, all_labels)],
+                group=parallel_state.get_data_parallel_group(),
+            )
+
+            # Deduplicate sentences that may have been distributed across multiple data parallel ranks.
+            if parallel_state.get_data_parallel_rank() == 0:
+
+                gather_results_dedup = list(set(itertools.chain(*gather_results)))
+
+                val_metric_dict = self.validation_metric.get_score(
+                    [i[2] for i in gather_results_dedup],
+                    [i[1] for i in gather_results_dedup],
+                )
+
+                for metric, val in val_metric_dict.items():
+                    logging.info(f'Validation {metric}: {val}')
+                val_metric = list(val_metric_dict.items())[0][1]
+                metric_name = list(val_metric_dict.items())[0][0]
+            else:
+                val_metric = torch.tensor(0.0).cuda()
+                metric_name = ''
+
+            self.log(f'val_{metric_name}', val_metric, prog_bar=True, rank_zero_only=True, batch_size=1)
+
+        gbs = self.cfg.global_batch_size
+        mbs = self.cfg.micro_batch_size
+        self._reconfigure_batch_sizes(gbs, mbs)
+        self.validation_step_outputs.clear()
+
+    def test_step(self, batch, batch_idx):
+        result = self.predict_step(batch, batch_idx)
+        return result
+
+    def on_test_epoch_end(self):
+        """
+        This might still be broken for lightning 2.0. to fix: see
+        https://github.com/NVIDIA/NeMo/blob/9bdf4d12276ee8f95a340cf2f7f340e9b5b74a7e/docs/source/starthere/migration-guide.rst
+        """
+        outputs = self.predict_step_outputs
+        average_metrics = {}
+        for output in outputs:
+            for key in output:
+                if key not in average_metrics:
+                    average_metrics[key] = []
+                if isinstance(output[key], torch.Tensor):
+                    average_metrics[key].append(output[key].item())
+                elif output[key] is None:
+                    continue
+                else:
+                    average_metrics[key].append(output[key])
+
+        for key in average_metrics:
+            average_metrics[key] = np.mean(average_metrics[key]).item()
+            logging.info(f'Test {key}: {average_metrics[key]}')
+            self.log(f'test_{key}', average_metrics[key], prog_bar=True, rank_zero_only=True, batch_size=1)
+            self.logger.experiment.add_scalar(f'Inf Cumulative {key}', average_metrics[key], 0)
+
+        # save average metrics into json file
+        with open(os.path.join(self.logger.log_dir, 'output_metrics.json'), 'w') as f:
+            json.dump(average_metrics, f)
+
+    def build_virtual_prompt_dataset(
+        self, dataset_paths, batch_size, for_train, drop_last, shuffle, num_workers, pin_memory
+    ):
+        dataset = T5SpeechLMDataset(
+            datasets=dataset_paths,
+            tokenizer=self.tokenizer,
+            sample_rate=self.cfg.data.get('sample_rate', 24000),
+            virtual_prompt_source=self.virtual_prompt_source,
+            task_templates=self.task_templates,
+            pseudo_tokens=self.pseudo_tokens,
+            pad_token_id=self.pad_token_id,
+            max_seq_length=self.cfg.data.get('max_seq_length', self.frozen_model.cfg.max_position_embeddings),
+            min_seq_length=self.cfg.data.get('min_seq_length', 1),
+            add_bos=self.cfg.data.get('add_bos', False),
+            add_eos=self.cfg.data.get('add_eos', True),
+            decoder_starts_with_pad=self.cfg.data.get('decoder_starts_with_pad', False),
+            add_eos_to_decoder_output=self.cfg.data.get('add_eos_to_decoder_output', True),
+            add_sentinel_to_input=self.cfg.data.get('add_sentinel_to_input', True),
+            ul2_prompt_token=self.cfg.data.get('ul2_prompt_token', None),
+            for_train=for_train,
+            segment_max_duration=self.cfg.data.get('segment_max_duration', None),
+            trim=self.cfg.data.get('trim', None),
+            trim_ref=self.cfg.data.get('trim_ref', None),
+            trim_top_db=self.cfg.data.get('trim_top_db', None),
+            trim_frame_length=self.cfg.data.get('trim_frame_length', None),
+            trim_hop_length=self.cfg.data.get('trim_hop_length', None),
+            pad_multiple=self.cfg.data.get('pad_multiple', 1),
+            pitch_augment=self.cfg.data.get('pitch_augment', None),
+            sup_data_path=self.cfg.data.get('sup_data_path', None),
+            codec_folder=self.cfg.data.get('codec_folder', None),
+            speech_offset=self.cfg.data.get('speech_offset', None),
+            train_task=self.cfg.data.get('train_task', "tts"),
+            seq_pattern=self.cfg.get('seq_pattern', 'delay_parallel'),
+            use_attention_prior=self.cfg.data.get('use_attention_prior', False),
+            attention_prior_scaling_factor=self.cfg.data.get('attention_prior_scaling_factor', 1.0),
+            cross_attention_epsilon=self.cfg.data.get('cross_attention_epsilon', 0.0),
+            lm_vocab_size=self.lm_vocab_size,
+            num_speech_codebooks=self.num_speech_codebooks,
+            codebook_fps=self.cfg.data.get('codebook_fps', 86),
+            add_special_tokens_to_only_first_codebook=self.cfg.data.get(
+                'add_special_tokens_to_only_first_codebook', False
+            ),
+            context_pattern=self.cfg.data.get('context_pattern', 'parallel'),
+            context_duration_min=self.cfg.data.get('context_duration_min', 3.0),
+            context_duration_max=self.cfg.data.get('context_duration_max', 5.0),
+            g2p=self.cfg.data.get('g2p', None),
+            skip_datasets=self.cfg.data.get('skip_datasets', []),
+            english_only_model=self.cfg.get('english_only_model', False),
+            use_ipa=self.cfg.data.get('use_ipa', False),
+            context_conditioning=self.cfg.get('context_conditioning', "decoder"),
+            use_beta_binomial_interpolator=self.cfg.get('use_beta_binomial_interpolator', False),
+            context_slice_method=self.cfg.data.get('context_slice_method', 'random'),
+            phoneme_probability=self.cfg.data.get('phoneme_probability', 0.5),
+            encoder_type=self.cfg.data.get('encoder_type', 'single_transformer'),
+        )
+
+        rank = parallel_state.get_data_parallel_rank()
+        world_size = parallel_state.get_data_parallel_world_size()
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            dataset, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.cfg.seed
+        )
+
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            collate_fn=dataset.collate_fn,
+            sampler=sampler,
+            batch_size=batch_size // world_size,
+            drop_last=drop_last,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=(
+                True if num_workers > 0 else False
+            ),  # (@adithyare and @eharper) We need to set this to True to get around issues with spawn=True
+        )
+        logging.info(f'build success: {len(dataloader)} {dataset_paths}')
+        if self.phoneme_tokenizer is None:
+            self.phoneme_tokenizer = dataset.phoneme_tokenizer
+        return dataset, dataloader
+
+    def build_virtual_prompt_tarred_dataset(
+        self, dataset_paths, audio_path, batch_size, for_train, drop_last, shuffle, num_workers, pin_memory
+    ):
+        dataset = T5SpeechLMTarredDataset(
+            audio_tar_filepaths=audio_path,
+            manifest_filepath=dataset_paths,
+            tokenizer=self.tokenizer,
+            sample_rate=self.cfg.data.get('sample_rate', 24000),
+            virtual_prompt_source=self.virtual_prompt_source,
+            task_templates=self.task_templates,
+            pseudo_tokens=self.pseudo_tokens,
+            pad_token_id=self.pad_token_id,
+            max_seq_length=self.cfg.data.get('max_seq_length', self.frozen_model.cfg.max_position_embeddings),
+            min_seq_length=self.cfg.data.get('min_seq_length', 1),
+            shuffle_n=shuffle,
+            add_bos=self.cfg.data.get('add_bos', False),
+            add_eos=self.cfg.data.get('add_eos', True),
+            decoder_starts_with_pad=self.cfg.data.get('decoder_starts_with_pad', False),
+            add_eos_to_decoder_output=self.cfg.data.get('add_eos_to_decoder_output', True),
+            add_sentinel_to_input=self.cfg.data.get('add_sentinel_to_input', True),
+            ul2_prompt_token=self.cfg.data.get('ul2_prompt_token', None),
+            for_train=for_train,
+            segment_max_duration=self.cfg.data.get('segment_max_duration', None),
+            trim=self.cfg.data.get('trim', None),
+            trim_ref=self.cfg.data.get('trim_ref', None),
+            trim_top_db=self.cfg.data.get('trim_top_db', None),
+            trim_frame_length=self.cfg.data.get('trim_frame_length', None),
+            trim_hop_length=self.cfg.data.get('trim_hop_length', None),
+            pad_multiple=self.cfg.data.get('pad_multiple', 1),
+            pitch_augment=self.cfg.data.get('pitch_augment', None),
+            speech_offset=self.cfg.data.get('speech_offset', None),
+            train_task=self.cfg.data.get('train_task', "tts"),
+            seq_pattern=self.cfg.get('seq_pattern', 'delay_parallel'),
+            use_attention_prior=self.cfg.data.get('use_attention_prior', False),
+            attention_prior_scaling_factor=self.cfg.data.get('attention_prior_scaling_factor', 1.0),
+            cross_attention_epsilon=self.cfg.data.get('cross_attention_epsilon', 0.0),
+            lm_vocab_size=self.lm_vocab_size,
+            num_speech_codebooks=self.num_speech_codebooks,
+        )
+        rank = parallel_state.get_data_parallel_rank()
+        world_size = parallel_state.get_data_parallel_world_size()
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            collate_fn=dataset.collate_fn,
+            batch_size=batch_size // world_size,
+            drop_last=drop_last,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=(
+                True if num_workers > 0 else False
+            ),  # (@adithyare and @eharper) We need to set this to True to get around issues with spawn=True
+        )
+        logging.info(f'build success: {len(dataloader)} {dataset_paths}')
+
+        return dataset, dataloader
+
+    def process_text(self, input_text):
+        """
+        Normalizes text for CER/WER calculation.
+        Taken from hallucination_eval.py
+        """
+        # Convert text to lowercase
+        lower_case_text = input_text.lower()
+
+        # Remove commas from text
+        no_comma_text = lower_case_text.replace(",", "")
+
+        # Replace "-" with spaces
+        no_dash_text = no_comma_text.replace("-", " ")
+
+        # Replace double spaces with single space
+        single_space_text = " ".join(no_dash_text.split())
+
+        single_space_text = single_space_text.translate(str.maketrans('', '', string.punctuation))
+
+        return single_space_text
+
+    def predict_step(
+        self, batch: Any, batch_idx: int, dataloader_idx: int = 0, log_scalars=True, global_step=None
+    ) -> Any:
+
+        with torch.no_grad():
+            (
+                virtual_tokens,
+                context_and_question_tokens,
+                enc_mask,
+                dec_input_raw,
+                dec_input_mask_raw,
+                labels,
+                loss_mask,
+                position_ids,
+                taskname_ids,
+                speech_mask,
+                context_and_question_tokens_lens,
+                cross_attention_prior,
+                text_limits,  # [start of question token, question token len) in [0, enc_mask.size(1))
+                lang,
+                question_texts,
+            ) = batch
+
+            batch_size = virtual_tokens.size(0)
+            dec_input = (
+                dec_input_raw * 1
+            )  # (B, 8, T)  # TODO @xueyang: apply clone() method bypasses this unnecessary computation.
+            dec_input_mask = dec_input_mask_raw * 1  # (B, T)
+            dec_input_mask[:, :] = 1  # Does not really matter
+            output_token_list = []
+
+            end_indices = {}
+            # pad dec_input (B, 8, T) to 1000 timesteps
+            max_inference_timesteps = self.cfg.get('max_inference_timesteps', 2000)
+            # TODO @xueyang: potential bug when max_inference_timesteps < dec_input.shape[2], then dec_input is clipped.
+            dec_input = torch.nn.functional.pad(dec_input, (0, max_inference_timesteps - dec_input.shape[2]), value=0)
+            dec_input[:, :, self.decoder_context_len + 1 :].zero_()
+            # TODO @xueyang: why not just declare torch.ones(dec_input_raw.size(0), max_inference_timesteps)?
+            dec_input_mask = torch.nn.functional.pad(
+                dec_input_mask, (0, max_inference_timesteps - dec_input_mask.shape[1]), value=1
+            )
+
+            if self.inference_apply_text_cfg and self.inference_apply_audio_cfg:
+                question_limits = text_limits - virtual_tokens.size(
+                    1
+                )  # (b, 2), reset question range to start from [pad] context, same start position as context_and_question_tokens.
+                question_start = question_limits[:, 0].unsqueeze(1)  # (b, 1)
+                question_end = question_limits[:, 1].unsqueeze(1)  # (b, 1)
+
+                # duplicate and glue two batches into a single one.
+                virtual_tokens = torch.cat((virtual_tokens, virtual_tokens), dim=0)
+                taskname_ids = torch.cat((taskname_ids, taskname_ids), dim=0)
+                speech_mask = torch.cat((speech_mask, speech_mask), dim=0)
+                dec_input_mask = torch.cat((dec_input_mask, dec_input_mask), dim=0)
+
+                if isinstance(context_and_question_tokens, list):  # indicate self.encoder_type = "multi_transformers".
+                    context_tokens, question_tokens = context_and_question_tokens
+
+                    # text
+                    question_tokens_unconditioned = question_tokens.clone()
+                    time_range = torch.arange(
+                        question_tokens_unconditioned.size(2), device=question_tokens_unconditioned.device
+                    ).unsqueeze(0)
+                    question_mask = (time_range >= question_start) & (
+                        time_range < question_end
+                    )  # create a mask for question only tokens.
+                    question_tokens_unconditioned[:, 0][
+                        question_mask
+                    ] = self.tokenizer.unk_id  # only the first layer has non-zero IDs.
+
+                    # audio
+                    context_tokens_unconditioned = context_tokens.clone()
+                    context_tokens_unconditioned[:, :, :] = self.tokenizer.unk_id
+
+                    # concatenate both conditioned and unconditioned batches as a single one.
+                    context_and_question_tokens = [
+                        torch.cat((context_tokens, context_tokens_unconditioned), dim=0),
+                        torch.cat((question_tokens, question_tokens_unconditioned), dim=0),
+                    ]
+                    enc_mask = [torch.cat((mask, mask), dim=0) for mask in enc_mask]
+                    dec_input = torch.cat((dec_input, dec_input), dim=0)
+                    position_ids = [torch.cat((pos_ids, pos_ids), dim=0) for pos_ids in position_ids]
+                else:
+                    assert (
+                        self.context_conditioning == "decoder"
+                    ), f"The encoder_type is single_transformer. We expect context_condition is decoder: context_condition={self.context_conditioning}"
+
+                    # text
+                    context_and_question_tokens_unconditioned = context_and_question_tokens.clone()
+                    time_range = torch.arange(
+                        context_and_question_tokens_unconditioned.size(2),
+                        device=context_and_question_tokens_unconditioned.device,
+                    ).unsqueeze(
+                        0
+                    )  # (1, max_context_question_tokens_len)
+                    question_mask = (time_range >= question_start) & (
+                        time_range < question_end
+                    )  # create a mask for question only tokens.
+                    context_and_question_tokens_unconditioned[:, 0][
+                        question_mask
+                    ] = self.tokenizer.unk_id  # only the first layer has non-zero IDs.
+
+                    # audio
+                    dec_input_unconditioned = dec_input.clone()
+                    dec_input_unconditioned[:, :, 1 : self.decoder_context_len + 1] = (
+                        self.tokenizer.unk_id
+                    )  # TODO @xueyang: switch to other token id if this one is conflict with text unk.
+
+                    # concatenate both conditioned and unconditioned batches as a single one.
+                    context_and_question_tokens = torch.cat(
+                        (context_and_question_tokens, context_and_question_tokens_unconditioned), dim=0
+                    )
+                    enc_mask = torch.cat((enc_mask, enc_mask), dim=0)
+                    dec_input = torch.cat((dec_input, dec_input_unconditioned), dim=0)
+                    position_ids = torch.cat((position_ids, position_ids), dim=0)
+
+                # clean up useless variables.
+                del question_limits, question_start, question_end, time_range, question_mask
+            elif self.inference_apply_text_cfg:
+                # replace question token IDs with [UNK]'s id. No speech offset for Phoneme's [UNK]. Same op as train.
+                # instruction token IDs are bpe token IDs directly obtained from self.tokenizer without any offset.
+                # question token IDs are phoneme and grapheme token IDs and are offset by self.lm_vocab_size
+                #   if under "Phoneme TTS" instruction, so exising no overlaps between instruction and question token IDs.
+                # question token IDs are bpe token IDs without any offset
+                #   if under "Text to speech this" instruction, so existing overlaps between instruction and question token IDs.
+                question_limits = text_limits - virtual_tokens.size(
+                    1
+                )  # (b, 2), reset question range to start from [pad] context, same start position as context_and_question_tokens.
+                question_start = question_limits[:, 0].unsqueeze(1)  # (b, 1)
+                question_end = question_limits[:, 1].unsqueeze(1)  # (b, 1)
+
+                # duplicate and glue two batches into a single one.
+                virtual_tokens = torch.cat((virtual_tokens, virtual_tokens), dim=0)
+                taskname_ids = torch.cat((taskname_ids, taskname_ids), dim=0)
+                speech_mask = torch.cat((speech_mask, speech_mask), dim=0)
+                dec_input_mask = torch.cat((dec_input_mask, dec_input_mask), dim=0)
+
+                if isinstance(context_and_question_tokens, list):  # indicate self.encoder_type = "multi_transformers".
+                    context_tokens, question_tokens = context_and_question_tokens
+                    question_tokens_unconditioned = question_tokens.clone()
+
+                    time_range = torch.arange(
+                        question_tokens_unconditioned.size(2), device=question_tokens_unconditioned.device
+                    ).unsqueeze(0)
+                    question_mask = (time_range >= question_start) & (
+                        time_range < question_end
+                    )  # create a mask for question only tokens.
+                    question_tokens_unconditioned[:, 0][
+                        question_mask
+                    ] = self.tokenizer.unk_id  # only the first layer has non-zero IDs.
+
+                    # concatenate both conditioned and unconditioned batches as a single one.
+                    context_and_question_tokens = [
+                        torch.cat((context_tokens, context_tokens), dim=0),
+                        torch.cat((question_tokens, question_tokens_unconditioned), dim=0),
+                    ]
+                    enc_mask = [torch.cat((mask, mask), dim=0) for mask in enc_mask]
+                    dec_input = torch.cat((dec_input, dec_input), dim=0)
+                    position_ids = [torch.cat((pos_ids, pos_ids), dim=0) for pos_ids in position_ids]
+                else:
+                    assert (
+                        self.context_conditioning == "decoder"
+                    ), f"The encoder_type is single_transformer. We expect context_condition is decoder: context_condition={self.context_conditioning}"
+                    context_and_question_tokens_unconditioned = context_and_question_tokens.clone()
+                    time_range = torch.arange(
+                        context_and_question_tokens_unconditioned.size(2),
+                        device=context_and_question_tokens_unconditioned.device,
+                    ).unsqueeze(
+                        0
+                    )  # (1, max_context_question_tokens_len)
+                    question_mask = (time_range >= question_start) & (
+                        time_range < question_end
+                    )  # create a mask for question only tokens.
+                    context_and_question_tokens_unconditioned[:, 0][
+                        question_mask
+                    ] = self.tokenizer.unk_id  # only the first layer has non-zero IDs.
+
+                    # concatenate both conditioned and unconditioned batches as a single one.
+                    context_and_question_tokens = torch.cat(
+                        (context_and_question_tokens, context_and_question_tokens_unconditioned), dim=0
+                    )
+                    enc_mask = torch.cat((enc_mask, enc_mask), dim=0)
+                    dec_input = torch.cat((dec_input, dec_input), dim=0)
+                    position_ids = torch.cat((position_ids, position_ids), dim=0)
+
+                # clean up useless variables.
+                del question_limits, question_start, question_end, time_range, question_mask
+            elif self.inference_apply_audio_cfg:
+                # duplicate and glue two batches into a single one.
+                virtual_tokens = torch.cat((virtual_tokens, virtual_tokens), dim=0)
+                taskname_ids = torch.cat((taskname_ids, taskname_ids), dim=0)
+                speech_mask = torch.cat((speech_mask, speech_mask), dim=0)
+                dec_input_mask = torch.cat((dec_input_mask, dec_input_mask), dim=0)
+
+                if isinstance(
+                    context_and_question_tokens, list
+                ):  # indicate that self.encoder_type = "multi_transformers"
+                    context_tokens, question_tokens = context_and_question_tokens
+                    context_tokens_unconditioned = context_tokens.clone()
+                    context_tokens_unconditioned[:, :, :] = (
+                        self.tokenizer.unk_id
+                    )  # TODO @xueyang: verify if extra tokens other than audio codec tokens are appended.
+
+                    # concatenate both conditioned and unconditioned batches as a single one.
+                    context_and_question_tokens = [
+                        torch.cat((context_tokens, context_tokens_unconditioned), dim=0),
+                        torch.cat((question_tokens, question_tokens), dim=0),
+                    ]
+                    enc_mask = [torch.cat((mask, mask), dim=0) for mask in enc_mask]
+                    dec_input = torch.cat((dec_input, dec_input), dim=0)
+                    position_ids = [torch.cat((pos_ids, pos_ids), dim=0) for pos_ids in position_ids]
+                else:
+                    assert (
+                        self.context_conditioning == "decoder"
+                    ), f"The encoder_type is single_transformer. We expect context_condition is decoder: context_condition={self.context_conditioning}"
+                    dec_input_unconditioned = dec_input.clone()
+                    dec_input_unconditioned[:, :, 1 : self.decoder_context_len + 1] = (
+                        self.tokenizer.unk_id
+                    )  # TODO @xueyang: switch to other token id if this one is conflict with text unk.
+
+                    # concatenate both conditioned and unconditioned batches as a single one.
+                    context_and_question_tokens = torch.cat(
+                        (context_and_question_tokens, context_and_question_tokens), dim=0
+                    )
+                    enc_mask = torch.cat((enc_mask, enc_mask), dim=0)
+                    dec_input = torch.cat((dec_input, dec_input_unconditioned), dim=0)
+                    position_ids = torch.cat((position_ids, position_ids), dim=0)
+            else:
+                logging.debug(
+                    f"Neither text or audio cfg logits are applied:"
+                    f" self.inference_apply_text_cfg={self.inference_apply_text_cfg},"
+                    f" self.inference_apply_audio_cfg={self.inference_apply_audio_cfg}"
+                )
+
+            end_inference_loop_at = None
+            fwd_bwd_function = get_forward_backward_func()
+            encoder_output = None
+            attention_probs_all = []
+            start_time = time.time()
+            for t in range(self.decoder_context_len + 1, dec_input.shape[2] - 1):
+                # Start at 0 if encoder context, else context_len
+                if t % 100 == 0:
+                    logging.info("Timestep {}".format(t))
+                if t == end_inference_loop_at:
+                    print("All ends detected")
+                    break
+
+                if isinstance(enc_mask, list):
+                    encoder_max_sequence_len = [e.size(1) for e in enc_mask]
+                else:
+                    encoder_max_sequence_len = enc_mask.size(1)
+
+                # if context_condition is decoder, then t starts at [PAD] token represented as [0] * 8.
+                # if context_condition is encoder, then t starts at [CLS].
+                if t == self.decoder_context_len + 1:
+                    # Run first step manually
+                    output_logits, _, token_and_speech_logits = self.forward(
+                        virtual_tokens,
+                        context_and_question_tokens,
+                        enc_mask,
+                        dec_input[
+                            :, :, : t + 1
+                        ],  # tensors representing [CLS] + context audio tokens + [PAD] if context_condition is decoder, otherwise, tensors representing [CLS].
+                        dec_input_mask[:, : t + 1],  # doesn't matter because of all ones.
+                        position_ids,
+                        taskname_ids,
+                        labels=None,
+                        speech_mask=speech_mask,
+                        inference=True,
+                        inference_step=0,
+                        decoder_max_sequence_len=max_inference_timesteps,
+                        encoder_max_sequence_len=encoder_max_sequence_len,
+                    )
+                    encoder_output = token_and_speech_logits[-1]
+
+                    if isinstance(encoder_output, list):
+                        encoder_output = [e.transpose(0, 1) for e in encoder_output]
+                    else:
+                        encoder_output = encoder_output.transpose(0, 1)
+
+                else:
+                    # Prepare batch
+                    batch = [
+                        max_inference_timesteps,
+                        encoder_max_sequence_len,
+                        encoder_output,
+                        enc_mask,
+                        dec_input[:, :, : t + 1],
+                        dec_input_mask[:, : t + 1],
+                        position_ids,
+                        taskname_ids,
+                        speech_mask,
+                    ]
+
+                    output_tensor = fwd_bwd_function(
+                        forward_step_func=self.get_forward_output_only_func(),
+                        data_iterator=iter(
+                            [
+                                batch,
+                            ]
+                        ),
+                        model=[self],
+                        num_microbatches=get_num_microbatches(),
+                        forward_only=True,
+                        seq_length=t,
+                        micro_batch_size=dec_input.shape[0],
+                    )
+                    output_logits = output_tensor[0]['output_logits']  # (B, T, V, 8) or (2B, T, V, 8)
+                    token_and_speech_logits = output_tensor[0]['token_and_speech_logits']
+
+                    # when return_all_crossattention is False, attention_probs is None.
+                    if self.frozen_model.enc_dec_model.return_all_crossattention_probs:
+                        attention_probs = token_and_speech_logits[2]
+                        attention_probs_mean = torch.stack(attention_probs).mean(dim=0)  # B, 12, 1, enc_timesteps
+                        attention_probs_all.append(attention_probs_mean)
+
+                if self.inference_apply_text_cfg or self.inference_apply_audio_cfg:
+                    # interpolate conditioned and unconditioned logits
+                    token_logits = (
+                        self.inference_cfg_interpolation_scale * token_and_speech_logits[0][:batch_size]
+                        + (1 - self.inference_cfg_interpolation_scale) * token_and_speech_logits[0][batch_size:]
+                    )
+                    output_speech_logits = (
+                        self.inference_cfg_interpolation_scale * output_logits[:batch_size]
+                        + (1 - self.inference_cfg_interpolation_scale) * output_logits[batch_size:]
+                    )
+                else:
+                    token_logits = token_and_speech_logits[0]  # (B, T, V)
+                    output_speech_logits = output_logits
+
+                token_logits_currtimestep = token_logits[:, -1, :]  # (B, V)
+                token_preds = token_logits_currtimestep.argmax(dim=1)  # (B,)
+
+                if torch.count_nonzero(speech_mask) > 0:
+                    output_logits_currtimestep = (
+                        output_speech_logits[:, -1, :, :]
+                        .permute(0, 2, 1)
+                        .contiguous()
+                        .view(-1, self.speech_codebook_size)
+                    )  # (B*8, V)
+                    output_logits_currtimestep_conditioned = (
+                        output_logits[:batch_size][:, -1, :, :]
+                        .permute(0, 2, 1)
+                        .contiguous()
+                        .view(-1, self.speech_codebook_size)
+                    )
+                    output_logits_currtimestep_unconditioned = (
+                        output_logits[batch_size:][:, -1, :, :]
+                        .permute(0, 2, 1)
+                        .contiguous()
+                        .view(-1, self.speech_codebook_size)
+                    )
+                else:
+                    output_logits_currtimestep = token_logits_currtimestep  # (B, V)
+                    output_logits_currtimestep_conditioned = token_logits_currtimestep
+                    output_logits_currtimestep_unconditioned = token_logits_currtimestep
+
+                top_k = self.cfg.get('top_k', 80)
+
+                # (B*8, 80) or (B, 80)
+                output_logits_currtimestep_topk = torch.topk(output_logits_currtimestep, top_k, dim=1)[0]
+
+                # find indices which are not top k
+                indices_to_remove = output_logits_currtimestep < output_logits_currtimestep_topk[:, -1].unsqueeze(1)
+                # (B*8, 1024) or (B, 1024)
+
+                if self.inference_apply_cfg_filter:
+                    output_logits_currtimestep_rescored = output_logits_currtimestep_conditioned.clone()
+                else:
+                    output_logits_currtimestep_rescored = output_logits_currtimestep.clone()
+
+                output_logits_currtimestep_rescored[indices_to_remove] = -float('Inf')
+
+                # logits interpolation between conditioned and unconditioned logits.
+                if (
+                    self.inference_apply_text_cfg or self.inference_apply_audio_cfg
+                ) and self.inference_apply_cfg_filter:
+                    output_logits_currtimestep_rescored = (
+                        self.inference_cfg_filter_interpolation_scale * output_logits_currtimestep_rescored
+                        + (1 - self.inference_cfg_filter_interpolation_scale)
+                        * output_logits_currtimestep_unconditioned
+                    )
+
+                temperature = self.cfg.get('temperature', 0.85)  # Set temp 0.01 for greedy decoding
+                output_logits_currtimestep_rescored = output_logits_currtimestep_rescored / temperature
+                output_logits_currtimestep_rescored = torch.nn.functional.softmax(
+                    output_logits_currtimestep_rescored, dim=1
+                )
+
+                output_tokens_curr_timestep = torch.multinomial(
+                    output_logits_currtimestep_rescored, num_samples=1
+                )  # (B*8, 1)
+
+                if torch.count_nonzero(speech_mask) > 0:
+                    # Convert back to (B, 8)
+                    output_tokens_curr_timestep = output_tokens_curr_timestep.view(
+                        batch_size, self.num_speech_codebooks
+                    )
+
+                for _b in range(token_preds.shape[0]):
+                    if t > self.decoder_context_len + 10 and token_preds[_b] == self.tokenizer.eos_id:
+                        if _b not in end_indices:
+                            logging.info("End detected for item {}".format(_b) + " at timestep {}".format(t))
+                            end_indices[_b] = t
+                            if len(end_indices) == token_preds.shape[0]:
+                                end_inference_loop_at = t + self.num_speech_codebooks
+
+                output_token_list.append(output_tokens_curr_timestep)
+
+                # duplicate to 2b dim as input for the next iteration if enabling cfg.
+                if self.inference_apply_text_cfg or self.inference_apply_audio_cfg:
+                    output_tokens_curr_timestep = torch.cat(
+                        (output_tokens_curr_timestep, output_tokens_curr_timestep), dim=0
+                    )
+
+                if torch.count_nonzero(speech_mask) > 0:
+                    dec_input_next_timestep = output_tokens_curr_timestep * 1  # (B,8)
+                    dec_input_next_timestep[:, 0] = (
+                        dec_input_next_timestep[:, 0] + self.speech_offset
+                    )  # add offset to first codebook
+                    dec_input[:, :, t + 1] = dec_input_next_timestep * 1
+                else:
+                    dec_input[:, 0, t + 1] = output_tokens_curr_timestep.squeeze(1)
+
+            # end of for loop
+            output_tokens_combined = torch.stack(output_token_list)  # (T, B, 8) if speech else (T, B)
+            if torch.count_nonzero(speech_mask) > 0:
+                output_tokens_combined = output_tokens_combined.permute(1, 2, 0)  # (B, 8, T)
+            else:
+                output_tokens_combined = output_tokens_combined.squeeze(2)
+                output_tokens_combined = output_tokens_combined.permute(1, 0)  # (B, T)
+
+            # consider only autoregressive time, disconsider loading eval models for RTF time
+            total_process_time = time.time() - start_time
+
+            # Layerwise token error rate
+            ter_dict = {}
+            for i in range(self.num_speech_codebooks):
+                ter_dict[i] = {'hypothesis': [], 'gt': []}
+
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+            if 'nemo_sv_model' not in self.additional_models:
+                nemo_sv_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name='titanet_large')
+                nemo_sv_model = nemo_sv_model.to(device)
+                nemo_sv_model.encoder.disable_torch_distributed = True  # For multi-gpu training validation
+                nemo_sv_model.eval()
+                self.additional_models['nemo_sv_model'] = nemo_sv_model
+                logging.info(f"Loaded SV Model: {nemo_sv_model}")
+            else:
+                nemo_sv_model = self.additional_models['nemo_sv_model']
+
+            if 'asr_model' not in self.additional_models:
+                asr_model = self.cfg.get("asr_model_name", "stt_multilingual_fastconformer_hybrid_large_pc_blend_eu")
+
+                if "hybrid" in asr_model:
+                    model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel
+                else:
+                    model = nemo_asr.models.EncDecRNNTBPEModel
+                asr_model = model.from_pretrained(model_name=asr_model)
+                asr_model = asr_model.to(device)
+                asr_model.encoder.disable_torch_distributed = True  # For multi-gpu training validation
+                asr_model.eval()
+                self.additional_models['asr_model'] = asr_model
+                logging.info(f"Loaded ASR Model: {asr_model}")
+            else:
+                asr_model = self.additional_models['asr_model']
+
+            asr_model_zh = None
+            if Lang.zh.value in lang:
+                if 'asr_model_zh' not in self.additional_models:
+                    asr_model_zh = nemo_asr.models.EncDecRNNTModel.from_pretrained(
+                        model_name="stt_zh_conformer_transducer_large"
+                    )
+                    asr_model_zh = asr_model_zh.to(device)
+                    asr_model_zh.eval()
+                    self.additional_models['asr_model_zh'] = asr_model_zh
+                else:
+                    asr_model_zh = self.additional_models['asr_model_zh']
+
+            if 'wavlm_sv_model' not in self.additional_models:
+                wavlm_sv_extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-base-plus-sv')
+                wavlm_sv_model = WavLMForXVector.from_pretrained('microsoft/wavlm-base-plus-sv')
+                wavlm_sv_model = wavlm_sv_model.to(device)
+                wavlm_sv_model = wavlm_sv_model.eval()
+                self.additional_models['wavlm_sv_model'] = wavlm_sv_model
+                self.additional_models['wavlm_sv_extractor'] = wavlm_sv_extractor
+                logging.info(f"Loaded SV Model: {wavlm_sv_model}")
+            else:
+                wavlm_sv_model = self.additional_models['wavlm_sv_model']
+                wavlm_sv_extractor = self.additional_models['wavlm_sv_extractor']
+
+            # load MOS estimator model only if True.
+            if self.estimate_mos:
+                # load mos estimator.
+                if 'squim_mos_model' not in self.additional_models:
+                    squim_mos_model_full = SQUIM_SUBJECTIVE.get_model().to(device)
+                    self.additional_models['squim_mos_model'] = squim_mos_model_full
+                else:
+                    squim_mos_model_full = self.additional_models['squim_mos_model']
+
+                # load non-matching reference clean audio.
+                ref_16khz_wav, _ = librosa.load(self.non_matching_ref_audio_filepath, sr=16000)
+
+                # prepare MOS estimator by taking a single audio example as an input.
+                squim_mos_model = partial(
+                    squim_mos_model_full, reference=torch.from_numpy(ref_16khz_wav).to(device).unsqueeze(0)
+                )
+
+            _exp_dir_path = self.logger.log_dir
+            _exp_dir_path = _exp_dir_path + '/Sample_Audios'
+            if not os.path.exists(_exp_dir_path):
+                os.mkdir(_exp_dir_path)
+
+            squim_mos_list_pred = []
+            squim_mos_list_context = []
+            squim_mos_list_gt = []
+            similarity_list = []
+            similarity_list_wavlm = []
+            pred_context_similarity_list = []
+            pred_context_similarity_list_wavlm = []
+            gt_context_similarity_list = []
+            gt_context_similarity_list_wavlm = []
+            question_type = []
+
+            # predicting audio
+            batch_size = output_tokens_combined.shape[0]
+            test_dataloader_batch_size = batch_size
+            # self.test_dataloader() is not defined during validation
+            if isinstance(self.test_dataloader(), torch.utils.data.DataLoader):
+                test_dataloader_batch_size = self.test_dataloader().batch_size
+
+            # logging attention maps.
+            # empty attention_probs_all indicates self.frozen_model.enc_dec_model.return_all_crossattention_probs is False.
+            if len(attention_probs_all) != 0:
+                attention_probs_all = torch.cat(attention_probs_all, dim=2)  # B, 12, dec_timesteps, enc_timesteps
+                attention_probs_all = attention_probs_all.mean(dim=1)  # B, dec_timesteps, enc_timesteps
+
+                for i in range(batch_size):
+                    text_end_step = text_limits[i, 1].item()
+                    text_start_step = text_limits[i, 0].item()
+                    end_index = end_indices.get(i, output_tokens_combined.shape[2])
+                    if len(attention_probs_all) != 0:
+                        attention_probs_example = attention_probs_all[i][
+                            : end_index - (1 + self.decoder_context_len), text_start_step:text_end_step
+                        ]  # T, enc_timesteps
+                        attention_map = attention_probs_example.float().cpu().numpy().T
+                        alignment_image = plot_alignment_to_numpy_for_speechllm(
+                            attention_map,
+                            phoneme_ver=1,
+                            phoneme_seq=None,
+                        )
+
+                        if global_step is not None:
+                            # During validation, step is simply global_step + i
+                            step = global_step + i
+                        else:
+                            # During inference, step is the index of the sample
+                            step = batch_idx * test_dataloader_batch_size + i
+
+                        self.logger.experiment.add_image(
+                            "Inf Attention Map",
+                            alignment_image,
+                            step,
+                            dataformats="HWC",
+                        )
+                        # Save attention image to file
+                        alignment_fp = os.path.join(_exp_dir_path, f'attention_map_{step}.png')
+                        imageio.imwrite(alignment_fp, alignment_image)
+
+            wer_score = 0
+            audio_to_pred = []
+            audio_to_pred_zh = []
+            total_audio_seconds = 0
+            for i in range(batch_size):
+                if global_step is not None:
+                    # During validation, step is simply global_step + i
+                    step = global_step + i
+                else:
+                    # During inference, step is the index of the sample
+                    step = batch_idx * test_dataloader_batch_size + i
+
+                audio_len = self.decoder_context_len + (labels[i][0][self.decoder_context_len :] != 0).sum().item()
+
+                if torch.count_nonzero(speech_mask) > 0:
+                    dec_input_to_1024 = self.convert_tokens_to_range(dec_input_raw[i, :, 0:audio_len])
+                    dec_input_to_1024_answer = dec_input_to_1024[:, self.decoder_context_len + 1 :]
+                    dec_input_wav = self.decode_wav_from_codec_model(dec_input_to_1024_answer)
+                    self.logger.experiment.add_audio("Inf Dec Input Wav", dec_input_wav, step, self.sample_rate)
+
+                    predicted_tokens = output_tokens_combined[i]  # Should not contain context even if decoder context
+                    if i in end_indices:
+                        logging.info(f"Clipping until end index for audio {i}")
+                        if self.cfg.get('seq_pattern', 'parallel') == 'delay_parallel':
+                            predicted_tokens = predicted_tokens[
+                                :, 0 : end_indices[i] - (1 + self.decoder_context_len) + self.num_speech_codebooks
+                            ]  # trim to audio length
+                        else:
+                            predicted_tokens = predicted_tokens[
+                                :, 0 : end_indices[i] - (1 + self.decoder_context_len)
+                            ]  # trim to audio length
+
+                    pred_img = predicted_tokens.data.cpu().float().numpy()
+                    dec_inp_img = dec_input_to_1024.data.cpu().float().numpy()
+                    start_time = time.time()
+                    predicted_tokens = self.convert_tokens_to_range(predicted_tokens, apply_offset_correction=False)
+                    predicted_wav = self.decode_wav_from_codec_model(predicted_tokens)
+                    # accumulate audio length in seconds and process time in seconds to the RTF
+                    total_process_time = total_process_time + (time.time() - start_time)
+                    total_audio_seconds = total_audio_seconds + predicted_wav.size(-1) / self.sample_rate
+
+                    self.logger.experiment.add_audio("Inf Pred Wav", predicted_wav, step, self.sample_rate)
+                    self.logger.experiment.add_image(
+                        "Inf Pred Tokens",
+                        plot_codec_to_numpy(pred_img),
+                        step,
+                        dataformats="HWC",
+                    )
+                    self.logger.experiment.add_image(
+                        "Inf Dec Input Tokens",
+                        plot_codec_to_numpy(dec_inp_img),
+                        step,
+                        dataformats="HWC",
+                    )
+
+                    # save predicted_wav and gt_wav to a wav files in dir_path
+                    if global_step is not None:
+                        # During training, overwrite the wav file from the previous validation
+                        wav_num = i
+                    else:
+                        wav_num = step
+
+                    audio_fp_pred = os.path.join(_exp_dir_path, f'predicted_wav_{wav_num}.wav')
+                    sf.write(audio_fp_pred, predicted_wav.cpu().numpy(), self.sample_rate)
+                    audio_fp_gt = os.path.join(_exp_dir_path, f'dec_input_wav_{wav_num}.wav')
+                    sf.write(audio_fp_gt, dec_input_wav.cpu().numpy(), self.sample_rate)
+
+                    # speaker verification evaluation using nemo model
+                    spk_embedding_pred = nemo_sv_model.get_embedding(audio_fp_pred)
+                    spk_embedding_pred = spk_embedding_pred.cpu().detach().numpy().flatten()
+                    spk_embedding_gt = nemo_sv_model.get_embedding(audio_fp_gt)
+                    spk_embedding_gt = spk_embedding_gt.cpu().detach().numpy().flatten()
+                    similarity = np.dot(spk_embedding_pred, spk_embedding_gt) / (
+                        np.linalg.norm(spk_embedding_pred) * np.linalg.norm(spk_embedding_gt)
+                    )
+
+                    if log_scalars:
+                        self.logger.experiment.add_scalar(f'Inf SV Cossim Individual Sample', similarity, step)
+                    similarity_list.append(similarity)
+
+                    # speaker verification evaluation using wavlm model
+                    gt_16khz_wav, _ = librosa.load(audio_fp_gt, sr=16000)
+                    pred_16khz_wav, _ = librosa.load(audio_fp_pred, sr=16000)
+                    inputs_wavlm = wavlm_sv_extractor(
+                        [pred_16khz_wav, gt_16khz_wav], padding=True, return_tensors="pt", sampling_rate=16000
+                    )
+                    for key in inputs_wavlm.keys():
+                        inputs_wavlm[key] = inputs_wavlm[key].to(device)
+
+                    with torch.no_grad():
+                        wavlm_embeddings = wavlm_sv_model(**inputs_wavlm).embeddings
+                        wavlm_embeddings = torch.nn.functional.normalize(wavlm_embeddings, dim=-1).cpu()
+
+                    spk_embedding_pred_wavlm = wavlm_embeddings[0].cpu().detach().numpy().flatten()
+                    spk_embedding_gt_wavlm = wavlm_embeddings[1].cpu().detach().numpy().flatten()
+                    similarity_wavlm = np.dot(spk_embedding_pred_wavlm, spk_embedding_gt_wavlm) / (
+                        np.linalg.norm(spk_embedding_pred_wavlm) * np.linalg.norm(spk_embedding_gt_wavlm)
+                    )
+                    similarity_list_wavlm.append(similarity_wavlm)
+
+                    if lang[i] == Lang.zh.value:
+                        audio_to_pred_zh.append({"step": i, "audio": audio_fp_pred})
+                        audio_to_pred_zh.append({"step": i, "audio": audio_fp_gt})
+                    else:
+                        audio_to_pred.append({"step": i, "audio": audio_fp_pred})
+                        audio_to_pred.append({"step": i, "audio": audio_fp_gt})
+
+                    if isinstance(context_and_question_tokens, list):
+                        context_tokens, question_tokens = context_and_question_tokens
+                        input_token_list = [
+                            question_tokens[i, 0, j].item()
+                            for j in range(context_and_question_tokens_lens[1][i].item())
+                        ]
+                        input_token_list = [
+                            (ti, t) for ti, t in enumerate(input_token_list) if t != 0 and t < self.speech_offset
+                        ]
+                        context_end_step = context_and_question_tokens_lens[0][i]
+                        context_tokens = context_tokens[i][:, :context_end_step]
+                    else:
+                        input_token_list = [
+                            context_and_question_tokens[i, 0, j].item()
+                            for j in range(context_and_question_tokens.shape[2])
+                        ]
+                        input_token_list = [
+                            (ti, t) for ti, t in enumerate(input_token_list) if t != 0 and t < self.speech_offset
+                        ]
+                        context_end_step = input_token_list[0][0]
+                        context_tokens = context_and_question_tokens[i][:, :context_end_step]
+
+                    spk_embedding_context = spk_embedding_gt
+                    spk_embedding_context_wavlm = spk_embedding_gt_wavlm
+                    if self.decoder_context_len > 0:
+                        context_tokens = dec_input_to_1024[:, : self.decoder_context_len + 1]
+                        context_wav = self.decode_wav_from_codec_model(context_tokens)
+                    elif context_end_step > 1:
+                        is_speech_context = context_tokens[1, :].sum().item() > 0
+                        if is_speech_context:
+                            context_tokens = self.convert_tokens_to_range(context_tokens, pattern=self.context_pattern)
+                            context_wav = self.decode_wav_from_codec_model(context_tokens)
+                        else:
+                            context_wav = None
+                            _context_token_list = [v.item() for v in context_tokens[0, :]]
+                            _context_text = self.frozen_model.tokenizer.ids_to_text(
+                                [v for v in _context_token_list if v < self.lm_vocab_size]
+                            )
+                            self.logger.experiment.add_text("Context Text", _context_text, self.global_step)
+
+                    else:
+                        context_wav = None
+
+                    if context_wav is not None:
+                        self.logger.experiment.add_audio("Context Wav", context_wav, step, self.sample_rate)
+                        context_wav_fp = os.path.join(_exp_dir_path, f'context_wav_{wav_num}.wav')
+                        sf.write(context_wav_fp, context_wav.cpu().numpy(), self.sample_rate)
+                        # titanet
+                        spk_embedding_context = nemo_sv_model.get_embedding(context_wav_fp)
+                        spk_embedding_context = spk_embedding_context.cpu().detach().numpy().flatten()
+                        # wavlm
+                        context_wavlm_wav, _ = librosa.load(context_wav_fp, sr=16000)
+                        inputs_wavlm = wavlm_sv_extractor(
+                            [context_wavlm_wav], padding=True, return_tensors="pt", sampling_rate=16000
+                        )
+                        for key in inputs_wavlm.keys():
+                            inputs_wavlm[key] = inputs_wavlm[key].to(device)
+
+                        with torch.no_grad():
+                            wavlm_embeddings = wavlm_sv_model(**inputs_wavlm).embeddings
+                            wavlm_embeddings = torch.nn.functional.normalize(wavlm_embeddings, dim=-1).cpu()
+
+                        spk_embedding_context_wavlm = wavlm_embeddings[0].cpu().detach().numpy().flatten()
+
+                    pred_similarity_context = np.dot(spk_embedding_context, spk_embedding_pred) / (
+                        np.linalg.norm(spk_embedding_context) * np.linalg.norm(spk_embedding_pred)
+                    )
+                    gt_similarity_context = np.dot(spk_embedding_context, spk_embedding_gt) / (
+                        np.linalg.norm(spk_embedding_context) * np.linalg.norm(spk_embedding_gt)
+                    )
+
+                    pred_similarity_context_wavlm = np.dot(spk_embedding_context_wavlm, spk_embedding_pred_wavlm) / (
+                        np.linalg.norm(spk_embedding_context_wavlm) * np.linalg.norm(spk_embedding_pred_wavlm)
+                    )
+                    gt_similarity_context_wavlm = np.dot(spk_embedding_context_wavlm, spk_embedding_gt_wavlm) / (
+                        np.linalg.norm(spk_embedding_context_wavlm) * np.linalg.norm(spk_embedding_gt_wavlm)
+                    )
+
+                    if log_scalars:
+                        self.logger.experiment.add_scalar(f'Inf SV Cossim Context Pred', pred_similarity_context, step)
+                        self.logger.experiment.add_scalar(f'Inf SV Cossim Context GT', gt_similarity_context, step)
+                    pred_context_similarity_list.append(pred_similarity_context)
+                    gt_context_similarity_list.append(gt_similarity_context)
+                    pred_context_similarity_list_wavlm.append(pred_similarity_context_wavlm)
+                    gt_context_similarity_list_wavlm.append(gt_similarity_context_wavlm)
+
+                    task_question = self.frozen_model.tokenizer.ids_to_text(
+                        [v[1] for v in input_token_list if v[1] < self.lm_vocab_size]
+                    )
+                    self.logger.experiment.add_text("Inf Task Question", task_question, step)
+                    if "Phoneme TTS" in task_question:
+                        question_type.append("Phoneme TTS")
+                    elif "Text to speech this" in task_question:
+                        question_type.append("Text to speech this")
+                    else:
+                        question_type.append("Other")
+
+                    task_question_phoneme_tokens = [
+                        v[1] - self.lm_vocab_size for v in input_token_list if v[1] >= self.lm_vocab_size
+                    ]
+                    if len(task_question_phoneme_tokens) > 0:
+                        phoneme_text = self.phoneme_tokenizer.decode(task_question_phoneme_tokens)
+                        self.logger.experiment.add_text("Inf Task Question Phoneme Text", phoneme_text, step)
+
+                    # store predicted_tokens for each layer to compute token error rate
+                    for layer_idx in range(self.num_speech_codebooks):
+                        ter_dict[layer_idx]['hypothesis'].append(predicted_tokens[layer_idx].cpu().numpy().tolist())
+                        ter_dict[layer_idx]['gt'].append(dec_input_to_1024_answer[layer_idx].cpu().numpy().tolist())
+
+                    # estimate MOS scores.
+                    if self.estimate_mos:
+                        squim_mos_score_pred = squim_mos_model(
+                            torch.from_numpy(pred_16khz_wav).to(device).unsqueeze(0)
+                        ).item()
+                        squim_mos_score_gt = squim_mos_model(
+                            torch.from_numpy(gt_16khz_wav).to(device).unsqueeze(0)
+                        ).item()
+                        if context_wav is not None:
+                            squim_mos_score_context = squim_mos_model(context_wav.to(device).unsqueeze(0)).item()
+                            squim_mos_list_context.append(squim_mos_score_context)
+                        squim_mos_list_pred.append(squim_mos_score_pred)
+                        squim_mos_list_gt.append(squim_mos_score_gt)
+                else:
+                    r = labels[i, 0].long()
+                    nzm = r != 0
+                    r = r.tolist()[:-1]
+                    nzm = nzm[:-1]
+                    h = output_tokens_combined[i].long() * nzm
+                    h = h.tolist()
+                    cur_wer_score = editdistance.eval(r, h)
+                    if log_scalars:
+                        self.logger.experiment.add_scalar('WER', cur_wer_score, step)
+                        logging.info(f"current wer score : {cur_wer_score}")
+                    wer_score += cur_wer_score
+            if wer_score > 0:
+                wer_score /= batch_size
+                if log_scalars:
+                    self.logger.experiment.add_scalar('AVG WER', wer_score, step)
+                    logging.info(f"average wer score : {wer_score}")
+
+            # compute token error rate for each layer
+            if log_scalars:
+                for layer_idx in range(self.num_speech_codebooks):
+                    wer = word_error_rate(ter_dict[layer_idx]['hypothesis'], ter_dict[layer_idx]['gt'], use_cer=True)
+                    self.logger.experiment.add_scalar(f'Inf TER Layer {layer_idx}', wer, 0)
+
+            greedy_transcripts = []
+            if len(audio_to_pred) > 0:
+                greedy_transcripts.extend(asr_model.transcribe([i["audio"] for i in audio_to_pred])[0])
+            if len(audio_to_pred_zh) > 0:
+                greedy_transcripts.extend(asr_model_zh.transcribe([i["audio"] for i in audio_to_pred_zh])[0])
+
+            all_audio_to_pred = audio_to_pred + audio_to_pred_zh
+            # Note WER over the batch is not equal to WER(sample) / batch_size, but approx. here
+
+            # These are between ASR outputs of GT audio and predicted audio
+            wer_batch = []
+            cer_batch = []
+            cer_phoneme = []
+            wer_phoneme = []
+            cer_tts = []
+            wer_tts = []
+
+            # These are between ASR output of Pred audio and GT text
+            wer_batch_gt = []
+            cer_batch_gt = []
+            cer_phoneme_gt = []
+            wer_phoneme_gt = []
+            cer_tts_gt = []
+            wer_tts_gt = []
+
+            for i in range(0, len(greedy_transcripts) - 1, 2):
+                assert all_audio_to_pred[i]["step"] == all_audio_to_pred[i + 1]["step"]
+                step = batch_idx * test_dataloader_batch_size + all_audio_to_pred[i]["step"]
+                question_text = question_texts[i // 2]
+
+                # No need to process text since both are ASR outputs
+                cer_sample = word_error_rate([greedy_transcripts[i]], [greedy_transcripts[i + 1]], use_cer=True)
+                wer_sample = word_error_rate([greedy_transcripts[i]], [greedy_transcripts[i + 1]], use_cer=False)
+
+                # Processing text since one is ASR output and the other is the GT text
+                cer_gt = word_error_rate(
+                    [self.process_text(greedy_transcripts[i])], [self.process_text(question_text)], use_cer=True
+                )
+                wer_gt = word_error_rate(
+                    [self.process_text(greedy_transcripts[i])], [self.process_text(question_text)], use_cer=False
+                )
+
+                self.logger.experiment.add_text("Inf Predicted Text", greedy_transcripts[i], step)
+                self.logger.experiment.add_text("Inf GT Text", greedy_transcripts[i + 1], step)
+                self.logger.experiment.add_text("Inf Question Text", question_text, step)
+                if log_scalars:
+                    self.logger.experiment.add_scalar(f'Inf CER Transcript', cer_sample, step)
+                    self.logger.experiment.add_scalar(f'Inf WER Transcript', wer_sample, step)
+                    self.logger.experiment.add_scalar(f'Inf CER GT Transcript', cer_gt, step)
+                cer_batch.append(cer_sample)
+                wer_batch.append(wer_sample)
+                cer_batch_gt.append(cer_gt)
+                wer_batch_gt.append(wer_gt)
+                if question_type[all_audio_to_pred[i]["step"]] == "Phoneme TTS":
+                    if log_scalars:
+                        self.logger.experiment.add_scalar(f'Inf CER Phoneme Task', cer_sample, step)
+                        self.logger.experiment.add_scalar(f'Inf WER Phoneme Task', wer_sample, step)
+                        self.logger.experiment.add_scalar(f'Inf CER GT Phoneme Task', cer_gt, step)
+                    cer_phoneme.append(cer_sample)
+                    wer_phoneme.append(wer_sample)
+                    cer_phoneme_gt.append(cer_gt)
+                    wer_phoneme_gt.append(wer_gt)
+                elif question_type[all_audio_to_pred[i]["step"]] == "Text to speech this":
+                    if log_scalars:
+                        self.logger.experiment.add_scalar(f'Inf CER TTS Task', cer_sample, step)
+                        self.logger.experiment.add_scalar(f'Inf WER TTS Task', wer_sample, step)
+                        self.logger.experiment.add_scalar(f'Inf CER GT TTS Task', cer_gt, step)
+                    cer_tts.append(cer_sample)
+                    wer_tts.append(wer_sample)
+                    cer_tts_gt.append(cer_gt)
+                    wer_tts_gt.append(wer_gt)
+
+            # compute average similarity
+            similarity_avg = np.mean(similarity_list)
+            pred_context_similarity_avg = np.mean(pred_context_similarity_list)
+            gt_context_similarity_avg = np.mean(gt_context_similarity_list)
+            similarity_avg_wavlm = np.mean(similarity_list_wavlm)
+            pred_context_similarity_avg_wavlm = np.mean(pred_context_similarity_list_wavlm)
+            gt_context_similarity_avg_wavlm = np.mean(gt_context_similarity_list_wavlm)
+
+            if log_scalars:
+                self.logger.experiment.add_scalar(f'Inf SV Avg Cossim', similarity_avg, batch_idx)
+            self.predict_step_outputs.append(
+                {
+                    'titanet_avg_cossim': similarity_avg,
+                    'titanet_avg_cossim_context_pred': pred_context_similarity_avg,
+                    'titanet_avg_cossim_context_gt': gt_context_similarity_avg,
+                    'wavlm_avg_cossim': similarity_avg_wavlm,
+                    'wavlm_avg_cossim_context_pred': pred_context_similarity_avg_wavlm,
+                    'wavlm_avg_cossim_context_gt': gt_context_similarity_avg_wavlm,
+                    'squim_mos_pred': np.mean(squim_mos_list_pred) if len(squim_mos_list_pred) > 0 else None,
+                    'squim_mos_context': np.mean(squim_mos_list_context) if len(squim_mos_list_context) > 0 else None,
+                    'squim_mos_gt': np.mean(squim_mos_list_gt) if len(squim_mos_list_gt) > 0 else None,
+                    'cer_transcript': np.mean(cer_batch),
+                    'wer_transcript': np.mean(wer_batch),
+                    'cer_phoneme': np.mean(cer_phoneme) if len(cer_phoneme) > 0 else None,
+                    'wer_phoneme': np.mean(wer_phoneme) if len(wer_phoneme) > 0 else None,
+                    'cer_tts': np.mean(cer_tts) if len(cer_tts) > 0 else None,
+                    'wer_tts': np.mean(wer_tts) if len(wer_tts) > 0 else None,
+                    'cer_transcript_gt': np.mean(cer_batch_gt),
+                    'wer_transcript_gt': np.mean(wer_batch_gt),
+                    'cer_phoneme_gt': np.mean(cer_phoneme_gt) if len(cer_phoneme_gt) > 0 else None,
+                    'wer_phoneme_gt': np.mean(wer_phoneme_gt) if len(wer_phoneme_gt) > 0 else None,
+                    'cer_tts_gt': np.mean(cer_tts_gt) if len(cer_tts_gt) > 0 else None,
+                    'wer_tts_gt': np.mean(wer_tts_gt) if len(wer_tts_gt) > 0 else None,
+                    "RTF": total_process_time / total_audio_seconds,
+                }
+            )
+
+    # TODO @xueyang: PTL 2.0+ patch. Signature of method `on_predict_epoch_end` does not match signature of the base method in PTL class 'ModelHooks'.
+    # Remove the `outputs` param and choose `self.predict_step_output` instead.
+    def on_predict_epoch_end(self, outputs: List[Any]) -> None:
+
+        gather_results = [None for _ in range(parallel_state.get_data_parallel_world_size())]
+        all_preds = list(itertools.chain(*[item['preds_text'] for item in outputs[0]]))
+        all_labels = list(itertools.chain(*[item['labels_text'] for item in outputs[0]]))
+        all_inputs = list(itertools.chain(*[item['input_text'] for item in outputs[0]]))
+
+        assert len(all_preds) == len(all_labels)
+        assert len(all_preds) == len(all_inputs)
+
+        # Gather inputs, predictions, and ground truths from all workers
+        torch.distributed.all_gather_object(
+            gather_results,
+            [(input, pred, label) for (input, pred, label) in zip(all_inputs, all_preds, all_labels)],
+            group=parallel_state.get_data_parallel_group(),
+        )
+
+        # Deduplicate sentences that may have been distributed across multiple data parallel ranks.
+        if parallel_state.get_data_parallel_rank() == 0:
+            gather_results_dedup = list(set(itertools.chain(*gather_results)))
+
+            input_prediction_pair = []
+            correct = 0
+            for input, pred, label in gather_results_dedup:
+                input_prediction_pair.append((input, pred))
+                if label:
+                    if pred == label:
+                        correct += 1
+
+            acc = correct / len(gather_results_dedup) if all_labels[0] else None
+            logging.info(f'Prediction results: {acc}')
+            logging.info(f'Test finish')
diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py
index a4c65f9ed0e5..85d12a4261da 100644
--- a/nemo/collections/tts/parts/utils/helpers.py
+++ b/nemo/collections/tts/parts/utils/helpers.py
@@ -48,8 +48,8 @@
 import librosa
 import matplotlib.pylab as plt
 import numpy as np
+import seaborn as sns
 import torch
-from einops import rearrange
 from numba import jit, prange
 
 from nemo.collections.tts.torch.tts_data_types import DATA_STR2DATA_CLASS, MAIN_DATA_TYPES, WithLens
@@ -468,6 +468,74 @@ def plot_alignment_to_numpy(alignment, title='', info=None, phoneme_seq=None, vm
     return data
 
 
+def plot_alignment_to_numpy_for_speechllm(
+    alignment,
+    title='',
+    info=None,
+    phoneme_seq=None,
+    vmin=None,
+    vmax=None,
+    phoneme_ver=0,
+    phone_offset=2,
+    h_offset=True,
+):
+    alignment = np.clip(alignment, a_min=0, a_max=None)
+    fig, ax = plt.subplots(figsize=(8, 6))
+    im = ax.imshow(alignment, aspect='auto', origin='lower', interpolation='none', vmin=vmin, vmax=vmax)
+    ax.set_title(title)
+    fig.colorbar(im, ax=ax)
+    xlabel = 'Decoder timestep'
+    if info is not None:
+        xlabel += '\n\n' + info
+    plt.xlabel(xlabel)
+    plt.ylabel('Encoder timestep')
+
+    if phoneme_seq is not None:
+        if phoneme_ver == 0:
+            # for debugging of phonemes and durs in maps. Not used by def in training code
+            ax.set_yticks(np.arange(len(phoneme_seq)))
+            ax.set_yticklabels(phoneme_seq)
+            ax.hlines(np.arange(len(phoneme_seq)), xmin=0.0, xmax=max(ax.get_xticks()))
+        elif phoneme_ver == 1:
+            yticks = ax.get_yticks()
+            new_yticks = []
+            for tick in yticks:
+                if tick < 0 or tick > alignment.shape[0]:
+                    continue
+                new_yticks.append(tick)
+            new_yticks += phoneme_seq
+            ax.set_yticks(new_yticks)
+        elif phoneme_ver == 2:
+            phones = phoneme_seq[phone_offset:]
+            ax.set_yticks(np.arange(len(phones)))
+            ax.set_yticklabels(phones)
+            ax.hlines(np.arange(0.5, len(phones) - 0.5, 1.0), xmin=0.0, xmax=alignment.shape[1] - 0.5, colors="black")
+
+            if h_offset:
+                xticks = ax.get_xticks()
+                new_xticks = []
+                for tick in xticks:
+                    new_xticks.append(f"{tick+phoneme_seq[1]:.0f}")
+                ax.set_xticklabels(new_xticks)
+
+    plt.tight_layout()
+    fig.canvas.draw()
+    data = save_figure_to_numpy(fig)
+    plt.close()
+    return data
+
+
+def plot_codec_to_numpy(codes, title=''):
+    fig, ax = plt.subplots(figsize=(10, 3))
+    sns.heatmap(codes, ax=ax)
+
+    plt.tight_layout()
+    fig.canvas.draw()
+    data = save_figure_to_numpy(fig)
+    plt.close()
+    return data
+
+
 def plot_pitch_to_numpy(pitch, ylim_range=None):
     fig, ax = plt.subplots(figsize=(12, 3))
     plt.plot(pitch)
diff --git a/nemo/collections/tts/parts/utils/tts_dataset_utils.py b/nemo/collections/tts/parts/utils/tts_dataset_utils.py
index 5f1185c2c399..96806f633a54 100644
--- a/nemo/collections/tts/parts/utils/tts_dataset_utils.py
+++ b/nemo/collections/tts/parts/utils/tts_dataset_utils.py
@@ -67,8 +67,7 @@ def get_audio_filepaths(manifest_entry: Dict[str, Any], audio_dir: Path) -> Tupl
 
 
 def normalize_volume(audio: np.array, volume_level: float = 0.95) -> np.array:
-    """Apply peak normalization to the input audio.
-    """
+    """Apply peak normalization to the input audio."""
     if not (0.0 <= volume_level <= 1.0):
         raise ValueError(f"Volume must be in range [0.0, 1.0], received {volume_level}")
 
@@ -88,10 +87,11 @@ class BetaBinomialInterpolator:
     The implementation is taken from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
     """
 
-    def __init__(self, round_mel_len_to=50, round_text_len_to=10, cache_size=500):
+    def __init__(self, round_mel_len_to=50, round_text_len_to=10, cache_size=500, scaling_factor: float = 1.0):
         self.round_mel_len_to = round_mel_len_to
         self.round_text_len_to = round_text_len_to
-        self.bank = functools.lru_cache(maxsize=cache_size)(beta_binomial_prior_distribution)
+        cached_func = lambda x, y: beta_binomial_prior_distribution(x, y, scaling_factor=scaling_factor)
+        self.bank = functools.lru_cache(maxsize=cache_size)(cached_func)
 
     @staticmethod
     def round(val, to):
@@ -315,7 +315,11 @@ def load_audio(
 
 
 def sample_audio(
-    manifest_entry: Dict[str, Any], audio_dir: Path, sample_rate: int, n_samples: int, volume_norm: bool = False,
+    manifest_entry: Dict[str, Any],
+    audio_dir: Path,
+    sample_rate: int,
+    n_samples: int,
+    volume_norm: bool = False,
 ) -> Tuple[np.ndarray, Path, Path]:
     """
     Randomly sample an audio segment from a manifest entry.
diff --git a/requirements/requirements_tts.txt b/requirements/requirements_tts.txt
index 0d499feb3b1f..6d20e0f2250f 100644
--- a/requirements/requirements_tts.txt
+++ b/requirements/requirements_tts.txt
@@ -11,3 +11,5 @@ nltk
 pandas
 pypinyin
 pypinyin-dict
+seaborn
+

From d1d22bb293d61eb9d5fad70d11ec59ee81744fe6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Fri, 15 Nov 2024 23:37:27 +0100
Subject: [PATCH 05/48] ci: Exclude CPU machines from scan (#11300)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/monitor-vms.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/monitor-vms.yml b/.github/workflows/monitor-vms.yml
index 6795f87abf68..0bb54524847a 100644
--- a/.github/workflows/monitor-vms.yml
+++ b/.github/workflows/monitor-vms.yml
@@ -27,7 +27,7 @@ jobs:
             | jq -c '[
                 .runners[] 
                 | select(.status == "online")
-                | select(.name | contains("gpu"))
+                | select(.name | contains("cpu") | not)
                 | {
                   "vm": .name, 
                   "n_gpus": [

From d0004609c3be6422586417687ee33247b1f5788d Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 15 Nov 2024 15:04:14 -0800
Subject: [PATCH 06/48] Revert "fix(export): GPT models w/ bias=False convert
 properly (#11255)" (#11301)

This reverts commit 2d4f4953881b9e2d118d3ffeba7e64625d827d11.
---
 nemo/export/trt_llm/tensorrt_llm_build.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index 4be2d42ebe4d..cdf8eaac6b1c 100755
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -118,14 +118,6 @@ def build_and_save_engine(
         build_config.lora_config = lora_config
 
     model = model_cls.from_config(model_config)
-    if not model_config.bias and model_config.architecture == 'GPTForCausalLM':
-        # NOTE: GPT models in megatron-core that set bias=False sets the bias false globally
-        #  whereas bias=False in TRTLLM GPT models sets it false everywhere except
-        #  LayerNorm. This change makes TRTLLM's implementation match megatron-core.
-        for name, module in model.named_modules():
-            if isinstance(module, tensorrt_llm.layers.normalization.LayerNorm):
-                module.bias = None
-                module.register_parameter('bias', None)
     model = optimize_model(
         model,
         use_parallel_embedding=model_config.use_parallel_embedding,

From a8b830937381d0de6d0e2e7b1697a76aca3f2caf Mon Sep 17 00:00:00 2001
From: Sharath TS <ts.sharath18@gmail.com>
Date: Fri, 15 Nov 2024 15:38:21 -0800
Subject: [PATCH 07/48] remove redundant docs (#11302)

---
 docs/source/nlp/distillation.rst              | 58 ----------------
 .../model_distillation/drop_layers.rst        | 67 -------------------
 2 files changed, 125 deletions(-)
 delete mode 100644 docs/source/nlp/distillation.rst
 delete mode 100644 docs/source/nlp/nemo_megatron/model_distillation/drop_layers.rst

diff --git a/docs/source/nlp/distillation.rst b/docs/source/nlp/distillation.rst
deleted file mode 100644
index 22b2f3dd8a1c..000000000000
--- a/docs/source/nlp/distillation.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-.. _megatron_distillation:
-
-Distillation
-==========================
-
-Knowledge Distillation (KD)
---------------------------------
-
-KD involves using information from an existing trained model to train a second (usually smaller, faster) model, thereby "distilling" knowledge from one to the other.
-
-Distillation has two primary benefits: faster convergence and higher end accuracy than traditional training.
-
-In NeMo, distillation is enabled by the `NVIDIA TensorRT Model Optimizer (ModelOpt) <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ library -- a library to optimize deep-learning models for inference on GPUs.
-
-The logits-distillation process consists of the following steps:
-
-1. Loading both student and teacher model checkpoints (must support same parallelism strategy, if any)
-2. Training until convergence, where forward passes are run on both models (and backward only on student), performing a specific loss function between the logits.
-3. Saving the final student model.
-
-
-Example
-^^^^^^^
-The example below shows how to run the distillation script for LLama models.
-
-The script must be launched correctly with the number of processes equal to tensor parallelism. This is achieved with the ``torchrun`` command below:
-
-.. code-block:: bash
-
-    STUDENT_CKPT="path/to/student.nemo"  # can also be None (will use default architecture found in examples/nlp/language_modeling/conf/megatron_llama_distill.yaml)
-    TEACHER_CKPT="path/to/teacher.nemo"
-    TOKENIZER="path/to/tokenizer.model"
-    DATA_PATHS="[1.0,path/to/tokenized/data]"
-    FINAL_SAVE_FILE="final_checkpoint.nemo"
-    TP=4
-
-    NPROC=$TP
-    launch_config="torchrun --nproc_per_node=$NPROC"
-
-    ${launch_config} examples/nlp/language_modeling/megatron_gpt_distillation.py \
-        model.restore_from_path=$STUDENT_CKPT \
-        model.kd_teacher_restore_from_path=$TEACHER_CKPT \
-        model.tensor_model_parallel_size=$TP \
-        model.tokenizer.model=$TOKENIZER \
-        model.data.data_prefix=$DATA_PATHS \
-        model.nemo_path=$FINAL_SAVE_FILE \
-        trainer.precision=bf16 \
-        trainer.devices=$NPROC
-
-For large models, the command can be used in multi-node setting. For example, this can be done with `NeMo Framework Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_ using Slurm.
-
-
-Limitations
-^^^^^^^^^^^
-* Only Megatron Core-based GPT models are supported
-* Only logit-pair distillation is supported for now
-* Pipeline parallelism not yet supported
-* FSDP strategy not yet supported
diff --git a/docs/source/nlp/nemo_megatron/model_distillation/drop_layers.rst b/docs/source/nlp/nemo_megatron/model_distillation/drop_layers.rst
deleted file mode 100644
index 3dc008945cc9..000000000000
--- a/docs/source/nlp/nemo_megatron/model_distillation/drop_layers.rst
+++ /dev/null
@@ -1,67 +0,0 @@
-.. _drop_layers:
-
-Drop Model Layers
------------------
-
-To trim the model layers, use the following script:
-
-.. code-block:: bash
-
-  python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
-    /NeMo/examples/nlp/language_modeling/megatron_gpt_drop_layers.py \
-      --path_to_nemo /path/to/model.nemo \
-      --path_to_save /path/to/save/trimmed_model.nemo \
-      --tensor_model_parallel_size <tensor_model_parallel_size> \
-      --pipeline_model_parallel_size <pipeline_model_parallel_size> \
-      --gpus_per_node <gpus_per_node>  \
-      --drop_layers 1 2 3 4
-
-**Note:** layer indices start from 1.
-
-To save trimmed model in ``zarr`` checkpoint format, add the following flag to the command above:
-
-.. code-block:: bash
-
-  --zarr
-
-**Note:** the ``zarr`` checkpoint format is deprecated.
-
-Validate Trimmed Model
-----------------------
-
-To validate the trimmed model, use the following script:
-
-.. code-block:: bash
-
-  python /NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-    --config-path=/path/to/folder/with/model/config \
-    --config-name=model_config.yaml \
-    trainer.limit_val_batches=<limit_val_batches> \
-    model.restore_from_path=/path/to/trimmed_model.nemo \
-    model.skip_train=True \
-    model.data.data_impl=mock \
-    model.data.data_prefix=[]
-
-To use a specific dataset instead of a mock dataset, modify the ``model.data`` parameters as follows:
-
-.. code-block:: bash
-
-  model.data.data_impl=mmap \
-  model.data.data_prefix=["path/to/datafile1", "path/to/datafile2"]
-
-Validate Original Model
------------------------
-
-To validate the original model without specific layers, use the following script:
-
-.. code-block:: bash
-
-  python /NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-    --config-path=/path/to/folder/with/model/config \
-    --config-name=model_config.yaml \
-    trainer.limit_val_batches=<limit_val_batches> \
-    model.restore_from_path=/path/to/original_model.nemo \
-    model.skip_train=True \
-    model.data.data_impl=mock \
-    model.data.data_prefix=[] \
-    model.drop_layers=[1,2,3,4]

From a63ea84d217ef47e4d635abeda625796c41765b8 Mon Sep 17 00:00:00 2001
From: mayani-nv <67936769+mayani-nv@users.noreply.github.com>
Date: Fri, 15 Nov 2024 17:22:33 -0800
Subject: [PATCH 08/48] Create phi3mini.py (#11281)

* Create phi3mini.py

Signed-off-by: mayani-nv <67936769+mayani-nv@users.noreply.github.com>

Apply isort and black reformatting

Signed-off-by: mayani-nv <mayani-nv@users.noreply.github.com>

Update __init__.py

Signed-off-by: mayani-nv <67936769+mayani-nv@users.noreply.github.com>

Update __init__.py

Signed-off-by: mayani-nv <67936769+mayani-nv@users.noreply.github.com>

Apply isort and black reformatting

Signed-off-by: mayani-nv <mayani-nv@users.noreply.github.com>

* Create phi3_mini_4k_instruct.py for adding to recipe

Signed-off-by: mayani-nv <67936769+mayani-nv@users.noreply.github.com>

Apply isort and black reformatting

Signed-off-by: mayani-nv <mayani-nv@users.noreply.github.com>

Update phi3_mini_4k_instruct.py and removed Performant recipe

Signed-off-by: mayani-nv <67936769+mayani-nv@users.noreply.github.com>

Update phi3_mini_4k_instruct.py and removing performant condition

Signed-off-by: mayani-nv <67936769+mayani-nv@users.noreply.github.com>

Update phi3_mini_4k_instruct.py with docstring changes

Signed-off-by: mayani-nv <67936769+mayani-nv@users.noreply.github.com>

* Update __init__.py

Signed-off-by: mayani-nv <67936769+mayani-nv@users.noreply.github.com>

* fixing pylint warnings

* Apply isort and black reformatting

Signed-off-by: mayani-nv <mayani-nv@users.noreply.github.com>

* correcting typos and adding working recipe files

---------

Signed-off-by: mayani-nv <mayani-nv@users.noreply.github.com>
Signed-off-by: mayani-nv <67936769+mayani-nv@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: mayani-nv <mayani-nv@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py              |   6 +
 nemo/collections/llm/gpt/model/__init__.py    |   4 +
 nemo/collections/llm/gpt/model/phi3mini.py    | 258 ++++++++++++++++
 nemo/collections/llm/recipes/__init__.py      |   2 +
 .../llm/recipes/phi3_mini_4k_instruct.py      | 284 ++++++++++++++++++
 5 files changed, 554 insertions(+)
 create mode 100644 nemo/collections/llm/gpt/model/phi3mini.py
 create mode 100644 nemo/collections/llm/recipes/phi3_mini_4k_instruct.py

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 2051f844d888..c5bbbd04fe74 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -93,6 +93,9 @@
     NemotronModel,
     NVIDIAMambaConfig8B,
     NVIDIAMambaHybridConfig8B,
+    Phi3Config,
+    Phi3ConfigMini,
+    Phi3Model,
     Qwen2Config,
     Qwen2Config1P5B,
     Qwen2Config7B,
@@ -143,6 +146,9 @@
     "Nemotron4Config15B",
     "Nemotron4Config340B",
     "NemotronConfig",
+    "Phi3Config",
+    "Phi3ConfigMini",
+    "Phi3Model",
     "SSMConfig",
     "BaseMambaConfig130M",
     "BaseMambaConfig370M",
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index b42ceac564bc..152309536f5b 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -79,6 +79,7 @@
     NemotronConfig,
     NemotronModel,
 )
+from nemo.collections.llm.gpt.model.phi3mini import Phi3Config, Phi3ConfigMini, Phi3Model
 from nemo.collections.llm.gpt.model.qwen2 import (
     Qwen2Config,
     Qwen2Config1P5B,
@@ -140,6 +141,9 @@
     "Nemotron3Config22B",
     "Nemotron4Config340B",
     "NemotronModel",
+    "Phi3Config",
+    "Phi3ConfigMini",
+    "Phi3Model",
     "CodeLlamaConfig7B",
     "CodeLlamaConfig13B",
     "CodeLlamaConfig34B",
diff --git a/nemo/collections/llm/gpt/model/phi3mini.py b/nemo/collections/llm/gpt/model/phi3mini.py
new file mode 100644
index 000000000000..eb0b9c758dd7
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/phi3mini.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.lightning import OptimizerModule, io, teardown
+from nemo.lightning.pytorch.utils import dtype_from_hf
+
+
+@dataclass
+class Phi3Config(GPTConfig):
+    # pylint: disable=C0115,C0116
+    normalization: str = "RMSNorm"
+    activation_func: Callable = F.silu
+    gated_linear_unit: bool = True
+    position_embedding_type: str = "rope"
+    add_bias_linear: bool = False
+    seq_length: int = 4096
+    attention_dropout: float = 0.0
+    hidden_dropout: float = 0.0
+    share_embeddings_and_output_weights: bool = False
+
+
+@dataclass
+class Phi3ConfigMini(Phi3Config):
+    # pylint: disable=C0115,C0116
+    num_layers: int = 32
+    hidden_size: int = 3072
+    ffn_hidden_size: int = 8192
+    num_attention_heads: int = 32
+    num_query_groups: int = 32
+    rotary_base: float = 10000.0
+    vocab_size: int = 32064
+
+
+class Phi3Model(GPTModel):
+    # pylint: disable=C0115,C0116
+    def __init__(
+        self,
+        config: Optional[Phi3Config] = None,
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        super().__init__(config or Phi3Config(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
+
+
+@io.model_importer(Phi3Model, "hf")
+class HFPhi3Importer(io.ModelConnector["Phi3ForCausalLM", Phi3Model]):
+    # pylint: disable=C0115,C0116
+    def init(self) -> Phi3Model:
+        return Phi3Model(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import Phi3ForCausalLM
+
+        # Check if the source is valid model identifier or path
+        try:
+            source = Phi3ForCausalLM.from_pretrained(str(self), torch_dtype='auto')
+        except Exception as e:
+            raise ValueError(f"Failed to load the model from source '{self}': {e}")
+
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted Phi3 model to Nemo, model saved to {output_path} in {source.dtype}.")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+        # pylint: disable=C0115,C0116
+        # Define mapping for mini-4k-instruct
+        mapping = {
+            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
+            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "model.layers.*.self_attn.qkv_proj.weight": "decoder.layers.*.self_attention.linear_qkv.weight",
+            "model.layers.*.mlp.gate_up_proj.weight": "decoder.layers.*.mlp.linear_fc1.weight",
+            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "model.norm.weight": "decoder.final_layernorm.weight",
+            "lm_head.weight": "output_layer.weight",
+        }
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
+
+    @property
+    def tokenizer(self):
+        # pylint: disable=C0115,C0116
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)))
+
+    @property
+    def config(self) -> Phi3Config:
+        # pylint: disable=C0115,C0116
+        from transformers import Phi3Config as HFPhi3Config
+
+        source = HFPhi3Config.from_pretrained(str(self))
+
+        def make_vocab_size_divisible_by(vocab_size):
+            base = 128
+            while vocab_size % base != 0:
+                base //= 2
+            return base
+
+        output = Phi3Config(
+            num_layers=source.num_hidden_layers,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.intermediate_size,
+            num_attention_heads=source.num_attention_heads,
+            init_method_std=source.initializer_range,
+            layernorm_epsilon=source.rms_norm_eps,
+            rotary_base=source.rope_theta,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
+            share_embeddings_and_output_weights=False,
+            fp16=(dtype_from_hf(source) == torch.float16),
+            bf16=(dtype_from_hf(source) == torch.bfloat16),
+            params_dtype=dtype_from_hf(source),
+        )
+        print("output:", output)
+        return output
+
+
+@io.model_exporter(Phi3Model, "hf")
+class HFPhi3Exporter(io.ModelConnector[Phi3Model, "Phi3ForCausalLM"]):
+    # pylint: disable=C0115,C0116
+    def init(self) -> "Phi3ForCausalLM":
+        from transformers import AutoModelForCausalLM
+
+        return AutoModelForCausalLM.from_config(self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        target = self.init()
+        source, _ = self.nemo_load(str(self))
+        target = self.convert_state(source, target)
+
+        target.cpu().save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        # pylint: disable=C0115,C0116
+        mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight",
+        }
+
+        # Convert source weights to target dtype if needed
+        for name, param in source.state_dict().items():
+            if param.dtype != target.state_dict()[name].dtype:
+                param.data = param.data.to(target.state_dict()[name].dtype)
+
+        return io.apply_transforms(source, target, mapping=mapping)
+
+    @property
+    def tokenizer(self):
+        # pylint: disable=C0115,C0116
+        return io.load_context(str(self)).model.tokenizer.tokenizer
+
+    @property
+    def config(self) -> "HFPhi3Config":
+        # pylint: disable=C0115,C0116
+        source: Phi3Config = io.load_context(str(self)).model.config
+
+        from transformers import Phi3Config as HFPhi3Config
+
+        return HFPhi3Config(
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.seq_length,
+            initializer_range=0.02,
+            rms_norm_eps=1e-05,
+            num_key_value_heads=source.num_query_groups,
+            rope_theta=source.rotary_base,
+            vocab_size=self.tokenizer.vocab_size,
+        )
+
+
+@io.state_transform(
+    source_key="model.layers.*.self_attn.qkv_proj.weight",
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_qkv(ctx: io.TransformCTX, qkv_weight):
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_size = megatron_config.kv_channels
+
+    old_tensor_shape = qkv_weight.size()
+    new_q_tensor_shape = (head_num, head_size, old_tensor_shape[1])
+    new_kv_tensor_shape = (num_query_groups, head_size, old_tensor_shape[1])
+    q, k, v = qkv_weight.split(
+        [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size], dim=0
+    )
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights = torch.empty((0, head_size, old_tensor_shape[1])).type_as(qkv_weight)
+    for i in range(num_query_groups):
+        qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+        qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+        qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),  # phi-3-mini-4k-instruct
+    target_key="decoder.layers.*.mlp.linear_fc1.weight",
+)
+def _import_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0)
+
+
+__all__ = ["Phi3Config", "Phi3ConfigMini", "Phi3Model"]
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 8f772e3da5b7..e76729d5e31a 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -56,6 +56,7 @@
     nemotron4_15b_16k,
     nemotron4_15b_64k,
     nemotron4_340b,
+    phi3_mini_4k_instruct,
     qwen2,
     qwen2_1p5b,
     qwen2_7b,
@@ -111,6 +112,7 @@
     "nemotron4_15b_16k",
     "nemotron4_15b_64k",
     "nemotron4_340b",
+    "phi3_mini_4k_instruct",
     "t5_220m",
     "t5_3b",
     "t5_11b",
diff --git a/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py b/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
new file mode 100644
index 000000000000..d3bc64f54d91
--- /dev/null
+++ b/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
+from nemo.collections.llm.gpt.model.phi3mini import Phi3ConfigMini, Phi3Model
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "phi3_mini_4k_instruct"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Phi3 Mini 4k instruct model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Phi3 mini 4k instruct model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=phi3_mini_4k_instruct ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(Phi3Model, config=run.Config(Phi3ConfigMini))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 1,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Phi3 mini 4k instruct model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=phi3_mini_4k_instruct ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    tensor_parallelism: int = 1,
+    num_gpus_per_node: int = 1,
+    max_steps: int = 1168251,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for phi3_mini_4k_instruct model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory phi3_mini_4k_instruct
+            $ nemo llm pretrain --factory "phi3_mini_4k_instruct(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="phi3_mini_4k_instruct", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    recipe = run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 1,
+    tensor_parallelism: int = 1,
+    max_steps: int = 116825,
+    peft_scheme: Optional[str] = 'lora',
+    seq_length: Optional[int] = None,
+    packed_sequence: Optional[bool] = None,
+    performance_mode: bool = False,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Phi3 mini-4k-instruct model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora',
+        'none'/None.
+        seq_length (int): Maximum number of tokens per microbatch.
+        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
+        maximum seq_length for better efficiency. By default, this value equals performance_mode.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory phi3_mini_4k_instruct
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="phi3_mini_4k_instruct", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    # Default to unpacked data in normal mode and packed data in performance mode
+    # once packing recipe is well tested, change this default to true
+    if packed_sequence is None:
+        packed_sequence = performance_mode
+
+    # For unpacked sequence, most samples in SQuAD dataset are shorter than 2K
+    if seq_length is None:
+        seq_length = 4096 if packed_sequence else 2048
+
+    recipe = default_finetune_recipe(
+        model(), "microsoft/Phi-3-mini-4k-instruct", dir, name, num_nodes, num_gpus_per_node, packed_sequence
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.peft.dim = 8
+        recipe.peft.alpha = 16
+        recipe.peft.target_modules = ['linear_qkv']
+        recipe.optim.config.use_distributed_optimizer = False
+
+        # some settings currently do not function correctly with LoRA
+        recipe.model.config.cross_entropy_loss_fusion = False
+
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+
+    # Sequence length settings in the model and dataset must agree
+    recipe.model.config.seq_length = seq_length
+    recipe.data.seq_length = seq_length
+    if packed_sequence:
+        recipe.data.pad_to_max_length = True
+        recipe.data.packed_sequence_specs = run.Config(PackedSequenceSpecs, packed_sequence_size=seq_length)
+
+    return recipe

From f67f9ef60d375a5b06494e28d6f050bf51fcefb0 Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Sat, 16 Nov 2024 12:07:48 -0800
Subject: [PATCH 09/48] Integrate lm-eval-harness for evaluations in NeMo
 (#10621)

* Add evaluate method and other minor fixes

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add inference params to evaluate method

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add wait_for_rest_service fn to evaluate method

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>

* Add logprobs to be returned by Pytriton for trtllm models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Increase max_retries in wait_for_rest_service method

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>

* Add unset slurm vars and use env vars for Triton args

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add logic to get logProbs from logits

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Refactor, clean and organize the code

1) Refactors the code and creates an evaluation folder where all util methods live
2) Add doctsrings, comments
3) Expose gather_context_logits, gather_generation_logits in trtllm and add output_generation_logits flag to return generation logits and remove output_logporbs as its not getting used anymore

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add copyright and initialize special_tokens_kwargs in eval_utils.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add the following chanes

1) Move get_trtllm_deployable and unset_environment_variables to deploy base.py
2) Rename eval_utils.py to base.py
3) REstore scripts/export/convert_nemo2_for_export.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix a minor typo

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Revert output_log_probs and all_probs arg in tensorrt_llm_run.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix docstrings formatting

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pylint and other minor fixes

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix pylint and typos

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>

* Avoid multiple calls for tokenizer_type

Co-authored-by: Ananth Subramaniam <ananth.subramaniam@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* Replace print statements with logging statements

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: athitten <athitten@users.noreply.github.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: athitten <athitten@users.noreply.github.com>
Co-authored-by: Ananth Subramaniam <ananth.subramaniam@gmail.com>
---
 nemo/collections/llm/__init__.py            |   7 +
 nemo/collections/llm/api.py                 | 207 +++++++++++--------
 nemo/collections/llm/deploy/__init__.py     |   3 +
 nemo/collections/llm/deploy/base.py         | 117 +++++++++++
 nemo/collections/llm/evaluation/__init__.py |   3 +
 nemo/collections/llm/evaluation/base.py     | 210 ++++++++++++++++++++
 nemo/deploy/nlp/query_llm.py                |   9 +
 nemo/deploy/service/rest_model_api.py       |  48 +++--
 nemo/export/tensorrt_llm.py                 |  30 ++-
 nemo/export/trt_llm/tensorrt_llm_build.py   |   6 +-
 nemo/export/trt_llm/tensorrt_llm_run.py     |   6 +-
 scripts/deploy/nlp/deploy_triton.py         |   3 +-
 12 files changed, 541 insertions(+), 108 deletions(-)
 create mode 100644 nemo/collections/llm/deploy/__init__.py
 create mode 100644 nemo/collections/llm/deploy/base.py
 create mode 100644 nemo/collections/llm/evaluation/__init__.py
 create mode 100644 nemo/collections/llm/evaluation/base.py

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index c5bbbd04fe74..13f0ffe3db88 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -237,3 +237,10 @@
     __all__.append("deploy")
 except ImportError as error:
     logging.warning(f"The deploy module could not be imported: {error}")
+
+try:
+    from nemo.collections.llm.api import evaluate
+
+    __all__.append("evaluate")
+except ImportError as error:
+    logging.warning(f"The evaluate module could not be imported: {error}")
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index fdceff5d959e..07899b2ee484 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
 import os
 from copy import deepcopy
 from pathlib import Path
@@ -256,84 +255,13 @@ def validate(
     return app_state.exp_dir
 
 
-def get_trtllm_deployable(
-    nemo_checkpoint,
-    model_type,
-    triton_model_repository,
-    num_gpus,
-    tensor_parallelism_size,
-    pipeline_parallelism_size,
-    max_input_len,
-    max_output_len,
-    max_batch_size,
-    dtype,
-):
-    from nemo.export.tensorrt_llm import TensorRTLLM
-
-    if triton_model_repository is None:
-        trt_llm_path = "/tmp/trt_llm_model_dir/"
-        Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
-    else:
-        trt_llm_path = triton_model_repository
-
-    if nemo_checkpoint is None and triton_model_repository is None:
-        raise ValueError(
-            "The provided model repository is not a valid TensorRT-LLM model "
-            "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine."
-        )
-
-    if nemo_checkpoint is None and not os.path.isdir(triton_model_repository):
-        raise ValueError(
-            "The provided model repository is not a valid TensorRT-LLM model "
-            "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine."
-        )
-
-    if nemo_checkpoint is not None and model_type is None:
-        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
-
-    trt_llm_exporter = TensorRTLLM(
-        model_dir=trt_llm_path,
-        load_model=(nemo_checkpoint is None),
-    )
-
-    if nemo_checkpoint is not None:
-        try:
-            logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
-            trt_llm_exporter.export(
-                nemo_checkpoint_path=nemo_checkpoint,
-                model_type=model_type,
-                n_gpus=num_gpus,
-                tensor_parallelism_size=tensor_parallelism_size,
-                pipeline_parallelism_size=pipeline_parallelism_size,
-                max_input_len=max_input_len,
-                max_output_len=max_output_len,
-                max_batch_size=max_batch_size,
-                dtype=dtype,
-            )
-        except Exception as error:
-            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
-
-    return trt_llm_exporter
-
-
-def store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response):
-    args_dict = {
-        "triton_service_ip": triton_http_address,
-        "triton_service_port": triton_port,
-        "triton_request_timeout": triton_request_timeout,
-        "openai_format_response": openai_format_response,
-    }
-    with open("nemo/deploy/service/config.json", "w") as f:
-        json.dump(args_dict, f)
-
-
 @run.cli.entrypoint(namespace="llm")
 def deploy(
     nemo_checkpoint: Path = None,
     model_type: str = "llama",
-    triton_model_name: str = "xxx",
+    triton_model_name: str = 'triton_model',
     triton_model_version: Optional[int] = 1,
-    triton_port: int = 8080,
+    triton_port: int = 8000,
     triton_http_address: str = "0.0.0.0",
     triton_request_timeout: int = 60,
     triton_model_repository: Path = None,
@@ -344,21 +272,61 @@ def deploy(
     max_input_len: int = 256,
     max_output_len: int = 256,
     max_batch_size: int = 8,
-    start_rest_service: bool = False,
+    start_rest_service: bool = True,
     rest_service_http_address: str = "0.0.0.0",
-    rest_service_port: int = 8000,
-    openai_format_response: bool = False,
+    rest_service_port: int = 8080,
+    openai_format_response: bool = True,
+    output_generation_logits: bool = True,
 ):
+    """
+    Deploys nemo model on a PyTriton server by converting the nemo ckpt to trtllm.
+    Also starts rest service that is used to send OpenAI API compatible input request
+    to the PyTiton server.
+
+    Args:
+        nemo_checkpoint (Path): Path for nemo checkpoint.
+        model_type (str): Type of the model. Choices: gpt, llama, falcon, starcoder. Default: llama.
+        triton_model_name (str): Name for the model that gets deployed on PyTriton. Please ensure that the same model
+        name is passed to the evalute method for the model to be accessible while sending evalution requests.
+        Default: 'triton_model'.
+        triton_model_version (Optional[int]): Version for the triton model. Default: 1.
+        triton_port (int): Port for the PyTriton server. Default: 8000.
+        triton_http_address (str): HTTP address for the PyTriton server. Default:  "0.0.0.0".
+        triton_request_timeout (int): Timeout in seconds for Triton server. Default: 60.
+        triton_model_repository (Path): Folder for the trt-llm conversion, trt-llm engine gets saved in this specified
+        path. If None, saves it in /tmp dir. Default: None.
+        num_gpus (int): Number of GPUs for export to trtllm and deploy. Default: 1.
+        tensor_parallelism_size (int): Tensor parallelism size. Default: 1.
+        pipeline_parallelism_size (int): Pipeline parallelism size. Default: 1.
+        dtype (str): dtype of the TensorRT-LLM model. Default: "bfloat16".
+        max_input_len (int): Max input length of the model. Default: 256.
+        max_output_len (int): Max output length of the model. Default: 256.
+        max_batch_size (int): Max batch size of the model. Default: 8.
+        start_rest_service (bool): Start rest service that is used to send evaluation requests to the PyTriton server.
+        Needs to be True to be able to run evaluation. Default: True.
+        rest_service_http_address (str): HTTP address for the rest service. Default: "0.0.0.0".
+        rest_service_port (int): Port for the rest service. Default: 8080.
+        openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to
+        be True while running evaluation. Default: True.
+        output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True.
+        generation_logits are used to compute the logProb of the output token. Default: True.
+    """
+    from nemo.collections.llm import deploy
     from nemo.deploy import DeployPyTriton
 
+    deploy.unset_environment_variables()
     if start_rest_service:
         if triton_port == rest_service_port:
             logging.error("REST service port and Triton server port cannot use the same port.")
             return
-        # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py
-        store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response)
-
-    triton_deployable = get_trtllm_deployable(
+        # Store triton ip, port and other args relevant for REST API as env vars to be accessible by rest_model_api.py
+        os.environ['TRITON_HTTP_ADDRESS'] = triton_http_address
+        os.environ['TRITON_PORT'] = str(triton_port)
+        os.environ['TRITON_REQUEST_TIMEOUT'] = str(triton_request_timeout)
+        os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response)
+        os.environ['OUTPUT_GENERATION_LOGITS'] = str(output_generation_logits)
+
+    triton_deployable = deploy.get_trtllm_deployable(
         nemo_checkpoint,
         model_type,
         triton_model_repository,
@@ -369,6 +337,7 @@ def deploy(
         max_output_len,
         max_batch_size,
         dtype,
+        output_generation_logits,
     )
 
     try:
@@ -383,6 +352,7 @@ def deploy(
 
         logging.info("Triton deploy function will be called.")
         nm.deploy()
+        nm.run()
     except Exception as error:
         logging.error("Error message has occurred during deploy function. Error message: " + str(error))
         return
@@ -416,6 +386,81 @@ def deploy(
     nm.stop()
 
 
+def evaluate(
+    nemo_checkpoint_path: Path,
+    url: str = "http://0.0.0.0:8080/v1",
+    model_name: str = "triton_model",
+    eval_task: str = "gsm8k",
+    num_fewshot: Optional[int] = None,
+    limit: Optional[Union[int, float]] = None,
+    bootstrap_iters: int = 100000,
+    # inference params
+    max_tokens_to_generate: Optional[int] = 256,
+    temperature: Optional[float] = 0.000000001,
+    top_p: Optional[float] = 0.0,
+    top_k: Optional[int] = 1,
+    add_bos: Optional[bool] = False,
+):
+    """
+    Evaluates nemo model deployed on PyTriton server (via trtllm) using lm-evaluation-harness
+    (https://github.com/EleutherAI/lm-evaluation-harness/tree/main).
+
+    Args:
+        nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt
+        which is required to tokenize the evaluation input and output prompts.
+        url (str): rest service url and port that were used in the deploy method above in the format:
+        http://{rest_service_http}:{rest_service_port}. Post requests with evaluation input prompts
+        (from lm-eval-harness) are sent to this url which is then passed to the model deployed on PyTriton server.
+        The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server.
+        model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as
+        triton_model_name passed to the deploy method above to be able to launch evaluation. Deafult: "triton_model".
+        eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k".
+        These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from
+        lm-evaluation-harness can be run, but only the above mentioned ones are tested. Tasks of type
+        loglikelihood_rolling are not supported yet.
+        num_fewshot (int): number of examples in few-shot context. Default: None.
+        limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit
+        is a percentage of the total number of examples. If int say x, then run evaluation only on x number of samples
+        from the eval dataset. Default: None, which means eval is run the entire dataset.
+        bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. Set to 0
+        for no stderr calculations to be performed. Default: 100000.
+        # inference params
+        max_tokens_to_generate (int): max tokens to generate. Default: 256.
+        temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token
+        with highest prob is chosen. Temperature can't be set to 0.0 currently, due to a bug with TRTLLM
+        (# TODO to be investigated). Hence using a very samll value as the default. Default: 0.000000001.
+        top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability.
+        top_p=0 means the model will only consider the single most likely token for the next prediction. Default: 0.0.
+        top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model
+        will only consider the single most likely token for the next prediction. Default: 1
+        add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when
+        encoding a string. Default: False since typically for CausalLM its set to False. If needed set add_bos to True.
+    """
+    try:
+        # lm-evaluation-harness import
+        from lm_eval import evaluator
+    except ImportError:
+        raise ImportError(
+            "Please ensure that lm-evaluation-harness is installed in your env as it is required " "to run evaluations"
+        )
+
+    from nemo.collections.llm import evaluation
+
+    # Get tokenizer from nemo ckpt. This works only with NeMo 2.0 ckpt.
+    tokenizer = io.load_context(nemo_checkpoint_path + '/context', subpath="model").tokenizer
+    # Wait for rest service to be ready before starting evaluation
+    evaluation.wait_for_rest_service(rest_url=f"{url}/v1/health")
+    # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate
+    model = evaluation.NeMoFWLMEval(
+        model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos
+    )
+    results = evaluator.simple_evaluate(
+        model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters
+    )
+
+    print("score", results['results'][eval_task])
+
+
 @run.cli.entrypoint(name="import", namespace="llm")
 def import_ckpt(
     model: pl.LightningModule,
diff --git a/nemo/collections/llm/deploy/__init__.py b/nemo/collections/llm/deploy/__init__.py
new file mode 100644
index 000000000000..24c102bfa0d2
--- /dev/null
+++ b/nemo/collections/llm/deploy/__init__.py
@@ -0,0 +1,3 @@
+from nemo.collections.llm.deploy.base import get_trtllm_deployable, unset_environment_variables
+
+__all__ = ["unset_environment_variables", "get_trtllm_deployable"]
diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py
new file mode 100644
index 000000000000..e21198f5884b
--- /dev/null
+++ b/nemo/collections/llm/deploy/base.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+from pathlib import Path
+
+from nemo.utils import logging
+
+
+def unset_environment_variables() -> None:
+    """
+    SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work
+    on clusters. This method takes care of unsetting these env variables
+    """
+    logging.info("Unsetting all SLURM_, PMI_, PMIX_ Variables")
+
+    # Function to unset variables with a specific prefix
+    def unset_vars_with_prefix(prefix):
+        unset_vars = []
+        cmd = f"env | grep ^{prefix} | cut -d= -f1"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        vars_to_unset = result.stdout.strip().split('\n')
+        for var in vars_to_unset:
+            if var:  # Check if the variable name is not empty
+                os.environ.pop(var, None)
+                unset_vars.append(var)
+        return unset_vars
+
+    # Collect all unset variables across all prefixes
+    all_unset_vars = []
+
+    # Unset variables for each prefix
+    for prefix in ['SLURM_', 'PMI_', 'PMIX_']:
+        unset_vars = unset_vars_with_prefix(prefix)
+        all_unset_vars.extend(unset_vars)
+
+    if all_unset_vars:
+        logging.info(f"Unset env variables: {', '.join(all_unset_vars)}")
+    else:
+        logging.info("No env variables were unset.")
+
+
+def get_trtllm_deployable(
+    nemo_checkpoint,
+    model_type,
+    triton_model_repository,
+    num_gpus,
+    tensor_parallelism_size,
+    pipeline_parallelism_size,
+    max_input_len,
+    max_output_len,
+    max_batch_size,
+    dtype,
+    output_generation_logits,
+):
+    """
+    Exports the nemo checkpoint to trtllm and returns trt_llm_exporter that is used to deploy on PyTriton.
+    """
+    from nemo.export.tensorrt_llm import TensorRTLLM
+
+    if triton_model_repository is None:
+        trt_llm_path = "/tmp/trt_llm_model_dir/"
+        Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
+    else:
+        trt_llm_path = triton_model_repository
+
+    if nemo_checkpoint is None and triton_model_repository is None:
+        raise ValueError(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine."
+        )
+
+    if nemo_checkpoint is None and not os.path.isdir(triton_model_repository):
+        raise ValueError(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine."
+        )
+
+    if nemo_checkpoint is not None and model_type is None:
+        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
+
+    trt_llm_exporter = TensorRTLLM(
+        model_dir=trt_llm_path,
+        load_model=(nemo_checkpoint is None),
+    )
+
+    if nemo_checkpoint is not None:
+        try:
+            logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
+            trt_llm_exporter.export(
+                nemo_checkpoint_path=nemo_checkpoint,
+                model_type=model_type,
+                n_gpus=num_gpus,
+                tensor_parallelism_size=tensor_parallelism_size,
+                pipeline_parallelism_size=pipeline_parallelism_size,
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_batch_size=max_batch_size,
+                dtype=dtype,
+                gather_generation_logits=output_generation_logits,
+            )
+        except Exception as error:
+            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
+
+    return trt_llm_exporter
diff --git a/nemo/collections/llm/evaluation/__init__.py b/nemo/collections/llm/evaluation/__init__.py
new file mode 100644
index 000000000000..3012689bb8da
--- /dev/null
+++ b/nemo/collections/llm/evaluation/__init__.py
@@ -0,0 +1,3 @@
+from nemo.collections.llm.evaluation.base import NeMoFWLMEval, wait_for_rest_service
+
+__all__ = ["NeMoFWLMEval", "wait_for_rest_service"]
diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py
new file mode 100644
index 000000000000..b1734d6f4d43
--- /dev/null
+++ b/nemo/collections/llm/evaluation/base.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import requests
+import torch
+import torch.nn.functional as F
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from requests.exceptions import RequestException
+
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.utils import logging
+
+
+class NeMoFWLMEval(LM):
+    """
+    NeMoFWLMEval is a wrapper class subclassing lm_eval.api.model.LM class, that defines how lm_eval interfaces with
+    NeMo model deployed on PyTriton server.
+    Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md
+    """
+
+    def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos):
+        self.model_name = model_name
+        self.api_url = api_url
+        self.tokenizer = tokenizer
+        self.max_tokens_to_generate = max_tokens_to_generate
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        self.add_bos = add_bos
+        super().__init__()
+
+    def _generate_tokens_logits(self, payload, return_text: bool = False, return_logits: bool = False):
+        """
+        A private method that sends post request to the model on PyTriton server and returns either generated text or
+        logits.
+        """
+        # send a post request to /v1/completions/ endpoint with the payload
+        response = requests.post(f"{self.api_url}/v1/completions/", json=payload)
+        response_data = response.json()
+
+        if 'error' in response_data:
+            raise Exception(f"API Error: {response_data['error']}")
+
+        # Assuming the response is in OpenAI format
+        if return_text:
+            # in case of generate_until tasks return just the text
+            return response_data['choices'][0]['text']
+
+        if return_logits:
+            # in case of loglikelihood tasks return the logits
+            return response_data['choices'][0]['generation_logits']
+
+    def tokenizer_type(self, tokenizer):
+        """
+        Returns the type of the tokenizer.
+        """
+        if isinstance(tokenizer, AutoTokenizer):
+            return "AutoTokenizer"
+        elif isinstance(tokenizer, SentencePieceTokenizer):
+            return "SentencePieceTokenizer"
+        else:
+            raise ValueError(
+                "Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check "
+                "how to handle special tokens for this tokenizer"
+            )
+
+    def loglikelihood(self, requests: list[Instance]):
+        """
+        Defines the loglikelihood request. Takes input requests of type list[Instance] where Instance is a dataclass
+        defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request type(here
+        loglikelihood) and other relevant args like few shot samples.
+        """
+        special_tokens_kwargs = {}
+        tokenizer_type = self.tokenizer_type(self.tokenizer)
+        if tokenizer_type == "SentencePieceTokenizer":
+            special_tokens_kwargs['add_bos'] = self.add_bos
+        elif tokenizer_type == "AutoTokenizer":
+            special_tokens_kwargs['add_special_tokens'] = self.add_bos
+
+        results = []
+        for request in requests:
+            # get the input prompt from the request
+            context = request.arguments[0]
+            # get the output prompt from the request
+            continuation = request.arguments[1]
+            # get encoded tokens of continuation
+            continuation_enc = self.tokenizer.tokenizer.encode(continuation, **special_tokens_kwargs)
+            # for SentencePeice consider the encoded tokens from the 2nd token since first encoded token is space.
+            if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer":
+                continuation_enc = continuation_enc[1:]
+            num_cont_tokens = len(continuation_enc)
+            # Update self.max_tokens_to_generate with number of continuation tokens (or output tokens) in the request
+            self.max_tokens_to_generate = num_cont_tokens
+            # Create payload to query the model deployed on PyTriton server
+            payload = {
+                "model": self.model_name,
+                "prompt": context,
+                "max_tokens": self.max_tokens_to_generate,
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "top_k": self.top_k,
+            }
+            # Get the logits from the model
+            generation_logits = self._generate_tokens_logits(payload, return_logits=True)
+            # Convert generation_logits to torch tensor to easily get logprobs wo manual implementation of log_softmax
+            multi_logits = F.log_softmax(torch.tensor(generation_logits[0]), dim=-1)
+            # Convert encoded continuation tokens to torch tensor
+            cont_toks = torch.tensor(continuation_enc, dtype=torch.long).unsqueeze(0)
+            # Get the greedy token from the logits (i.e token with the highest prob)
+            greedy_tokens = multi_logits.argmax(dim=-1)
+            # Check if all greedy_tokens match the the actual continuation tokens
+            is_greedy = (greedy_tokens == cont_toks).all()
+            # Get the logits corresponding to the actual continuation tokens
+            logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)
+            # result is tuple of logProb of generating the continuation token and is_greedy
+            result = (float(logits.sum()), bool(is_greedy))
+
+            results.append(result)
+
+        return results
+
+    def loglikelihood_rolling(self, requests: list[Instance]):
+        """
+        Defines the loglikelihood_rolling request type. Yet to be implemented.
+        """
+        pass
+
+    def generate_until(self, inputs: list[Instance]):
+        """
+        Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a
+        dataclass defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request
+        type(here loglikelihood) and other relevant args like few shot samples.
+        """
+        results = []
+        for instance in inputs:
+            # Access the 'arguments' attribute of the Instance which contains the input prompt string
+            prompt = instance.arguments[0]
+            # Create payload to query the model deployed on PyTriton server
+            payload = {
+                "model": self.model_name,
+                "prompt": prompt,
+                "max_tokens": self.max_tokens_to_generate,
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "top_k": self.top_k,
+            }
+            # Get the text generated by the model
+            generated_text = self._generate_tokens_logits(payload, return_text=True)
+
+            results.append(generated_text)
+
+        return results
+
+
+def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2):
+    """
+    Wait for REST service to be ready.
+
+    Args:
+    rest_url (str): URL of the REST service's health endpoint
+    max_retries (int): Maximum number of retry attempts. Defaul: 60.
+    retry_interval (int): Time to wait between retries in seconds. Default: 2.
+
+    Returns:
+    bool: True if rest service is ready, False otherwise
+    """
+
+    def check_service(url):
+        """
+        Check if the service is ready by making a GET request to its health endpoint.
+
+        Args:
+        url (str): URL of the service's health endpoint
+
+        Returns:
+        bool: True if the service is ready, False otherwise
+        """
+        try:
+            response = requests.get(url, timeout=5)
+            return response.status_code == 200
+        except RequestException:
+            return False
+
+    for _ in range(max_retries):
+        rest_ready = check_service(rest_url)
+
+        if rest_ready:
+            logging.info("REST service is ready.")
+            return True
+
+        logging.info(f"REST Service not ready yet. Retrying in {retry_interval} seconds...")
+        time.sleep(retry_interval)
+
+    logging.info("Timeout: REST service did not become ready.")
+    return False
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 7e873db6b5b1..e1d21bb54b76 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -174,6 +174,7 @@ def query_llm(
         end_strings=None,
         init_timeout=60.0,
         openai_format_response: bool = False,
+        output_generation_logits: bool = False,
     ):
         """
         Query the Triton server synchronously and return a list of responses.
@@ -190,6 +191,8 @@ def query_llm(
             no_repeat_ngram_size (int): no repeat ngram size.
             task_id (str): downstream task id if virtual tokens are used.
             init_timeout (flat): timeout for the connection.
+            openai_format_response: return response similar to OpenAI API format
+            output_generation_logits: return generation logits from model on PyTriton
         """
 
         prompts = str_list2numpy(prompts)
@@ -248,6 +251,9 @@ def query_llm(
         if end_strings is not None:
             inputs["end_strings"] = str_list2numpy(end_strings)
 
+        if output_generation_logits is not None:
+            inputs["output_generation_logits"] = np.full(prompts.shape, output_generation_logits, dtype=np.bool_)
+
         with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
             result_dict = client.infer_batch(**inputs)
             output_type = client.model_config.outputs[0].dtype
@@ -269,6 +275,9 @@ def query_llm(
                         "model": self.model_name,
                         "choices": [{"text": str(sentences)}],
                     }
+                    # Convert gneration logits to a list to make it json serializable and add it to openai_response dict
+                    if output_generation_logits:
+                        openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"].tolist()
                     return openai_response
                 else:
                     return sentences
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index fbc774883faa..64afea167295 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -8,8 +8,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-import json
 import os
 from pathlib import Path
 import requests
@@ -19,6 +17,7 @@
 from pydantic_settings import BaseSettings
 
 from nemo.deploy.nlp import NemoQueryLLM
+from nemo.utils import logging
 
 
 class TritonSettings(BaseSettings):
@@ -29,14 +28,13 @@ class TritonSettings(BaseSettings):
     def __init__(self):
         super(TritonSettings, self).__init__()
         try:
-            with open(os.path.join(Path.cwd(), 'nemo/deploy/service/config.json')) as config:
-                config_json = json.load(config)
-                self._triton_service_port = config_json["triton_service_port"]
-                self._triton_service_ip = config_json["triton_service_ip"]
-                self._triton_request_timeout = config_json["triton_request_timeout"]
-                self._openai_format_response = config_json["openai_format_response"]
+            self._triton_service_port = int(os.environ.get('TRITON_PORT', 8080))
+            self._triton_service_ip = os.environ.get('TRITON_HTTP_ADDRESS', '0.0.0.0')
+            self._triton_request_timeout = int(os.environ.get('TRITON_REQUEST_TIMEOUT', 60))
+            self._openai_format_response = os.environ.get('OPENAI_FORMAT_RESPONSE', 'False').lower() == 'true'
+            self._output_generation_logits = os.environ.get('OUTPUT_GENERATION_LOGITS', 'False').lower() == 'true'
         except Exception as error:
-            print("An exception occurred:", error)
+            logging.error("An exception occurred trying to retrieve set args in TritonSettings class. Error:", error)
             return
 
     @property
@@ -54,11 +52,17 @@ def triton_request_timeout(self):
     @property
     def openai_format_response(self):
         """
-        Retuns the response from Triton server in OpenAI compatible formar if set to True,
-        default set in config.json is false.
+        Retuns the response from Triton server in OpenAI compatible format if set to True.
         """
         return self._openai_format_response
 
+    @property
+    def output_generation_logits(self):
+        """
+        Retuns the generation logits along with text in Triton server output if set to True.
+        """
+        return self._output_generation_logits
+
 
 app = FastAPI()
 triton_settings = TritonSettings()
@@ -70,19 +74,27 @@ class CompletionRequest(BaseModel):
     max_tokens: int = 512
     temperature: float = 1.0
     top_p: float = 0.0
-    n: int = 1
+    top_k: int = 1
     stream: bool = False
     stop: str | None = None
     frequency_penalty: float = 1.0
 
 
-@app.get("/triton_health")
+@app.get("/v1/health")
+def health_check():
+    return {"status": "ok"}
+
+
+@app.get("/v1/triton_health")
 async def check_triton_health():
     """
     This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while running the REST or FastAPI application.
-    Verify by running: curl http://service_http_address:service_port/triton_health and the returned status should inform if the server is accessible.
+    Verify by running: curl http://service_http_address:service_port/v1/triton_health and the returned status should inform if the server is accessible.
     """
-    triton_url = f"triton_settings.triton_service_ip:str(triton_settings.triton_service_port)/v2/health/ready"
+    triton_url = (
+        f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready"
+    )
+    logging.info(f"Attempting to connect to Triton server at: {triton_url}")
     try:
         response = requests.get(triton_url, timeout=5)
         if response.status_code == 200:
@@ -101,11 +113,13 @@ def completions_v1(request: CompletionRequest):
         output = nq.query_llm(
             prompts=[request.prompt],
             max_output_len=request.max_tokens,
-            top_k=request.n,
+            # when these below params are passed as None
+            top_k=request.top_k,
             top_p=request.top_p,
             temperature=request.temperature,
             init_timeout=triton_settings.triton_request_timeout,
             openai_format_response=triton_settings.openai_format_response,
+            output_generation_logits=triton_settings.output_generation_logits,
         )
         if triton_settings.openai_format_response:
             return output
@@ -114,5 +128,5 @@ def completions_v1(request: CompletionRequest):
                 "output": output[0][0],
             }
     except Exception as error:
-        print("An exception occurred:", error)
+        logging.error("An exception occurred with the post request to /v1/completions/ endpoint:", error)
         return {"error": "An exception occurred"}
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 08b0b822cad4..a1e6cb0e03c4 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -180,6 +180,8 @@ def export(
         reduce_fusion: bool = True,
         fp8_quantized: Optional[bool] = None,
         fp8_kvcache: Optional[bool] = None,
+        gather_context_logits: Optional[bool] = False,
+        gather_generation_logits: Optional[bool] = False,
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.
@@ -218,6 +220,8 @@ def export(
             reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce
             fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type.
             fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type.
+            gather_context_logits (Optional[bool]): if True, enables gather_context_logits while building trtllm engine. Default: False
+            gather_generation_logits (Optional[bool]): if True, enables gather_generation_logits while building trtllm engine. Default: False
         """
         if n_gpus is not None:
             warnings.warn(
@@ -495,6 +499,8 @@ def get_transformer_config(nemo_model_config):
                             multiple_profiles=multiple_profiles,
                             gpt_attention_plugin=gpt_attention_plugin,
                             gemm_plugin=gemm_plugin,
+                            gather_context_logits=gather_context_logits,
+                            gather_generation_logits=gather_generation_logits,
                         )
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
@@ -688,6 +694,7 @@ def forward(
         prompt_embeddings_checkpoint_path: str = None,
         streaming: bool = False,
         output_log_probs: bool = False,
+        output_generation_logits: bool = False,
         **sampling_kwargs,
     ):
         """
@@ -706,6 +713,7 @@ def forward(
             task_ids (List(str)): list of the task ids for the prompt tables.
             prompt_embeddings_table (List(float)): prompt embeddings table.
             prompt_embeddings_checkpoint_path (str): path for the nemo checkpoint for the prompt embedding table.
+            output_generation_logits (bool): if True returns generation_logits in the outout of generate method.
             sampling_kwargs: Additional kwargs to set in the SamplingConfig.
         """
 
@@ -784,6 +792,7 @@ def forward(
                     no_repeat_ngram_size=no_repeat_ngram_size,
                     output_log_probs=output_log_probs,
                     multiprocessed_env=multiprocessed_env,
+                    output_generation_logits=output_generation_logits,
                     **sampling_kwargs,
                 )
             else:
@@ -862,16 +871,21 @@ def get_triton_input(self):
             Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True),
             Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True),
             Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False),
         )
         return inputs
 
     @property
     def get_triton_output(self):
-        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
+        outputs = (
+            Tensor(name="outputs", shape=(-1,), dtype=bytes),
+            Tensor(name="generation_logits", shape=(-1,), dtype=np.single),
+        )
         return outputs
 
     @batch
     def triton_infer_fn(self, **inputs: np.ndarray):
+        output_dict = {}
         try:
             infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
             if "max_output_len" in inputs:
@@ -898,14 +912,20 @@ def triton_infer_fn(self, **inputs: np.ndarray):
             if "lora_uids" in inputs:
                 lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
                 infer_input["lora_uids"] = lora_uids[0].tolist()
+            if "output_generation_logits" in inputs:
+                infer_input["output_generation_logits"] = inputs.pop("output_generation_logits")[0][0]
 
-            output_texts = self.forward(**infer_input)
-            output = cast_output(output_texts, np.bytes_)
+            if infer_input["output_generation_logits"]:
+                output_texts, generation_logits = self.forward(**infer_input)
+                output_dict["generation_logits"] = np.array(generation_logits.cpu().numpy())
+            else:
+                output_texts = self.forward(**infer_input)
+            output_dict["outputs"] = cast_output(output_texts, np.bytes_)
         except Exception as error:
             err_msg = "An error occurred: {0}".format(str(error))
-            output = cast_output([err_msg], np.bytes_)
+            output_dict["outputs"] = cast_output([err_msg], np.bytes_)
 
-        return {"outputs": output}
+        return output_dict
 
     @batch
     def triton_infer_fn_streaming(self, **inputs: np.ndarray):
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index cdf8eaac6b1c..b2b761483700 100755
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -54,6 +54,8 @@ def build_and_save_engine(
     gpt_attention_plugin: str = "auto",
     gemm_plugin: str = "auto",
     reduce_fusion: bool = False,
+    gather_context_logits: bool = False,
+    gather_generation_logits: bool = False,
 ):
     architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture
     try:
@@ -96,8 +98,8 @@ def build_and_save_engine(
         'max_num_tokens': max_num_tokens,
         'opt_num_tokens': opt_num_tokens,
         'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
-        'gather_context_logits': False,
-        'gather_generation_logits': False,
+        'gather_context_logits': gather_context_logits,
+        'gather_generation_logits': gather_generation_logits,
         'strongly_typed': False,
         'builder_opt': None,
         'use_refit': use_refit,
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index bd7b8abd5f9e..ef67c918290f 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -647,6 +647,7 @@ def generate(
     streaming: bool = False,
     output_log_probs=False,
     multiprocessed_env=False,
+    output_generation_logits=False,
     **sampling_kwargs,
 ) -> Optional[List[List[str]]]:
     """Generate the output sequence from the input sequence.
@@ -692,6 +693,7 @@ def generate(
         multiprocessed_env=multiprocessed_env,
         **sampling_kwargs,
     )
+
     assert outputs is not None
     if tensorrt_llm.mpi_rank() != 0:
         return None
@@ -705,8 +707,8 @@ def generate(
         for b in range(output_ids.shape[0])
     ]
 
-    if output_log_probs:
-        return output_lines_list, log_probs
+    if output_generation_logits:
+        return output_lines_list, outputs['generation_logits']
     return output_lines_list
 
 
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index e3394726fa1c..154ffc90dc9c 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -419,13 +419,14 @@ def nemo_deploy(argv):
 
         LOGGER.info("Triton deploy function will be called.")
         nm.deploy()
+        nm.run()
     except Exception as error:
         LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
         return
 
     try:
         LOGGER.info("Model serving on Triton is will be started.")
-        if args.start_rest_service == "True":
+        if args.start_rest_service:
             try:
                 LOGGER.info("REST service will be started.")
                 uvicorn.run(

From 7d03e1f149ca666233b9ef0a23308654425b6aff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 17 Nov 2024 16:55:05 +0100
Subject: [PATCH 10/48] ci: Fix release workflow (#11286)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* ci: Fix release workflow

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Update .github/workflows/release.yml

Signed-off-by: oliver könig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 .github/workflows/release.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index c11629776c40..03474251f995 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -23,7 +23,7 @@ on:
 
 jobs: 
   release:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.10.0
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.12.3
     with:
       release-ref: ${{ inputs.release-ref }}
       image-name: nemo_container
@@ -39,3 +39,4 @@ jobs:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
       SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      PAT: ${{ secrets.PAT }}

From 25445927d77f0cf5ef897826fc99c22d9b2308ee Mon Sep 17 00:00:00 2001
From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Date: Sun, 17 Nov 2024 18:27:56 -0800
Subject: [PATCH 11/48] Update import 'pytorch_lightning' ->
 'lightning.pytorch' (#11252)

* update import in collections/llm

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in lightning

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update fabric import in lightning

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* update import in collections/asr

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in collections/tts

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* update requirements

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unused imports

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in tests

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* update import in collections/common

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in core

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* update import in utils

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* update import in collections/nlp

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update fabric import in collections/nlp

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* update fabric import in utils

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* update import in nlp examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in asr examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in llm examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in tts examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update fabric import in nlp examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in deploy

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* update import in slu examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in speaker_tasks examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in collections/audio

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in audio examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in collections/llm

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* update import in collections/vlm

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in collections/diffusion

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in collections/vision

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in collections/multimodal

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in multimodal examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in vision examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* update import in scripts

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Update baseline

Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* revert bad change

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: maanug-nv <maanug-nv@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: maanug-nv <maanug-nv@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
---
 .secrets.baseline                             |   8 +-
 examples/asr/asr_adapters/eval_asr_adapter.py |   2 +-
 .../asr/asr_adapters/train_asr_adapter.py     |   2 +-
 .../aed/speech_to_text_aed_chunked_infer.py   |   2 +-
 .../ctc/speech_to_text_buffered_infer_ctc.py  |   2 +-
 .../speech_to_text_buffered_infer_rnnt.py     |   2 +-
 examples/asr/asr_ctc/speech_to_text_ctc.py    |   2 +-
 .../asr/asr_ctc/speech_to_text_ctc_bpe.py     |   2 +-
 .../speech_to_text_hybrid_rnnt_ctc_bpe.py     |   2 +-
 .../speech_to_text_hybrid_rnnt_ctc_char.py    |   2 +-
 .../asr/asr_transducer/speech_to_text_rnnt.py |   2 +-
 .../asr_transducer/speech_to_text_rnnt_bpe.py |   2 +-
 .../speech_to_text_bpe_with_text.py           |   2 +-
 .../speech_to_text_bpe_with_text_finetune.py  |   2 +-
 .../asr/conf/asr_adapters/asr_adaptation.yaml |   2 +-
 .../conf/asr_adapters/asr_adaptation_hp.yaml  |   2 +-
 .../asr_finetune/speech_to_text_finetune.yaml |   2 +-
 .../speech_to_text_hf_finetune.yaml           |   2 +-
 .../fastconformer_ctc_bpe_streaming.yaml      |   2 +-
 .../fastconformer_ctc_char_streaming.yaml     |   2 +-
 ...astconformer_transducer_bpe_streaming.yaml |   2 +-
 ...stconformer_transducer_char_streaming.yaml |   2 +-
 .../fastconformer/fast-conformer_ctc_bpe.yaml |   2 +-
 .../fast-conformer_transducer_bpe.yaml        |   2 +-
 ...r_hybrid_transducer_ctc_bpe_streaming.yaml |   2 +-
 ..._hybrid_transducer_ctc_char_streaming.yaml |   2 +-
 ...stconformer_hybrid_transducer_ctc_bpe.yaml |   2 +-
 ...tconformer_hybrid_transducer_ctc_char.yaml |   2 +-
 .../fast-conformer-long_ctc_bpe.yaml          |   4 +-
 .../fast-conformer-long_transducer_bpe.yaml   |   2 +-
 .../speech_multitask/fast-conformer_aed.yaml  |   2 +-
 .../experimental/k2/align_speech_parallel.py  |   2 +-
 .../asr/experimental/k2/speech_to_text_bpe.py |   2 +-
 .../k2/speech_to_text_rnnt_bpe.py             |   2 +-
 .../structured/speech_to_text_hybrid.py       |   2 +-
 .../structured/speech_to_text_structured.py   |  10 +-
 .../speech_to_text_structured_v2.py           |   2 +-
 .../speech_to_frame_label.py                  |   2 +-
 .../speech_classification/speech_to_label.py  |   2 +-
 .../speech_multitask/speech_to_text_aed.py    |   2 +-
 .../speech_classification_mfa_train.py        |   2 +-
 .../masked_token_pred_pretrain.py             |   2 +-
 .../speech_pretraining/speech_pre_training.py |   2 +-
 examples/asr/speech_to_text_finetune.py       |   2 +-
 .../speech_to_text_transformer.py             |   2 +-
 .../speech_translation/translate_speech.py    |   2 +-
 examples/asr/transcribe_speech.py             |   2 +-
 examples/asr/transcribe_speech_parallel.py    |   2 +-
 examples/audio/audio_to_audio_train.py        |   2 +-
 examples/audio/process_audio.py               |   2 +-
 examples/llm/peft/hf.py                       |   2 +-
 examples/llm/sft/hf.py                        |   4 +-
 .../neva/convert_llava_to_neva.py             |   4 +-
 .../speech_llm/export/extract_salm_weights.py |   2 +-
 .../controlnet/controlnet_train.py            |   2 +-
 .../text_to_image/convert_hf_ckpt_to_nemo.py  |   6 +-
 .../dreambooth/dreambooth_lora_infer.py       |   9 +-
 .../imagen/generate_fid_images.py             |   7 +-
 .../imagen/imagen_generate_images.py          |   8 +-
 .../text_to_image/imagen/imagen_infer.py      |   2 +-
 .../text_to_image/imagen/imagen_training.py   |   2 +-
 .../stable_diffusion/sd_lora_infer.py         |   9 +-
 .../stable_diffusion/sd_xl_train.py           |   3 +-
 .../clip/convert_external_clip_to_nemo.py     |   4 +-
 .../x_to_nerf/benchmark_callback.py           |   2 +-
 examples/multimodal/x_to_nerf/data.py         |   2 +-
 examples/multimodal/x_to_nerf/main.py         |   2 +-
 examples/nlp/dialogue/dialogue.py             |   2 +-
 .../nlp/duplex_text_normalization/helpers.py  |   4 +-
 .../self_alignment_pretraining.py             |   2 +-
 examples/nlp/glue_benchmark/glue_benchmark.py |   2 +-
 .../nlp/information_retrieval/bert_dpr.py     |   2 +-
 .../information_retrieval/bert_joint_ir.py    |   2 +-
 .../megatron_gpt_embedding_finetuning.py      |   2 +-
 .../megatron_gpt_reranker_finetuning.py       |   2 +-
 .../intent_slot_classification.py             |   2 +-
 .../multi_label_intent_slot_classification.py |   2 +-
 .../nlp/language_modeling/bert_pretraining.py |   4 +-
 .../mamba_change_num_partition.py             |   2 +-
 .../megatron_bart_pretraining.py              |  10 +-
 .../megatron_change_num_partitions.py         |  13 +-
 .../megatron_ckpt_to_nemo.py                  |   4 +-
 .../nlp/language_modeling/megatron_export.py  |   2 +-
 .../megatron_gpt_distillation.py              |   2 +-
 .../language_modeling/megatron_gpt_eval.py    |   2 +-
 .../megatron_gpt_mcore_batch_eval.py          |   2 +-
 .../language_modeling/megatron_gpt_prune.py   |   2 +-
 .../nlp/language_modeling/megatron_gpt_ptq.py |   2 +-
 .../language_modeling/megatron_gpt_test.py    |  16 +-
 .../megatron_gpt_validate.py                  |   6 +-
 .../megatron_lm_ckpt_to_nemo.py               |  10 +-
 .../language_modeling/megatron_mamba_eval.py  |   2 +-
 .../megatron_retro_cal_shape.py               |   8 +-
 .../language_modeling/megatron_retro_eval.py  |   6 +-
 .../megatron_retro_eval_legacy.py             |  12 +-
 .../megatron_retro_fine_tune.py               |  16 +-
 .../megatron_retro_mutransfer_pretrain.py     |  10 +-
 .../megatron_retro_pretraining_legacy.py      |  10 +-
 .../megatron_retro_qatask_eval.py             |   6 +-
 .../nlp/language_modeling/megatron_t5_eval.py |  17 +-
 .../megatron_t5_lm_adaptation_finetune.py     |  10 +-
 .../megatron_t5_seq2seq_eval.py               |   8 +-
 .../megatron_t5_seq2seq_finetune.py           |   6 +-
 .../nlp/language_modeling/transformer_lm.py   |   2 +-
 .../language_modeling/upcycle_dense_to_moe.py |   2 +-
 .../enc_dec_nmt-bottleneck.py                 |   3 +-
 .../nlp/machine_translation/enc_dec_nmt.py    |   3 +-
 .../enc_dec_nmt_finetune.py                   |   3 +-
 .../megatron_nmt_training.py                  |  10 +-
 .../nmt_transformer_infer_megatron.py         |   2 +-
 .../question_answering/question_answering.py  |   2 +-
 .../helpers.py                                |   4 +-
 .../nlp/text2sparql/evaluate_text2sparql.py   |   2 +-
 examples/nlp/text2sparql/text2sparql.py       |   2 +-
 ...parallel_text_classification_evaluation.py |   2 +-
 .../text_classification_with_bert.py          |   2 +-
 .../text_normalization_as_tagging/helpers.py  |   4 +-
 ...talization_lexical_audio_train_evaluate.py |   2 +-
 ...nctuation_capitalization_train_evaluate.py |   2 +-
 .../token_classification_evaluate.py          |   2 +-
 .../token_classification_train.py             |   2 +-
 .../zero_shot_intent_train.py                 |   2 +-
 .../eval_utils/inference.py                   |   2 +-
 .../speech_intent_slot_train.py               |   2 +-
 .../clustering_diarizer/offline_diar_infer.py |   2 +-
 .../multiscale_diar_decoder.py                |   4 +-
 .../speaker_identification_infer.py           |  21 +-
 .../speaker_tasks/recognition/speaker_reco.py |   4 +-
 .../recognition/speaker_reco_finetune.py      |   4 +-
 examples/tts/aligner.py                       |   2 +-
 examples/tts/audio_codec.py                   |   2 +-
 examples/tts/fastpitch.py                     |   2 +-
 examples/tts/fastpitch_finetune.py            |   2 +-
 examples/tts/fastpitch_finetune_adapters.py   |   2 +-
 examples/tts/fastpitch_ssl.py                 |   2 +-
 .../g2p_heteronym_classification_inference.py |  15 +-
 ...ronym_classification_train_and_evaluate.py |   2 +-
 examples/tts/g2p/g2p_inference.py             |  20 +-
 examples/tts/g2p/g2p_train_and_evaluate.py    |   2 +-
 examples/tts/hifigan.py                       |   2 +-
 examples/tts/hifigan_finetune.py              |   2 +-
 examples/tts/mixer_tts.py                     |   2 +-
 examples/tts/radtts.py                        |   2 +-
 examples/tts/spectrogram_enhancer.py          |   2 +-
 examples/tts/ssl_tts.py                       |   2 +-
 examples/tts/tacotron2.py                     |   2 +-
 examples/tts/tacotron2_finetune.py            |   2 +-
 examples/tts/univnet.py                       |   2 +-
 examples/tts/vits.py                          |   2 +-
 examples/tts/waveglow.py                      |   2 +-
 examples/vision/convert_ckpt_to_nemo.py       |   4 +-
 .../megatron_vit_classification_evaluate.py   |  26 ++-
 .../megatron_vit_classification_infer.py      |  20 +-
 .../asr/data/audio_to_text_dataset.py         |   2 +-
 .../asr/models/aed_multitask_models.py        |   3 +-
 .../asr/models/classification_models.py       |   2 +-
 .../asr/models/clustering_diarizer.py         |   3 +-
 .../asr/models/confidence_ensemble.py         |   2 +-
 nemo/collections/asr/models/ctc_models.py     |   3 +-
 .../asr/models/hybrid_asr_tts_models.py       |   9 +-
 .../asr/models/hybrid_rnnt_ctc_bpe_models.py  |   2 +-
 .../asr/models/hybrid_rnnt_ctc_models.py      |   2 +-
 .../asr/models/k2_sequence_models.py          |  14 +-
 nemo/collections/asr/models/label_models.py   |   2 +-
 nemo/collections/asr/models/msdd_models.py    |   4 +-
 .../collections/asr/models/rnnt_bpe_models.py |   2 +-
 nemo/collections/asr/models/rnnt_models.py    |   2 +-
 nemo/collections/asr/models/ssl_models.py     |   2 +-
 .../asr/models/transformer_bpe_models.py      |   2 +-
 .../collections/asr/parts/utils/wfst_utils.py |   2 +-
 .../audio/models/audio_to_audio.py            |   2 +-
 nemo/collections/audio/models/enhancement.py  |   2 +-
 .../audio/parts/utils/callbacks.py            |   8 +-
 .../collections/common/callbacks/callbacks.py |   7 +-
 nemo/collections/common/callbacks/ema.py      |  26 ++-
 .../common/metrics/perf_metrics.py            |   2 +-
 .../collections/common/parts/ptl_overrides.py |   4 +-
 .../data/diffusion_energon_datamodule.py      |   2 +-
 .../data/diffusion_fake_datamodule.py         |   4 +-
 nemo/collections/diffusion/train.py           |   4 +-
 nemo/collections/llm/api.py                   |   2 +-
 nemo/collections/llm/gpt/data/api.py          |   2 +-
 nemo/collections/llm/gpt/data/fine_tuning.py  |   2 +-
 nemo/collections/llm/gpt/data/hf_dataset.py   |   2 +-
 nemo/collections/llm/gpt/data/mock.py         |   4 +-
 nemo/collections/llm/gpt/data/pre_training.py |   4 +-
 nemo/collections/llm/gpt/model/base.py        |   2 +-
 .../gpt/model/hf_auto_model_for_causal_lm.py  |   2 +-
 nemo/collections/llm/gpt/model/mistral.py     |   1 -
 nemo/collections/llm/inference/base.py        |   4 +-
 nemo/collections/llm/recipes/baichuan2_7b.py  |   4 +-
 nemo/collections/llm/recipes/chatglm3_6b.py   |   4 +-
 .../llm/recipes/finetune_default.py           |   2 +-
 nemo/collections/llm/recipes/gemma2.py        |   4 +-
 nemo/collections/llm/recipes/gemma2_27b.py    |   2 +-
 nemo/collections/llm/recipes/gemma2_2b.py     |   2 +-
 nemo/collections/llm/recipes/gemma2_9b.py     |   2 +-
 nemo/collections/llm/recipes/gemma_2b.py      |   4 +-
 nemo/collections/llm/recipes/gemma_7b.py      |   4 +-
 nemo/collections/llm/recipes/gpt3_175b.py     |   4 +-
 .../recipes/hf_auto_model_for_causal_lm.py    |   4 +-
 nemo/collections/llm/recipes/llama31_405b.py  |   4 +-
 nemo/collections/llm/recipes/llama31_70b.py   |   4 +-
 nemo/collections/llm/recipes/llama31_8b.py    |   4 +-
 nemo/collections/llm/recipes/llama3_70b.py    |   4 +-
 .../collections/llm/recipes/llama3_70b_16k.py |   2 +-
 .../collections/llm/recipes/llama3_70b_64k.py |   2 +-
 nemo/collections/llm/recipes/llama3_8b.py     |   4 +-
 nemo/collections/llm/recipes/llama3_8b_16k.py |   2 +-
 nemo/collections/llm/recipes/llama3_8b_64k.py |   2 +-
 nemo/collections/llm/recipes/log/default.py   |   2 +-
 nemo/collections/llm/recipes/mamba2_130m.py   |   4 +-
 nemo/collections/llm/recipes/mamba2_1_3b.py   |   4 +-
 nemo/collections/llm/recipes/mamba2_2_7b.py   |   4 +-
 nemo/collections/llm/recipes/mamba2_370m.py   |   4 +-
 nemo/collections/llm/recipes/mamba2_780m.py   |   4 +-
 nemo/collections/llm/recipes/mamba2_8b.py     |   4 +-
 .../llm/recipes/mamba2_hybrid_8b.py           |   4 +-
 nemo/collections/llm/recipes/mistral_7b.py    |   4 +-
 .../llm/recipes/mistral_nemo_12b.py           |   4 +-
 nemo/collections/llm/recipes/mixtral_8x22b.py |   4 +-
 nemo/collections/llm/recipes/mixtral_8x7b.py  |   4 +-
 .../llm/recipes/mixtral_8x7b_16k.py           |   2 +-
 .../llm/recipes/mixtral_8x7b_64k.py           |   2 +-
 nemo/collections/llm/recipes/nemotron.py      |   4 +-
 nemo/collections/llm/recipes/nemotron3_22b.py |   2 +-
 .../llm/recipes/nemotron3_22b_16k.py          |   2 +-
 .../llm/recipes/nemotron3_22b_64k.py          |   2 +-
 nemo/collections/llm/recipes/nemotron3_4b.py  |   2 +-
 nemo/collections/llm/recipes/nemotron3_8b.py  |   2 +-
 nemo/collections/llm/recipes/nemotron4_15b.py |   2 +-
 .../llm/recipes/nemotron4_15b_16k.py          |   2 +-
 .../llm/recipes/nemotron4_15b_64k.py          |   2 +-
 .../collections/llm/recipes/nemotron4_340b.py |   2 +-
 nemo/collections/llm/recipes/qwen2.py         |   4 +-
 nemo/collections/llm/recipes/qwen2_1p5b.py    |   2 +-
 nemo/collections/llm/recipes/qwen2_500m.py    |   2 +-
 nemo/collections/llm/recipes/qwen2_72b.py     |   2 +-
 nemo/collections/llm/recipes/qwen2_7b.py      |   2 +-
 nemo/collections/llm/recipes/starcoder2.py    |   5 +-
 .../collections/llm/recipes/starcoder2_15b.py |   2 +-
 nemo/collections/llm/recipes/starcoder2_3b.py |   2 +-
 nemo/collections/llm/recipes/starcoder2_7b.py |   2 +-
 nemo/collections/llm/recipes/starcoder_15b.py |   4 +-
 nemo/collections/llm/recipes/t5_11b.py        |   4 +-
 nemo/collections/llm/recipes/t5_220m.py       |   4 +-
 nemo/collections/llm/recipes/t5_3b.py         |   4 +-
 nemo/collections/llm/t5/data/fine_tuning.py   |   2 +-
 nemo/collections/llm/t5/data/mock.py          |   4 +-
 nemo/collections/llm/t5/data/pre_training.py  |   4 +-
 nemo/collections/llm/t5/model/t5.py           |   2 +-
 .../data/dreambooth/dreambooth_dataset.py     |   2 +-
 .../multimodal/data/energon/base.py           |   4 +-
 .../models/multimodal_llm/neva/neva_model.py  |   2 +-
 .../text_to_image/controlnet/controlnet.py    |   4 +-
 .../models/text_to_image/controlnet/util.py   |   4 +-
 .../text_to_image/dreambooth/dreambooth.py    |   2 +-
 .../models/text_to_image/imagen/imagen.py     |   2 +-
 .../text_to_image/imagen/imagen_pipeline.py   |  10 +-
 .../stable_diffusion/diffusion_engine.py      |   6 +-
 .../stable_diffusion/ldm/autoencoder.py       |   2 +-
 .../stable_diffusion/ldm/ddpm.py              |  14 +-
 .../clip/megatron_clip_models.py              |   4 +-
 .../megatron_nsfw_clip_models.py              |   5 +-
 nemo/collections/multimodal/parts/utils.py    |   4 +-
 .../speech_cv/models/visual_ctc_models.py     |   2 +-
 .../visual_hybrid_rnnt_ctc_bpe_models.py      |  17 +-
 .../models/visual_hybrid_rnnt_ctc_models.py   |   2 +-
 .../models/visual_rnnt_bpe_models.py          |  17 +-
 .../speech_cv/models/visual_rnnt_models.py    |   2 +-
 .../speech_llm/models/modular_models.py       |   4 +-
 .../speech_llm/models/modular_t5_models.py    |   2 +-
 .../machine_translation/preproc_mt_data.py    |  89 +++++----
 .../dialogue_gpt_classification_model.py      |   2 +-
 .../dialogue/dialogue_gpt_generation_model.py |   2 +-
 .../dialogue_nearest_neighbour_model.py       |   2 +-
 .../dialogue/dialogue_s2s_generation_model.py |   2 +-
 .../dialogue_zero_shot_intent_model.py        |   2 +-
 .../intent_slot_classification_model.py       |   2 +-
 .../nlp/models/dialogue/sgdqa_model.py        |   2 +-
 .../duplex_decoder.py                         |  10 +-
 .../duplex_tagger.py                          |   8 +-
 .../nlp/models/enc_dec_nlp_model.py           |   5 +-
 .../entity_linking/entity_linking_model.py    |   2 +-
 .../glue_benchmark/glue_benchmark_model.py    |   2 +-
 .../information_retrieval/base_ir_model.py    |   2 +-
 .../information_retrieval/bert_dpr_model.py   |  44 ++++-
 .../bert_joint_ir_model.py                    |  10 +-
 .../megatron_bert_embedding_model.py          |   5 +-
 .../megatron_gpt_embedding_model.py           |   2 +-
 .../megatron_gpt_reranker_model.py            |   2 +-
 .../intent_slot_classification_model.py       |  11 +-
 ..._label_intent_slot_classification_model.py |  54 +++--
 .../models/language_modeling/bert_lm_model.py |  16 +-
 .../language_modeling/megatron_bart_model.py  |   6 +-
 .../language_modeling/megatron_base_model.py  |   8 +-
 .../megatron_base_prompt_learning_model.py    |   2 +-
 .../language_modeling/megatron_bert_model.py  |   2 +-
 .../language_modeling/megatron_glue_model.py  |   2 +-
 .../megatron_gpt_adapter_model.py             |  23 +--
 .../language_modeling/megatron_gpt_model.py   |   6 +-
 .../megatron_gpt_prompt_learning_model.py     |   2 +-
 .../megatron_gpt_sft_model.py                 |   4 +-
 .../megatron_griffin_model.py                 |   2 +-
 .../megatron_griffin_sft_model.py             |   8 +-
 .../megatron_lm_encoder_decoder_model.py      |   6 +-
 .../language_modeling/megatron_mamba_model.py |   2 +-
 .../megatron_mamba_sft_model.py               |   3 +-
 .../megatron_retrieval_model.py               |  12 +-
 .../megatron_retro_fine_tune_model.py         |  22 ++-
 .../language_modeling/megatron_retro_model.py |   4 +-
 .../language_modeling/megatron_t0_model.py    |   2 +-
 .../megatron_t5_adapter_model.py              |  30 +--
 .../language_modeling/megatron_t5_model.py    |  12 +-
 .../megatron_t5_prompt_learning_model.py      |   2 +-
 .../megatron_t5_sft_model.py                  |   4 +-
 .../language_modeling/transformer_lm_model.py |  19 +-
 .../machine_translation/megatron_nmt_model.py |   4 +-
 .../mt_enc_dec_bottleneck_model.py            |  42 +++-
 .../machine_translation/mt_enc_dec_model.py   |  74 ++++---
 nemo/collections/nlp/models/nlp_model.py      |  12 +-
 .../question_answering/qa_base_model.py       |   2 +-
 .../question_answering/qa_bert_model.py       |   2 +-
 .../models/question_answering/qa_gpt_model.py |   2 +-
 .../nlp/models/question_answering/qa_model.py |   2 +-
 .../models/question_answering/qa_s2s_model.py |   2 +-
 .../nlp/models/rag/custom_bert_embedder.py    |   2 +-
 .../nlp/models/rag/custom_gpt_llm.py          |   2 +-
 .../spellchecking_model.py                    |   2 +-
 .../models/text2sparql/text2sparql_model.py   |  12 +-
 .../text_classification_model.py              |   2 +-
 .../thutmose_tagger.py                        |   4 +-
 ...tion_capitalization_lexical_audio_model.py | 187 +++++++++---------
 .../punctuation_capitalization_model.py       |  21 +-
 .../token_classification_model.py             |   2 +-
 .../zero_shot_intent_model.py                 |   3 +-
 .../nlp/modules/common/lm_utils.py            |   2 +-
 .../modules/common/text_generation_utils.py   |   2 +-
 .../nlp/parts/megatron_trainer_builder.py     |   8 +-
 nemo/collections/nlp/parts/nlp_overrides.py   |  33 ++--
 nemo/collections/nlp/parts/utils_funcs.py     |   2 +-
 nemo/collections/tts/g2p/models/ctc.py        |  23 ++-
 .../g2p/models/heteronym_classification.py    |  14 +-
 nemo/collections/tts/g2p/models/t5.py         |   2 +-
 nemo/collections/tts/models/aligner.py        |   4 +-
 nemo/collections/tts/models/audio_codec.py    |   2 +-
 nemo/collections/tts/models/fastpitch.py      |   4 +-
 nemo/collections/tts/models/fastpitch_ssl.py  |  42 +++-
 nemo/collections/tts/models/hifigan.py        |   4 +-
 nemo/collections/tts/models/mixer_tts.py      |   4 +-
 nemo/collections/tts/models/radtts.py         |   4 +-
 .../tts/models/spectrogram_enhancer.py        |   4 +-
 nemo/collections/tts/models/ssl_tts.py        |  31 +--
 nemo/collections/tts/models/tacotron2.py      |   2 +-
 nemo/collections/tts/models/univnet.py        |  14 +-
 nemo/collections/tts/models/vits.py           |   4 +-
 nemo/collections/tts/models/waveglow.py       |   2 +-
 nemo/collections/tts/parts/utils/callbacks.py |  65 ++++--
 nemo/collections/tts/parts/utils/helpers.py   |   2 +-
 .../megatron_vit_classification_models.py     |   4 +-
 nemo/collections/vlm/mllama/data/lazy.py      |   4 +-
 nemo/collections/vlm/mllama/data/mock.py      |   4 +-
 nemo/collections/vlm/mllama/model/base.py     |   2 +-
 nemo/collections/vlm/neva/data/api.py         |   2 +-
 nemo/collections/vlm/neva/data/lazy.py        |   4 +-
 nemo/collections/vlm/neva/data/mock.py        |   4 +-
 nemo/collections/vlm/neva/model/api.py        |   2 +-
 nemo/collections/vlm/neva/model/base.py       |   2 +-
 nemo/collections/vlm/recipes/mllama_11b.py    |   2 +-
 nemo/collections/vlm/recipes/mllama_90b.py    |   2 +-
 nemo/core/classes/__init__.py                 |   2 +-
 nemo/core/classes/exportable.py               |   2 +-
 nemo/core/classes/modelPT.py                  |   6 +-
 .../core/connectors/save_restore_connector.py |   2 +-
 nemo/core/utils/k2_guard.py                   |   3 +-
 nemo/deploy/deploy_base.py                    |   2 +-
 nemo/deploy/nlp/megatronllm_deployable.py     |   2 +-
 nemo/lightning/__init__.py                    |   4 +-
 nemo/lightning/_strategy_lib.py               |   2 +-
 nemo/lightning/base.py                        |   2 +-
 nemo/lightning/data.py                        |   2 +-
 nemo/lightning/fabric/conversion.py           |  14 +-
 nemo/lightning/fabric/fabric.py               |   4 +-
 nemo/lightning/fabric/plugins.py              |   2 +-
 nemo/lightning/fabric/strategies.py           |  26 +--
 nemo/lightning/io/api.py                      |   2 +-
 nemo/lightning/io/connector.py                |   4 +-
 nemo/lightning/io/pl.py                       |  10 +-
 nemo/lightning/megatron_parallel.py           |   7 +-
 nemo/lightning/nemo_logger.py                 |   8 +-
 .../pytorch/callbacks/ddp_parity_checker.py   |   2 +-
 nemo/lightning/pytorch/callbacks/debugging.py |   4 +-
 .../pytorch/callbacks/garbage_collection.py   |   2 +-
 .../callbacks/megatron_comm_overlap.py        |   4 +-
 .../pytorch/callbacks/memory_profiler.py      |   2 +-
 .../pytorch/callbacks/model_checkpoint.py     |  18 +-
 .../pytorch/callbacks/model_transform.py      |   4 +-
 .../pytorch/callbacks/moe_token_drop.py       |   4 +-
 nemo/lightning/pytorch/callbacks/nsys.py      |   2 +-
 nemo/lightning/pytorch/callbacks/peft.py      |   8 +-
 .../lightning/pytorch/callbacks/preemption.py |   4 +-
 .../pytorch/callbacks/progress_bar.py         |   4 +-
 .../pytorch/callbacks/progress_printer.py     |   4 +-
 nemo/lightning/pytorch/optim/base.py          |   4 +-
 nemo/lightning/pytorch/optim/megatron.py      |   2 +-
 nemo/lightning/pytorch/optim/pytorch.py       |   4 +-
 .../lightning/pytorch/plugins/data_sampler.py |   2 +-
 .../pytorch/plugins/mixed_precision.py        |   3 +-
 .../pytorch/strategies/fsdp_strategy.py       |  12 +-
 .../pytorch/strategies/megatron_strategy.py   |  20 +-
 nemo/lightning/pytorch/strategies/utils.py    |   7 +-
 nemo/lightning/pytorch/trainer.py             |   6 +-
 nemo/lightning/resume.py                      |   4 +-
 nemo/lightning/run/plugins.py                 |   4 +-
 nemo/utils/callbacks/cuda_graph.py            |  20 +-
 nemo/utils/callbacks/dist_ckpt_io.py          |  12 +-
 nemo/utils/callbacks/nemo_model_checkpoint.py |  21 +-
 nemo/utils/callbacks/preemption.py            |   6 +-
 nemo/utils/callbacks/s3_checkpoint_io.py      |   2 +-
 nemo/utils/cloud.py                           |   9 +-
 nemo/utils/exp_manager.py                     |  36 ++--
 nemo/utils/lightning_logger_patch.py          |   2 +-
 nemo/utils/loggers/clearml_logger.py          |   6 +-
 nemo/utils/loggers/dllogger.py                |   8 +-
 requirements/requirements_lightning.txt       |   1 +
 .../average_model_checkpoints.py              |   2 +-
 .../megatron_checkpoint_averaging.py          |   7 +-
 .../convert_baichuan2_hf_to_nemo.py           |   6 +-
 .../convert_baichuan2_nemo_to_hf.py           |   2 +-
 .../convert_bert_nemo_to_hf.py                |  12 +-
 .../convert_chatglm_hf_to_nemo.py             |  10 +-
 .../convert_chatglm_nemo_to_hf.py             |   2 +-
 .../convert_clip_hf_to_nemo.py                |   4 +-
 .../convert_falcon_hf_to_nemo.py              |   6 +-
 .../convert_falcon_nemo_to_hf.py              |   2 +-
 .../convert_gpt_nemo_to_mcore.py              |   2 +-
 .../convert_llama_hf_to_nemo.py               |   2 +-
 .../convert_llama_hf_to_nemo_load.py          |   2 +-
 .../convert_llama_hf_to_nemo_save_dict.py     |   2 +-
 .../convert_llama_nemo_to_hf.py               |   2 +-
 .../convert_mistral_7b_hf_to_nemo.py          |   4 +-
 .../convert_mistral_7b_nemo_to_hf.py          |   2 +-
 .../convert_mixtral_hf_to_nemo.py             |   4 +-
 .../convert_mixtral_nemo_to_hf.py             |   2 +-
 .../convert_mpt_hf_to_nemo.py                 |   8 +-
 .../convert_nemotron_nemo_to_hf.py            |   2 +-
 .../convert_qwen2_hf_to_nemo.py               |   2 +-
 .../convert_qwen2_nemo_to_hf.py               |   2 +-
 .../convert_starcoder2_hf_to_nemo.py          |   6 +-
 .../convert_starcoder2_nemo_to_hf.py          |   2 +-
 .../convert_starcoder_hf_to_nemo.py           |   2 +-
 .../quantize_model_to_nf4.py                  |   2 +-
 .../confidence_ensembles/build_ensemble.py    |  18 +-
 .../merge_lora_weights_into_base_model.py     |   2 +-
 scripts/export.py                             |   2 +-
 .../nemo_legacy_import/nlp_checkpoint_port.py |   2 +-
 .../convert_prompt_learning_ckpt_to_nemo.py   |   2 +-
 .../hf_t5-v1_1_to_nemo.py                     |   2 +-
 .../merge_lora_weights/merge.py               |   2 +-
 .../start_retro_model_service.py              |  12 +-
 .../confidence/benchmark_asr_confidence.py    |   2 +-
 scripts/speech_recognition/oomptimizer.py     |   2 +-
 .../asr/confidence/test_asr_confidence.py     |   2 +-
 .../asr/test_asr_context_biasing.py           |  30 ++-
 .../asr/test_asr_interctc_models.py           |  14 +-
 tests/collections/asr/test_asr_local_attn.py  |  17 +-
 tests/collections/common/test_ema.py          |  15 +-
 tests/collections/llm/common.py               |   2 +-
 .../llm/gpt/model/megatron_ssm_pretraining.py |   4 +-
 tests/collections/llm/lora_mistralai.py       |   2 +-
 .../llm/megatron_gpt_pretraining.py           |   2 +-
 .../collections/llm/megatron_t5_finetuning.py |   2 +-
 .../llm/megatron_t5_pretraining.py            |   2 +-
 .../collections/llm/test_mnist_model_nemo2.py |   4 +-
 .../llm/test_mnist_model_nemo2_fsdp.py        |   4 +-
 .../multimodal/test_speechllm_models.py       |   4 +-
 tests/collections/nlp/test_falcon_model.py    |   2 +-
 tests/collections/nlp/test_flash_attention.py |   2 +-
 tests/collections/nlp/test_gpt_eval.py        |   2 +-
 tests/collections/nlp/test_gpt_model.py       |   2 +-
 tests/collections/nlp/test_nlp_exportables.py |   2 +-
 .../nlp/test_pretrained_models_performance.py |   2 +-
 .../collections/nlp/test_rampup_batch_size.py |   2 +-
 .../collections/nlp/test_retrieval_module.py  |  14 +-
 .../nlp/test_retrieval_module_inference.py    |  62 +++++-
 tests/collections/nlp/test_retro_model.py     |   2 +-
 tests/core/test_config_utils.py               |   8 +-
 tests/core/test_dist_ckpt.py                  |   6 +-
 tests/core/test_exp_manager.py                |   6 +-
 tests/core/test_fault_tolerance.py            |   2 +-
 tests/core/test_optimizers_schedulers.py      |  20 +-
 tests/core/test_straggler_det.py              |   2 +-
 tests/core_ptl/check_for_ranks.py             |   4 +-
 .../core_ptl/check_manual_upload_to_hf_hub.py |  14 +-
 tests/core_ptl/test_ptl_stateless_timer.py    |   2 +-
 tests/lightning/_fabric/test_conversion.py    |   8 +-
 tests/lightning/_io/test_api.py               |   2 +-
 .../callbacks/test_model_checkpoint.py        |   4 +-
 .../pytorch/callbacks/test_model_transform.py |   2 +-
 .../lightning/pytorch/callbacks/test_peft.py  |   2 +-
 .../pytorch/callbacks/test_preemption.py      |   2 +-
 tests/lightning/test_dist_ckpt.py             |   2 +-
 tests/lightning/test_nemo_logger.py           |   4 +-
 tests/lightning/test_precision_plugin.py      |   2 +-
 tests/lightning/test_state_restoration.py     |   2 +-
 tests/utils/test_trainer_utils.py             |   4 +-
 506 files changed, 1756 insertions(+), 1241 deletions(-)

diff --git a/.secrets.baseline b/.secrets.baseline
index c26f70775c5a..1e4832b40075 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -90,6 +90,10 @@
     {
       "path": "detect_secrets.filters.allowlist.is_line_allowlisted"
     },
+    {
+      "path": "detect_secrets.filters.common.is_baseline_file",
+      "filename": ".secrets.baseline"
+    },
     {
       "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
       "min_level": 2
@@ -273,7 +277,7 @@
         "filename": "scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py",
         "hashed_secret": "e0308bd21bffc156d79208f9ecf130370a015002",
         "is_verified": false,
-        "line_number": 460
+        "line_number": 471
       }
     ],
     "scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py": [
@@ -2083,5 +2087,5 @@
       }
     ]
   },
-  "generated_at": "2024-10-25T13:43:17Z"
+  "generated_at": "2024-11-14T09:37:19Z"
 }
diff --git a/examples/asr/asr_adapters/eval_asr_adapter.py b/examples/asr/asr_adapters/eval_asr_adapter.py
index bc5947f26aaf..b35cf33a6c0e 100644
--- a/examples/asr/asr_adapters/eval_asr_adapter.py
+++ b/examples/asr/asr_adapters/eval_asr_adapter.py
@@ -36,7 +36,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf, open_dict
 
 from nemo.collections.asr.models import ASRModel
diff --git a/examples/asr/asr_adapters/train_asr_adapter.py b/examples/asr/asr_adapters/train_asr_adapter.py
index 3f82ef8fe554..253672e3eb89 100644
--- a/examples/asr/asr_adapters/train_asr_adapter.py
+++ b/examples/asr/asr_adapters/train_asr_adapter.py
@@ -84,7 +84,7 @@
 import os
 from dataclasses import is_dataclass
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf, open_dict
 
 from nemo.collections.asr.models import ASRModel
diff --git a/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
index 8188bcced14d..1e63a9d820be 100644
--- a/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
+++ b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
@@ -49,7 +49,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 
diff --git a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
index 87370d278f98..ccea94f41f83 100644
--- a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
+++ b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
@@ -42,7 +42,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 
diff --git a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
index e6e84cdfa6c4..c31fa2b9d812 100644
--- a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
+++ b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
@@ -64,7 +64,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf, open_dict
 
diff --git a/examples/asr/asr_ctc/speech_to_text_ctc.py b/examples/asr/asr_ctc/speech_to_text_ctc.py
index 87b1b11633f7..ccdf3a5e09ea 100644
--- a/examples/asr/asr_ctc/speech_to_text_ctc.py
+++ b/examples/asr/asr_ctc/speech_to_text_ctc.py
@@ -68,7 +68,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecCTCModel
diff --git a/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py b/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py
index b4e3be5f650a..997cd6e52d5b 100644
--- a/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py
+++ b/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py
@@ -64,7 +64,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
diff --git a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py
index 796005a8fcee..ffda4c554a49 100644
--- a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py
+++ b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py
@@ -58,7 +58,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
diff --git a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py
index 423e005d8f02..02f43f93e2c7 100644
--- a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py
+++ b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py
@@ -69,7 +69,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecHybridRNNTCTCModel
diff --git a/examples/asr/asr_transducer/speech_to_text_rnnt.py b/examples/asr/asr_transducer/speech_to_text_rnnt.py
index 5b4f1e8a985d..2fab3ac137e6 100644
--- a/examples/asr/asr_transducer/speech_to_text_rnnt.py
+++ b/examples/asr/asr_transducer/speech_to_text_rnnt.py
@@ -67,7 +67,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecRNNTModel
diff --git a/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py b/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py
index 1fffea55686f..d18313acc9a6 100644
--- a/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py
+++ b/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py
@@ -59,7 +59,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecRNNTBPEModel
diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py
index b435d418fda2..acd7a8632822 100644
--- a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py
+++ b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py
@@ -49,7 +49,7 @@
 """
 
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models.hybrid_asr_tts_models import ASRWithTTSModel
diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py
index 99bc41ba966b..c1692cf6234f 100644
--- a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py
+++ b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py
@@ -45,7 +45,7 @@
 """
 
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models.hybrid_asr_tts_models import ASRWithTTSModel
diff --git a/examples/asr/conf/asr_adapters/asr_adaptation.yaml b/examples/asr/conf/asr_adapters/asr_adaptation.yaml
index b9a2a003217e..bae166d18782 100644
--- a/examples/asr/conf/asr_adapters/asr_adaptation.yaml
+++ b/examples/asr/conf/asr_adapters/asr_adaptation.yaml
@@ -182,7 +182,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: null
diff --git a/examples/asr/conf/asr_adapters/asr_adaptation_hp.yaml b/examples/asr/conf/asr_adapters/asr_adaptation_hp.yaml
index 958e6d23375c..d03b2eacfec4 100644
--- a/examples/asr/conf/asr_adapters/asr_adaptation_hp.yaml
+++ b/examples/asr/conf/asr_adapters/asr_adaptation_hp.yaml
@@ -182,7 +182,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: null
diff --git a/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml b/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml
index 3b5717efddf9..1ae64a341e16 100644
--- a/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml
+++ b/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml
@@ -81,7 +81,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 0.0
diff --git a/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml b/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml
index f111573f21eb..c044d3c8d7a8 100644
--- a/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml
+++ b/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml
@@ -145,7 +145,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 0.0
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
index 4c80d2f2e9d4..564f4b176e64 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
@@ -172,7 +172,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
index 0796a60260a1..6962c03ebe60 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
@@ -177,7 +177,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
index 4edcc38396fa..1531bf380b6d 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
@@ -228,7 +228,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
index 97b64ef93402..4cb508b0aff3 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
@@ -234,7 +234,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
diff --git a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
index ea6094380856..fd5f34aa43cb 100644
--- a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
@@ -198,7 +198,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 0.0
diff --git a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
index 9e2c1a876864..deb7b7ca613a 100644
--- a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
+++ b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
@@ -251,7 +251,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 0.0
diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
index daef1ed67a9f..6d89a6a52dfb 100644
--- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
@@ -245,7 +245,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
index 96aee4af1803..7e6b9c4aa7b4 100644
--- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
@@ -250,7 +250,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
index 4ba55e368bb9..12a21c6fba6c 100644
--- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
@@ -224,7 +224,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
index ed2ad8ca9c0d..65f657b5416e 100644
--- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
@@ -229,7 +229,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
index 773a500ef2db..df511883ce80 100644
--- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
@@ -169,7 +169,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 0.0
@@ -204,4 +204,4 @@ exp_manager:
   create_wandb_logger: false
   wandb_logger_kwargs:
     name: null
-    project: null
\ No newline at end of file
+    project: null
diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
index fec2a2839efa..0218136cbdbd 100644
--- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
+++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
@@ -223,7 +223,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 0.0
diff --git a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
index 3d1a8c8bdf47..50446dfd9467 100644
--- a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
+++ b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
@@ -249,7 +249,7 @@ trainer:
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy:
-    _target_: pytorch_lightning.strategies.DDPStrategy
+    _target_: lightning.pytorch.strategies.DDPStrategy
     gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 0.0
diff --git a/examples/asr/experimental/k2/align_speech_parallel.py b/examples/asr/experimental/k2/align_speech_parallel.py
index abfffa0cdfdb..cf07fb998e95 100644
--- a/examples/asr/experimental/k2/align_speech_parallel.py
+++ b/examples/asr/experimental/k2/align_speech_parallel.py
@@ -77,7 +77,7 @@
 from dataclasses import dataclass, field, is_dataclass
 from typing import Optional
 
-import pytorch_lightning as ptl
+import lightning.pytorch as ptl
 import torch
 from omegaconf import MISSING, OmegaConf
 
diff --git a/examples/asr/experimental/k2/speech_to_text_bpe.py b/examples/asr/experimental/k2/speech_to_text_bpe.py
index ee3924c7b8ac..8a941200770f 100644
--- a/examples/asr/experimental/k2/speech_to_text_bpe.py
+++ b/examples/asr/experimental/k2/speech_to_text_bpe.py
@@ -74,7 +74,7 @@
     model.graph_module_cfg.background_cfg.intersect_pruned=False \
     model.graph_module_cfg.background_cfg.boost_coeff=0.0
 """
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models.configs.k2_sequence_models_config import EncDecK2SeqModelConfig
diff --git a/examples/asr/experimental/k2/speech_to_text_rnnt_bpe.py b/examples/asr/experimental/k2/speech_to_text_rnnt_bpe.py
index a0031fba082d..973be0cbd477 100644
--- a/examples/asr/experimental/k2/speech_to_text_rnnt_bpe.py
+++ b/examples/asr/experimental/k2/speech_to_text_rnnt_bpe.py
@@ -63,7 +63,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecK2RnntSeqModelBPE
diff --git a/examples/asr/experimental/structured/speech_to_text_hybrid.py b/examples/asr/experimental/structured/speech_to_text_hybrid.py
index 26530631498f..e6126c47305f 100644
--- a/examples/asr/experimental/structured/speech_to_text_hybrid.py
+++ b/examples/asr/experimental/structured/speech_to_text_hybrid.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.asr.models import EncDecCTCModel, configs
 from nemo.core.config import hydra_runner
diff --git a/examples/asr/experimental/structured/speech_to_text_structured.py b/examples/asr/experimental/structured/speech_to_text_structured.py
index 366c6d831a7d..55934c00322e 100644
--- a/examples/asr/experimental/structured/speech_to_text_structured.py
+++ b/examples/asr/experimental/structured/speech_to_text_structured.py
@@ -14,7 +14,7 @@
 
 from dataclasses import asdict
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.models import EncDecCTCModel, configs
@@ -64,7 +64,13 @@
     ),
     # ... repeat 14 more times
     nemo_asr.modules.conv_asr.JasperEncoderConfig(
-        filters=1024, repeat=1, kernel=[1], stride=[1], dilation=[1], dropout=cfg.model.dropout, residual=False,
+        filters=1024,
+        repeat=1,
+        kernel=[1],
+        stride=[1],
+        dilation=[1],
+        dropout=cfg.model.dropout,
+        residual=False,
     ),
 ]
 
diff --git a/examples/asr/experimental/structured/speech_to_text_structured_v2.py b/examples/asr/experimental/structured/speech_to_text_structured_v2.py
index e8a865a9877a..146da425fb9b 100644
--- a/examples/asr/experimental/structured/speech_to_text_structured_v2.py
+++ b/examples/asr/experimental/structured/speech_to_text_structured_v2.py
@@ -14,7 +14,7 @@
 
 from dataclasses import asdict
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.asr.models import EncDecCTCModel, configs
 from nemo.core.config import modelPT, optimizers, schedulers
diff --git a/examples/asr/speech_classification/speech_to_frame_label.py b/examples/asr/speech_classification/speech_to_frame_label.py
index 04fcbdd1b61c..39a8e4415de5 100644
--- a/examples/asr/speech_classification/speech_to_frame_label.py
+++ b/examples/asr/speech_classification/speech_to_frame_label.py
@@ -39,7 +39,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 from nemo.collections.asr.models.classification_models import EncDecFrameClassificationModel
 
diff --git a/examples/asr/speech_classification/speech_to_label.py b/examples/asr/speech_classification/speech_to_label.py
index b3deb5a4e7e5..810d2b5e7bdf 100644
--- a/examples/asr/speech_classification/speech_to_label.py
+++ b/examples/asr/speech_classification/speech_to_label.py
@@ -143,7 +143,7 @@
 https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_classification/results.html#
 
 """
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 
diff --git a/examples/asr/speech_multitask/speech_to_text_aed.py b/examples/asr/speech_multitask/speech_to_text_aed.py
index 0c13e5289d86..943ecee59bfc 100644
--- a/examples/asr/speech_multitask/speech_to_text_aed.py
+++ b/examples/asr/speech_multitask/speech_to_text_aed.py
@@ -50,7 +50,7 @@
 
 
 """
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecMultiTaskModel
diff --git a/examples/asr/speech_pretraining/downstream/speech_classification_mfa_train.py b/examples/asr/speech_pretraining/downstream/speech_classification_mfa_train.py
index 3a256c7ab2d3..8bd56aa63450 100644
--- a/examples/asr/speech_pretraining/downstream/speech_classification_mfa_train.py
+++ b/examples/asr/speech_pretraining/downstream/speech_classification_mfa_train.py
@@ -14,7 +14,7 @@
 
 from collections import OrderedDict
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 
diff --git a/examples/asr/speech_pretraining/masked_token_pred_pretrain.py b/examples/asr/speech_pretraining/masked_token_pred_pretrain.py
index 1ea88d696643..e1c740e66412 100644
--- a/examples/asr/speech_pretraining/masked_token_pred_pretrain.py
+++ b/examples/asr/speech_pretraining/masked_token_pred_pretrain.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models.ssl_models import EncDecDenoiseMaskedTokenPredModel
diff --git a/examples/asr/speech_pretraining/speech_pre_training.py b/examples/asr/speech_pretraining/speech_pre_training.py
index cec9444096c3..0c94099442a6 100644
--- a/examples/asr/speech_pretraining/speech_pre_training.py
+++ b/examples/asr/speech_pretraining/speech_pre_training.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models.ssl_models import SpeechEncDecSelfSupervisedModel
diff --git a/examples/asr/speech_to_text_finetune.py b/examples/asr/speech_to_text_finetune.py
index 36a7bdc3bbdc..6b53446622ee 100644
--- a/examples/asr/speech_to_text_finetune.py
+++ b/examples/asr/speech_to_text_finetune.py
@@ -54,7 +54,7 @@
 https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations
 """
 import time
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import ASRModel
diff --git a/examples/asr/speech_translation/speech_to_text_transformer.py b/examples/asr/speech_translation/speech_to_text_transformer.py
index ac4dc4334164..bb7e0b3e4461 100644
--- a/examples/asr/speech_translation/speech_to_text_transformer.py
+++ b/examples/asr/speech_translation/speech_to_text_transformer.py
@@ -40,7 +40,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecTransfModelBPE
diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py
index 53599e1b3511..76c8c096527f 100644
--- a/examples/asr/speech_translation/translate_speech.py
+++ b/examples/asr/speech_translation/translate_speech.py
@@ -17,7 +17,7 @@
 from dataclasses import dataclass, is_dataclass
 from typing import List, Optional, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 
diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index a543fcf5e252..f1d61edc990e 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -18,7 +18,7 @@
 from dataclasses import dataclass, field, is_dataclass
 from typing import List, Optional, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf, open_dict
 
diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py
index eb905d3e91b0..bdf54ea67f7d 100644
--- a/examples/asr/transcribe_speech_parallel.py
+++ b/examples/asr/transcribe_speech_parallel.py
@@ -75,7 +75,7 @@
 from dataclasses import dataclass, is_dataclass
 from typing import Optional
 
-import pytorch_lightning as ptl
+import lightning.pytorch as ptl
 import torch
 from omegaconf import MISSING, OmegaConf
 
diff --git a/examples/audio/audio_to_audio_train.py b/examples/audio/audio_to_audio_train.py
index cef46dcf20b6..4d71e75176c9 100644
--- a/examples/audio/audio_to_audio_train.py
+++ b/examples/audio/audio_to_audio_train.py
@@ -28,7 +28,7 @@
 """
 from enum import Enum
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 
diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py
index ec88bda34954..8657d53ef957 100644
--- a/examples/audio/process_audio.py
+++ b/examples/audio/process_audio.py
@@ -20,7 +20,7 @@
 from pathlib import Path
 from typing import List, Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 
diff --git a/examples/llm/peft/hf.py b/examples/llm/peft/hf.py
index 97f21d6c253e..5b24c22ab79d 100644
--- a/examples/llm/peft/hf.py
+++ b/examples/llm/peft/hf.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import fiddle as fdl
-from pytorch_lightning.loggers import WandbLogger
+from lightning.pytorch.loggers import WandbLogger
 from nemo import lightning as nl
 from nemo.collections import llm
 
diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py
index 7d4cde7866a2..20f915f87287 100644
--- a/examples/llm/sft/hf.py
+++ b/examples/llm/sft/hf.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import fiddle as fdl
-import pytorch_lightning as pl
-from pytorch_lightning.loggers import WandbLogger
+import lightning.pytorch as pl
+from lightning.pytorch.loggers import WandbLogger
 from torch.utils.data import DataLoader
 
 from nemo import lightning as nl
diff --git a/examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py b/examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py
index d02b737c750a..874d62dc63c9 100644
--- a/examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py
+++ b/examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py
@@ -34,10 +34,10 @@
 from collections import OrderedDict
 
 import torch
+from lightning.pytorch.core.saving import _load_state as ptl_load_state
+from lightning.pytorch.trainer.trainer import Trainer
 from llava import LlavaLlamaForCausalLM
 from omegaconf import OmegaConf
-from pytorch_lightning.core.saving import _load_state as ptl_load_state
-from pytorch_lightning.trainer.trainer import Trainer
 from safetensors import safe_open
 from transformers import LlamaTokenizer
 
diff --git a/examples/multimodal/speech_llm/export/extract_salm_weights.py b/examples/multimodal/speech_llm/export/extract_salm_weights.py
index 0698a411110e..24c7aec3bb4d 100644
--- a/examples/multimodal/speech_llm/export/extract_salm_weights.py
+++ b/examples/multimodal/speech_llm/export/extract_salm_weights.py
@@ -18,9 +18,9 @@
 import tempfile
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from megatron.core import dist_checkpointing
 from omegaconf import OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.multimodal.speech_llm.modules.perception_modules import AudioPerceptionModule
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/examples/multimodal/text_to_image/controlnet/controlnet_train.py b/examples/multimodal/text_to_image/controlnet/controlnet_train.py
index 2bb8b66cac1a..14e7e62a1cc7 100644
--- a/examples/multimodal/text_to_image/controlnet/controlnet_train.py
+++ b/examples/multimodal/text_to_image/controlnet/controlnet_train.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 
 from nemo.collections.multimodal.models.text_to_image.controlnet.controlnet import MegatronControlNet
 from nemo.collections.multimodal.models.text_to_image.controlnet.util import ImageLogger
diff --git a/examples/multimodal/text_to_image/convert_hf_ckpt_to_nemo.py b/examples/multimodal/text_to_image/convert_hf_ckpt_to_nemo.py
index cebf159eb870..c50ad439eaec 100644
--- a/examples/multimodal/text_to_image/convert_hf_ckpt_to_nemo.py
+++ b/examples/multimodal/text_to_image/convert_hf_ckpt_to_nemo.py
@@ -27,10 +27,10 @@
 from argparse import ArgumentParser
 
 import torch
-from lightning_fabric.utilities.cloud_io import _load as pl_load
+from lightning.fabric.utilities.cloud_io import _load as pl_load
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.multimodal.models.text_to_image.controlnet.controlnet import MegatronControlNet
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine import MegatronDiffusionEngine
diff --git a/examples/multimodal/text_to_image/dreambooth/dreambooth_lora_infer.py b/examples/multimodal/text_to_image/dreambooth/dreambooth_lora_infer.py
index 52f0aa2940d2..e1d050f83939 100644
--- a/examples/multimodal/text_to_image/dreambooth/dreambooth_lora_infer.py
+++ b/examples/multimodal/text_to_image/dreambooth/dreambooth_lora_infer.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
+from lightning.pytorch import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
 from omegaconf import open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm import MegatronLatentDiffusion
 from nemo.collections.multimodal.parts.stable_diffusion.pipeline import pipeline
@@ -48,7 +48,10 @@ def model_cfg_modifier(model_cfg):
 
     plugins = []
     plugins.append(TorchElasticEnvironment())
-    strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        find_unused_parameters=False,
+    )
     trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
 
     model = MegatronLatentDiffusion(model_cfg, trainer=trainer)
diff --git a/examples/multimodal/text_to_image/imagen/generate_fid_images.py b/examples/multimodal/text_to_image/imagen/generate_fid_images.py
index ea743e3e1d06..7d2df372b545 100644
--- a/examples/multimodal/text_to_image/imagen/generate_fid_images.py
+++ b/examples/multimodal/text_to_image/imagen/generate_fid_images.py
@@ -15,7 +15,7 @@
 import os
 
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 
 from nemo.collections.multimodal.models.text_to_image.imagen.imagen_pipeline import ImagenPipeline
 from nemo.core.config import hydra_runner
@@ -79,7 +79,10 @@ def main(cfg):
         seeds = [local_task_id * chunk_size + batch_idx * batch_size + idx for idx in range(len(batch_captions))]
         with torch.no_grad():
             images, all_res_images, *_ = pipeline(
-                prompts=batch_captions, seed=seeds, single_batch_mode=True, classifier_free_guidance=current_node_cfg,
+                prompts=batch_captions,
+                seed=seeds,
+                single_batch_mode=True,
+                classifier_free_guidance=current_node_cfg,
             )
 
         if cfg.fid.save_all_res:
diff --git a/examples/multimodal/text_to_image/imagen/imagen_generate_images.py b/examples/multimodal/text_to_image/imagen/imagen_generate_images.py
index bc002052a989..06b324367a52 100644
--- a/examples/multimodal/text_to_image/imagen/imagen_generate_images.py
+++ b/examples/multimodal/text_to_image/imagen/imagen_generate_images.py
@@ -16,8 +16,8 @@
 import pickle
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.multimodal.models.text_to_image.imagen.imagen_pipeline import (
     ImagenPipeline,
@@ -65,7 +65,11 @@ def main(inference_config):
         seed = batch_idx + chuncksize
 
         with torch.no_grad():
-            images, all_res_images, throughput = pipeline(prompts=batch_captions, seed=seeds, single_batch_mode=True,)
+            images, all_res_images, throughput = pipeline(
+                prompts=batch_captions,
+                seed=seeds,
+                single_batch_mode=True,
+            )
 
         for outpath, one_res in zip(outpaths, all_res_images):
             for idx, (caption, image) in enumerate(zip(batch_captions, one_res[0])):
diff --git a/examples/multimodal/text_to_image/imagen/imagen_infer.py b/examples/multimodal/text_to_image/imagen/imagen_infer.py
index 0fb291729596..9ce680cf4b09 100644
--- a/examples/multimodal/text_to_image/imagen/imagen_infer.py
+++ b/examples/multimodal/text_to_image/imagen/imagen_infer.py
@@ -14,8 +14,8 @@
 
 import os
 
+from lightning.pytorch import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.multimodal.models.text_to_image.imagen.imagen_pipeline import (
     ImagenPipeline,
diff --git a/examples/multimodal/text_to_image/imagen/imagen_training.py b/examples/multimodal/text_to_image/imagen/imagen_training.py
index 23c1c9c1a1d7..211299156b69 100644
--- a/examples/multimodal/text_to_image/imagen/imagen_training.py
+++ b/examples/multimodal/text_to_image/imagen/imagen_training.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf.omegaconf import OmegaConf, open_dict
 from torch._dynamo import disable
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_lora_infer.py b/examples/multimodal/text_to_image/stable_diffusion/sd_lora_infer.py
index 0877d4eb4b2f..0d83a8daab9f 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_lora_infer.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_lora_infer.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
+from lightning.pytorch import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
 from omegaconf import open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm import MegatronLatentDiffusion
 from nemo.collections.multimodal.parts.stable_diffusion.pipeline import pipeline
@@ -45,7 +45,10 @@ def model_cfg_modifier(model_cfg):
 
     plugins = []
     plugins.append(TorchElasticEnvironment())
-    strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        find_unused_parameters=False,
+    )
     trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
 
     model = MegatronLatentDiffusion(model_cfg, trainer=trainer)
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py
index 44412aee0d14..4ef22b69aa64 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 import sys
+
 import torch
 import torch._dynamo.config as dynamo_config
+from lightning.pytorch import Trainer
 from omegaconf.omegaconf import OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine import MegatronDiffusionEngine
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
diff --git a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
index 178140aac828..abc987e07097 100644
--- a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
+++ b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
@@ -45,9 +45,9 @@
 import einops
 import open_clip
 import torch
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import CLIPModel
 
 from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel
diff --git a/examples/multimodal/x_to_nerf/benchmark_callback.py b/examples/multimodal/x_to_nerf/benchmark_callback.py
index fd7d5afdc5bc..2db78d1d385a 100644
--- a/examples/multimodal/x_to_nerf/benchmark_callback.py
+++ b/examples/multimodal/x_to_nerf/benchmark_callback.py
@@ -15,7 +15,7 @@
 import time
 from typing import Optional
 
-from pytorch_lightning import Callback, LightningModule, Trainer
+from lightning.pytorch import Callback, LightningModule, Trainer
 
 from nemo.utils import logging
 
diff --git a/examples/multimodal/x_to_nerf/data.py b/examples/multimodal/x_to_nerf/data.py
index fe7c47abc64b..b8dfd3aa536b 100644
--- a/examples/multimodal/x_to_nerf/data.py
+++ b/examples/multimodal/x_to_nerf/data.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf.omegaconf import DictConfig
 from torch.utils.data import DataLoader
 
diff --git a/examples/multimodal/x_to_nerf/main.py b/examples/multimodal/x_to_nerf/main.py
index 5d7f616a3165..f3c8a6949867 100644
--- a/examples/multimodal/x_to_nerf/main.py
+++ b/examples/multimodal/x_to_nerf/main.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 from hydra.utils import get_class, instantiate
+from lightning.pytorch import Trainer, seed_everything
 from omegaconf.omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer, seed_everything
 
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
diff --git a/examples/nlp/dialogue/dialogue.py b/examples/nlp/dialogue/dialogue.py
index 578895a2ad43..3f4c5581eb5a 100644
--- a/examples/nlp/dialogue/dialogue.py
+++ b/examples/nlp/dialogue/dialogue.py
@@ -42,7 +42,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models.dialogue.dialogue_gpt_classification_model import DialogueGPTClassificationModel
diff --git a/examples/nlp/duplex_text_normalization/helpers.py b/examples/nlp/duplex_text_normalization/helpers.py
index 6c1cfe37b90d..d9b8780fd787 100644
--- a/examples/nlp/duplex_text_normalization/helpers.py
+++ b/examples/nlp/duplex_text_normalization/helpers.py
@@ -14,7 +14,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig
 
 from nemo.collections.nlp.data.text_normalization import constants
@@ -29,7 +29,7 @@
 
 
 def instantiate_model_and_trainer(cfg: DictConfig, model_name: str, do_training: bool):
-    """ Function for instantiating a model and a trainer
+    """Function for instantiating a model and a trainer
     Args:
         cfg: The config used to instantiate the model and the trainer.
         model_name: A str indicates whether the model to be instantiated is a tagger or a decoder (i.e., model_name should be either TAGGER_MODEL or DECODER_MODEL).
diff --git a/examples/nlp/entity_linking/self_alignment_pretraining.py b/examples/nlp/entity_linking/self_alignment_pretraining.py
index a1ac1ac327cb..58b20f384d04 100644
--- a/examples/nlp/entity_linking/self_alignment_pretraining.py
+++ b/examples/nlp/entity_linking/self_alignment_pretraining.py
@@ -16,8 +16,8 @@
 # Please see tutorial at Nemo/tutorials/nlp/Entity_Linking_Medical.ipynb for
 # more information on entity linking and self alignment pretraining.
 
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.models import EntityLinkingModel
 from nemo.core.config import hydra_runner
diff --git a/examples/nlp/glue_benchmark/glue_benchmark.py b/examples/nlp/glue_benchmark/glue_benchmark.py
index 3cb5f8e4af3e..28efb9520fbd 100644
--- a/examples/nlp/glue_benchmark/glue_benchmark.py
+++ b/examples/nlp/glue_benchmark/glue_benchmark.py
@@ -35,7 +35,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models import GLUEModel
diff --git a/examples/nlp/information_retrieval/bert_dpr.py b/examples/nlp/information_retrieval/bert_dpr.py
index 2d9cd962ff34..4fc791da04fd 100644
--- a/examples/nlp/information_retrieval/bert_dpr.py
+++ b/examples/nlp/information_retrieval/bert_dpr.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models import BertDPRModel
diff --git a/examples/nlp/information_retrieval/bert_joint_ir.py b/examples/nlp/information_retrieval/bert_joint_ir.py
index 1bb164e580d1..f95cdd04e036 100644
--- a/examples/nlp/information_retrieval/bert_joint_ir.py
+++ b/examples/nlp/information_retrieval/bert_joint_ir.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models import BertJointIRModel
diff --git a/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py b/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py
index e1fe28cc892f..9cb5cb5d3d19 100644
--- a/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py
+++ b/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py
@@ -15,8 +15,8 @@
 from collections.abc import MutableMapping
 
 import torch.multiprocessing as mp
+from lightning.pytorch.loggers import WandbLogger
 from omegaconf.omegaconf import OmegaConf
-from pytorch_lightning.loggers import WandbLogger
 
 from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import MegatronGPTEmbeddingModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
diff --git a/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py b/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py
index cf65840bb843..be89e5bf5c43 100644
--- a/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py
+++ b/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py
@@ -15,8 +15,8 @@
 from collections.abc import MutableMapping
 
 import torch.multiprocessing as mp
+from lightning.pytorch.loggers import WandbLogger
 from omegaconf.omegaconf import OmegaConf
-from pytorch_lightning.loggers import WandbLogger
 
 from nemo.collections.nlp.models.information_retrieval.megatron_gpt_reranker_model import MegatronGPTRerankerModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
diff --git a/examples/nlp/intent_slot_classification/intent_slot_classification.py b/examples/nlp/intent_slot_classification/intent_slot_classification.py
index a112ea7785f5..2025f48f330f 100644
--- a/examples/nlp/intent_slot_classification/intent_slot_classification.py
+++ b/examples/nlp/intent_slot_classification/intent_slot_classification.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models import IntentSlotClassificationModel
diff --git a/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py b/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py
index 2441885e2ed2..232aa7d4d230 100644
--- a/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py
+++ b/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py
@@ -27,7 +27,7 @@
 """
 
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models import MultiLabelIntentSlotClassificationModel
diff --git a/examples/nlp/language_modeling/bert_pretraining.py b/examples/nlp/language_modeling/bert_pretraining.py
index 75d0a1072e69..7cff43f7fc73 100644
--- a/examples/nlp/language_modeling/bert_pretraining.py
+++ b/examples/nlp/language_modeling/bert_pretraining.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
+from lightning.pytorch.strategies import DDPStrategy
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning.strategies import DDPStrategy
 
 from nemo.collections.nlp.models.language_modeling import BERTLMModel
 from nemo.core.config import hydra_runner
diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
index ced2b43cd312..349543de8e59 100644
--- a/examples/nlp/language_modeling/mamba_change_num_partition.py
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -19,8 +19,8 @@
 from argparse import ArgumentParser
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
 from nemo.collections.nlp.parts.nlp_overrides import (
diff --git a/examples/nlp/language_modeling/megatron_bart_pretraining.py b/examples/nlp/language_modeling/megatron_bart_pretraining.py
index e45b5e04ca45..a6dd6f183d72 100644
--- a/examples/nlp/language_modeling/megatron_bart_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_bart_pretraining.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 
+from lightning.pytorch import Trainer
+from lightning.pytorch.callbacks import ModelSummary
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.trainer.connectors.checkpoint_connector import _CheckpointConnector
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelSummary
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
 from nemo.collections.nlp.parts.nlp_overrides import (
@@ -48,7 +48,7 @@ def main(cfg) -> None:
         scaler = None
         if cfg.trainer.precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                 growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 hysteresis=cfg.model.get('hysteresis', 2),
             )
diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py
index c035346e3bf1..49d1ef0dcb57 100644
--- a/examples/nlp/language_modeling/megatron_change_num_partitions.py
+++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py
@@ -21,8 +21,8 @@
 
 import torch
 import torch.nn as nn
+from lightning.pytorch import Trainer
 from omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.parts.nlp_overrides import (
     NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE,
@@ -922,7 +922,7 @@ def main():
         scaler = None
         if precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=tmp_cfg.get('native_amp_init_scale', 2 ** 32),
+                init_scale=tmp_cfg.get('native_amp_init_scale', 2**32),
                 growth_interval=tmp_cfg.get('native_amp_growth_interval', 1000),
                 hysteresis=tmp_cfg.get('hysteresis', 2),
             )
@@ -943,7 +943,10 @@ def main():
     if tp_size < 0 or pp_size < 0:
         logging.info(f"Loading model config from {args.model_file} to get TP and PP size")
         model_config_internal = cls.restore_from(
-            restore_path=args.model_file, trainer=trainer, map_location=torch.device("cpu"), return_config=True,
+            restore_path=args.model_file,
+            trainer=trainer,
+            map_location=torch.device("cpu"),
+            return_config=True,
         )
 
         tp_size = model_config_internal.get('tensor_model_parallel_size', 1)
@@ -1137,7 +1140,9 @@ def main():
 
                             else:
                                 model = cls.load_from_checkpoint(
-                                    checkpoint_path=checkpoint_path, trainer=trainer, map_location=torch.device("cpu"),
+                                    checkpoint_path=checkpoint_path,
+                                    trainer=trainer,
+                                    map_location=torch.device("cpu"),
                                 )
                                 model.freeze()
 
diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
index c81119489582..b46f8f459ff0 100644
--- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
@@ -32,10 +32,10 @@
 
 import torch
 from genericpath import isdir
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.trainer.trainer import Trainer
 from megatron.core import parallel_state
 from omegaconf import OmegaConf, open_dict
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
diff --git a/examples/nlp/language_modeling/megatron_export.py b/examples/nlp/language_modeling/megatron_export.py
index bf9157884bfc..b511a415d9b1 100644
--- a/examples/nlp/language_modeling/megatron_export.py
+++ b/examples/nlp/language_modeling/megatron_export.py
@@ -28,8 +28,8 @@
 
 import os
 
+from lightning.pytorch import Trainer
 from omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
diff --git a/examples/nlp/language_modeling/megatron_gpt_distillation.py b/examples/nlp/language_modeling/megatron_gpt_distillation.py
index dc8614be23b2..c00470c5c81e 100644
--- a/examples/nlp/language_modeling/megatron_gpt_distillation.py
+++ b/examples/nlp/language_modeling/megatron_gpt_distillation.py
@@ -19,8 +19,8 @@
 import modelopt.torch.distill as mtd
 import modelopt.torch.opt as mto
 import torch.multiprocessing as mp
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 try:
     from megatron.core import parallel_state, tensor_parallel
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
index b9b0d2973094..4dbbee78e898 100644
--- a/examples/nlp/language_modeling/megatron_gpt_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -20,8 +20,8 @@
 from functools import partial
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 from torch.utils.data import DataLoader, Dataset
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py b/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py
index 988a5f8588ff..ceb32d75f495 100644
--- a/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py
@@ -16,6 +16,7 @@
 import os
 from argparse import Namespace
 
+from lightning.pytorch.trainer.trainer import Trainer
 from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.engines.mcore_engine import MCoreEngine
 from megatron.core.inference.inference_model_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
@@ -23,7 +24,6 @@
     SimpleTextGenerationController,
 )
 from omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
diff --git a/examples/nlp/language_modeling/megatron_gpt_prune.py b/examples/nlp/language_modeling/megatron_gpt_prune.py
index de12b861a1c0..44992873f362 100644
--- a/examples/nlp/language_modeling/megatron_gpt_prune.py
+++ b/examples/nlp/language_modeling/megatron_gpt_prune.py
@@ -16,8 +16,8 @@
 import torch
 import torch.multiprocessing as mp
 from datasets import load_dataset
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 from tqdm import tqdm
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/examples/nlp/language_modeling/megatron_gpt_ptq.py b/examples/nlp/language_modeling/megatron_gpt_ptq.py
index e41becc2d8e0..0ac0822c5fbe 100644
--- a/examples/nlp/language_modeling/megatron_gpt_ptq.py
+++ b/examples/nlp/language_modeling/megatron_gpt_ptq.py
@@ -15,8 +15,8 @@
 import torch
 import torch.multiprocessing as mp
 from datasets import load_dataset
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 from tqdm import tqdm
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/examples/nlp/language_modeling/megatron_gpt_test.py b/examples/nlp/language_modeling/megatron_gpt_test.py
index 62a1d40dbaed..03bc6735e891 100644
--- a/examples/nlp/language_modeling/megatron_gpt_test.py
+++ b/examples/nlp/language_modeling/megatron_gpt_test.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from lightning.pytorch import Trainer
 from omegaconf.omegaconf import OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
@@ -38,7 +38,7 @@ def main(cfg) -> None:
         trainer = Trainer(
             plugins=[
                 NLPMixedPrecisionPlugin(
-                    init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                    init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                     growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 ),
             ],
@@ -46,7 +46,13 @@ def main(cfg) -> None:
             **cfg.trainer,
         )
     elif cfg.trainer.precision in ['bf16', 'bf16-mixed']:
-        trainer = Trainer(plugins=[NLPNativeBfloat16PrecisionPlugin(),], strategy=NLPDDPStrategy(), **cfg.trainer,)
+        trainer = Trainer(
+            plugins=[
+                NLPNativeBfloat16PrecisionPlugin(),
+            ],
+            strategy=NLPDDPStrategy(),
+            **cfg.trainer,
+        )
     else:
         trainer = Trainer(plugins=[NLPPrecisionPlugin()], strategy=NLPDDPStrategy(), **cfg.trainer)
 
@@ -55,7 +61,9 @@ def main(cfg) -> None:
     app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size)
 
     model = MegatronGPTModel.restore_from(
-        cfg.restore_from_path, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(),
+        cfg.restore_from_path,
+        trainer=trainer,
+        save_restore_connector=NLPSaveRestoreConnector(),
     )
 
     # Note: most nemo models must have the data paths configured before instantiating the model
diff --git a/examples/nlp/language_modeling/megatron_gpt_validate.py b/examples/nlp/language_modeling/megatron_gpt_validate.py
index b5a61e627a14..fa0abb89421c 100644
--- a/examples/nlp/language_modeling/megatron_gpt_validate.py
+++ b/examples/nlp/language_modeling/megatron_gpt_validate.py
@@ -15,8 +15,8 @@
 import os
 import tempfile
 
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
@@ -140,7 +140,9 @@ def main(cfg) -> None:
         with tempfile.NamedTemporaryFile(suffix='.yaml') as f:
             OmegaConf.save(config=pretrained_cfg, f=f.name)
             model = MegatronGPTModel.load_from_checkpoint(
-                checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,
+                checkpoint_path=checkpoint_path,
+                trainer=trainer,
+                hparams_file=f.name,
             )
     else:
         raise ValueError("need at least a nemo file or checkpoint dir")
diff --git a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
index 72252a03d5be..64ba2a51bb71 100644
--- a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
@@ -42,12 +42,12 @@
 from typing import Any, Optional
 
 import torch
-from lightning_fabric.utilities.cloud_io import _load as pl_load
+from lightning.fabric.utilities.cloud_io import _load as pl_load
+from lightning.pytorch.core.saving import _load_state as ptl_load_state
+from lightning.pytorch.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml
+from lightning.pytorch.trainer.trainer import Trainer
+from lightning.pytorch.utilities.migration import pl_legacy_patch
 from megatron.core import parallel_state
-from pytorch_lightning.core.saving import _load_state as ptl_load_state
-from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml
-from pytorch_lightning.trainer.trainer import Trainer
-from pytorch_lightning.utilities.migration import pl_legacy_patch
 
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/examples/nlp/language_modeling/megatron_mamba_eval.py b/examples/nlp/language_modeling/megatron_mamba_eval.py
index ed12e4b904ac..ba000e6bef63 100644
--- a/examples/nlp/language_modeling/megatron_mamba_eval.py
+++ b/examples/nlp/language_modeling/megatron_mamba_eval.py
@@ -20,8 +20,8 @@
 from functools import partial
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 from torch.utils.data import DataLoader, Dataset
 
 from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
diff --git a/examples/nlp/language_modeling/megatron_retro_cal_shape.py b/examples/nlp/language_modeling/megatron_retro_cal_shape.py
index a57a927d2a36..f790d9471964 100644
--- a/examples/nlp/language_modeling/megatron_retro_cal_shape.py
+++ b/examples/nlp/language_modeling/megatron_retro_cal_shape.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from lightning.pytorch import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.plugins.precision import MixedPrecisionPlugin
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 
 from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
 from nemo.collections.nlp.modules.common.megatron.mup.shape import make_base_shapes
@@ -46,7 +46,7 @@ def main(cfg) -> None:
         scaler = None
         if cfg.trainer.precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                 growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 hysteresis=cfg.model.get('hysteresis', 2),
             )
diff --git a/examples/nlp/language_modeling/megatron_retro_eval.py b/examples/nlp/language_modeling/megatron_retro_eval.py
index 89e3fe9c3ddb..ac946b2adf42 100644
--- a/examples/nlp/language_modeling/megatron_retro_eval.py
+++ b/examples/nlp/language_modeling/megatron_retro_eval.py
@@ -16,8 +16,8 @@
 import os
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 from torch.utils.data import DataLoader, Dataset
 
 from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
@@ -60,7 +60,9 @@ def __init__(self, sentences, neighbors):
         self.sentences = sentences
         self.neighbors = neighbors
 
-    def __len__(self,):
+    def __len__(
+        self,
+    ):
         return len(self.sentences)
 
     def __getitem__(self, idx):
diff --git a/examples/nlp/language_modeling/megatron_retro_eval_legacy.py b/examples/nlp/language_modeling/megatron_retro_eval_legacy.py
index 69222acedd34..c51a8f536cc1 100644
--- a/examples/nlp/language_modeling/megatron_retro_eval_legacy.py
+++ b/examples/nlp/language_modeling/megatron_retro_eval_legacy.py
@@ -15,8 +15,8 @@
 import os
 
 from examples.nlp.language_modeling.megatron_gpt_eval import RequestDataSet
+from lightning.pytorch import Trainer
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
@@ -69,7 +69,10 @@ def main(cfg) -> None:
         save_restore_connector.model_extracted_dir = model_path
 
     model_cfg = MegatronRetrievalModel.restore_from(
-        model_path, trainer=trainer, return_config=True, save_restore_connector=save_restore_connector,
+        model_path,
+        trainer=trainer,
+        return_config=True,
+        save_restore_connector=save_restore_connector,
     )
 
     with open_dict(model_cfg):
@@ -89,7 +92,10 @@ def main(cfg) -> None:
             cfg.pipeline_model_parallel_split_rank = model_cfg.get('pipeline_model_parallel_split_rank', 0)
 
     model = MegatronRetrievalModel.restore_from(
-        model_path, trainer=trainer, save_restore_connector=save_restore_connector, override_config_path=model_cfg,
+        model_path,
+        trainer=trainer,
+        save_restore_connector=save_restore_connector,
+        override_config_path=model_cfg,
     )
 
     length_params: LengthParam = {
diff --git a/examples/nlp/language_modeling/megatron_retro_fine_tune.py b/examples/nlp/language_modeling/megatron_retro_fine_tune.py
index 3fcaec156d9c..153a4b581135 100644
--- a/examples/nlp/language_modeling/megatron_retro_fine_tune.py
+++ b/examples/nlp/language_modeling/megatron_retro_fine_tune.py
@@ -15,12 +15,12 @@
 import datetime
 import os
 
+from lightning.pytorch import Trainer
+from lightning.pytorch.callbacks.timer import Timer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.plugins.precision import MixedPrecisionPlugin
+from lightning.pytorch.trainer.connectors.checkpoint_connector import _CheckpointConnector
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks.timer import Timer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_retro_fine_tune_model import MegatronRetroFinetuneModel
 from nemo.collections.nlp.parts.nlp_overrides import (
@@ -87,7 +87,7 @@ def main(cfg) -> None:
         scaler = None
         if cfg.trainer.precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                 growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 hysteresis=cfg.model.get('hysteresis', 2),
             )
@@ -118,7 +118,9 @@ def main(cfg) -> None:
     # Override timer callback to a stateless one
     for idx, callback in enumerate(trainer.callbacks):
         if isinstance(callback, Timer):
-            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)
+            trainer.callbacks[idx] = StatelessTimer(
+                cfg.trainer.max_time,
+            )
 
     # load existing or init new soft prompt GPT model
     if cfg.model.get("restore_path", None):
diff --git a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
index af6e22035def..775b75680ee9 100644
--- a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
+++ b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from lightning.pytorch import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.plugins.precision import MixedPrecisionPlugin
+from lightning.pytorch.trainer.connectors.checkpoint_connector import _CheckpointConnector
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
 from nemo.collections.nlp.modules.common.megatron.mup.optim import MuAdam, MuAdamW
@@ -52,7 +52,7 @@ def main(cfg) -> None:
         scaler = None
         if cfg.trainer.precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                 growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 hysteresis=cfg.model.get('hysteresis', 2),
             )
diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py b/examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py
index 4653222b3438..298deafabc1c 100644
--- a/examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py
+++ b/examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py
@@ -14,11 +14,11 @@
 
 import os
 
+from lightning.pytorch import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.plugins.precision import MixedPrecisionPlugin
+from lightning.pytorch.trainer.connectors.checkpoint_connector import _CheckpointConnector
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
 from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo
@@ -51,7 +51,7 @@ def main(cfg) -> None:
         scaler = None
         if cfg.trainer.precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                 growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 hysteresis=cfg.model.get('hysteresis', 2),
             )
diff --git a/examples/nlp/language_modeling/megatron_retro_qatask_eval.py b/examples/nlp/language_modeling/megatron_retro_qatask_eval.py
index b99bcafbab02..4e47157d5150 100644
--- a/examples/nlp/language_modeling/megatron_retro_qatask_eval.py
+++ b/examples/nlp/language_modeling/megatron_retro_qatask_eval.py
@@ -17,8 +17,8 @@
 import os
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 from torch.utils.data import DataLoader, Dataset
 
 from nemo.collections.nlp.data.question_answering.input_example.qa_input_example import QAExample
@@ -63,7 +63,9 @@ def __init__(self, sentences, neighbors):
         self.sentences = sentences
         self.neighbors = neighbors
 
-    def __len__(self,):
+    def __len__(
+        self,
+    ):
         return len(self.sentences)
 
     def __getitem__(self, idx):
diff --git a/examples/nlp/language_modeling/megatron_t5_eval.py b/examples/nlp/language_modeling/megatron_t5_eval.py
index 0b6ea54b6b99..57b48134101f 100644
--- a/examples/nlp/language_modeling/megatron_t5_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_eval.py
@@ -17,8 +17,8 @@
 from argparse import ArgumentParser
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.nlp.data.language_modeling.megatron.request_dataset import T5RequestDataset
@@ -40,13 +40,22 @@ def main():
         "--tokens_to_generate", type=int, default="16", required=False, help="How many tokens to add to prompt"
     )
     parser.add_argument(
-        "--tensor_model_parallel_size", type=int, default=-1, required=False,
+        "--tensor_model_parallel_size",
+        type=int,
+        default=-1,
+        required=False,
     )
     parser.add_argument(
-        "--pipeline_model_parallel_size", type=int, default=-1, required=False,
+        "--pipeline_model_parallel_size",
+        type=int,
+        default=-1,
+        required=False,
     )
     parser.add_argument(
-        "--pipeline_model_parallel_split_rank", type=int, default=-1, required=False,
+        "--pipeline_model_parallel_split_rank",
+        type=int,
+        default=-1,
+        required=False,
     )
     parser.add_argument("--precision", default="16", type=str, help="PyTorch Lightning Trainer precision flag")
     parser.add_argument("--decoder_starts_with_pad", action="store_true", help="Decoder starts with pad token")
diff --git a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
index 9e392d913171..4137213023ee 100644
--- a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 
+from lightning.pytorch import Trainer
+from lightning.pytorch.callbacks import ModelSummary
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.trainer.connectors.checkpoint_connector import _CheckpointConnector
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelSummary
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
 from nemo.collections.nlp.parts.nlp_overrides import (
@@ -49,7 +49,7 @@ def main(cfg) -> None:
         scaler = None
         if cfg.trainer.precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                 growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 hysteresis=cfg.model.get('hysteresis', 2),
             )
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
index ba8ea6492da3..ae6e1744395d 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from lightning.pytorch import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.plugins.precision import MixedPrecisionPlugin
 from megatron_t5_seq2seq_finetune import load_from_checkpoint_dir, load_from_nemo, validate_checkpoint_loading_args
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 
 from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
 from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model
@@ -82,7 +82,7 @@ def main(cfg) -> None:
         scaler = None
         if cfg.trainer.precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                 growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 hysteresis=cfg.model.get('hysteresis', 2),
             )
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
index 2409e99ad951..5f63289be27a 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -16,10 +16,10 @@
 import tempfile
 
 import torch.multiprocessing as mp
+from lightning.pytorch import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.trainer.connectors.checkpoint_connector import _CheckpointConnector
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
 from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model
diff --git a/examples/nlp/language_modeling/transformer_lm.py b/examples/nlp/language_modeling/transformer_lm.py
index caaa0e0d2935..3e97e28bb35e 100644
--- a/examples/nlp/language_modeling/transformer_lm.py
+++ b/examples/nlp/language_modeling/transformer_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models.language_modeling import TransformerLMModel
diff --git a/examples/nlp/language_modeling/upcycle_dense_to_moe.py b/examples/nlp/language_modeling/upcycle_dense_to_moe.py
index a1f4b6000b6f..f4a5fc017d97 100644
--- a/examples/nlp/language_modeling/upcycle_dense_to_moe.py
+++ b/examples/nlp/language_modeling/upcycle_dense_to_moe.py
@@ -26,7 +26,7 @@
 
 import torch
 import torch.nn
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
diff --git a/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py b/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py
index b1743e03188e..58c948f11458 100644
--- a/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py
+++ b/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py
@@ -15,8 +15,8 @@
 from dataclasses import dataclass
 from typing import Optional
 
+from lightning.pytorch import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_bottleneck_model import MTBottleneckModel
@@ -29,7 +29,6 @@
 from nemo.utils.config_utils import update_model_config
 from nemo.utils.exp_manager import ExpManagerConfig, exp_manager
 
-
 """
 Usage:
  1. If you need to start docker and install NeMo, otherwise skip this step:
diff --git a/examples/nlp/machine_translation/enc_dec_nmt.py b/examples/nlp/machine_translation/enc_dec_nmt.py
index 57b9f84c39ce..b901ba28a4db 100644
--- a/examples/nlp/machine_translation/enc_dec_nmt.py
+++ b/examples/nlp/machine_translation/enc_dec_nmt.py
@@ -15,8 +15,8 @@
 from dataclasses import dataclass
 from typing import Optional
 
+from lightning.pytorch import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig
@@ -29,7 +29,6 @@
 from nemo.utils.config_utils import update_model_config
 from nemo.utils.exp_manager import ExpManagerConfig, exp_manager
 
-
 """
 Usage:
  1. If you need to start docker and install NeMo, otherwise skip this step:
diff --git a/examples/nlp/machine_translation/enc_dec_nmt_finetune.py b/examples/nlp/machine_translation/enc_dec_nmt_finetune.py
index 16a635d09dee..688461a7b491 100644
--- a/examples/nlp/machine_translation/enc_dec_nmt_finetune.py
+++ b/examples/nlp/machine_translation/enc_dec_nmt_finetune.py
@@ -15,9 +15,9 @@
 from dataclasses import dataclass
 from typing import Optional
 
+from lightning.pytorch import Trainer
 from omegaconf import OmegaConf
 from omegaconf.omegaconf import MISSING
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel
@@ -29,7 +29,6 @@
 from nemo.utils.config_utils import update_model_config
 from nemo.utils.exp_manager import ExpManagerConfig, exp_manager
 
-
 """
 Usage:
  python enc_dec_nmt_finetune.py \
diff --git a/examples/nlp/machine_translation/megatron_nmt_training.py b/examples/nlp/machine_translation/megatron_nmt_training.py
index 7946500f92e9..5ff70a7a863c 100644
--- a/examples/nlp/machine_translation/megatron_nmt_training.py
+++ b/examples/nlp/machine_translation/megatron_nmt_training.py
@@ -14,11 +14,11 @@
 
 
 import torch.multiprocessing as mp
+from lightning.pytorch import Trainer
+from lightning.pytorch.callbacks import ModelSummary
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.trainer.connectors.checkpoint_connector import _CheckpointConnector
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelSummary
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
@@ -53,7 +53,7 @@ def main(cfg) -> None:
         scaler = None
         if cfg.trainer.precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                 growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 hysteresis=cfg.model.get('hysteresis', 2),
             )
diff --git a/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py b/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py
index fcf1fb8d1796..349155101a5d 100644
--- a/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py
+++ b/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py
@@ -24,8 +24,8 @@
 
 import os
 
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
diff --git a/examples/nlp/question_answering/question_answering.py b/examples/nlp/question_answering/question_answering.py
index fcde03582e5c..37bd43a4b0fb 100644
--- a/examples/nlp/question_answering/question_answering.py
+++ b/examples/nlp/question_answering/question_answering.py
@@ -14,7 +14,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models.question_answering.qa_bert_model import BERTQAModel
diff --git a/examples/nlp/spellchecking_asr_customization/helpers.py b/examples/nlp/spellchecking_asr_customization/helpers.py
index 2db11b0e7d96..8e3957d34cc1 100644
--- a/examples/nlp/spellchecking_asr_customization/helpers.py
+++ b/examples/nlp/spellchecking_asr_customization/helpers.py
@@ -16,7 +16,7 @@
 import os
 from typing import Tuple
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig
 
 from nemo.collections.nlp.models import SpellcheckingAsrCustomizationModel
@@ -32,7 +32,7 @@
 def instantiate_model_and_trainer(
     cfg: DictConfig, model_name: str, do_training: bool
 ) -> Tuple[pl.Trainer, SpellcheckingAsrCustomizationModel]:
-    """ Function for instantiating a model and a trainer
+    """Function for instantiating a model and a trainer
     Args:
         cfg: The config used to instantiate the model and the trainer.
         model_name: A str indicates the model direction, currently only 'itn'.
diff --git a/examples/nlp/text2sparql/evaluate_text2sparql.py b/examples/nlp/text2sparql/evaluate_text2sparql.py
index 52baa2a7e78c..774ced98e8ec 100644
--- a/examples/nlp/text2sparql/evaluate_text2sparql.py
+++ b/examples/nlp/text2sparql/evaluate_text2sparql.py
@@ -39,7 +39,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models.text2sparql import Text2SparqlModel
diff --git a/examples/nlp/text2sparql/text2sparql.py b/examples/nlp/text2sparql/text2sparql.py
index 1353a3967735..d70a7e616950 100644
--- a/examples/nlp/text2sparql/text2sparql.py
+++ b/examples/nlp/text2sparql/text2sparql.py
@@ -88,7 +88,7 @@
     exp_manager.exp_dir=./NeMo_logs
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models.text2sparql import Text2SparqlModel
diff --git a/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py b/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py
index ab3322f552c1..cf9b6d8dd2e4 100644
--- a/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py
+++ b/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py
@@ -15,7 +15,7 @@
 """
 This script runs model parallel text classification evaluation.
 """
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models.text_classification import TextClassificationModel
diff --git a/examples/nlp/text_classification/text_classification_with_bert.py b/examples/nlp/text_classification/text_classification_with_bert.py
index 01e8fae9bba5..a6c84b4e337a 100644
--- a/examples/nlp/text_classification/text_classification_with_bert.py
+++ b/examples/nlp/text_classification/text_classification_with_bert.py
@@ -95,7 +95,7 @@
     eval_model.set_trainer(eval_trainer)
     eval_trainer.test(model=eval_model, verbose=False)
 """
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models.text_classification import TextClassificationModel
diff --git a/examples/nlp/text_normalization_as_tagging/helpers.py b/examples/nlp/text_normalization_as_tagging/helpers.py
index 347b05b25fba..de74794f8f40 100644
--- a/examples/nlp/text_normalization_as_tagging/helpers.py
+++ b/examples/nlp/text_normalization_as_tagging/helpers.py
@@ -16,7 +16,7 @@
 import os
 from typing import Tuple
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig
 
 from nemo.collections.nlp.models import ThutmoseTaggerModel
@@ -31,7 +31,7 @@
 def instantiate_model_and_trainer(
     cfg: DictConfig, model_name: str, do_training: bool
 ) -> Tuple[pl.Trainer, ThutmoseTaggerModel]:
-    """ Function for instantiating a model and a trainer
+    """Function for instantiating a model and a trainer
     Args:
         cfg: The config used to instantiate the model and the trainer.
         model_name: A str indicates the model direction, currently only 'itn'.
diff --git a/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py b/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py
index 149a9a4515e2..508e434bb598 100644
--- a/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py
+++ b/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py
@@ -14,7 +14,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import DictConfig, OmegaConf
 
diff --git a/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py b/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py
index e983540a68b2..b16e1ecd0bdc 100644
--- a/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py
+++ b/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py
@@ -14,7 +14,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import DictConfig, OmegaConf
 
diff --git a/examples/nlp/token_classification/token_classification_evaluate.py b/examples/nlp/token_classification/token_classification_evaluate.py
index b69212f59de4..764aa90c8593 100644
--- a/examples/nlp/token_classification/token_classification_evaluate.py
+++ b/examples/nlp/token_classification/token_classification_evaluate.py
@@ -14,7 +14,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig
 
 from nemo.collections.nlp.models import TokenClassificationModel
diff --git a/examples/nlp/token_classification/token_classification_train.py b/examples/nlp/token_classification/token_classification_train.py
index 56c1487cf9c5..536327aff6da 100644
--- a/examples/nlp/token_classification/token_classification_train.py
+++ b/examples/nlp/token_classification/token_classification_train.py
@@ -14,7 +14,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models import TokenClassificationModel
diff --git a/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py b/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py
index 5b91049e965d..4dbbf01c935e 100644
--- a/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py
+++ b/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models import ZeroShotIntentModel
diff --git a/examples/slu/speech_intent_slot/eval_utils/inference.py b/examples/slu/speech_intent_slot/eval_utils/inference.py
index 9bd76c76822d..241f6463ed76 100644
--- a/examples/slu/speech_intent_slot/eval_utils/inference.py
+++ b/examples/slu/speech_intent_slot/eval_utils/inference.py
@@ -21,7 +21,7 @@
 from pathlib import Path
 from typing import List, Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 from tqdm.auto import tqdm
diff --git a/examples/slu/speech_intent_slot/speech_intent_slot_train.py b/examples/slu/speech_intent_slot/speech_intent_slot_train.py
index a9999d4d4682..f8732ec757e1 100644
--- a/examples/slu/speech_intent_slot/speech_intent_slot_train.py
+++ b/examples/slu/speech_intent_slot/speech_intent_slot_train.py
@@ -66,7 +66,7 @@
 
 from pathlib import Path
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 
diff --git a/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py b/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py
index 35077a5fe415..5c0f956c2e3c 100644
--- a/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py
+++ b/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from lightning.pytorch import seed_everything
 from omegaconf import OmegaConf
-from pytorch_lightning import seed_everything
 
 from nemo.collections.asr.models import ClusteringDiarizer
 from nemo.core.config import hydra_runner
diff --git a/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py b/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py
index 984b5ce93464..bc1db4dc1126 100644
--- a/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py
+++ b/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
+from lightning.pytorch import seed_everything
 from omegaconf import OmegaConf
-from pytorch_lightning import seed_everything
 
 from nemo.collections.asr.models import EncDecDiarLabelModel
 from nemo.core.config import hydra_runner
diff --git a/examples/speaker_tasks/recognition/speaker_identification_infer.py b/examples/speaker_tasks/recognition/speaker_identification_infer.py
index 90f930fcbfa6..7075a9f1f92a 100644
--- a/examples/speaker_tasks/recognition/speaker_identification_infer.py
+++ b/examples/speaker_tasks/recognition/speaker_identification_infer.py
@@ -16,8 +16,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import seed_everything
 from omegaconf import OmegaConf
-from pytorch_lightning import seed_everything
 
 from nemo.collections.asr.data.audio_to_label import AudioToSpeechLabelDataset
 from nemo.collections.asr.models import EncDecSpeakerLabelModel
@@ -55,10 +55,18 @@ def main(cfg):
             speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_path)
 
         enroll_embs, _, enroll_truelabels, _ = speaker_model.batch_inference(
-            enrollment_manifest, batch_size, sample_rate, device=device,
+            enrollment_manifest,
+            batch_size,
+            sample_rate,
+            device=device,
         )
 
-        test_embs, _, _, _ = speaker_model.batch_inference(test_manifest, batch_size, sample_rate, device=device,)
+        test_embs, _, _, _ = speaker_model.batch_inference(
+            test_manifest,
+            batch_size,
+            sample_rate,
+            device=device,
+        )
 
         # length normalize
         enroll_embs = enroll_embs / (np.linalg.norm(enroll_embs, ord=2, axis=-1, keepdims=True))
@@ -91,7 +99,12 @@ def main(cfg):
                 "number of labels mis match. Make sure you trained or finetuned neural classifier with labels from enrollement manifest_filepath"
             )
 
-        _, test_logits, _, _ = speaker_model.batch_inference(test_manifest, batch_size, sample_rate, device=device,)
+        _, test_logits, _, _ = speaker_model.batch_inference(
+            test_manifest,
+            batch_size,
+            sample_rate,
+            device=device,
+        )
         matched_labels = test_logits.argmax(axis=-1)
 
     with open(test_manifest, 'rb') as f1, open(out_manifest, 'w', encoding='utf-8') as f2:
diff --git a/examples/speaker_tasks/recognition/speaker_reco.py b/examples/speaker_tasks/recognition/speaker_reco.py
index a8acd4de4a3f..ac5cb12ac836 100644
--- a/examples/speaker_tasks/recognition/speaker_reco.py
+++ b/examples/speaker_tasks/recognition/speaker_reco.py
@@ -14,10 +14,10 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
+from lightning.pytorch import seed_everything
 from omegaconf import OmegaConf
-from pytorch_lightning import seed_everything
 
 from nemo.collections.asr.models import EncDecSpeakerLabelModel
 from nemo.core.config import hydra_runner
diff --git a/examples/speaker_tasks/recognition/speaker_reco_finetune.py b/examples/speaker_tasks/recognition/speaker_reco_finetune.py
index 884e5a60bc59..502d016a920d 100644
--- a/examples/speaker_tasks/recognition/speaker_reco_finetune.py
+++ b/examples/speaker_tasks/recognition/speaker_reco_finetune.py
@@ -14,10 +14,10 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
+from lightning.pytorch import seed_everything
 from omegaconf import OmegaConf
-from pytorch_lightning import seed_everything
 
 from nemo.collections.asr.models import EncDecSpeakerLabelModel
 from nemo.core.config import hydra_runner
diff --git a/examples/tts/aligner.py b/examples/tts/aligner.py
index e32c0444ca68..939b8dbcf11f 100644
--- a/examples/tts/aligner.py
+++ b/examples/tts/aligner.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models import AlignerModel
diff --git a/examples/tts/audio_codec.py b/examples/tts/audio_codec.py
index 5fc4b6fd0afd..d875a3037ba3 100644
--- a/examples/tts/audio_codec.py
+++ b/examples/tts/audio_codec.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.tts.models import AudioCodecModel
diff --git a/examples/tts/fastpitch.py b/examples/tts/fastpitch.py
index a8e6ecdc902d..7fd584b773e4 100644
--- a/examples/tts/fastpitch.py
+++ b/examples/tts/fastpitch.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models import FastPitchModel
diff --git a/examples/tts/fastpitch_finetune.py b/examples/tts/fastpitch_finetune.py
index 64b5e8b90625..9bdf704c514c 100644
--- a/examples/tts/fastpitch_finetune.py
+++ b/examples/tts/fastpitch_finetune.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models import FastPitchModel
diff --git a/examples/tts/fastpitch_finetune_adapters.py b/examples/tts/fastpitch_finetune_adapters.py
index 1361d63fb4cf..9b50d70ab15e 100644
--- a/examples/tts/fastpitch_finetune_adapters.py
+++ b/examples/tts/fastpitch_finetune_adapters.py
@@ -15,7 +15,7 @@
 import os
 from dataclasses import is_dataclass
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf, open_dict
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
diff --git a/examples/tts/fastpitch_ssl.py b/examples/tts/fastpitch_ssl.py
index 1101ac1eeaf7..b92983a4bfb1 100644
--- a/examples/tts/fastpitch_ssl.py
+++ b/examples/tts/fastpitch_ssl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models import fastpitch_ssl, hifigan
diff --git a/examples/tts/g2p/g2p_heteronym_classification_inference.py b/examples/tts/g2p/g2p_heteronym_classification_inference.py
index 61262c41a340..89a563e9b683 100644
--- a/examples/tts/g2p/g2p_heteronym_classification_inference.py
+++ b/examples/tts/g2p/g2p_heteronym_classification_inference.py
@@ -18,7 +18,7 @@
 from dataclasses import dataclass, is_dataclass
 from typing import Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 
@@ -56,9 +56,9 @@ class TranscriptionConfig:
 
     # path to .json manifest inference, if not provided, interactive mode will be enabled
     manifest: Optional[str] = None  # Path to .json manifest
-    output_manifest: Optional[
-        str
-    ] = "predictions.json"  # Path to .json manifest to save prediction, will be saved in "pred_text" field
+    output_manifest: Optional[str] = (
+        "predictions.json"  # Path to .json manifest to save prediction, will be saved in "pred_text" field
+    )
     grapheme_field: str = "text_graphemes"  # name of the field in .json manifest for input grapheme text
 
     # mapping from wordid predicted by the model to phonemes, e.g.,
@@ -132,9 +132,10 @@ def main(cfg):
         save_errors = True
         correct = 0
         total = 0
-        with open(cfg.output_manifest, "r", encoding="utf-8") as f_preds, open(
-            cfg.errors_file, "w", encoding="utf-8"
-        ) as f_errors:
+        with (
+            open(cfg.output_manifest, "r", encoding="utf-8") as f_preds,
+            open(cfg.errors_file, "w", encoding="utf-8") as f_errors,
+        ):
             for line in f_preds:
                 line = json.loads(line)
                 predictions = line["pred_wordid"]
diff --git a/examples/tts/g2p/g2p_heteronym_classification_train_and_evaluate.py b/examples/tts/g2p/g2p_heteronym_classification_train_and_evaluate.py
index 613865618501..f86a0a3934e4 100644
--- a/examples/tts/g2p/g2p_heteronym_classification_train_and_evaluate.py
+++ b/examples/tts/g2p/g2p_heteronym_classification_train_and_evaluate.py
@@ -14,7 +14,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
diff --git a/examples/tts/g2p/g2p_inference.py b/examples/tts/g2p/g2p_inference.py
index e7bffa888653..a9da11fcffdb 100644
--- a/examples/tts/g2p/g2p_inference.py
+++ b/examples/tts/g2p/g2p_inference.py
@@ -16,7 +16,7 @@
 from dataclasses import dataclass, is_dataclass
 from typing import Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 from utils import get_metrics
@@ -41,23 +41,23 @@ class TranscriptionConfig:
     # Required configs
     pretrained_model: str  # Path to a .nemo file or Name of a pretrained model
     manifest_filepath: str  # Path to .json manifest file
-    phoneme_field: Optional[
-        str
-    ] = None  # name of the field in manifest_filepath for ground truth phonemes, default during training "text"
+    phoneme_field: Optional[str] = (
+        None  # name of the field in manifest_filepath for ground truth phonemes, default during training "text"
+    )
     grapheme_field: Optional[str] = "text_graphemes"  # name of the field in manifest_filepath for input grapheme text
 
     # General configs
-    output_file: Optional[
-        str
-    ] = None  # Path to .json manifest file to save predictions, will be saved in "target_field"
+    output_file: Optional[str] = (
+        None  # Path to .json manifest file to save predictions, will be saved in "target_field"
+    )
     pred_field: Optional[str] = "pred_text"  # name of the field in the output_file to save predictions
     batch_size: int = 32  # Batch size to use for inference
     num_workers: int = 0  # Number of workers to use for DataLoader during inference
 
     # Config for heteronyms correction
-    pretrained_heteronyms_model: Optional[
-        str
-    ] = None  # Path to a .nemo file or a Name of a pretrained model to disambiguate heteronyms (Optional)
+    pretrained_heteronyms_model: Optional[str] = (
+        None  # Path to a .nemo file or a Name of a pretrained model to disambiguate heteronyms (Optional)
+    )
 
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
diff --git a/examples/tts/g2p/g2p_train_and_evaluate.py b/examples/tts/g2p/g2p_train_and_evaluate.py
index ff7b2b0675ea..319e1fb6a776 100644
--- a/examples/tts/g2p/g2p_train_and_evaluate.py
+++ b/examples/tts/g2p/g2p_train_and_evaluate.py
@@ -14,7 +14,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from utils import get_model
 
diff --git a/examples/tts/hifigan.py b/examples/tts/hifigan.py
index 5c3406a2f24c..6cf5c7a5aac4 100644
--- a/examples/tts/hifigan.py
+++ b/examples/tts/hifigan.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.tts.models import HifiGanModel
 from nemo.core.config import hydra_runner
diff --git a/examples/tts/hifigan_finetune.py b/examples/tts/hifigan_finetune.py
index f0e2513404fd..328e1f423903 100644
--- a/examples/tts/hifigan_finetune.py
+++ b/examples/tts/hifigan_finetune.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.tts.models import HifiGanModel
 from nemo.core.config import hydra_runner
diff --git a/examples/tts/mixer_tts.py b/examples/tts/mixer_tts.py
index 61a188f53969..53f55d93bcda 100644
--- a/examples/tts/mixer_tts.py
+++ b/examples/tts/mixer_tts.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models import MixerTTSModel
diff --git a/examples/tts/radtts.py b/examples/tts/radtts.py
index 09bf69a2d6e5..4b3b0e62da87 100644
--- a/examples/tts/radtts.py
+++ b/examples/tts/radtts.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models.radtts import RadTTSModel
diff --git a/examples/tts/spectrogram_enhancer.py b/examples/tts/spectrogram_enhancer.py
index 336729236d74..cd91ef3cb815 100644
--- a/examples/tts/spectrogram_enhancer.py
+++ b/examples/tts/spectrogram_enhancer.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.tts.models import SpectrogramEnhancerModel
 from nemo.core.config import hydra_runner
diff --git a/examples/tts/ssl_tts.py b/examples/tts/ssl_tts.py
index a96dccb930ab..a50997a8f432 100644
--- a/examples/tts/ssl_tts.py
+++ b/examples/tts/ssl_tts.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models import ssl_tts
diff --git a/examples/tts/tacotron2.py b/examples/tts/tacotron2.py
index a5446c35f775..6c4a15d98ef2 100755
--- a/examples/tts/tacotron2.py
+++ b/examples/tts/tacotron2.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models import Tacotron2Model
diff --git a/examples/tts/tacotron2_finetune.py b/examples/tts/tacotron2_finetune.py
index a0531f1f2801..f8d4d1dcaad0 100644
--- a/examples/tts/tacotron2_finetune.py
+++ b/examples/tts/tacotron2_finetune.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models import Tacotron2Model
diff --git a/examples/tts/univnet.py b/examples/tts/univnet.py
index 91aafa661842..ac6949405fd5 100644
--- a/examples/tts/univnet.py
+++ b/examples/tts/univnet.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models import UnivNetModel
diff --git a/examples/tts/vits.py b/examples/tts/vits.py
index 75e0d827018a..6eeebd3ea15a 100644
--- a/examples/tts/vits.py
+++ b/examples/tts/vits.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.tts.models.vits import VitsModel
 from nemo.core.config import hydra_runner
diff --git a/examples/tts/waveglow.py b/examples/tts/waveglow.py
index 66b13491abd4..3bcd008ab5e0 100755
--- a/examples/tts/waveglow.py
+++ b/examples/tts/waveglow.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.common.callbacks import LogEpochTimeCallback
 from nemo.collections.tts.models import WaveGlowModel
diff --git a/examples/vision/convert_ckpt_to_nemo.py b/examples/vision/convert_ckpt_to_nemo.py
index 14876f6931f9..e0cf773f98c2 100644
--- a/examples/vision/convert_ckpt_to_nemo.py
+++ b/examples/vision/convert_ckpt_to_nemo.py
@@ -28,8 +28,8 @@
 from argparse import ArgumentParser
 
 import torch
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.trainer.trainer import Trainer
 
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.collections.vision.models.megatron_vit_classification_models import MegatronVitClassificationModel
diff --git a/examples/vision/vision_transformer/megatron_vit_classification_evaluate.py b/examples/vision/vision_transformer/megatron_vit_classification_evaluate.py
index e827e4db73c7..f7c384809702 100644
--- a/examples/vision/vision_transformer/megatron_vit_classification_evaluate.py
+++ b/examples/vision/vision_transformer/megatron_vit_classification_evaluate.py
@@ -15,9 +15,9 @@
 import os
 
 import torch
+from lightning.pytorch import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
@@ -38,7 +38,8 @@ def main(cfg) -> None:
 
     plugins = []
     strategy = NLPDDPStrategy(
-        no_ddp_communication_hook=True, find_unused_parameters=False,  # we don't use DDP for async grad allreduce
+        no_ddp_communication_hook=True,
+        find_unused_parameters=False,  # we don't use DDP for async grad allreduce
     )
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
@@ -82,7 +83,10 @@ def main(cfg) -> None:
     model.eval()
 
     val_transform = ClassificationTransform(model.cfg, (model.cfg.img_h, model.cfg.img_w), train=False)
-    val_data = ImageFolder(root=cfg.model.data.imagenet_val, transform=val_transform,)
+    val_data = ImageFolder(
+        root=cfg.model.data.imagenet_val,
+        transform=val_transform,
+    )
 
     def dummy():
         return
@@ -91,12 +95,20 @@ def dummy():
         trainer.strategy.launcher.launch(dummy, trainer=trainer)
     trainer.strategy.setup_environment()
 
-    test_loader = DataLoader(val_data, batch_size=cfg.model.micro_batch_size, num_workers=cfg.model.data.num_workers,)
+    test_loader = DataLoader(
+        val_data,
+        batch_size=cfg.model.micro_batch_size,
+        num_workers=cfg.model.data.num_workers,
+    )
 
     autocast_dtype = torch_dtype_from_precision(trainer.precision)
 
-    with torch.no_grad(), torch.cuda.amp.autocast(
-        enabled=autocast_dtype in (torch.half, torch.bfloat16), dtype=autocast_dtype,
+    with (
+        torch.no_grad(),
+        torch.cuda.amp.autocast(
+            enabled=autocast_dtype in (torch.half, torch.bfloat16),
+            dtype=autocast_dtype,
+        ),
     ):
         total = correct = 0.0
         for tokens, labels in tqdm(test_loader):
diff --git a/examples/vision/vision_transformer/megatron_vit_classification_infer.py b/examples/vision/vision_transformer/megatron_vit_classification_infer.py
index a757eb7a1c1f..f50ccf1c325c 100644
--- a/examples/vision/vision_transformer/megatron_vit_classification_infer.py
+++ b/examples/vision/vision_transformer/megatron_vit_classification_infer.py
@@ -16,10 +16,10 @@
 import os
 
 import torch
+from lightning.pytorch import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
 from omegaconf.omegaconf import OmegaConf, open_dict
 from PIL import Image
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from torch.utils.data import DataLoader, Dataset
 
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
@@ -63,7 +63,8 @@ def main(cfg) -> None:
 
     plugins = []
     strategy = NLPDDPStrategy(
-        no_ddp_communication_hook=True, find_unused_parameters=False,  # we don't use DDP for async grad allreduce
+        no_ddp_communication_hook=True,
+        find_unused_parameters=False,  # we don't use DDP for async grad allreduce
     )
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
@@ -107,7 +108,10 @@ def main(cfg) -> None:
     model.eval()
 
     test_transform = ClassificationTransform(cfg.model, (model_cfg.img_h, model_cfg.img_w), train=False)
-    test_data = ImageFolderDataset(folder_path=cfg.data_path, transform=test_transform,)
+    test_data = ImageFolderDataset(
+        folder_path=cfg.data_path,
+        transform=test_transform,
+    )
     test_loader = DataLoader(test_data, batch_size=8)
 
     def dummy():
@@ -119,8 +123,12 @@ def dummy():
 
     autocast_dtype = torch_dtype_from_precision(trainer.precision)
 
-    with torch.no_grad(), torch.cuda.amp.autocast(
-        enabled=autocast_dtype in (torch.half, torch.bfloat16), dtype=autocast_dtype,
+    with (
+        torch.no_grad(),
+        torch.cuda.amp.autocast(
+            enabled=autocast_dtype in (torch.half, torch.bfloat16),
+            dtype=autocast_dtype,
+        ),
     ):
         class_names = []
         for tokens in test_loader:
diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py
index c63c73323797..76537a8b2b78 100644
--- a/nemo/collections/asr/data/audio_to_text_dataset.py
+++ b/nemo/collections/asr/data/audio_to_text_dataset.py
@@ -19,9 +19,9 @@
 from typing import Any, List, Optional, Union
 
 import torch
+from lightning.pytorch.callbacks import BasePredictionWriter
 from omegaconf import DictConfig, OmegaConf, open_dict
 from omegaconf.listconfig import ListConfig
-from pytorch_lightning.callbacks import BasePredictionWriter
 from torch.utils.data import ChainDataset
 
 from nemo.collections.asr.data import audio_to_text, audio_to_text_dali
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index f18fe02d2ed8..969966839dde 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -21,8 +21,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.asr.data.audio_to_text_lhotse_prompted import (
@@ -62,7 +62,6 @@
 from nemo.utils import logging, model_utils
 from nemo.utils.decorators import deprecated
 
-
 __all__ = ['EncDecMultiTaskModel']
 
 
diff --git a/nemo/collections/asr/models/classification_models.py b/nemo/collections/asr/models/classification_models.py
index b49ef50583a7..f84ece6d24ce 100644
--- a/nemo/collections/asr/models/classification_models.py
+++ b/nemo/collections/asr/models/classification_models.py
@@ -21,8 +21,8 @@
 from typing import Any, Dict, List, Optional, Union
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, ListConfig, OmegaConf
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 from torchmetrics import Accuracy
 from torchmetrics.regression import MeanAbsoluteError, MeanSquaredError
diff --git a/nemo/collections/asr/models/clustering_diarizer.py b/nemo/collections/asr/models/clustering_diarizer.py
index ddcc269bedcc..1f03cec59af7 100644
--- a/nemo/collections/asr/models/clustering_diarizer.py
+++ b/nemo/collections/asr/models/clustering_diarizer.py
@@ -22,8 +22,8 @@
 from typing import Any, List, Optional, Union
 
 import torch
+from lightning.pytorch.utilities import rank_zero_only
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning.utilities import rank_zero_only
 from tqdm import tqdm
 
 from nemo.collections.asr.metrics.der import score_labels
@@ -49,7 +49,6 @@
 from nemo.core.classes import Model
 from nemo.utils import logging, model_utils
 
-
 __all__ = ['ClusteringDiarizer']
 
 _MODEL_CONFIG_YAML = "model_config.yaml"
diff --git a/nemo/collections/asr/models/confidence_ensemble.py b/nemo/collections/asr/models/confidence_ensemble.py
index c6b2846085af..932d221be0f8 100644
--- a/nemo/collections/asr/models/confidence_ensemble.py
+++ b/nemo/collections/asr/models/confidence_ensemble.py
@@ -18,8 +18,8 @@
 import joblib
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.asr.models.asr_model import ASRModel
 from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 993c7dc6b298..3df6a7352c4d 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -18,8 +18,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.asr.data import audio_to_text_dataset
@@ -43,7 +43,6 @@
 from nemo.utils import logging
 from nemo.utils.decorators import deprecated
 
-
 __all__ = ['EncDecCTCModel']
 
 
diff --git a/nemo/collections/asr/models/hybrid_asr_tts_models.py b/nemo/collections/asr/models/hybrid_asr_tts_models.py
index 628395e04f94..89a7e1289675 100644
--- a/nemo/collections/asr/models/hybrid_asr_tts_models.py
+++ b/nemo/collections/asr/models/hybrid_asr_tts_models.py
@@ -19,8 +19,8 @@
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import MISSING, DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from torch.nn.utils.rnn import pad_sequence
 
 from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs
@@ -324,7 +324,9 @@ def __setattr__(self, name, value):
         return super().__setattr__(name, value)
 
     def setup_optimization(
-        self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        optim_config: Optional[Union[DictConfig, Dict]] = None,
+        optim_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """
         Setup optimizer and scheduler. Ensure tts model is frozen.
@@ -430,7 +432,8 @@ def _get_batch_spect(self, batch: Union[TextToTextBatch, TextOrAudioToTextBatch,
         elif isinstance(batch, TextOrAudioToTextBatch):
             tts_spectrogram, tts_spectrogram_len = self._get_tts_spectrogram(batch.tts_texts, batch.speakers)
             asr_spectrogram, asr_spectrogram_len = self.asr_model.preprocessor(
-                input_signal=batch.audio_signals, length=batch.audio_signal_lengths,
+                input_signal=batch.audio_signals,
+                length=batch.audio_signal_lengths,
             )
 
             spectrogram = pad_sequence(
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
index 1d437a19a86b..7e8720ee3ad8 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
@@ -17,8 +17,8 @@
 from typing import Dict, List, Optional, Union
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.asr.data import audio_to_text_dataset
 from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
index 028073d7ca7f..34dd9aae5711 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
@@ -19,8 +19,8 @@
 from typing import Any, List, Optional, Tuple
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs
diff --git a/nemo/collections/asr/models/k2_sequence_models.py b/nemo/collections/asr/models/k2_sequence_models.py
index 087e9e41b85d..b60d08afe635 100644
--- a/nemo/collections/asr/models/k2_sequence_models.py
+++ b/nemo/collections/asr/models/k2_sequence_models.py
@@ -14,8 +14,8 @@
 
 from typing import List, Optional
 
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.models.ctc_models import EncDecCTCModel
@@ -76,7 +76,11 @@ def change_vocabulary(self, new_vocabulary: List[str]):
 
     @typecheck()
     def forward(
-        self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None,
+        self,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
     ):
         """
         Forward pass of the model.
@@ -159,7 +163,11 @@ def change_vocabulary(self, new_tokenizer_dir: str, new_tokenizer_type: str):
 
     @typecheck()
     def forward(
-        self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None,
+        self,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
     ):
         """
         Forward pass of the model.
diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py
index 08c304e4c52c..37391879547b 100644
--- a/nemo/collections/asr/models/label_models.py
+++ b/nemo/collections/asr/models/label_models.py
@@ -24,8 +24,8 @@
 import soundfile as sf
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from sklearn.metrics import roc_curve
 from torchmetrics import Accuracy
 from tqdm import tqdm
diff --git a/nemo/collections/asr/models/msdd_models.py b/nemo/collections/asr/models/msdd_models.py
index c88275dcacd3..d30411f01bcc 100644
--- a/nemo/collections/asr/models/msdd_models.py
+++ b/nemo/collections/asr/models/msdd_models.py
@@ -25,11 +25,11 @@
 import numpy as np
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import LightningModule, Trainer
+from lightning.pytorch.utilities import rank_zero_only
 from omegaconf import DictConfig, open_dict
 from pyannote.core import Annotation
 from pyannote.metrics.diarization import DiarizationErrorRate
-from pytorch_lightning import LightningModule, Trainer
-from pytorch_lightning.utilities import rank_zero_only
 from tqdm import tqdm
 
 from nemo.collections.asr.data.audio_to_diar_label import AudioToSpeechMSDDInferDataset, AudioToSpeechMSDDTrainDataset
diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py
index 25890ec716c8..c92bcfaaef7a 100644
--- a/nemo/collections/asr/models/rnnt_bpe_models.py
+++ b/nemo/collections/asr/models/rnnt_bpe_models.py
@@ -17,8 +17,8 @@
 from typing import Dict, List, Optional, Union
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.asr.data import audio_to_text_dataset
 from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
index ce3b6bc89bce..a6408b5e935e 100644
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@@ -19,8 +19,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.asr.data import audio_to_text_dataset
diff --git a/nemo/collections/asr/models/ssl_models.py b/nemo/collections/asr/models/ssl_models.py
index 633a00d73f5e..9150da7bf7c2 100644
--- a/nemo/collections/asr/models/ssl_models.py
+++ b/nemo/collections/asr/models/ssl_models.py
@@ -17,8 +17,8 @@
 
 import torch
 import torch.nn as nn
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.asr.data import audio_to_text_dataset, ssl_dataset
 from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs
diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 089186e142bf..8d0f2b2223a3 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -22,8 +22,8 @@
 import editdistance
 import torch
 import torch.distributed as dist
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 from torchmetrics.text import SacreBLEUScore
 from tqdm.auto import tqdm
diff --git a/nemo/collections/asr/parts/utils/wfst_utils.py b/nemo/collections/asr/parts/utils/wfst_utils.py
index 31f394fb60ac..9dbb9fc751b2 100644
--- a/nemo/collections/asr/parts/utils/wfst_utils.py
+++ b/nemo/collections/asr/parts/utils/wfst_utils.py
@@ -32,7 +32,7 @@
     import kaldifst
 
     # check that kaldifst package is not empty
-    # Note: pytorch_lightning.utilities.imports.package_available may not help here
+    # Note: lightning.pytorch.utilities.imports.package_available may not help here
     kaldifst.StdVectorFst()
     _KALDIFST_AVAILABLE = True
 except (ImportError, ModuleNotFoundError, AttributeError):
diff --git a/nemo/collections/audio/models/audio_to_audio.py b/nemo/collections/audio/models/audio_to_audio.py
index e1732c1658b7..60c16f756f58 100644
--- a/nemo/collections/audio/models/audio_to_audio.py
+++ b/nemo/collections/audio/models/audio_to_audio.py
@@ -22,8 +22,8 @@
 import librosa
 import soundfile as sf
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 from tqdm import tqdm
 
 from nemo.collections.asr.data.audio_to_text_dataset import inject_dataloader_value_from_model_config
diff --git a/nemo/collections/audio/models/enhancement.py b/nemo/collections/audio/models/enhancement.py
index cd9f47b98096..8e2206afcef1 100644
--- a/nemo/collections/audio/models/enhancement.py
+++ b/nemo/collections/audio/models/enhancement.py
@@ -17,8 +17,8 @@
 import einops
 import hydra
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.audio.models.audio_to_audio import AudioToAudioModel
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
diff --git a/nemo/collections/audio/parts/utils/callbacks.py b/nemo/collections/audio/parts/utils/callbacks.py
index 093d5a11f419..ff975c93ecc7 100644
--- a/nemo/collections/audio/parts/utils/callbacks.py
+++ b/nemo/collections/audio/parts/utils/callbacks.py
@@ -16,10 +16,10 @@
 
 import einops
 import torch
-from pytorch_lightning import Callback, LightningModule, Trainer
-from pytorch_lightning.loggers import TensorBoardLogger
-from pytorch_lightning.loggers.logger import Logger
-from pytorch_lightning.loggers.wandb import WandbLogger
+from lightning.pytorch import Callback, LightningModule, Trainer
+from lightning.pytorch.loggers import TensorBoardLogger
+from lightning.pytorch.loggers.logger import Logger
+from lightning.pytorch.loggers.wandb import WandbLogger
 
 from nemo.utils import logging
 from nemo.utils.decorators import experimental
diff --git a/nemo/collections/common/callbacks/callbacks.py b/nemo/collections/common/callbacks/callbacks.py
index 1a6c011c38df..754b33726faf 100644
--- a/nemo/collections/common/callbacks/callbacks.py
+++ b/nemo/collections/common/callbacks/callbacks.py
@@ -13,15 +13,14 @@
 # limitations under the License.
 import time
 
-from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.utilities import rank_zero_only
+from lightning.pytorch.callbacks import Callback
+from lightning.pytorch.utilities import rank_zero_only
 
 # from sacrebleu import corpus_bleu
 
 
 class LogEpochTimeCallback(Callback):
-    """Simple callback that logs how long each epoch takes, in seconds, to a pytorch lightning log
-    """
+    """Simple callback that logs how long each epoch takes, in seconds, to a pytorch lightning log"""
 
     @rank_zero_only
     def on_train_epoch_start(self, trainer, pl_module):
diff --git a/nemo/collections/common/callbacks/ema.py b/nemo/collections/common/callbacks/ema.py
index 2f295bf67354..f866a2639d63 100644
--- a/nemo/collections/common/callbacks/ema.py
+++ b/nemo/collections/common/callbacks/ema.py
@@ -17,11 +17,11 @@
 import threading
 from typing import Any, Dict, Iterable
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
-from pytorch_lightning import Callback
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.rank_zero import rank_zero_info
+from lightning.pytorch import Callback
+from lightning.pytorch.utilities.exceptions import MisconfigurationException
+from lightning.pytorch.utilities.rank_zero import rank_zero_info
 
 
 class EMA(Callback):
@@ -40,7 +40,11 @@ class EMA(Callback):
     """
 
     def __init__(
-        self, decay: float, validate_original_weights: bool = False, every_n_steps: int = 1, cpu_offload: bool = False,
+        self,
+        decay: float,
+        validate_original_weights: bool = False,
+        every_n_steps: int = 1,
+        cpu_offload: bool = False,
     ):
         if not (0 <= decay <= 1):
             raise MisconfigurationException("EMA decay value must be between 0 and 1")
@@ -149,7 +153,9 @@ def on_load_checkpoint(
 def ema_update(ema_model_tuple, current_model_tuple, decay):
     torch._foreach_mul_(ema_model_tuple, decay)
     torch._foreach_add_(
-        ema_model_tuple, current_model_tuple, alpha=(1.0 - decay),
+        ema_model_tuple,
+        current_model_tuple,
+        alpha=(1.0 - decay),
     )
 
 
@@ -272,7 +278,13 @@ def update(self):
 
         if self.device.type == 'cpu':
             self.thread = threading.Thread(
-                target=run_ema_update_cpu, args=(self.ema_params, current_model_state, self.decay, self.stream,),
+                target=run_ema_update_cpu,
+                args=(
+                    self.ema_params,
+                    current_model_state,
+                    self.decay,
+                    self.stream,
+                ),
             )
             self.thread.start()
 
diff --git a/nemo/collections/common/metrics/perf_metrics.py b/nemo/collections/common/metrics/perf_metrics.py
index d668d29c42ff..daad92ce95ea 100644
--- a/nemo/collections/common/metrics/perf_metrics.py
+++ b/nemo/collections/common/metrics/perf_metrics.py
@@ -15,7 +15,7 @@
 from typing import Any, Dict, List, Optional
 
 import numpy as np
-from pytorch_lightning.callbacks import Callback
+from lightning.pytorch.callbacks import Callback
 
 from nemo.collections.common.parts.perf_metrics_utils import LLM_VOCAB_SIZE_MAP, read_tb_log
 from nemo.utils import logging
diff --git a/nemo/collections/common/parts/ptl_overrides.py b/nemo/collections/common/parts/ptl_overrides.py
index 0225ecd50fee..263c865f8270 100644
--- a/nemo/collections/common/parts/ptl_overrides.py
+++ b/nemo/collections/common/parts/ptl_overrides.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 import torch
-from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
+from lightning.pytorch.plugins.precision import MixedPrecisionPlugin
 
 
 class NeMoMixedPrecisionPlugin(MixedPrecisionPlugin):
-    def __init__(self, init_scale: float = 2 ** 32, growth_interval: int = 1000) -> None:
+    def __init__(self, init_scale: float = 2**32, growth_interval: int = 1000) -> None:
         super().__init__(precision=16)
 
         self.scaler = torch.cuda.amp.GradScaler(init_scale=init_scale, growth_interval=growth_interval)
diff --git a/nemo/collections/diffusion/data/diffusion_energon_datamodule.py b/nemo/collections/diffusion/data/diffusion_energon_datamodule.py
index 67a26609dd51..07747528363a 100644
--- a/nemo/collections/diffusion/data/diffusion_energon_datamodule.py
+++ b/nemo/collections/diffusion/data/diffusion_energon_datamodule.py
@@ -15,9 +15,9 @@
 import logging
 from typing import Any, Dict, Literal
 
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS
 from megatron.core import parallel_state
 from megatron.energon import DefaultTaskEncoder, WorkerConfig, get_savable_loader, get_train_dataset
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS
 
 from nemo.collections.multimodal.data.energon.base import SimpleMultiModalDataModule
 
diff --git a/nemo/collections/diffusion/data/diffusion_fake_datamodule.py b/nemo/collections/diffusion/data/diffusion_fake_datamodule.py
index 6cb686c1c305..a9fc7ad5b484 100644
--- a/nemo/collections/diffusion/data/diffusion_fake_datamodule.py
+++ b/nemo/collections/diffusion/data/diffusion_fake_datamodule.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils.data import DataLoader
 
 from nemo.collections.diffusion.models.model import DiTConfig
diff --git a/nemo/collections/diffusion/train.py b/nemo/collections/diffusion/train.py
index 5428e0eeefa2..404602084b85 100644
--- a/nemo/collections/diffusion/train.py
+++ b/nemo/collections/diffusion/train.py
@@ -14,13 +14,13 @@
 
 import os
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.loggers import WandbLogger
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
 from megatron.core.transformer.enums import AttnMaskType
-from pytorch_lightning.loggers import WandbLogger
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 07899b2ee484..d41bad067476 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -17,8 +17,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 from rich.console import Console
 from typing_extensions import Annotated
diff --git a/nemo/collections/llm/gpt/data/api.py b/nemo/collections/llm/gpt/data/api.py
index 74ecb5272ac2..2ebb30e781d1 100644
--- a/nemo/collections/llm/gpt/data/api.py
+++ b/nemo/collections/llm/gpt/data/api.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 
 from nemo.collections.llm.gpt.data.dolly import DollyDataModule
 from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 9d16ea8aa021..8fcef72f3bd9 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -17,7 +17,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from torch.utils.data import DataLoader
 
 from nemo.collections.common.tokenizers import AutoTokenizer
diff --git a/nemo/collections/llm/gpt/data/hf_dataset.py b/nemo/collections/llm/gpt/data/hf_dataset.py
index 5c6b71c74797..46562b6e72c8 100644
--- a/nemo/collections/llm/gpt/data/hf_dataset.py
+++ b/nemo/collections/llm/gpt/data/hf_dataset.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from torch.utils.data import DataLoader
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
diff --git a/nemo/collections/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py
index 5678597eda0b..f6b4e26ca355 100644
--- a/nemo/collections/llm/gpt/data/mock.py
+++ b/nemo/collections/llm/gpt/data/mock.py
@@ -14,10 +14,10 @@
 
 from typing import TYPE_CHECKING, Dict, List, Optional
 
+import lightning.pytorch as pl
 import numpy as np
-import pytorch_lightning as pl
 import torch
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils import data
 from torch.utils.data import DataLoader, Dataset
 
diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
index cfacde118b89..f659ce72796c 100644
--- a/nemo/collections/llm/gpt/data/pre_training.py
+++ b/nemo/collections/llm/gpt/data/pre_training.py
@@ -18,8 +18,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
-import pytorch_lightning as pl
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+import lightning.pytorch as pl
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils import data
 
 from nemo.lightning.data import WrappedDataLoader
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 6b158a33b226..8c3b47835ab1 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -15,7 +15,7 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union
 
-import pytorch_lightning as L
+import lightning.pytorch as L
 import torch
 import torch.distributed
 from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
index eada3f4c3eb8..4d9e7ae026d5 100644
--- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
+++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import torch.nn.functional as F
 from transformers import AutoModelForCausalLM
diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py
index a71042e2ba6f..0aa611b4454e 100644
--- a/nemo/collections/llm/gpt/model/mistral.py
+++ b/nemo/collections/llm/gpt/model/mistral.py
@@ -16,7 +16,6 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Callable, List, Optional
 
-import pytorch_lightning as pl
 import torch
 import torch.nn.functional as F
 from torch import nn
diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py
index 55d865ec238b..8a3cbc925dad 100644
--- a/nemo/collections/llm/inference/base.py
+++ b/nemo/collections/llm/inference/base.py
@@ -16,9 +16,10 @@
 from pathlib import Path
 from typing import Optional, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import torch.distributed
+from lightning.pytorch.trainer.states import TrainerFn
 from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.engines.mcore_engine import MCoreEngine
 from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
@@ -31,7 +32,6 @@
     SimpleTextGenerationController,
 )
 from megatron.core.transformer.module import MegatronModule
-from pytorch_lightning.trainer.states import TrainerFn
 
 import nemo.lightning as nl
 from nemo.collections.llm.peft import LoRA
diff --git a/nemo/collections/llm/recipes/baichuan2_7b.py b/nemo/collections/llm/recipes/baichuan2_7b.py
index 20de2c73f9dd..823f6e07cd57 100644
--- a/nemo/collections/llm/recipes/baichuan2_7b.py
+++ b/nemo/collections/llm/recipes/baichuan2_7b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm import Baichuan2Config7B, Baichuan2Model
diff --git a/nemo/collections/llm/recipes/chatglm3_6b.py b/nemo/collections/llm/recipes/chatglm3_6b.py
index ef815a0851fc..b6c640372074 100644
--- a/nemo/collections/llm/recipes/chatglm3_6b.py
+++ b/nemo/collections/llm/recipes/chatglm3_6b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm import ChatGLM3Config6B, ChatGLMModel
diff --git a/nemo/collections/llm/recipes/finetune_default.py b/nemo/collections/llm/recipes/finetune_default.py
index a060046a8bdf..f05fd7cb2d13 100644
--- a/nemo/collections/llm/recipes/finetune_default.py
+++ b/nemo/collections/llm/recipes/finetune_default.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 import nemo.lightning as nl
diff --git a/nemo/collections/llm/recipes/gemma2.py b/nemo/collections/llm/recipes/gemma2.py
index 6fd1be83c183..2a690dc556d8 100644
--- a/nemo/collections/llm/recipes/gemma2.py
+++ b/nemo/collections/llm/recipes/gemma2.py
@@ -14,11 +14,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.gpt.model.gemma2 import Gemma2Config2B, Gemma2Config9B, Gemma2Config27B, Gemma2Model
diff --git a/nemo/collections/llm/recipes/gemma2_27b.py b/nemo/collections/llm/recipes/gemma2_27b.py
index 6f852f0fe6cf..2025bd570503 100644
--- a/nemo/collections/llm/recipes/gemma2_27b.py
+++ b/nemo/collections/llm/recipes/gemma2_27b.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/gemma2_2b.py b/nemo/collections/llm/recipes/gemma2_2b.py
index 98c795591774..e1aa3ad4be86 100644
--- a/nemo/collections/llm/recipes/gemma2_2b.py
+++ b/nemo/collections/llm/recipes/gemma2_2b.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/gemma2_9b.py b/nemo/collections/llm/recipes/gemma2_9b.py
index a211d8cfa838..8117102f1b75 100644
--- a/nemo/collections/llm/recipes/gemma2_9b.py
+++ b/nemo/collections/llm/recipes/gemma2_9b.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py
index 8b2111e9f7c4..8798af436a9c 100644
--- a/nemo/collections/llm/recipes/gemma_2b.py
+++ b/nemo/collections/llm/recipes/gemma_2b.py
@@ -14,11 +14,11 @@
 import os
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm import GemmaConfig2B, GemmaModel
diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py
index 44efb3fe56b8..0bfd62b33e9e 100644
--- a/nemo/collections/llm/recipes/gemma_7b.py
+++ b/nemo/collections/llm/recipes/gemma_7b.py
@@ -14,11 +14,11 @@
 import os
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm import GemmaConfig7B, GemmaModel
diff --git a/nemo/collections/llm/recipes/gpt3_175b.py b/nemo/collections/llm/recipes/gpt3_175b.py
index 5932ce5346b9..189f0ca6baf1 100644
--- a/nemo/collections/llm/recipes/gpt3_175b.py
+++ b/nemo/collections/llm/recipes/gpt3_175b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import pretrain
diff --git a/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
index f5a52cd351be..d93b167b45b6 100644
--- a/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
+++ b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
@@ -15,10 +15,10 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
-from pytorch_lightning.callbacks.callback import Callback
+from lightning.pytorch.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py
index 31c83713b6e7..b1ef9719975c 100644
--- a/nemo/collections/llm/recipes/llama31_405b.py
+++ b/nemo/collections/llm/recipes/llama31_405b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/llama31_70b.py b/nemo/collections/llm/recipes/llama31_70b.py
index 91e4e10c83e6..6c0a108e10c7 100644
--- a/nemo/collections/llm/recipes/llama31_70b.py
+++ b/nemo/collections/llm/recipes/llama31_70b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/llama31_8b.py b/nemo/collections/llm/recipes/llama31_8b.py
index a4f0082e8535..d2cbd454d483 100644
--- a/nemo/collections/llm/recipes/llama31_8b.py
+++ b/nemo/collections/llm/recipes/llama31_8b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
index d43302a0a0ee..2b721b3f71c6 100644
--- a/nemo/collections/llm/recipes/llama3_70b.py
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/llama3_70b_16k.py b/nemo/collections/llm/recipes/llama3_70b_16k.py
index 928f961f7cf3..0a394d386afd 100644
--- a/nemo/collections/llm/recipes/llama3_70b_16k.py
+++ b/nemo/collections/llm/recipes/llama3_70b_16k.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/llama3_70b_64k.py b/nemo/collections/llm/recipes/llama3_70b_64k.py
index ffadf5ca8084..e035424d3506 100644
--- a/nemo/collections/llm/recipes/llama3_70b_64k.py
+++ b/nemo/collections/llm/recipes/llama3_70b_64k.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
index 4f6f6ce17443..13e2d13cc21a 100644
--- a/nemo/collections/llm/recipes/llama3_8b.py
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py
index d6c1677a3b4b..b81d01c6ec9a 100644
--- a/nemo/collections/llm/recipes/llama3_8b_16k.py
+++ b/nemo/collections/llm/recipes/llama3_8b_16k.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py
index 692347ea8dd0..ff176fb372bb 100644
--- a/nemo/collections/llm/recipes/llama3_8b_64k.py
+++ b/nemo/collections/llm/recipes/llama3_8b_64k.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py
index d83580a1a543..023e4e459d5f 100644
--- a/nemo/collections/llm/recipes/log/default.py
+++ b/nemo/collections/llm/recipes/log/default.py
@@ -16,8 +16,8 @@
 from datetime import timedelta
 from typing import Optional
 
+from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
 from nemo_run import Config, cli
-from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
 
 from nemo import lightning as nl
 
diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py
index 08640604a112..3f13f91f6609 100644
--- a/nemo/collections/llm/recipes/mamba2_130m.py
+++ b/nemo/collections/llm/recipes/mamba2_130m.py
@@ -15,11 +15,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py
index 58eaf049b059..1a280b8b92a1 100644
--- a/nemo/collections/llm/recipes/mamba2_1_3b.py
+++ b/nemo/collections/llm/recipes/mamba2_1_3b.py
@@ -15,11 +15,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/nemo/collections/llm/recipes/mamba2_2_7b.py b/nemo/collections/llm/recipes/mamba2_2_7b.py
index 5cb37c6a02a5..0915cec748dd 100644
--- a/nemo/collections/llm/recipes/mamba2_2_7b.py
+++ b/nemo/collections/llm/recipes/mamba2_2_7b.py
@@ -15,11 +15,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/nemo/collections/llm/recipes/mamba2_370m.py b/nemo/collections/llm/recipes/mamba2_370m.py
index bb8bddc4045a..bb063dfcfc3f 100644
--- a/nemo/collections/llm/recipes/mamba2_370m.py
+++ b/nemo/collections/llm/recipes/mamba2_370m.py
@@ -15,11 +15,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/nemo/collections/llm/recipes/mamba2_780m.py b/nemo/collections/llm/recipes/mamba2_780m.py
index 2f6ab6717ae1..e89905b2269a 100644
--- a/nemo/collections/llm/recipes/mamba2_780m.py
+++ b/nemo/collections/llm/recipes/mamba2_780m.py
@@ -15,11 +15,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/nemo/collections/llm/recipes/mamba2_8b.py b/nemo/collections/llm/recipes/mamba2_8b.py
index 58883deba732..873d79fcb0f0 100644
--- a/nemo/collections/llm/recipes/mamba2_8b.py
+++ b/nemo/collections/llm/recipes/mamba2_8b.py
@@ -15,11 +15,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
index eff37da46fca..09bb88a57089 100644
--- a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
+++ b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
@@ -15,11 +15,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/nemo/collections/llm/recipes/mistral_7b.py b/nemo/collections/llm/recipes/mistral_7b.py
index 7685bcd3ace6..3bc1e568185a 100644
--- a/nemo/collections/llm/recipes/mistral_7b.py
+++ b/nemo/collections/llm/recipes/mistral_7b.py
@@ -15,11 +15,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/mistral_nemo_12b.py b/nemo/collections/llm/recipes/mistral_nemo_12b.py
index e6616826d9a8..7d9fa1d792e9 100644
--- a/nemo/collections/llm/recipes/mistral_nemo_12b.py
+++ b/nemo/collections/llm/recipes/mistral_nemo_12b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py
index f768bf0499b1..16e6168e649b 100644
--- a/nemo/collections/llm/recipes/mixtral_8x22b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x22b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
index d4286a15843f..5fbb0ac22c61 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -15,11 +15,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
index 7cbfaf723544..499280cc8542 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
index 3606be5ec12b..e0702f7b2a63 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/nemotron.py b/nemo/collections/llm/recipes/nemotron.py
index 104c3798567a..7982665eb3d5 100644
--- a/nemo/collections/llm/recipes/nemotron.py
+++ b/nemo/collections/llm/recipes/nemotron.py
@@ -14,11 +14,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.gpt.model.nemotron import (
diff --git a/nemo/collections/llm/recipes/nemotron3_22b.py b/nemo/collections/llm/recipes/nemotron3_22b.py
index 724e21f002e3..2dd9c3ff5205 100644
--- a/nemo/collections/llm/recipes/nemotron3_22b.py
+++ b/nemo/collections/llm/recipes/nemotron3_22b.py
@@ -14,8 +14,8 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/nemotron3_22b_16k.py b/nemo/collections/llm/recipes/nemotron3_22b_16k.py
index 81f4253ad37a..5ae58d1a757d 100644
--- a/nemo/collections/llm/recipes/nemotron3_22b_16k.py
+++ b/nemo/collections/llm/recipes/nemotron3_22b_16k.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import pretrain
diff --git a/nemo/collections/llm/recipes/nemotron3_22b_64k.py b/nemo/collections/llm/recipes/nemotron3_22b_64k.py
index 676694697e4c..22f6291cfadb 100644
--- a/nemo/collections/llm/recipes/nemotron3_22b_64k.py
+++ b/nemo/collections/llm/recipes/nemotron3_22b_64k.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import pretrain
diff --git a/nemo/collections/llm/recipes/nemotron3_4b.py b/nemo/collections/llm/recipes/nemotron3_4b.py
index e1c2ef345d7e..c208ee740265 100644
--- a/nemo/collections/llm/recipes/nemotron3_4b.py
+++ b/nemo/collections/llm/recipes/nemotron3_4b.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py
index 202efe658d83..7799512c6260 100644
--- a/nemo/collections/llm/recipes/nemotron3_8b.py
+++ b/nemo/collections/llm/recipes/nemotron3_8b.py
@@ -14,8 +14,8 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo import lightning as nl
diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
index 0f15c47c67b9..ad0f884b0d3b 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -14,8 +14,8 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/nemotron4_15b_16k.py b/nemo/collections/llm/recipes/nemotron4_15b_16k.py
index 75eced72761f..e16c2b03b032 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b_16k.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b_16k.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import pretrain
diff --git a/nemo/collections/llm/recipes/nemotron4_15b_64k.py b/nemo/collections/llm/recipes/nemotron4_15b_64k.py
index 8286778aa7ba..2cedfbed398b 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b_64k.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b_64k.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import pretrain
diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
index c02950109669..b22abc43d558 100644
--- a/nemo/collections/llm/recipes/nemotron4_340b.py
+++ b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -14,8 +14,8 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo import lightning as nl
diff --git a/nemo/collections/llm/recipes/qwen2.py b/nemo/collections/llm/recipes/qwen2.py
index ff0c76a714f1..db9dcfc88865 100644
--- a/nemo/collections/llm/recipes/qwen2.py
+++ b/nemo/collections/llm/recipes/qwen2.py
@@ -14,10 +14,10 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
-from pytorch_lightning.callbacks.callback import Callback
+from lightning.pytorch.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.gpt.model.qwen2 import (
diff --git a/nemo/collections/llm/recipes/qwen2_1p5b.py b/nemo/collections/llm/recipes/qwen2_1p5b.py
index 662f8e98899d..a3d705c4fb3a 100644
--- a/nemo/collections/llm/recipes/qwen2_1p5b.py
+++ b/nemo/collections/llm/recipes/qwen2_1p5b.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/qwen2_500m.py b/nemo/collections/llm/recipes/qwen2_500m.py
index ac6cbfe84464..08541ca9e421 100644
--- a/nemo/collections/llm/recipes/qwen2_500m.py
+++ b/nemo/collections/llm/recipes/qwen2_500m.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/qwen2_72b.py b/nemo/collections/llm/recipes/qwen2_72b.py
index 0b94761e5749..c0bc9bf40611 100644
--- a/nemo/collections/llm/recipes/qwen2_72b.py
+++ b/nemo/collections/llm/recipes/qwen2_72b.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/qwen2_7b.py b/nemo/collections/llm/recipes/qwen2_7b.py
index 10c990f15142..67bcc5e953bf 100644
--- a/nemo/collections/llm/recipes/qwen2_7b.py
+++ b/nemo/collections/llm/recipes/qwen2_7b.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/starcoder2.py b/nemo/collections/llm/recipes/starcoder2.py
index c3a19326585c..b090ce1cf9ef 100644
--- a/nemo/collections/llm/recipes/starcoder2.py
+++ b/nemo/collections/llm/recipes/starcoder2.py
@@ -14,10 +14,11 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
-from pytorch_lightning.callbacks.callback import Callback
+from lightning.pytorch.callbacks.callback import Callback
+
 from nemo import lightning as nl
 from nemo.collections.llm.gpt.model.starcoder2 import (
     Starcoder2Config3B,
diff --git a/nemo/collections/llm/recipes/starcoder2_15b.py b/nemo/collections/llm/recipes/starcoder2_15b.py
index a59ec272c865..14b53809111a 100644
--- a/nemo/collections/llm/recipes/starcoder2_15b.py
+++ b/nemo/collections/llm/recipes/starcoder2_15b.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/starcoder2_3b.py b/nemo/collections/llm/recipes/starcoder2_3b.py
index 55884b353d8f..3ee81522ebc9 100644
--- a/nemo/collections/llm/recipes/starcoder2_3b.py
+++ b/nemo/collections/llm/recipes/starcoder2_3b.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/starcoder2_7b.py b/nemo/collections/llm/recipes/starcoder2_7b.py
index 46e34b8b0c77..96b5ab36b876 100644
--- a/nemo/collections/llm/recipes/starcoder2_7b.py
+++ b/nemo/collections/llm/recipes/starcoder2_7b.py
@@ -14,8 +14,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/starcoder_15b.py b/nemo/collections/llm/recipes/starcoder_15b.py
index cb0ba14df868..d87788be5613 100644
--- a/nemo/collections/llm/recipes/starcoder_15b.py
+++ b/nemo/collections/llm/recipes/starcoder_15b.py
@@ -14,10 +14,10 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
-from pytorch_lightning.callbacks.callback import Callback
+from lightning.pytorch.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/t5_11b.py b/nemo/collections/llm/recipes/t5_11b.py
index b3806e6f2540..8baf54b4f42f 100644
--- a/nemo/collections/llm/recipes/t5_11b.py
+++ b/nemo/collections/llm/recipes/t5_11b.py
@@ -15,12 +15,12 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/t5_220m.py b/nemo/collections/llm/recipes/t5_220m.py
index e220eb3fb1b0..77b68f43f0e1 100644
--- a/nemo/collections/llm/recipes/t5_220m.py
+++ b/nemo/collections/llm/recipes/t5_220m.py
@@ -15,12 +15,12 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/recipes/t5_3b.py b/nemo/collections/llm/recipes/t5_3b.py
index e7f215d57635..333661d97117 100644
--- a/nemo/collections/llm/recipes/t5_3b.py
+++ b/nemo/collections/llm/recipes/t5_3b.py
@@ -15,12 +15,12 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/llm/t5/data/fine_tuning.py b/nemo/collections/llm/t5/data/fine_tuning.py
index 4180b4f135cb..ced4ea1a0b37 100644
--- a/nemo/collections/llm/t5/data/fine_tuning.py
+++ b/nemo/collections/llm/t5/data/fine_tuning.py
@@ -17,7 +17,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from torch.utils.data import DataLoader
 
 from nemo.collections.llm.t5.data.core import create_sft_dataset
diff --git a/nemo/collections/llm/t5/data/mock.py b/nemo/collections/llm/t5/data/mock.py
index eaf41d290da4..25dcf4d560fc 100644
--- a/nemo/collections/llm/t5/data/mock.py
+++ b/nemo/collections/llm/t5/data/mock.py
@@ -14,10 +14,10 @@
 
 from typing import TYPE_CHECKING, Dict, List, Optional
 
+import lightning.pytorch as pl
 import numpy as np
-import pytorch_lightning as pl
 import torch
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils import data
 from torch.utils.data import DataLoader, Dataset
 
diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py
index 45d485ba2074..4bd6e5ed5e93 100644
--- a/nemo/collections/llm/t5/data/pre_training.py
+++ b/nemo/collections/llm/t5/data/pre_training.py
@@ -17,8 +17,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
-import pytorch_lightning as pl
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+import lightning.pytorch as pl
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils import data
 
 from nemo.lightning.data import WrappedDataLoader
diff --git a/nemo/collections/llm/t5/model/t5.py b/nemo/collections/llm/t5/model/t5.py
index 058acaaec7b0..7d8ff65e8d99 100644
--- a/nemo/collections/llm/t5/model/t5.py
+++ b/nemo/collections/llm/t5/model/t5.py
@@ -16,7 +16,7 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union
 
-import pytorch_lightning as L
+import lightning.pytorch as L
 import torch
 import torch.distributed
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
diff --git a/nemo/collections/multimodal/data/dreambooth/dreambooth_dataset.py b/nemo/collections/multimodal/data/dreambooth/dreambooth_dataset.py
index 1c39b1a72216..baead0c47962 100644
--- a/nemo/collections/multimodal/data/dreambooth/dreambooth_dataset.py
+++ b/nemo/collections/multimodal/data/dreambooth/dreambooth_dataset.py
@@ -15,8 +15,8 @@
 from pathlib import Path
 
 import torch
+from lightning.pytorch.utilities import rank_zero_only
 from PIL import Image
-from pytorch_lightning.utilities import rank_zero_only
 from torch.utils.data import Dataset
 from tqdm import tqdm
 
diff --git a/nemo/collections/multimodal/data/energon/base.py b/nemo/collections/multimodal/data/energon/base.py
index 0a99b1a1baad..4e90dce55c7a 100644
--- a/nemo/collections/multimodal/data/energon/base.py
+++ b/nemo/collections/multimodal/data/energon/base.py
@@ -16,10 +16,10 @@
 from typing import Any, Dict, Literal, Optional
 
 import fiddle as fdl
-import pytorch_lightning as pl
+import lightning.pytorch as pl
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from megatron.core import parallel_state
 from megatron.energon import WorkerConfig, get_savable_loader, get_train_dataset
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils.data import DataLoader
 from typing_extensions import Self
 
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 5291497f92c3..5d19b8544305 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -22,8 +22,8 @@
 import torch
 import torch.nn.functional as F
 from einops import rearrange, reduce, repeat
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig, ListConfig, OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import CLIPVisionModel, SiglipVisionModel
 
 from nemo.collections.common.parts.utils import extend_instance
diff --git a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
index 158fa7595782..981600fcc3a1 100644
--- a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
+++ b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
@@ -18,9 +18,9 @@
 import torch
 import torch.nn as nn
 from einops import rearrange, repeat
+from lightning.pytorch import Trainer
+from lightning.pytorch.utilities.rank_zero import rank_zero_only
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
-from pytorch_lightning.utilities.rank_zero import rank_zero_only
 from torch._inductor import config as inductor_config
 
 from nemo.collections.multimodal.data.controlnet.controlnet_dataset import build_train_valid_datasets
diff --git a/nemo/collections/multimodal/models/text_to_image/controlnet/util.py b/nemo/collections/multimodal/models/text_to_image/controlnet/util.py
index 3d9a7d16b1c3..f890426c98f4 100644
--- a/nemo/collections/multimodal/models/text_to_image/controlnet/util.py
+++ b/nemo/collections/multimodal/models/text_to_image/controlnet/util.py
@@ -17,9 +17,9 @@
 import numpy as np
 import torch
 import torchvision
+from lightning.pytorch import Callback
+from lightning.pytorch.utilities.rank_zero import rank_zero_only
 from PIL import Image
-from pytorch_lightning import Callback
-from pytorch_lightning.utilities.rank_zero import rank_zero_only
 
 
 class ImageLogger(Callback):
diff --git a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
index 47548b02961d..8906263faeba 100644
--- a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
+++ b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
@@ -15,8 +15,8 @@
 from typing import Any, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from torch._inductor import config as inductor_config
 
 from nemo.collections.multimodal.data.dreambooth.dreambooth_dataset import DreamBoothDataset
diff --git a/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py b/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
index ed9be58178c4..1772e465f604 100644
--- a/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
+++ b/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
@@ -17,8 +17,8 @@
 from typing import Any
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.multimodal.data.imagen.imagen_dataset import build_train_valid_datasets
 from nemo.collections.multimodal.models.text_to_image.imagen.precond import ContinousDDPMPrecond, EDMPrecond
diff --git a/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py b/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py
index 43660c9000a1..63963321fcf7 100644
--- a/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py
+++ b/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py
@@ -17,8 +17,8 @@
 from typing import Callable, List, Optional, Union
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf.omegaconf import OmegaConf
-from pytorch_lightning import Trainer
 from torch.cuda.amp import autocast
 
 from nemo.collections.multimodal.models.text_to_image.imagen.imagen import Imagen, MegatronImagen
@@ -73,7 +73,9 @@ def _load_model(model_ckpt: str, model_cfg: str, eval_mode: bool = True, trainer
             model_cfg.micro_batch_size = 1
             model_cfg.global_batch_size = 1
             model = MegatronImagen.restore_from(
-                restore_path=model_ckpt, override_config_path=model_cfg, trainer=trainer,
+                restore_path=model_ckpt,
+                override_config_path=model_cfg,
+                trainer=trainer,
             )
         elif model_ckpt.endswith('.ckpt'):
             model_cfg = OmegaConf.load(model_cfg)
@@ -128,7 +130,9 @@ def model_cfg_modifier(model_cfg):
         models = []
         print('Load base model.')
         model = ImagenPipeline._load_model(
-            model_ckpt=customized_models.base_ckpt, model_cfg=customized_models.base_cfg, trainer=trainer,
+            model_ckpt=customized_models.base_ckpt,
+            model_cfg=customized_models.base_cfg,
+            trainer=trainer,
         )
         models.append(model)
 
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
index 8b18fe2b25fe..c7e8795a749c 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
@@ -17,14 +17,14 @@
 from typing import Any, Dict, List, Tuple, Union
 
 import hydra
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import torch._dynamo
 import torch.nn as nn
 from einops import rearrange
+from lightning.pytorch import Trainer
+from lightning.pytorch.utilities import rank_zero_only
 from omegaconf import DictConfig, ListConfig, OmegaConf
-from pytorch_lightning import Trainer
-from pytorch_lightning.utilities import rank_zero_only
 from safetensors.torch import load_file as load_safetensors
 from torch._dynamo import optimize
 from torch.optim.lr_scheduler import LambdaLR
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
index d79d85c2e026..311ebc0f06f5 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from contextlib import contextmanager
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import torch.nn.functional as F
 from nemo.utils import logging
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
index 744dc6945394..163b2fb27e0f 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -17,18 +17,18 @@
 from functools import partial
 from typing import Any, Dict, List, Optional, Union
 
+import lightning.pytorch as pl
 import numpy as np
-import pytorch_lightning as pl
 import torch
 import torch.nn as nn
 from einops import rearrange, repeat
-from lightning_fabric.utilities.cloud_io import _load as pl_load
+from lightning.fabric.utilities.cloud_io import _load as pl_load
+from lightning.pytorch import Trainer
+from lightning.pytorch.core.saving import _load_state as ptl_load_state
+from lightning.pytorch.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml
+from lightning.pytorch.utilities.migration import pl_legacy_patch
+from lightning.pytorch.utilities.rank_zero import rank_zero_only
 from omegaconf import DictConfig, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.core.saving import _load_state as ptl_load_state
-from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml
-from pytorch_lightning.utilities.migration import pl_legacy_patch
-from pytorch_lightning.utilities.rank_zero import rank_zero_only
 from torch._inductor import config as inductor_config
 from torchvision.utils import make_grid
 from tqdm import tqdm
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
index a9e51610bedd..84718f99262f 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
@@ -23,10 +23,10 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
+from lightning.pytorch.accelerators import CPUAccelerator
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.accelerators import CPUAccelerator
-from pytorch_lightning.trainer.trainer import Trainer
 from tqdm import tqdm
 
 from nemo.collections.multimodal.data.clip.clip_dataset import (
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/megatron_nsfw_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/megatron_nsfw_clip_models.py
index 79c0f3910be0..37e33f892890 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/megatron_nsfw_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/megatron_nsfw_clip_models.py
@@ -19,11 +19,11 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from lightning.pytorch.accelerators import CPUAccelerator
+from lightning.pytorch.trainer.trainer import Trainer
 from megatron.core import parallel_state
 from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.accelerators import CPUAccelerator
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.multimodal.data.clip.clip_dataset import tokenize
 from nemo.collections.multimodal.data.nsfw.nsfw_dataset import build_dataset
@@ -38,7 +38,6 @@
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
 
-
 try:
     from megatron.core.num_microbatches_calculator import get_num_microbatches
 
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 6ba2e8ca91f9..8773b47025bc 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -17,10 +17,10 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
 from omegaconf import DictConfig, OmegaConf, open_dict
 from PIL import Image
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from transformers import CLIPImageProcessor, SiglipImageProcessor
 
 from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform
diff --git a/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py b/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py
index 50b4d29c05a4..53ae4a2dfb65 100644
--- a/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py
+++ b/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py
@@ -20,8 +20,8 @@
 from typing import Dict, List, Optional, Union
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data import audio_to_text_dataset
diff --git a/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_bpe_models.py
index 106fbc432926..8249e5d8a7f8 100644
--- a/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_bpe_models.py
+++ b/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_bpe_models.py
@@ -17,8 +17,8 @@
 from typing import Dict, Optional, Union
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.asr.losses.ctc import CTCLoss
 from nemo.collections.asr.losses.rnnt import RNNTLoss
@@ -90,7 +90,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
         # Setup decoding object
         self.decoding = RNNTBPEDecoding(
-            decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+            decoding_cfg=self.cfg.decoding,
+            decoder=self.decoder,
+            joint=self.joint,
+            tokenizer=self.tokenizer,
         )
 
         # Setup wer object
@@ -282,7 +285,10 @@ def change_vocabulary(
         decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg)
 
         self.decoding = RNNTBPEDecoding(
-            decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+            decoding_cfg=decoding_cfg,
+            decoder=self.decoder,
+            joint=self.joint,
+            tokenizer=self.tokenizer,
         )
 
         self.wer = WER(
@@ -388,7 +394,10 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig, decoder_type: str =
             decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg)
 
             self.decoding = RNNTBPEDecoding(
-                decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+                decoding_cfg=decoding_cfg,
+                decoder=self.decoder,
+                joint=self.joint,
+                tokenizer=self.tokenizer,
             )
 
             self.wer = WER(
diff --git a/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py b/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py
index 1b30263985da..158bfaddcc96 100644
--- a/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py
+++ b/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py
@@ -19,8 +19,8 @@
 from typing import List, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.losses.ctc import CTCLoss
diff --git a/nemo/collections/multimodal/speech_cv/models/visual_rnnt_bpe_models.py b/nemo/collections/multimodal/speech_cv/models/visual_rnnt_bpe_models.py
index eeffb906981a..11e9d43e1737 100644
--- a/nemo/collections/multimodal/speech_cv/models/visual_rnnt_bpe_models.py
+++ b/nemo/collections/multimodal/speech_cv/models/visual_rnnt_bpe_models.py
@@ -17,8 +17,8 @@
 from typing import Dict, List, Optional, Union
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.asr.losses.rnnt import RNNTLoss
 from nemo.collections.asr.metrics.wer import WER
@@ -68,7 +68,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
         # Setup decoding object
         self.decoding = RNNTBPEDecoding(
-            decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+            decoding_cfg=self.cfg.decoding,
+            decoder=self.decoder,
+            joint=self.joint,
+            tokenizer=self.tokenizer,
         )
 
         # Setup wer object
@@ -165,7 +168,10 @@ def change_vocabulary(
         decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg)
 
         self.decoding = RNNTBPEDecoding(
-            decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+            decoding_cfg=decoding_cfg,
+            decoder=self.decoder,
+            joint=self.joint,
+            tokenizer=self.tokenizer,
         )
 
         self.wer = WER(
@@ -214,7 +220,10 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig):
         decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg)
 
         self.decoding = RNNTBPEDecoding(
-            decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+            decoding_cfg=decoding_cfg,
+            decoder=self.decoder,
+            joint=self.joint,
+            tokenizer=self.tokenizer,
         )
 
         self.wer = WER(
diff --git a/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py b/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py
index 5a86eed93019..75202238d2d0 100644
--- a/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py
+++ b/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py
@@ -20,8 +20,8 @@
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data import audio_to_text_dataset
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_models.py b/nemo/collections/multimodal/speech_llm/models/modular_models.py
index 46b2ca3e26fd..aab27cf2d908 100644
--- a/nemo/collections/multimodal/speech_llm/models/modular_models.py
+++ b/nemo/collections/multimodal/speech_llm/models/modular_models.py
@@ -21,11 +21,11 @@
 import sacrebleu
 import torch
 from hydra.utils import get_class
+from lightning.pytorch.trainer.trainer import Trainer
+from lightning.pytorch.utilities import rank_zero_only
 from omegaconf import ListConfig
 from omegaconf.dictconfig import DictConfig
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
-from pytorch_lightning.utilities import rank_zero_only
 
 from nemo.collections.asr.models import ASRModel, EncDecSpeakerLabelModel
 from nemo.collections.asr.parts.utils.eval_utils import remove_punctuations
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py b/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py
index 79fc0468e819..a99f5c346831 100644
--- a/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py
+++ b/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py
@@ -21,10 +21,10 @@
 
 import sacrebleu
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import ListConfig
 from omegaconf.dictconfig import DictConfig
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.asr.models import ASRModel, SpeechEncDecSelfSupervisedModel
 from nemo.collections.common.data.utils import move_data_to_device
diff --git a/nemo/collections/nlp/data/machine_translation/preproc_mt_data.py b/nemo/collections/nlp/data/machine_translation/preproc_mt_data.py
index b95993ded69e..59181d8cb89f 100644
--- a/nemo/collections/nlp/data/machine_translation/preproc_mt_data.py
+++ b/nemo/collections/nlp/data/machine_translation/preproc_mt_data.py
@@ -21,8 +21,8 @@
 import tempfile
 
 from joblib import Parallel, delayed
+from lightning.pytorch import Trainer
 from omegaconf import ListConfig, OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer, create_spt_model
 from nemo.collections.nlp.data.language_modeling.sentence_dataset import SentenceDataset
@@ -33,23 +33,23 @@
 
 
 class MTDataPreproc:
-    """ Automatically trains tokenizers and preprocesses machine translation data based on the MTEncDecModelConfig.
-        For training NMT models with datasets larger than 5M sentence pairs, 
-        it can be inefficient to train them without first creating a tarred dataset. 
-        If the user wants to change the tokenizer, vocab size, or batch size, for example, 
-        they must reprocess the data with the correct configuration. 
-        With MTDataPreproc users can sweep through data configurations and the tarred dataset will 
-        be automatically created according to the model configuration.
-        To train tokenizer model and create tarred dataset specify in configuration:
-            model.preproc_out_dir=/path/to/preproc_out
-            model.encoder_tokenizer.vocab_size=32000
-            model.decoder_tokenizer.vocab_size=32000 
-            model.train_ds.use_tarred_dataset=True 
-            model.train_ds.src_file_name=/path/to/src.txt
-            model.train_ds.tgt_file_name=/path/to/tgt.txt
-            model.train_ds.tokens_in_batch=16000 
-        Once a dataset has been constructed based on this configuration, MTDataPreproc will not process it again.
-        If a previously trained tokenizer model or tarred dataset is found, MTDataPreproc will not preprocess the data.
+    """Automatically trains tokenizers and preprocesses machine translation data based on the MTEncDecModelConfig.
+    For training NMT models with datasets larger than 5M sentence pairs,
+    it can be inefficient to train them without first creating a tarred dataset.
+    If the user wants to change the tokenizer, vocab size, or batch size, for example,
+    they must reprocess the data with the correct configuration.
+    With MTDataPreproc users can sweep through data configurations and the tarred dataset will
+    be automatically created according to the model configuration.
+    To train tokenizer model and create tarred dataset specify in configuration:
+        model.preproc_out_dir=/path/to/preproc_out
+        model.encoder_tokenizer.vocab_size=32000
+        model.decoder_tokenizer.vocab_size=32000
+        model.train_ds.use_tarred_dataset=True
+        model.train_ds.src_file_name=/path/to/src.txt
+        model.train_ds.tgt_file_name=/path/to/tgt.txt
+        model.train_ds.tokens_in_batch=16000
+    Once a dataset has been constructed based on this configuration, MTDataPreproc will not process it again.
+    If a previously trained tokenizer model or tarred dataset is found, MTDataPreproc will not preprocess the data.
     """
 
     def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None) -> None:
@@ -147,12 +147,16 @@ def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None) -> None:
                         global_rank=self.global_rank,
                         encoder_training_sample_size=cfg.encoder_tokenizer.get('training_sample_size', -1),
                         decoder_training_sample_size=cfg.decoder_tokenizer.get('training_sample_size', -1),
-                        encoder_special_tokens=OmegaConf.to_container(cfg.encoder_tokenizer.special_tokens)
-                        if cfg.encoder_tokenizer.special_tokens
-                        else None,
-                        decoder_special_tokens=OmegaConf.to_container(cfg.decoder_tokenizer.special_tokens)
-                        if cfg.decoder_tokenizer.special_tokens
-                        else None,
+                        encoder_special_tokens=(
+                            OmegaConf.to_container(cfg.encoder_tokenizer.special_tokens)
+                            if cfg.encoder_tokenizer.special_tokens
+                            else None
+                        ),
+                        decoder_special_tokens=(
+                            OmegaConf.to_container(cfg.decoder_tokenizer.special_tokens)
+                            if cfg.decoder_tokenizer.special_tokens
+                            else None
+                        ),
                         spt_symbols=spt_symbols,
                     )
                     # update config
@@ -280,10 +284,10 @@ def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None) -> None:
                     )
 
     def tar_files_to_string(self, tar_files):
-        """ Tar files are generated in the following format: basename.number.tar
+        """Tar files are generated in the following format: basename.number.tar
             Where number is an integer from 1 to the number of tar files.
             We convert this list to a string that can be used in the model config to specify
-            tarred datasets: basename_OP_1..num_tar_files_CL_.tar 
+            tarred datasets: basename_OP_1..num_tar_files_CL_.tar
 
         Args:
             tar_files (List[str]): List of tar files generated by preprocess_parallel_dataset
@@ -337,7 +341,9 @@ def get_enc_dec_tokenizers(
 
     @staticmethod
     def get_monolingual_tokenizer(
-        tokenizer_name=None, tokenizer_model=None, bpe_dropout=0.0,
+        tokenizer_name=None,
+        tokenizer_model=None,
+        bpe_dropout=0.0,
     ):
         if tokenizer_name == 'sentencepiece':
             tokenizer = SentencePieceTokenizer(model_path=tokenizer_model)
@@ -385,14 +391,14 @@ def preprocess_parallel_dataset(
             src_fname (str): path to source text data
             tgt_fname (str): path to target text data
             out_dir (str): path to write tarred dataset
-            encoder_tokenizer (Any): tokenizer for encoder 
+            encoder_tokenizer (Any): tokenizer for encoder
             decoder_tokenizer (Any): tokenizer for decoder
-            max_seq_length (int): maximum sequence length 
-            min_seq_length (int): minimum sequence length 
-            tokens_in_batch (int): tokens per batch per GPU, effectively batch size 
+            max_seq_length (int): maximum sequence length
+            min_seq_length (int): minimum sequence length
+            tokens_in_batch (int): tokens per batch per GPU, effectively batch size
             lines_per_dataset_fragment (int): number of lines to consider for bucketing and padding
             num_batches_per_tarfile (int): number of batches (pickle files) within each tarfile
-            tar_file_prefix (str) : add string prefix to tar files 
+            tar_file_prefix (str) : add string prefix to tar files
             n_jobs (int): number of processes to use for data processing (-2 to use all but 2)
         """
 
@@ -471,7 +477,10 @@ def preprocess_parallel_dataset(
                                 out_dir,
                                 f'remainder-batches.tokens.{tokens_in_batch}.tar_file_{remainder_tar_file_ctr}.tar',
                             )
-                            remainder_tar_file_ptr = tarfile.open(remainder_tar_file_path, 'w',)
+                            remainder_tar_file_ptr = tarfile.open(
+                                remainder_tar_file_path,
+                                'w',
+                            )
                             batch_in_tar_ctr = 0
                     tar_file_ptr.close()
                     os.remove(tar_file_path)
@@ -631,9 +640,9 @@ def preprocess_monolingual_dataset(
             fname (str): Path to source text data
             out_dir (str): Path to write tarred dataset
             tokenizer (Any): Path to tokenizer model
-            max_seq_length (int): maximum sequence length 
-            min_seq_length (int): minimum sequence length 
-            tokens_in_batch (int): tokens per batch per GPU, effectively batch size 
+            max_seq_length (int): maximum sequence length
+            min_seq_length (int): minimum sequence length
+            tokens_in_batch (int): tokens per batch per GPU, effectively batch size
             lines_per_dataset_fragment (int): number of lines to consider for bucketing and padding
             num_batches_per_tarfile (int): number of batches (pickle files) within each tarfile
             global_rank (int): if set to zero, data will be processed on this node
@@ -808,7 +817,8 @@ def train_tokenizers(
                                 split_by_whitespace=split_by_whitespace,
                             )
                             os.rename(
-                                os.path.join(out_dir, 'tokenizer.model'), encoder_tokenizer_model,
+                                os.path.join(out_dir, 'tokenizer.model'),
+                                encoder_tokenizer_model,
                             )
         else:
             if encoder_tokenizer_name in supported_train_tokenizers:
@@ -1007,7 +1017,10 @@ def write_parallel_batches_to_tarfiles(
                 tar_file_path = os.path.join(
                     out_dir, 'fragment-%s-batches.tokens.%d.%d.tar' % (fragment_index, num_tokens, tar_file_ctr)
                 )
-                tar_file_ptr = tarfile.open(tar_file_path, 'w',)
+                tar_file_ptr = tarfile.open(
+                    tar_file_path,
+                    'w',
+                )
                 batch_ctr = 0
 
         # return tar files paths that have batches remaining
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
index 07ca790866c7..6c7472b95c42 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
@@ -21,8 +21,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 from transformers import AutoModelWithLMHead
 
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
index 116605b65d52..7fb0ba770189 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
@@ -19,8 +19,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 from transformers import AutoModelWithLMHead
 
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
index 29e2627fa038..9bf7ae2a9116 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
@@ -19,8 +19,8 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from transformers import AutoModel
 
 from nemo.collections.nlp.data.dialogue import DialogueSGDDataProcessor
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
index 48f3e5127a88..3f0d09d7dc66 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
@@ -18,8 +18,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 from transformers import AutoModelForSeq2SeqLM
 
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
index 5298c060df08..1df19cf8a556 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
@@ -19,8 +19,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 from nemo.collections.nlp.data.dialogue import DialogueSGDDataProcessor
diff --git a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
index 777d468084e2..09a81b33c973 100644
--- a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
+++ b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
@@ -16,8 +16,8 @@
 from typing import Dict, List, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.common.losses import AggregatorLoss, CrossEntropyLoss
diff --git a/nemo/collections/nlp/models/dialogue/sgdqa_model.py b/nemo/collections/nlp/models/dialogue/sgdqa_model.py
index 3b30dfccd9ce..6cd2243423a4 100644
--- a/nemo/collections/nlp/models/dialogue/sgdqa_model.py
+++ b/nemo/collections/nlp/models/dialogue/sgdqa_model.py
@@ -22,8 +22,8 @@
 from typing import List, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.nlp.data.dialogue import DialogueSGDBERTDataset, DialogueSGDDataProcessor
diff --git a/nemo/collections/nlp/models/duplex_text_normalization/duplex_decoder.py b/nemo/collections/nlp/models/duplex_text_normalization/duplex_decoder.py
index 7d4cac46cc28..253962e55621 100644
--- a/nemo/collections/nlp/models/duplex_text_normalization/duplex_decoder.py
+++ b/nemo/collections/nlp/models/duplex_text_normalization/duplex_decoder.py
@@ -19,8 +19,8 @@
 from typing import Dict, List, Optional, Union
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
 
 import nemo.collections.nlp.data.text_normalization.constants as constants
@@ -307,7 +307,7 @@ def _infer(
         span_ends: List[List[int]],
         inst_directions: List[str],
     ):
-        """ Main function for Inference
+        """Main function for Inference
         Args:
             sents: A list of inputs tokenized by a basic tokenizer.
             nb_spans: A list of ints where each int indicates the number of semiotic spans in each input.
@@ -521,9 +521,9 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, data_split: str):
                 tokenizer_name=self.transformer_name,
                 mode=self.mode,
                 max_len=self.max_sequence_len,
-                decoder_data_augmentation=cfg.get('decoder_data_augmentation', False)
-                if data_split == "train"
-                else False,
+                decoder_data_augmentation=(
+                    cfg.get('decoder_data_augmentation', False) if data_split == "train" else False
+                ),
                 lang=self.lang,
                 use_cache=cfg.get('use_cache', False),
                 max_insts=cfg.get('max_insts', -1),
diff --git a/nemo/collections/nlp/models/duplex_text_normalization/duplex_tagger.py b/nemo/collections/nlp/models/duplex_text_normalization/duplex_tagger.py
index feeda99bdbe5..1ce005403999 100644
--- a/nemo/collections/nlp/models/duplex_text_normalization/duplex_tagger.py
+++ b/nemo/collections/nlp/models/duplex_text_normalization/duplex_tagger.py
@@ -16,8 +16,8 @@
 from typing import Dict, List, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from torch import nn
 from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification
 from transformers.tokenization_utils_base import BatchEncoding
@@ -151,7 +151,7 @@ def on_test_epoch_end(self):
     # Functions for inference
     @torch.no_grad()
     def _infer(self, sents: List[List[str]], inst_directions: List[str]):
-        """ Main function for Inference
+        """Main function for Inference
 
         Args:
             sents: A list of inputs tokenized by a basic tokenizer.
@@ -248,7 +248,7 @@ def _infer(self, sents: List[List[str]], inst_directions: List[str]):
         return all_tag_preds, nb_spans, span_starts, span_ends
 
     def _postprocess_tag_preds(self, words: List[str], inst_dir: str, preds: List[str]):
-        """ Function for postprocessing the raw tag predictions of the model. It
+        """Function for postprocessing the raw tag predictions of the model. It
         corrects obvious mistakes in the tag predictions such as a TRANSFORM span
         starts with I_TRANSFORM_TAG (instead of B_TRANSFORM_TAG).
 
@@ -280,7 +280,7 @@ def _postprocess_tag_preds(self, words: List[str], inst_dir: str, preds: List[st
         return final_preds
 
     def decode_tag_preds(self, tag_preds: List[List[str]]):
-        """ Decoding the raw tag predictions to locate the semiotic spans in the
+        """Decoding the raw tag predictions to locate the semiotic spans in the
         input texts.
 
         Args:
diff --git a/nemo/collections/nlp/models/enc_dec_nlp_model.py b/nemo/collections/nlp/models/enc_dec_nlp_model.py
index d9aa3c017bae..60c6b616c20a 100644
--- a/nemo/collections/nlp/models/enc_dec_nlp_model.py
+++ b/nemo/collections/nlp/models/enc_dec_nlp_model.py
@@ -15,8 +15,8 @@
 from dataclasses import dataclass
 from typing import Any
 
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.omegaconf import MISSING
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.collections.nlp.modules.common.decoder_module import DecoderModule
@@ -35,8 +35,7 @@ class EncDecNLPModelConfig(ModelConfig):
 
 
 class EncDecNLPModel(NLPModel):
-    """Base class for encoder-decoder NLP models.
-    """
+    """Base class for encoder-decoder NLP models."""
 
     def __init__(self, cfg: EncDecNLPModelConfig, trainer: Trainer = None):
         super().__init__(cfg=cfg, trainer=trainer)
diff --git a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
index 4afae81e3893..640520cdaaa7 100644
--- a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
+++ b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
@@ -15,8 +15,8 @@
 from typing import Dict, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from transformers import AutoTokenizer
 
 from nemo.collections.common.losses import MultiSimilarityLoss
diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
index 4447ebb89386..e90cf9d88c30 100644
--- a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
+++ b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
@@ -19,8 +19,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.losses import CrossEntropyLoss, MSELoss
 from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUE_TASKS_NUM_LABELS, GLUEDataset
diff --git a/nemo/collections/nlp/models/information_retrieval/base_ir_model.py b/nemo/collections/nlp/models/information_retrieval/base_ir_model.py
index 67424320d185..91d86fef1851 100644
--- a/nemo/collections/nlp/models/information_retrieval/base_ir_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/base_ir_model.py
@@ -17,8 +17,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.data import BertInformationRetrievalDataset
 from nemo.collections.nlp.models.nlp_model import NLPModel
diff --git a/nemo/collections/nlp/models/information_retrieval/bert_dpr_model.py b/nemo/collections/nlp/models/information_retrieval/bert_dpr_model.py
index 03b62d91170c..bfbec123d13e 100644
--- a/nemo/collections/nlp/models/information_retrieval/bert_dpr_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/bert_dpr_model.py
@@ -15,8 +15,8 @@
 from typing import Dict, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.losses import SmoothedCrossEntropyLoss
 from nemo.collections.nlp.data import BertInformationRetrievalDataset
@@ -63,29 +63,50 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
     @typecheck()
     def forward(
-        self, q_input_ids, q_token_type_ids, q_attention_mask, p_input_ids, p_token_type_ids, p_attention_mask,
+        self,
+        q_input_ids,
+        q_token_type_ids,
+        q_attention_mask,
+        p_input_ids,
+        p_token_type_ids,
+        p_attention_mask,
     ):
 
         q_vectors = self.q_encoder(
-            input_ids=q_input_ids, token_type_ids=q_token_type_ids, attention_mask=q_attention_mask,
+            input_ids=q_input_ids,
+            token_type_ids=q_token_type_ids,
+            attention_mask=q_attention_mask,
         )
         q_vectors = q_vectors[:, 0]
         batch_size, hidden_size = q_vectors.size()
 
         p_vectors = self.p_encoder(
-            input_ids=p_input_ids, token_type_ids=p_token_type_ids, attention_mask=p_attention_mask,
+            input_ids=p_input_ids,
+            token_type_ids=p_token_type_ids,
+            attention_mask=p_attention_mask,
         )
         num_passages = p_vectors.shape[0] // batch_size
         p_vectors = p_vectors[:, 0].view(-1, num_passages, hidden_size)
         p_positives, p_negatives = p_vectors[:, 0], p_vectors[:, 1:]
         scores = torch.cat(
-            (torch.matmul(q_vectors, p_positives.T), torch.einsum("ij,ipj->ip", q_vectors, p_negatives),), dim=1,
+            (
+                torch.matmul(q_vectors, p_positives.T),
+                torch.einsum("ij,ipj->ip", q_vectors, p_negatives),
+            ),
+            dim=1,
         )
 
         return scores
 
     def compute_scores_and_loss(self, inputs):
-        (q_input_ids, q_input_mask, q_input_type_ids, p_input_ids, p_input_mask, p_input_type_ids,) = inputs
+        (
+            q_input_ids,
+            q_input_mask,
+            q_input_type_ids,
+            p_input_ids,
+            p_input_mask,
+            p_input_type_ids,
+        ) = inputs
         batch_size, num_passages, p_seq_length = p_input_ids.size()
         q_seq_length = q_input_ids.size()[-1]
 
@@ -100,10 +121,17 @@ def compute_scores_and_loss(self, inputs):
         normalized_scores = torch.log_softmax(scores, dim=-1)
 
         labels = torch.arange(batch_size)[:, None].long().to(normalized_scores.device)
-        loss = self.loss(log_probs=normalized_scores, labels=labels, output_mask=torch.ones_like(labels),)
+        loss = self.loss(
+            log_probs=normalized_scores,
+            labels=labels,
+            output_mask=torch.ones_like(labels),
+        )
 
         scores = scores[:, 0]
-        scores = torch.cat((torch.diag(scores)[:, None], scores[:, batch_size:]), dim=1,)
+        scores = torch.cat(
+            (torch.diag(scores)[:, None], scores[:, batch_size:]),
+            dim=1,
+        )
 
         return scores, loss
 
diff --git a/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py b/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py
index a4dc4356342a..33885e6b50c6 100644
--- a/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py
@@ -15,8 +15,8 @@
 from typing import Dict, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.losses import SmoothedCrossEntropyLoss
 from nemo.collections.nlp.models.information_retrieval.base_ir_model import BaseIRModel
@@ -53,7 +53,9 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.bert_model = self.get_lm_model_with_padded_embedding(cfg)
         hidden_size = self.bert_model.config.hidden_size
         self.sim_score_regressor = SequenceRegression(
-            hidden_size=hidden_size, num_layers=1, dropout=cfg.language_model.sim_score_dropout,
+            hidden_size=hidden_size,
+            num_layers=1,
+            dropout=cfg.language_model.sim_score_dropout,
         )
         self.loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id)
 
@@ -61,7 +63,9 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
     def forward(self, input_ids, attention_mask, token_type_ids):
 
         hidden_states = self.bert_model(
-            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
         )
         if isinstance(hidden_states, tuple):
             hidden_states = hidden_states[0]
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
index 5e38b61938c9..a5b71d5bcb69 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
@@ -16,13 +16,11 @@
 import os
 
 import numpy as np
-
-
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
 from omegaconf import DictConfig, OmegaConf, open_dict
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 from torch.distributed import all_gather as all_gather_no_backprop
 from torch.distributed.nn.functional import all_gather as all_gather_with_backprop
 
@@ -46,7 +44,6 @@
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import logging
 
-
 try:
     from megatron.core import parallel_state
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
index c7565f45358e..b5240ec2e170 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
@@ -17,8 +17,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig, ListConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.information_retrieval.gpt_embedding_dataset import GPTEmbeddingDataset
 from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py
index e316871fe607..fa593adf5c8f 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py
@@ -17,8 +17,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig, ListConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.information_retrieval.gpt_embedding_dataset import GPTRerankerDataset
 from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
diff --git a/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py b/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py
index 0cd1d07af5dd..a49bc699ab24 100644
--- a/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py
+++ b/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py
@@ -17,8 +17,8 @@
 from typing import Dict, List, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.common.losses import AggregatorLoss, CrossEntropyLoss
@@ -38,8 +38,7 @@
 
 class IntentSlotClassificationModel(NLPModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        """ Initializes BERT Joint Intent and Slot model.
-        """
+        """Initializes BERT Joint Intent and Slot model."""
         self.max_seq_length = cfg.language_model.max_seq_length
         # init superclass
         # Check the presence of data_dir.
@@ -75,7 +74,7 @@ def _set_defaults_data_desc(self, cfg):
             OmegaConf.set_struct(cfg, True)
 
     def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds):
-        """ Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc. """
+        """Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc."""
         # Save data from data desc to config - so it can be reused later, e.g. in inference.
         data_desc = IntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix])
         OmegaConf.set_struct(cfg, False)
@@ -109,7 +108,7 @@ def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds):
         OmegaConf.set_struct(cfg, True)
 
     def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None:
-        """ Saves label ids map to a file """
+        """Saves label ids map to a file"""
         with open(filename, 'w') as out:
             labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1]))
             out.write('\n'.join(labels))
@@ -117,7 +116,7 @@ def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None:
             logging.info(f'Labels mapping saved to : {out.name}')
 
     def _reconfigure_classifier(self):
-        """ Method reconfigures the classifier depending on the settings of model cfg.data_desc """
+        """Method reconfigures the classifier depending on the settings of model cfg.data_desc"""
 
         self.classifier = SequenceTokenClassifier(
             hidden_size=self.hidden_size,
diff --git a/nemo/collections/nlp/models/intent_slot_classification/multi_label_intent_slot_classification_model.py b/nemo/collections/nlp/models/intent_slot_classification/multi_label_intent_slot_classification_model.py
index c689b97ab0a5..7a2bec1f2cc0 100644
--- a/nemo/collections/nlp/models/intent_slot_classification/multi_label_intent_slot_classification_model.py
+++ b/nemo/collections/nlp/models/intent_slot_classification/multi_label_intent_slot_classification_model.py
@@ -18,8 +18,8 @@
 import numpy as np
 import numpy.typing as npt
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 from sklearn.metrics import f1_score, precision_score, recall_score
 from torch.utils.data import DataLoader
 
@@ -38,10 +38,10 @@
 
 class MultiLabelIntentSlotClassificationModel(IntentSlotClassificationModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        """ 
+        """
         Initializes BERT Joint Intent and Slot model.
 
-        Args: 
+        Args:
             cfg: configuration object
             trainer: trainer for Pytorch Lightning
         """
@@ -69,12 +69,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
     def _set_data_desc_to_cfg(
         self, cfg: DictConfig, data_dir: str, train_ds: DictConfig, validation_ds: DictConfig
     ) -> None:
-        """ 
-        Creates MultiLabelIntentSlotDataDesc and copies generated values to Configuration object's data descriptor. 
-        
-        Args: 
+        """
+        Creates MultiLabelIntentSlotDataDesc and copies generated values to Configuration object's data descriptor.
+
+        Args:
             cfg: configuration object
-            data_dir: data directory 
+            data_dir: data directory
             train_ds: training dataset file name
             validation_ds: validation dataset file name
 
@@ -101,7 +101,10 @@ def _set_data_desc_to_cfg(
         if not hasattr(cfg, "class_labels") or cfg.class_labels is None:
             cfg.class_labels = {}
             cfg.class_labels = OmegaConf.create(
-                {"intent_labels_file": "intent_labels.csv", "slot_labels_file": "slot_labels.csv",}
+                {
+                    "intent_labels_file": "intent_labels.csv",
+                    "slot_labels_file": "slot_labels.csv",
+                }
             )
 
         slot_labels_file = os.path.join(data_dir, cfg.class_labels.slot_labels_file)
@@ -114,7 +117,7 @@ def _set_data_desc_to_cfg(
         OmegaConf.set_struct(cfg, True)
 
     def _reconfigure_classifier(self) -> None:
-        """ Method reconfigures the classifier depending on the settings of model cfg.data_desc """
+        """Method reconfigures the classifier depending on the settings of model cfg.data_desc"""
 
         self.classifier = SequenceTokenClassifier(
             hidden_size=self.bert_model.config.hidden_size,
@@ -135,7 +138,8 @@ def _reconfigure_classifier(self) -> None:
             self.slot_loss = CrossEntropyLoss(logits_ndim=3)
 
         self.total_loss = AggregatorLoss(
-            num_inputs=2, weights=[self.cfg.intent_loss_weight, 1.0 - self.cfg.intent_loss_weight],
+            num_inputs=2,
+            weights=[self.cfg.intent_loss_weight, 1.0 - self.cfg.intent_loss_weight],
         )
 
         # setup to track metrics
@@ -161,12 +165,22 @@ def validation_step(self, batch, batch_idx) -> None:
             batch: batches of data from DataLoader
             batch_idx: batch idx from DataLoader
 
-        Returns: 
+        Returns:
             None
         """
-        (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, intent_labels, slot_labels,) = batch
+        (
+            input_ids,
+            input_type_ids,
+            input_mask,
+            loss_mask,
+            subtokens_mask,
+            intent_labels,
+            slot_labels,
+        ) = batch
         intent_logits, slot_logits = self(
-            input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,
+            input_ids=input_ids,
+            token_type_ids=input_type_ids,
+            attention_mask=input_mask,
         )
 
         # calculate combined loss for intents and slots
@@ -201,7 +215,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig) -> DataLoader:
 
         Args:
             cfg: configuration object
-        
+
         Returns:
             DataLoader for model's data
         """
@@ -289,8 +303,8 @@ def prediction_probabilities(self, queries: List[str], test_ds: DictConfig) -> n
 
     def optimize_threshold(self, test_ds: DictConfig, file_name: str) -> None:
         """
-        Set the optimal threshold of the model from performance on validation set. This threshold is used to round the 
-        logits to 0 or 1. 
+        Set the optimal threshold of the model from performance on validation set. This threshold is used to round the
+        logits to 0 or 1.
 
         Args:
             test_ds: location of test dataset
@@ -361,16 +375,16 @@ def predict_from_examples(
             queries: text sequences
             test_ds: Dataset configuration section.
             threshold: Threshold for rounding prediction logits
-        
+
         Returns:
             predicted_intents: model intent predictions with their probabilities
-                Example:  [[('flight', 0.84)], [('airfare', 0.54), 
+                Example:  [[('flight', 0.84)], [('airfare', 0.54),
                             ('flight', 0.73), ('meal', 0.24)]]
             predicted_slots: model slot predictions
                 Example:  ['O B-depart_date.month_name B-depart_date.day_number',
                            'O O B-flight_stop O O O']
 
-            predicted_vector: model intent predictions for each individual query. Binary values within each list 
+            predicted_vector: model intent predictions for each individual query. Binary values within each list
                 indicate whether a class is prediced for the given query (1 for True, 0 for False)
                 Example: [[1,0,0,0,0,0], [0,0,1,0,0,0]]
         """
diff --git a/nemo/collections/nlp/models/language_modeling/bert_lm_model.py b/nemo/collections/nlp/models/language_modeling/bert_lm_model.py
index 6b03d86982b0..dc7103b67aa6 100644
--- a/nemo/collections/nlp/models/language_modeling/bert_lm_model.py
+++ b/nemo/collections/nlp/models/language_modeling/bert_lm_model.py
@@ -16,8 +16,8 @@
 from typing import Dict, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.common.losses import AggregatorLoss, CrossEntropyLoss, SmoothedCrossEntropyLoss
@@ -75,7 +75,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             config_file = self.register_artifact('language_model.config_file', cfg.language_model.config_file)
 
         self.bert_model = get_lm_model(
-            config_file=config_file, config_dict=config_dict, vocab_file=vocab_file, trainer=trainer, cfg=cfg,
+            config_file=config_file,
+            config_dict=config_dict,
+            vocab_file=vocab_file,
+            trainer=trainer,
+            cfg=cfg,
         )
 
         self.hidden_size = self.bert_model.config.hidden_size
@@ -127,7 +131,9 @@ def forward(self, input_ids, attention_mask, token_type_ids):
         in the `nn.Module` in vanilla PyTorch.
         """
         hidden_states = self.bert_model(
-            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
         )
         if isinstance(hidden_states, tuple):
             hidden_states = hidden_states[0]
@@ -225,7 +231,9 @@ def _setup_preprocessed_dataloader(self, cfg: Optional[DictConfig]):
             files = [dataset]
         files.sort()
         dl = BertPretrainingPreprocessedDataloader(
-            data_files=files, max_predictions_per_seq=max_predictions_per_seq, batch_size=batch_size,
+            data_files=files,
+            max_predictions_per_seq=max_predictions_per_seq,
+            batch_size=batch_size,
         )
         return dl
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bart_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bart_model.py
index 1c768829e3e2..4a53edacb566 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bart_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bart_model.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
 
@@ -48,7 +48,9 @@ def _validate_cfg(self):
     @property
     def _build_train_valid_test_datasets_kwargs(self):
         """allows child classes to add kwargs to dataset building"""
-        return dict(delete_mask_prob=self._cfg.data.get('delete_mask_prob', 0.0),)
+        return dict(
+            delete_mask_prob=self._cfg.data.get('delete_mask_prob', 0.0),
+        )
 
     def list_available_models(self):
         pass
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index d2a21e50e486..dee36b255297 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -23,12 +23,12 @@
 import omegaconf
 import torch
 import torch.nn as nn
+from lightning.pytorch.plugins.precision import MixedPrecisionPlugin
+from lightning.pytorch.trainer.connectors.logger_connector.fx_validator import _FxValidator
+from lightning.pytorch.trainer.trainer import Trainer
+from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from omegaconf import OmegaConf, open_dict
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
-from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator
-from pytorch_lightning.trainer.trainer import Trainer
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.collections.nlp.modules.common.megatron.attention import HAVE_FLASH_ATTENTION
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
index 2a356012c728..b00b6fcf0302 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
@@ -18,9 +18,9 @@
 from typing import Any, Optional
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
 from omegaconf.omegaconf import open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 from torch import Tensor
 
 from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 0eb5ea1c0048..e6945d1ada56 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -18,8 +18,8 @@
 
 import torch
 import torch.nn.functional as F
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.language_modeling.megatron import dataset_utils
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py
index c0a4b6351530..d3829c3e8de1 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import (
     TextToTextGLUEDataset,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py
index c6b4d055ef6e..44860c3178f6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py
@@ -21,8 +21,8 @@
 import os
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -162,7 +162,7 @@ def state_dict(self, destination=None, prefix=None, keep_vars=False):
 
     def load_state_dict(self, state_dict, strict: bool = True):
         """
-        Loads a state_dict expecting the state_dict to contain key,values 
+        Loads a state_dict expecting the state_dict to contain key,values
         only for the adapter parameters.
         """
         for name, module in self.frozen_model.named_modules():
@@ -176,13 +176,13 @@ def load_state_dict(self, state_dict, strict: bool = True):
 
     def setup_optimizer_param_groups(self):
         """
-        ModelPT override. Optimizer will get self._optimizer_param_groups. 
+        ModelPT override. Optimizer will get self._optimizer_param_groups.
         Makes two optimizer param groups, one for the frozen model params
-        and one for the prompt-table/prompt-encoder params. The learning 
+        and one for the prompt-table/prompt-encoder params. The learning
         rate for the frozen model's params will always be zero effectively
         freezing the model's params but still allowing for the needed gradients
-        to be passed around in pipeline parallel models. The prompt-encoder 
-        and/or prompt table will use the learning rate set by the user. 
+        to be passed around in pipeline parallel models. The prompt-encoder
+        and/or prompt table will use the learning rate set by the user.
         """
         self.frozen_model.freeze()  # Freeze the entire model
         opt_params = []
@@ -246,8 +246,8 @@ class MegatronGPTAdapterLearningModel(MegatronGPTBaseAdapterModel):
     Two adapter's are inserted into each Transformer layer in the base GPT Model.
 
     It is assumed that these set of adapters will then be trained for a specific task.
-    Once trained, the adapter weights will be saved and can be re-loaded 
-    and infused into the same GPT Model for inference. 
+    Once trained, the adapter weights will be saved and can be re-loaded
+    and infused into the same GPT Model for inference.
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
@@ -295,7 +295,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                 for adapter_key in self.adapter_name_keys:
                     if model_utils.import_class_by_path(adapter_cfg._target_) in module.get_accepted_adapter_types():
                         module.add_adapter(
-                            name=adapter_key, cfg=adapter_cfg,
+                            name=adapter_key,
+                            cfg=adapter_cfg,
                         )
 
         logging.info(f'After adding adapters:\n{self.frozen_model.summarize()}')
@@ -313,8 +314,8 @@ class MegatronGPTInfusedAdapterModel(MegatronGPTBaseAdapterModel):
     Three adapter's are inserted into each Transformer layer in the base GPT Model. Each adapter is basically a vector that simply scales the key, value or ffn hidden representations.
 
     It is assumed that these set of adapters will then be trained for a specific task.
-    Once trained, the adapter weights will be saved and can be re-loaded 
-    and infused into the same GPT Model for inference. 
+    Once trained, the adapter weights will be saved and can be re-loaded
+    and infused into the same GPT Model for inference.
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 8f541e5703e6..0e1bd83dfe1c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -24,11 +24,11 @@
 
 import packaging
 import torch
+from lightning.pytorch.accelerators import CPUAccelerator
+from lightning.pytorch.loops.fetchers import _DataFetcherWrapper
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.accelerators import CPUAccelerator
-from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.common.parts.utils import apply_rope_scaling, extend_instance
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
index 78f671142c1b..7d39459ae654 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -18,9 +18,9 @@
 from typing import Any, List, Optional, Union
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
 from omegaconf.omegaconf import open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
 from nemo.collections.nlp.data.language_modeling.megatron.gpt_prompt_learning_dataset import GPTPromptLearningDataset
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 08bc5501363c..2d3f43b2f2a8 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -17,9 +17,9 @@
 from typing import Any, Optional
 
 import torch
+from lightning.pytorch.loops.fetchers import _DataFetcherWrapper
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig, ListConfig
-from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.common.metrics import MetricStringToTorchMetric
 from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py b/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
index 1e5a2f0c15c0..40e147b90903 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_model import GriffinModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py
index c53d231b2719..584a4b0572f7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
@@ -48,8 +48,8 @@ def _reset_activation_checkpointing_args(self):
 
     def on_validation_model_zero_grad(self) -> None:
         """
-         Skip gradient zeroing at the beginning of validation routine.
-         This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
-         """
+        Skip gradient zeroing at the beginning of validation routine.
+        This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
+        """
         if not self.validation_param_sync_overlap:
             MegatronBaseModel.on_validation_model_zero_grad(self)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 7b92b9e25d69..f5b675c94891 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -18,11 +18,11 @@
 from typing import Any, Dict, List, Optional
 
 import torch
+from lightning.pytorch.accelerators import CPUAccelerator
+from lightning.pytorch.loops.fetchers import _DataFetcherWrapper
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf, open_dict
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.accelerators import CPUAccelerator
-from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
     MegatronPretrainingRandomSampler,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
index 4f0000dafaa2..ad92421ee607 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.utils import logging
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py
index ebcc47004711..cacdb1c190e7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py
@@ -12,15 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
 from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
 
-
 __all__ = ['MegatronMambaSFTModel']
 
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
index 42323e503f7d..147c832f4b9a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
@@ -16,8 +16,8 @@
 from typing import Any, List, Optional, Union
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
     MegatronPretrainingRandomSampler,
@@ -294,7 +294,10 @@ def training_step(self, batch, batch_idx):
             self.log('lr', lr, batch_size=1)
             self.log('global_step', self.trainer.global_step, prog_bar=True, batch_size=1)
             self.log(
-                'consumed_samples', self._compute_consumed_samples_after_training_step(), prog_bar=True, batch_size=1,
+                'consumed_samples',
+                self._compute_consumed_samples_after_training_step(),
+                prog_bar=True,
+                batch_size=1,
             )
             self._reduced_loss_buffer = []
         return lm_loss
@@ -427,7 +430,10 @@ def build_pretraining_data_loader(self, dataset, consumed_samples):
 
         # Torch dataloader.
         return torch.utils.data.DataLoader(
-            dataset, batch_sampler=batch_sampler, num_workers=self.cfg.data.num_workers, pin_memory=True,
+            dataset,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.data.num_workers,
+            pin_memory=True,
         )
 
     def setup(self, stage=None):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_fine_tune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_fine_tune_model.py
index 1eaec4238648..924da5825024 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retro_fine_tune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_fine_tune_model.py
@@ -15,8 +15,8 @@
 from functools import partial
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig, ListConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.common.data import ConcatMapDataset
 from nemo.collections.common.metrics import MetricStringToTorchMetric
@@ -50,11 +50,13 @@
 
 
 def build_all_datasets(
-    cfg, tokenizer, train_valid_test_num_samples,
+    cfg,
+    tokenizer,
+    train_valid_test_num_samples,
 ):
     """Build train, valid, and test RETRO datasets.
-       There is one to one mapping between data_prefix and knn_map_path.
-       Currently only supports one retrieval dataset.
+    There is one to one mapping between data_prefix and knn_map_path.
+    Currently only supports one retrieval dataset.
     """
     train_dataset = RetroQAFineTuneDataset(
         cfg.train_ds.get('file_name'),
@@ -97,7 +99,7 @@ def build_all_datasets(
 
 
 class MegatronRetroFinetuneModel(MegatronRetrievalModel):
-    """Finetune RETRO Model """
+    """Finetune RETRO Model"""
 
     def build_train_valid_test_datasets(self):
         logging.info('Building RETRO datasets.')
@@ -114,7 +116,9 @@ def build_train_valid_test_datasets(self):
         ]
 
         self._train_ds, self._validation_ds, self._test_ds = build_all_datasets(
-            cfg=self.cfg.data, tokenizer=self.tokenizer, train_valid_test_num_samples=train_valid_test_num_samples,
+            cfg=self.cfg.data,
+            tokenizer=self.tokenizer,
+            train_valid_test_num_samples=train_valid_test_num_samples,
         )
         if self._train_ds is not None:
             logging.info(f'Length of train dataset: {len(self._train_ds)}')
@@ -143,5 +147,9 @@ def build_pretraining_data_loader(self, dataset, consumed_samples):
             drop_last=True,
         )
         return torch.utils.data.DataLoader(
-            dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, num_workers=0, pin_memory=True,
+            dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn,
+            num_workers=0,
+            pin_memory=True,
         )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
index a6bf75fb9444..493d512fd30e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
@@ -23,10 +23,10 @@
 from typing import Any, Dict, Iterator, List, Optional, Union
 
 import torch
+from lightning.pytorch.accelerators import CPUAccelerator
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf, open_dict
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.accelerators import CPUAccelerator
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
     MegatronPretrainingRandomSampler,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t0_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t0_model.py
index cee1b11a160b..92827b31a259 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t0_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t0_model.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig, ListConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
     get_datasets_weights_and_num_samples,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py
index 31eb4519ded2..a6e6afc8b7eb 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py
@@ -21,9 +21,9 @@
 from typing import Any
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
 from omegaconf.omegaconf import open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
@@ -60,7 +60,15 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.adapter_name_keys = []
 
     def forward(
-        self, input_ids, dec_input, enc_mask, dec_mask, position_ids, taskname_ids, labels=None, inference=False,
+        self,
+        input_ids,
+        dec_input,
+        enc_mask,
+        dec_mask,
+        position_ids,
+        taskname_ids,
+        labels=None,
+        inference=False,
     ):
         # Call forward on T5 model with preprocessed embeddings
         if self.autocast_dtype == torch.float32:
@@ -195,13 +203,13 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> A
 
     def setup_optimizer_param_groups(self):
         """
-        ModelPT override. Optimizer will get self._optimizer_param_groups. 
+        ModelPT override. Optimizer will get self._optimizer_param_groups.
         Makes two optimizer param groups, one for the frozen model params
-        and one for the prompt-table/prompt-encoder params. The learning 
+        and one for the prompt-table/prompt-encoder params. The learning
         rate for the frozen model's params will always be zero effectively
         freezing the model's params but still allowing for the needed gradients
-        to be passed around in pipeline parallel models. The prompt-encoder 
-        and/or prompt table will use the learning rate set by the user. 
+        to be passed around in pipeline parallel models. The prompt-encoder
+        and/or prompt table will use the learning rate set by the user.
         """
         self.frozen_model.freeze()  # Freeze the entire model
         opt_params = []
@@ -266,7 +274,7 @@ def state_dict(self, destination=None, prefix=None, keep_vars=False):
 
     def load_state_dict(self, state_dict, strict: bool = True):
         """
-        Loads a state_dict expecting the state_dict to contain key,values 
+        Loads a state_dict expecting the state_dict to contain key,values
         only for the adapter parameters.
         """
         for name, module in self.frozen_model.named_modules():
@@ -319,7 +327,7 @@ def on_validation_epoch_end(self):
                 gather_results_dedup = list(set(itertools.chain(*gather_results)))
 
                 correct = 0
-                for (input, pred, label) in gather_results_dedup:
+                for input, pred, label in gather_results_dedup:
                     if pred == label:
                         correct += 1
 
@@ -559,8 +567,8 @@ class MegatronT5InfusedAdapterModel(MegatronT5BaseAdapterModel):
     Three adapter's are inserted into each Transformer layer in the base GPT Model. Each adapter is basically a vector that simply scales the key, value or ffn hidden representations.
 
     It is assumed that these set of adapters will then be trained for a specific task.
-    Once trained, the adapter weights will be saved and can be re-loaded 
-    and infused into the same GPT Model for inference. 
+    Once trained, the adapter weights will be saved and can be re-loaded
+    and infused into the same GPT Model for inference.
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
@@ -670,7 +678,7 @@ def state_dict(self, destination=None, prefix=None, keep_vars=False):
 
     def load_state_dict(self, state_dict, strict: bool = True):
         """
-        Loads a state_dict expecting the state_dict to contain key,values 
+        Loads a state_dict expecting the state_dict to contain key,values
         only for the adapter parameters.
         """
         encoder = self.frozen_model.enc_dec_model.enc_dec_model.encoder
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_model.py
index 0f5022795446..1df10403a9e7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_model.py
@@ -15,8 +15,8 @@
 import enum
 import math
 
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import build_train_valid_test_datasets
 from nemo.collections.nlp.models.language_modeling.megatron_lm_encoder_decoder_model import (
@@ -79,7 +79,9 @@ def _validate_cfg(self):
     @property
     def _build_train_valid_test_datasets_kwargs(self):
         """allows child classes to add kwargs to dataset building"""
-        return dict(max_seq_length_dec=self._cfg.data.seq_length_dec,)
+        return dict(
+            max_seq_length_dec=self._cfg.data.seq_length_dec,
+        )
 
     def _build_vocab(self):
         self.num_sentinel_tokens = self._cfg.tokenizer.num_sentinel_tokens
@@ -210,9 +212,9 @@ def build_train_valid_test_datasets(self):
         ]
 
         if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
-            train_valid_test_num_samples[
-                1
-            ] = 1  # This is to make sure we only have one epoch on every validation iteration
+            train_valid_test_num_samples[1] = (
+                1  # This is to make sure we only have one epoch on every validation iteration
+            )
 
         self._train_ds, self._validation_ds, self._test_ds = build_train_valid_test_datasets(
             cfg=self._cfg,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py
index 1f54cb87428e..187f24c884b7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py
@@ -16,10 +16,10 @@
 from typing import Any, List
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
 from omegaconf.omegaconf import open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.language_modeling.megatron.t5_prompt_learning_dataset import T5PromptLearningDataset
 from nemo.collections.nlp.models.language_modeling.megatron_base_prompt_learning_model import (
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
index c70f44925d33..6f9a69f27529 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
@@ -16,9 +16,9 @@
 from typing import Dict, List
 
 import torch
+from lightning.pytorch.loops.fetchers import _DataFetcherWrapper
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig, ListConfig
-from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.common.data import ConcatMapDataset
 from nemo.collections.common.metrics import MetricStringToTorchMetric
diff --git a/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py b/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py
index 69db0d46e75e..3b8e1f819ea1 100644
--- a/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py
+++ b/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py
@@ -19,8 +19,8 @@
 import numpy as np
 import torch
 import torch.utils.data as pt_data
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.losses import SmoothedCrossEntropyLoss
 from nemo.collections.common.metrics import GlobalAverageLossMetric
@@ -59,9 +59,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             tokenizer_model=cfg.tokenizer.get("tokenizer_model", None),
             vocab_file=cfg.tokenizer.get("vocab_file", None),
             bpe_dropout=cfg.tokenizer.get("bpe_dropout", 0.0),
-            special_tokens=OmegaConf.to_container(cfg.tokenizer.special_tokens)
-            if cfg.tokenizer.get("special_tokens", None)
-            else None,
+            special_tokens=(
+                OmegaConf.to_container(cfg.tokenizer.special_tokens)
+                if cfg.tokenizer.get("special_tokens", None)
+                else None
+            ),
         )
 
         # init superclass
@@ -99,7 +101,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         # tie weights of embedding and softmax matrices
         self.log_softmax.mlp.layer0.weight = self.encoder.embedding.token_embedding.weight
 
-        std_init_range = 1 / self.encoder.hidden_size ** 0.5
+        std_init_range = 1 / self.encoder.hidden_size**0.5
 
         # initialize weights if not using pretrained encoder
         if not self._cfg.encoder.get('pretrained', False):
@@ -199,7 +201,12 @@ def on_test_epoch_end(self):
         self.test_step_outputs.clear()  # free memory
 
     def setup_tokenizer(
-        self, tokenizer_name=None, tokenizer_model=None, vocab_file=None, bpe_dropout=0.0, special_tokens=None,
+        self,
+        tokenizer_name=None,
+        tokenizer_model=None,
+        vocab_file=None,
+        bpe_dropout=0.0,
+        special_tokens=None,
     ):
 
         supported_tokenizers = ['huggingface', 'sentencepiece', 'word']
diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
index 4461b417f311..b5f228f21e1a 100644
--- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
+++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
@@ -19,10 +19,10 @@
 
 import numpy as np
 import torch
+from lightning.pytorch.loops.fetchers import _DataFetcherWrapper
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
 from omegaconf.listconfig import ListConfig
-from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
-from pytorch_lightning.trainer.trainer import Trainer
 from sacrebleu import corpus_bleu
 
 from nemo.collections.nlp.data.common.sequence_to_sequence_dataset import (
diff --git a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_bottleneck_model.py b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_bottleneck_model.py
index 41c6125ba05f..96077c4da82e 100644
--- a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_bottleneck_model.py
+++ b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_bottleneck_model.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 
 from nemo.collections.common.losses import NLLLoss
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTBottleneckModelConfig
@@ -184,7 +184,11 @@ def loss(
             output_mask = (tgt_labels != self.decoder_tokenizer.pad_id).type_as(tgt_log_probs)
 
             log_p_x_given_z_per_token = (
-                -recon_loss_fn(log_probs=tgt_log_probs, labels=tgt_labels,).view(tgt_log_probs.shape[:2]) * output_mask
+                -recon_loss_fn(
+                    log_probs=tgt_log_probs,
+                    labels=tgt_labels,
+                ).view(tgt_log_probs.shape[:2])
+                * output_mask
             )
 
             # probability per sample
@@ -216,7 +220,10 @@ def loss(
 
         if self.model_type in ["mim", "vae"]:
             # tokens = tgt_mask.sum()
-            q_z_given_x = torch.distributions.Normal(loc=z_mean, scale=torch.exp(0.5 * z_logv),)
+            q_z_given_x = torch.distributions.Normal(
+                loc=z_mean,
+                scale=torch.exp(0.5 * z_logv),
+            )
             # average latent distribution to match averaging of observations
             if self.recon_per_token:
                 # average latent per dimension - to heuristically match per-token reconstruction
@@ -225,7 +232,10 @@ def loss(
                 log_q_z_given_x = q_z_given_x.log_prob(z).sum(-1).sum(-1).mean()
 
             # build prior distribution
-            p_z = torch.distributions.Normal(loc=torch.zeros_like(z), scale=torch.ones_like(z),)
+            p_z = torch.distributions.Normal(
+                loc=torch.zeros_like(z),
+                scale=torch.ones_like(z),
+            )
             if self.recon_per_token:
                 # average latent distribution similar to averaging of observations
                 log_p_z = p_z.log_prob(z).mean(-1).mean(-1).mean()
@@ -267,7 +277,11 @@ def forward(self, src, src_mask, tgt, tgt_mask, timer=None):
 
         if timer is not None:
             timer.start("encoder")
-        enc_hiddens, enc_mask = self.encoder(input_ids=src, encoder_mask=src_mask, return_mask=True,)
+        enc_hiddens, enc_mask = self.encoder(
+            input_ids=src,
+            encoder_mask=src_mask,
+            return_mask=True,
+        )
 
         # build posterior distribution q(x|z)
         z, z_mean, z_logv = self.encode_latent(hidden=enc_hiddens)
@@ -283,7 +297,10 @@ def forward(self, src, src_mask, tgt, tgt_mask, timer=None):
         context_hiddens = self.latent2hidden(z)
 
         tgt_hiddens = self.decoder(
-            input_ids=tgt, decoder_mask=tgt_mask, encoder_embeddings=context_hiddens, encoder_mask=enc_mask,
+            input_ids=tgt,
+            decoder_mask=tgt_mask,
+            encoder_embeddings=context_hiddens,
+            encoder_mask=enc_mask,
         )
 
         # build decoding distribution
@@ -426,18 +443,25 @@ def eval_step(self, batch, batch_idx, mode, dataloader_idx=0):
             return_info=True,
         )
         # pass cache to sampler in order to reuse encoder's output
-        cache = dict(z=z, z_mean=z_mean, z_mask=z_mask, timer=timer,)
+        cache = dict(
+            z=z,
+            z_mean=z_mean,
+            z_mask=z_mask,
+            timer=timer,
+        )
 
         inputs, translations = self.batch_translate(src=src_ids, src_mask=src_mask, cache=cache)
 
         num_measurements = labels.shape[0] * labels.shape[1]
         if dataloader_idx == 0:
             getattr(self, f'{mode}_loss')(
-                loss=eval_loss, num_measurements=num_measurements,
+                loss=eval_loss,
+                num_measurements=num_measurements,
             )
         else:
             getattr(self, f'{mode}_loss_{dataloader_idx}')(
-                loss=eval_loss, num_measurements=num_measurements,
+                loss=eval_loss,
+                num_measurements=num_measurements,
             )
         np_tgt = tgt_ids.detach().cpu().numpy()
         ground_truths = [self.decoder_tokenizer.ids_to_text(tgt) for tgt in np_tgt]
diff --git a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py
index 708d4236be7f..78b701699259 100644
--- a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py
+++ b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py
@@ -25,9 +25,9 @@
 import torch
 import torch.distributed as dist
 import torch.utils.data as pt_data
+from lightning.pytorch import Trainer
+from lightning.pytorch.utilities import rank_zero_only
 from omegaconf import DictConfig, ListConfig, OmegaConf
-from pytorch_lightning import Trainer
-from pytorch_lightning.utilities import rank_zero_only
 from sacrebleu import corpus_bleu
 
 from nemo.collections.common.data import ConcatDataset
@@ -120,17 +120,21 @@ def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None):
         encoder_tokenizer, decoder_tokenizer = MTEncDecModel.setup_enc_dec_tokenizers(
             encoder_tokenizer_library=self.encoder_tokenizer_library,
             encoder_tokenizer_model=encoder_tokenizer_model,
-            encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0)
-            if cfg.encoder_tokenizer.get('bpe_dropout', 0.0) is not None
-            else 0.0,
+            encoder_bpe_dropout=(
+                cfg.encoder_tokenizer.get('bpe_dropout', 0.0)
+                if cfg.encoder_tokenizer.get('bpe_dropout', 0.0) is not None
+                else 0.0
+            ),
             encoder_model_name=cfg.encoder.get('model_name') if hasattr(cfg.encoder, 'model_name') else None,
             encoder_r2l=cfg.encoder_tokenizer.get('r2l', False),
             decoder_tokenizer_library=self.decoder_tokenizer_library,
             encoder_tokenizer_vocab_file=encoder_vocab_file,
             decoder_tokenizer_model=decoder_tokenizer_model,
-            decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0)
-            if cfg.decoder_tokenizer.get('bpe_dropout', 0.0) is not None
-            else 0.0,
+            decoder_bpe_dropout=(
+                cfg.decoder_tokenizer.get('bpe_dropout', 0.0)
+                if cfg.decoder_tokenizer.get('bpe_dropout', 0.0) is not None
+                else 0.0
+            ),
             decoder_model_name=cfg.decoder.get('model_name') if hasattr(cfg.decoder, 'model_name') else None,
             decoder_r2l=cfg.decoder_tokenizer.get('r2l', False),
             special_tokens=self.special_tokens,
@@ -254,7 +258,7 @@ def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None):
         self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight
 
         # TODO: encoder and decoder with different hidden size?
-        std_init_range = 1 / self.encoder.hidden_size ** 0.5
+        std_init_range = 1 / self.encoder.hidden_size**0.5
 
         # initialize weights if not using pretrained encoder/decoder
         if not self._cfg.encoder.get('pretrained', False):
@@ -341,7 +345,10 @@ def filter_predicted_ids(cls, ids, decoder_tokenizer):
         return ids
 
     def test_encoder_ids(self, ids, raise_error=False):
-        invalid_ids = torch.logical_or((ids >= self.encoder_tokenizer.vocab_size).any(), (ids < 0).any(),)
+        invalid_ids = torch.logical_or(
+            (ids >= self.encoder_tokenizer.vocab_size).any(),
+            (ids < 0).any(),
+        )
 
         if raise_error and invalid_ids:
             raise ValueError("Encoder ids are out of range (tip: check encoder tokenizer)")
@@ -349,7 +356,10 @@ def test_encoder_ids(self, ids, raise_error=False):
         return not invalid_ids
 
     def test_decoder_ids(self, ids, raise_error=False):
-        invalid_ids = torch.logical_or((ids >= self.decoder_tokenizer.vocab_size).any(), (ids < 0).any(),)
+        invalid_ids = torch.logical_or(
+            (ids >= self.decoder_tokenizer.vocab_size).any(),
+            (ids < 0).any(),
+        )
 
         if raise_error and invalid_ids:
             raise ValueError("Decoder ids are out of range (tip: check decoder tokenizer)")
@@ -655,7 +665,10 @@ def setup_training_data(self, train_data_config: Optional[DictConfig]):
             multilingual=self.multilingual,
             multilingual_ids=self.multilingual_ids,
         )
-        self._train_dl = MTEncDecModel._setup_dataloader_from_config(cfg=train_data_config, dataset=self._train_ds,)
+        self._train_dl = MTEncDecModel._setup_dataloader_from_config(
+            cfg=train_data_config,
+            dataset=self._train_ds,
+        )
 
         # Need to set this because if using an IterableDataset, the length of the dataloader is the total number
         # of samples rather than the number of batches, and this messes up the tqdm progress bar.
@@ -714,7 +727,9 @@ def setup_validation_data(self, val_data_config: Optional[DictConfig]):
             for dataloader_idx in range(len(self._validation_dl)):
                 if dataloader_idx == 0:
                     setattr(
-                        self, f'val_loss', GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True),
+                        self,
+                        f'val_loss',
+                        GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True),
                     )
                 else:
                     setattr(
@@ -737,7 +752,9 @@ def setup_test_data(self, test_data_config: Optional[DictConfig]):
             for dataloader_idx in range(len(self._test_dl)):
                 if dataloader_idx == 0:
                     setattr(
-                        self, f'test_loss', GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True),
+                        self,
+                        f'test_loss',
+                        GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True),
                     )
                 else:
                     setattr(
@@ -886,13 +903,15 @@ def _setup_dataloader_from_config(cls, cfg, dataset):
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=1,
-            sampler=None
-            if (
-                cfg.get("use_tarred_dataset", False)
-                or cfg.get("dataset_type", "") == "tarred"
-                or isinstance(dataset, ConcatDataset)
-            )
-            else sampler,
+            sampler=(
+                None
+                if (
+                    cfg.get("use_tarred_dataset", False)
+                    or cfg.get("dataset_type", "") == "tarred"
+                    or isinstance(dataset, ConcatDataset)
+                )
+                else sampler
+            ),
             num_workers=cfg.get("num_workers", 2),
             pin_memory=cfg.get("pin_memory", False),
             drop_last=cfg.get("drop_last", False),
@@ -983,9 +1002,11 @@ def _setup_eval_dataloader_from_config(cls, cfg, datasets):
                 torch.utils.data.DataLoader(
                     dataset=dataset,
                     batch_size=1,
-                    sampler=None
-                    if (cfg.get("use_tarred_dataset", False) or isinstance(datasets[0], ConcatDataset))
-                    else sampler,
+                    sampler=(
+                        None
+                        if (cfg.get("use_tarred_dataset", False) or isinstance(datasets[0], ConcatDataset))
+                        else sampler
+                    ),
                     num_workers=cfg.get("num_workers", 2),
                     pin_memory=cfg.get("pin_memory", False),
                     drop_last=cfg.get("drop_last", False),
@@ -1188,7 +1209,10 @@ def translate(
             )
             if return_beam_scores:
                 _, all_translations, scores, best_translations = self.batch_translate(
-                    src, src_mask, return_beam_scores=True, cache=cache,
+                    src,
+                    src_mask,
+                    return_beam_scores=True,
+                    cache=cache,
                 )
                 return_val = all_translations, scores, best_translations
             else:
diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py
index b27c00c5d7c3..0c61b085bc7f 100644
--- a/nemo/collections/nlp/models/nlp_model.py
+++ b/nemo/collections/nlp/models/nlp_model.py
@@ -19,13 +19,13 @@
 from typing import Any, Mapping, Optional, Union
 
 import torch
-from lightning_fabric.utilities.cloud_io import _load as pl_load
+from lightning.fabric.utilities.cloud_io import _load as pl_load
+from lightning.pytorch import Trainer
+from lightning.pytorch.core.saving import _load_state as ptl_load_state
+from lightning.pytorch.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml
+from lightning.pytorch.utilities import rank_zero_only
+from lightning.pytorch.utilities.migration import pl_legacy_patch
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
-from pytorch_lightning.core.saving import _load_state as ptl_load_state
-from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml
-from pytorch_lightning.utilities import rank_zero_only
-from pytorch_lightning.utilities.migration import pl_legacy_patch
 from transformers import TRANSFORMERS_CACHE
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
diff --git a/nemo/collections/nlp/models/question_answering/qa_base_model.py b/nemo/collections/nlp/models/question_answering/qa_base_model.py
index 7ca78f2e136e..cb07e43c3dc1 100644
--- a/nemo/collections/nlp/models/question_answering/qa_base_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_base_model.py
@@ -15,8 +15,8 @@
 from typing import Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.data.question_answering.data_processor.qa_processing import (
     EVALUATION_MODE,
diff --git a/nemo/collections/nlp/models/question_answering/qa_bert_model.py b/nemo/collections/nlp/models/question_answering/qa_bert_model.py
index d4bdef6d871d..4036b23999d8 100644
--- a/nemo/collections/nlp/models/question_answering/qa_bert_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_bert_model.py
@@ -17,8 +17,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from transformers.models.bert.tokenization_bert import BasicTokenizer
 
 from nemo.collections.common.losses import SpanningLoss
diff --git a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py
index 059cf5625f15..f8c883643fe0 100644
--- a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py
@@ -16,8 +16,8 @@
 from typing import List, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 from transformers import AutoModelForCausalLM
 
 from nemo.collections.nlp.data.question_answering.data_processor.qa_processing import QAProcessor
diff --git a/nemo/collections/nlp/models/question_answering/qa_model.py b/nemo/collections/nlp/models/question_answering/qa_model.py
index 2147d7d6a5bf..01b07bb8b3b0 100644
--- a/nemo/collections/nlp/models/question_answering/qa_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_model.py
@@ -16,8 +16,8 @@
 from typing import Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 from torch.cuda.amp import autocast
 
 from nemo.collections.common.losses import SpanningLoss
diff --git a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py
index 5ad959fd1b6f..a703e23bc837 100644
--- a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py
@@ -16,8 +16,8 @@
 from typing import List, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from torch.cuda.amp import autocast
 from transformers import AutoModelForSeq2SeqLM
 
diff --git a/nemo/collections/nlp/models/rag/custom_bert_embedder.py b/nemo/collections/nlp/models/rag/custom_bert_embedder.py
index d27ee98a14ef..84361e2728b5 100644
--- a/nemo/collections/nlp/models/rag/custom_bert_embedder.py
+++ b/nemo/collections/nlp/models/rag/custom_bert_embedder.py
@@ -15,10 +15,10 @@
 from typing import Any, List
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from llama_index.core.bridge.pydantic import PrivateAttr
 from llama_index.core.embeddings import BaseEmbedding
 from omegaconf import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
diff --git a/nemo/collections/nlp/models/rag/custom_gpt_llm.py b/nemo/collections/nlp/models/rag/custom_gpt_llm.py
index f26a86cfaaf7..1bbeed38991b 100644
--- a/nemo/collections/nlp/models/rag/custom_gpt_llm.py
+++ b/nemo/collections/nlp/models/rag/custom_gpt_llm.py
@@ -14,10 +14,10 @@
 
 from typing import Any
 
+from lightning.pytorch.trainer.trainer import Trainer
 from llama_index.core.bridge.pydantic import PrivateAttr
 from llama_index.core.llms import CompletionResponse, CompletionResponseGen, CustomLLM, LLMMetadata
 from llama_index.core.llms.callbacks import llm_completion_callback
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
diff --git a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py
index d9e08f6764fc..6d4974993bcb 100644
--- a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py
+++ b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py
@@ -17,8 +17,8 @@
 from typing import Dict, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.losses import CrossEntropyLoss
 from nemo.collections.nlp.data.spellchecking_asr_customization import (
diff --git a/nemo/collections/nlp/models/text2sparql/text2sparql_model.py b/nemo/collections/nlp/models/text2sparql/text2sparql_model.py
index 6503364fc07e..df7eefa310bb 100644
--- a/nemo/collections/nlp/models/text2sparql/text2sparql_model.py
+++ b/nemo/collections/nlp/models/text2sparql/text2sparql_model.py
@@ -19,8 +19,8 @@
 from typing import Dict, List, Optional, Tuple
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 from transformers import AutoModel, BartForConditionalGeneration, EncoderDecoderModel
 
 from nemo.collections.common.metrics import Perplexity
@@ -145,7 +145,10 @@ def training_step(self, batch: Tuple, batch_idx: int) -> Dict:
         """
         input_ids, input_mask, decoder_input_ids, labels = batch
         loss = self.forward(
-            input_ids=input_ids, attention_mask=input_mask, decoder_input_ids=decoder_input_ids, labels=labels,
+            input_ids=input_ids,
+            attention_mask=input_mask,
+            decoder_input_ids=decoder_input_ids,
+            labels=labels,
         )[0]
 
         tensorboard_logs = {"train_loss": loss, "lr": self._optimizer.param_groups[0]["lr"]}
@@ -159,7 +162,10 @@ def validation_step(self, batch: Tuple, batch_idx: int) -> Dict:
         """
         input_ids, input_mask, decoder_input_ids, labels = batch
         loss, logits = self.forward(
-            input_ids=input_ids, attention_mask=input_mask, decoder_input_ids=decoder_input_ids, labels=labels,
+            input_ids=input_ids,
+            attention_mask=input_mask,
+            decoder_input_ids=decoder_input_ids,
+            labels=labels,
         )[:2]
 
         self.validation_perplexity(logits=logits)
diff --git a/nemo/collections/nlp/models/text_classification/text_classification_model.py b/nemo/collections/nlp/models/text_classification/text_classification_model.py
index 033447304bbf..b2da2fe21701 100644
--- a/nemo/collections/nlp/models/text_classification/text_classification_model.py
+++ b/nemo/collections/nlp/models/text_classification/text_classification_model.py
@@ -17,8 +17,8 @@
 from typing import Dict, List, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.losses import CrossEntropyLoss
 from nemo.collections.nlp.data.text_classification import TextClassificationDataset, calc_class_weights
diff --git a/nemo/collections/nlp/models/text_normalization_as_tagging/thutmose_tagger.py b/nemo/collections/nlp/models/text_normalization_as_tagging/thutmose_tagger.py
index 4c11dc157b2b..ddcb3a774055 100644
--- a/nemo/collections/nlp/models/text_normalization_as_tagging/thutmose_tagger.py
+++ b/nemo/collections/nlp/models/text_normalization_as_tagging/thutmose_tagger.py
@@ -17,8 +17,8 @@
 from typing import Dict, List, Optional
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.losses import CrossEntropyLoss
 from nemo.collections.nlp.data.text_normalization_as_tagging import (
@@ -289,7 +289,7 @@ def on_test_epoch_end(self):
     # Functions for inference
     @torch.no_grad()
     def _infer(self, sents: List[str]) -> List[List[int]]:
-        """ Main function for Inference
+        """Main function for Inference
 
         Args:
             sents: A list of input sentences (lowercase spoken-domain words separated by space).
diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_lexical_audio_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_lexical_audio_model.py
index 69df9b6ac009..bd42517a5720 100644
--- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_lexical_audio_model.py
+++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_lexical_audio_model.py
@@ -17,8 +17,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, open_dict
-from pytorch_lightning import Trainer
 from torch.nn import Linear
 from tqdm import tqdm
 
@@ -53,27 +53,27 @@ def update_model_config_to_support_adapter(model_cfg):
 
 class PunctuationCapitalizationLexicalAudioModel(PunctuationCapitalizationModel):
     """
-        A model for restoring punctuation and capitalization in text using lexical and audio features.
-
-        The model consists of a language model and two multilayer perceptrons (MLP) on top the fusion of LM and AM. The first
-        MLP serves for punctuation prediction and the second is for capitalization prediction. You can use only BERT-like
-        HuggingFace language models (model ``forward`` method accepts ``input_ids``, ``token_types_ids``,
-        ``attention_mask`` arguments). See more about model config options :ref:`here<model-config-label>`.
-        And any :class:`~nemo.collections.asr.models.EncDecCTCModel` which has encoder module which is used as an AM.
-
-        For training and testing use dataset
-        :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset` with parameter ``use_audio`` set to ``True``,
-        for training on huge amounts of data which cannot be loaded into memory simultaneously use
-        :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.BertPunctuationCapitalizationTarredDataset` with parameter ``use_audio`` set to ``True``.
-
-        Args:
-            cfg: a model configuration. It should follow dataclass
-                :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationLexicalAudioModelConfig`
-                See an example of full config in
-                `nemo/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml
-                <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml>`_
-            trainer: an instance of a PyTorch Lightning trainer
-        """
+    A model for restoring punctuation and capitalization in text using lexical and audio features.
+
+    The model consists of a language model and two multilayer perceptrons (MLP) on top the fusion of LM and AM. The first
+    MLP serves for punctuation prediction and the second is for capitalization prediction. You can use only BERT-like
+    HuggingFace language models (model ``forward`` method accepts ``input_ids``, ``token_types_ids``,
+    ``attention_mask`` arguments). See more about model config options :ref:`here<model-config-label>`.
+    And any :class:`~nemo.collections.asr.models.EncDecCTCModel` which has encoder module which is used as an AM.
+
+    For training and testing use dataset
+    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset` with parameter ``use_audio`` set to ``True``,
+    for training on huge amounts of data which cannot be loaded into memory simultaneously use
+    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.BertPunctuationCapitalizationTarredDataset` with parameter ``use_audio`` set to ``True``.
+
+    Args:
+        cfg: a model configuration. It should follow dataclass
+            :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationLexicalAudioModelConfig`
+            See an example of full config in
+            `nemo/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml
+            <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml>`_
+        trainer: an instance of a PyTorch Lightning trainer
+    """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None) -> None:
         super().__init__(cfg, trainer)
@@ -199,31 +199,31 @@ def forward(
         features_length: torch.Tensor = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
-                Executes a forward pass through the model. For more details see ``forward`` method of :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationLexicalAudioModelConfig`
-                and ``forward`` method of :class:'~nemo.collections.asr.models.EncDecCTCModel'
-
-                Args:
-                    input_ids (:obj:`torch.Tensor`): an integer torch tensor of shape ``[Batch, Time]``. Contains encoded
-                        source tokens.
-                    attention_mask (:obj:`torch.Tensor`): a boolean torch tensor of shape ``[Batch, Time]``. Contains an
-                        attention mask for excluding paddings.
-                    token_type_ids (:obj:`torch.Tensor`): an integer torch Tensor of shape ``[Batch, Time]``. Contains an index
-                        of segment to which a token belongs. If ``token_type_ids`` is not ``None``, then it should be a zeros
-                        tensor.
-                    features (:obj:`torch.Tensor`): tensor that represents a batch of raw audio signals,
-                        of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
-                        sample_rate number of floating point values.
-                    features_length (:obj:`torch.Tensor`): Vector of length B, that contains the individual lengths of the audio
-                        sequences.
-
-                Returns:
-                    :obj:`Tuple[torch.Tensor, torch.Tensor]`: a tuple containing
-
-                        - ``punct_logits`` (:obj:`torch.Tensor`): a float torch tensor of shape
-                          ``[Batch, Time, NumPunctuationLabels]`` containing punctuation logits
-                        - ``capit_logits`` (:obj:`torch.Tensor`): a float torch tensor of shape
-                          ``[Batch, Time, NumCapitalizationLabels]`` containing capitalization logits
-                """
+        Executes a forward pass through the model. For more details see ``forward`` method of :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationLexicalAudioModelConfig`
+        and ``forward`` method of :class:'~nemo.collections.asr.models.EncDecCTCModel'
+
+        Args:
+            input_ids (:obj:`torch.Tensor`): an integer torch tensor of shape ``[Batch, Time]``. Contains encoded
+                source tokens.
+            attention_mask (:obj:`torch.Tensor`): a boolean torch tensor of shape ``[Batch, Time]``. Contains an
+                attention mask for excluding paddings.
+            token_type_ids (:obj:`torch.Tensor`): an integer torch Tensor of shape ``[Batch, Time]``. Contains an index
+                of segment to which a token belongs. If ``token_type_ids`` is not ``None``, then it should be a zeros
+                tensor.
+            features (:obj:`torch.Tensor`): tensor that represents a batch of raw audio signals,
+                of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
+                sample_rate number of floating point values.
+            features_length (:obj:`torch.Tensor`): Vector of length B, that contains the individual lengths of the audio
+                sequences.
+
+        Returns:
+            :obj:`Tuple[torch.Tensor, torch.Tensor]`: a tuple containing
+
+                - ``punct_logits`` (:obj:`torch.Tensor`): a float torch tensor of shape
+                  ``[Batch, Time, NumPunctuationLabels]`` containing punctuation logits
+                - ``capit_logits`` (:obj:`torch.Tensor`): a float torch tensor of shape
+                  ``[Batch, Time, NumCapitalizationLabels]`` containing capitalization logits
+        """
         self.update_max_seq_length(seq_length=features.size(1), device=features.device)
         lexical_hidden_states = self.bert_model(
             input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
@@ -232,7 +232,8 @@ def forward(
             lexical_hidden_states = lexical_hidden_states[0]
 
         processed_signal, processed_signal_length = self.audio_encoder.preprocessor(
-            input_signal=features, length=features_length,
+            input_signal=features,
+            length=features_length,
         )
 
         if self.audio_encoder.spec_augmentation is not None and self.training:
@@ -301,49 +302,49 @@ def add_punctuation_capitalization(
         target_sr: Optional[int] = None,
     ) -> List[str]:
         """
-               Adds punctuation and capitalization to the queries. Use this method for inference.
-
-               Parameters ``max_seq_length``, ``step``, ``margin`` are for controlling the way queries are split into segments
-               which are processed by the model. Parameter ``max_seq_length`` is a length of a segment after tokenization
-               including special tokens [CLS] in the beginning and [SEP] in the end of a segment. Parameter ``step`` is a
-               shift between consequent segments. Parameter ``margin`` is used to exclude negative effect of subtokens near
-               borders of segments which have only one side context.
-
-               If segments overlap, probabilities of overlapping predictions are multiplied and then the label with
-               corresponding to the maximum probability is selected.
-
-               Args:
-                   queries (:obj:`List[str]`): lower cased text without punctuation.
-                   batch_size (:obj:`List[str]`, `optional`): batch size to use during inference. If ``batch_size`` parameter
-                       is not provided, then it will be equal to length of ``queries`` list.
-                   max_seq_length (:obj:`int`, `optional`, defaults to :obj:`64`): maximum sequence length of a segment after
-                       tokenization including :code:`[CLS]` and :code:`[SEP]` tokens.
-                   step (:obj:`int`, `optional`, defaults to :obj:`8`): relative shift of consequent segments into which long
-                       queries are split. Long queries are split into segments which can overlap. Parameter ``step`` controls
-                       such overlapping. Imagine that queries are tokenized into characters, ``max_seq_length=5``, and
-                       ``step=2``. In such case, query ``"hello"`` is tokenized into segments
-                       ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]``.
-                   margin (:obj:`int`, `optional`, defaults to :obj:`16`): number of subtokens in the beginning and the end of
-                       segments which are not used for prediction computation. The first segment does not have left margin and
-                       the last segment does not have right margin. For example, if an input sequence is tokenized into
-                       characters, ``max_seq_length=5``, ``step=1``, and ``margin=1``, then query ``"hello"`` will be
-                       tokenized into segments ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'e', 'l', 'l', '[SEP]'],
-                       ['[CLS]', 'l', 'l', 'o', '[SEP]']]``. These segments are passed to the model. Before final predictions
-                       computation, margins are removed. In the next list, subtokens which logits are not used for final
-                       predictions computation are marked with asterisk: ``[['[CLS]'*, 'h', 'e', 'l'*, '[SEP]'*],
-                       ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]``.
-                   return_labels (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to return labels in NeMo format
-                       (see :ref:`nlp/punctuation_and_capitalization/NeMo Data Format`) instead of queries with restored
-                       punctuation and capitalization.
-                   dataloader_kwargs (:obj:`Dict[str, Any]`, `optional`): an optional dictionary with parameters of PyTorch
-                       data loader. May include keys: ``'num_workers'``, ``'pin_memory'``, ``'worker_init_fn'``,
-                       ``'prefetch_factor'``, ``'persistent_workers'``.
-                   audio_queries (:obj:`List[str]`, `optional`): paths to audio files.
-                   target_sr (:obj:`int`, `optional`): target sample rate for audios.
-               Returns:
-                   :obj:`List[str]`: a list of queries with restored capitalization and punctuation if
-                   ``return_labels=False``, else a list of punctuation and capitalization labels strings for all queries
-               """
+        Adds punctuation and capitalization to the queries. Use this method for inference.
+
+        Parameters ``max_seq_length``, ``step``, ``margin`` are for controlling the way queries are split into segments
+        which are processed by the model. Parameter ``max_seq_length`` is a length of a segment after tokenization
+        including special tokens [CLS] in the beginning and [SEP] in the end of a segment. Parameter ``step`` is a
+        shift between consequent segments. Parameter ``margin`` is used to exclude negative effect of subtokens near
+        borders of segments which have only one side context.
+
+        If segments overlap, probabilities of overlapping predictions are multiplied and then the label with
+        corresponding to the maximum probability is selected.
+
+        Args:
+            queries (:obj:`List[str]`): lower cased text without punctuation.
+            batch_size (:obj:`List[str]`, `optional`): batch size to use during inference. If ``batch_size`` parameter
+                is not provided, then it will be equal to length of ``queries`` list.
+            max_seq_length (:obj:`int`, `optional`, defaults to :obj:`64`): maximum sequence length of a segment after
+                tokenization including :code:`[CLS]` and :code:`[SEP]` tokens.
+            step (:obj:`int`, `optional`, defaults to :obj:`8`): relative shift of consequent segments into which long
+                queries are split. Long queries are split into segments which can overlap. Parameter ``step`` controls
+                such overlapping. Imagine that queries are tokenized into characters, ``max_seq_length=5``, and
+                ``step=2``. In such case, query ``"hello"`` is tokenized into segments
+                ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]``.
+            margin (:obj:`int`, `optional`, defaults to :obj:`16`): number of subtokens in the beginning and the end of
+                segments which are not used for prediction computation. The first segment does not have left margin and
+                the last segment does not have right margin. For example, if an input sequence is tokenized into
+                characters, ``max_seq_length=5``, ``step=1``, and ``margin=1``, then query ``"hello"`` will be
+                tokenized into segments ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'e', 'l', 'l', '[SEP]'],
+                ['[CLS]', 'l', 'l', 'o', '[SEP]']]``. These segments are passed to the model. Before final predictions
+                computation, margins are removed. In the next list, subtokens which logits are not used for final
+                predictions computation are marked with asterisk: ``[['[CLS]'*, 'h', 'e', 'l'*, '[SEP]'*],
+                ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]``.
+            return_labels (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to return labels in NeMo format
+                (see :ref:`nlp/punctuation_and_capitalization/NeMo Data Format`) instead of queries with restored
+                punctuation and capitalization.
+            dataloader_kwargs (:obj:`Dict[str, Any]`, `optional`): an optional dictionary with parameters of PyTorch
+                data loader. May include keys: ``'num_workers'``, ``'pin_memory'``, ``'worker_init_fn'``,
+                ``'prefetch_factor'``, ``'persistent_workers'``.
+            audio_queries (:obj:`List[str]`, `optional`): paths to audio files.
+            target_sr (:obj:`int`, `optional`): target sample rate for audios.
+        Returns:
+            :obj:`List[str]`: a list of queries with restored capitalization and punctuation if
+            ``return_labels=False``, else a list of punctuation and capitalization labels strings for all queries
+        """
 
         if len(queries) == 0:
             return []
@@ -408,7 +409,9 @@ def add_punctuation_capitalization(
                             acc_probs[q_i] = b_probs_i
                         else:
                             all_preds[q_i], acc_probs[q_i] = self._move_acc_probs_to_token_preds(
-                                all_preds[q_i], acc_probs[q_i], start_word_id - len(all_preds[q_i]),
+                                all_preds[q_i],
+                                acc_probs[q_i],
+                                start_word_id - len(all_preds[q_i]),
                             )
                             acc_probs[q_i] = self._update_accumulated_probabilities(acc_probs[q_i], b_probs_i)
             for all_preds, acc_probs in [(all_punct_preds, acc_punct_probs), (all_capit_preds, acc_capit_probs)]:
diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
index 6e2d1f5762ec..8cf153dfdf76 100644
--- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
+++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 from tqdm import tqdm
 
 from nemo.collections.common.losses import AggregatorLoss, CrossEntropyLoss
@@ -812,7 +812,13 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u
                 raise ValueError(
                     f"If `use_tarred_dataset` is `False`, then you need to provide `tokens_in_batch` parameter."
                 )
-            text_file, labels_file, = Path(cfg.ds_item) / cfg.text_file, Path(cfg.ds_item) / cfg.labels_file
+            (
+                text_file,
+                labels_file,
+            ) = (
+                Path(cfg.ds_item) / cfg.text_file,
+                Path(cfg.ds_item) / cfg.labels_file,
+            )
             if cfg.audio_file:
                 audio_file = Path(cfg.ds_item) / cfg.audio_file
             if self.label_ids_are_set:
@@ -1010,7 +1016,8 @@ def _transform_logit_to_prob_and_remove_margins_and_extract_word_probs(
             stm = self._remove_margins(stm, margin, keep_left=first, keep_right=last)
             for b_probs, logits in [(b_punct_probs, pl), (b_capit_probs, cl)]:
                 p = torch.nn.functional.softmax(
-                    self._remove_margins(logits, margin, keep_left=first, keep_right=last)[stm], dim=-1,
+                    self._remove_margins(logits, margin, keep_left=first, keep_right=last)[stm],
+                    dim=-1,
                 )
                 b_probs.append(p.detach().cpu().numpy())
         return b_punct_probs, b_capit_probs, new_start_word_ids
@@ -1191,7 +1198,9 @@ def add_punctuation_capitalization(
             ):
                 inp_ids, inp_type_ids, inp_mask, subtokens_mask, start_word_ids, query_ids, is_first, is_last = batch
                 punct_logits, capit_logits = self.forward(
-                    input_ids=inp_ids.to(d), token_type_ids=inp_type_ids.to(d), attention_mask=inp_mask.to(d),
+                    input_ids=inp_ids.to(d),
+                    token_type_ids=inp_type_ids.to(d),
+                    attention_mask=inp_mask.to(d),
                 )
                 _res = self._transform_logit_to_prob_and_remove_margins_and_extract_word_probs(
                     punct_logits, capit_logits, subtokens_mask, start_word_ids, margin, is_first, is_last
@@ -1208,7 +1217,9 @@ def add_punctuation_capitalization(
                             acc_probs[q_i] = b_probs_i
                         else:
                             all_preds[q_i], acc_probs[q_i] = self._move_acc_probs_to_token_preds(
-                                all_preds[q_i], acc_probs[q_i], start_word_id - len(all_preds[q_i]),
+                                all_preds[q_i],
+                                acc_probs[q_i],
+                                start_word_id - len(all_preds[q_i]),
                             )
                             acc_probs[q_i] = self._update_accumulated_probabilities(acc_probs[q_i], b_probs_i)
             for all_preds, acc_probs in [(all_punct_preds, acc_punct_probs), (all_capit_preds, acc_capit_probs)]:
diff --git a/nemo/collections/nlp/models/token_classification/token_classification_model.py b/nemo/collections/nlp/models/token_classification/token_classification_model.py
index 0b465bae663c..99bb2328b956 100644
--- a/nemo/collections/nlp/models/token_classification/token_classification_model.py
+++ b/nemo/collections/nlp/models/token_classification/token_classification_model.py
@@ -16,8 +16,8 @@
 from typing import List, Optional, Union
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.common.losses import CrossEntropyLoss
diff --git a/nemo/collections/nlp/models/zero_shot_intent_recognition/zero_shot_intent_model.py b/nemo/collections/nlp/models/zero_shot_intent_recognition/zero_shot_intent_model.py
index e65f3d7749eb..07e0826c712c 100644
--- a/nemo/collections/nlp/models/zero_shot_intent_recognition/zero_shot_intent_model.py
+++ b/nemo/collections/nlp/models/zero_shot_intent_recognition/zero_shot_intent_model.py
@@ -18,8 +18,8 @@
 
 import numpy as np
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (
     ZeroShotIntentDataset,
@@ -155,7 +155,6 @@ def predict(
         entailment_idx=1,
         contradiction_idx=0,
     ) -> List[Dict]:
-
         """
         Given a list of queries and a list of candidate labels, return a ranked list of labels and scores for each query.
 
diff --git a/nemo/collections/nlp/modules/common/lm_utils.py b/nemo/collections/nlp/modules/common/lm_utils.py
index af6fc9ecb0a7..86792059b28f 100644
--- a/nemo/collections/nlp/modules/common/lm_utils.py
+++ b/nemo/collections/nlp/modules/common/lm_utils.py
@@ -17,8 +17,8 @@
 from typing import List, Optional, Union
 
 from attr import asdict
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.modules.common.bert_module import BertModule
 from nemo.collections.nlp.modules.common.decoder_module import DecoderModule
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index eeaaea26beac..4743c3216e6a 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -24,7 +24,7 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
-from lightning_fabric.utilities.seed import seed_everything
+from lightning.fabric.utilities.seed import seed_everything
 
 from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
 from nemo.collections.multimodal.data.neva.conversation import (
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 7c7360ba3400..11f79baa819a 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -15,11 +15,11 @@
 import sys
 from typing import Optional, Union
 
-from lightning_fabric.utilities.exceptions import MisconfigurationException
+from lightning.fabric.utilities.exceptions import MisconfigurationException
+from lightning.pytorch import Trainer
+from lightning.pytorch.callbacks import ModelSummary
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
 from omegaconf import DictConfig, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelSummary
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 
 from nemo.collections.common.metrics.perf_metrics import FLOPsMeasurementCallback
 from nemo.collections.nlp.parts.nlp_overrides import (
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 2100e9c1ba8f..c16116145a12 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -23,24 +23,24 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Generator, Iterator, List, Literal, Mapping, Optional, Sized, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
-from lightning_fabric.plugins import TorchCheckpointIO
-from lightning_fabric.utilities.cloud_io import get_filesystem
-from lightning_fabric.utilities.optimizer import _optimizer_to_device
+from lightning.fabric.plugins import TorchCheckpointIO
+from lightning.fabric.utilities.cloud_io import get_filesystem
+from lightning.fabric.utilities.optimizer import _optimizer_to_device
+from lightning.pytorch.callbacks.progress import TQDMProgressBar
+from lightning.pytorch.callbacks.progress.tqdm_progress import _update_n
+from lightning.pytorch.core.optimizer import LightningOptimizer
+from lightning.pytorch.loops.fetchers import _DataFetcher
+from lightning.pytorch.plugins import ClusterEnvironment
+from lightning.pytorch.plugins.io.checkpoint_plugin import CheckpointIO
+from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
+from lightning.pytorch.plugins.precision import MixedPrecisionPlugin
+from lightning.pytorch.plugins.precision.fsdp import FSDPPrecision
+from lightning.pytorch.strategies import DDPStrategy, FSDPStrategy
+from lightning.pytorch.trainer.states import TrainerFn
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.callbacks.progress import TQDMProgressBar
-from pytorch_lightning.callbacks.progress.tqdm_progress import _update_n
-from pytorch_lightning.core.optimizer import LightningOptimizer
-from pytorch_lightning.loops.fetchers import _DataFetcher
-from pytorch_lightning.plugins import ClusterEnvironment
-from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
-from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
-from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
-from pytorch_lightning.plugins.precision.fsdp import FSDPPrecision
-from pytorch_lightning.strategies import DDPStrategy, FSDPStrategy
-from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.trainer.trainer import Trainer
 from torch._C._distributed_c10d import ReduceOp
 from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook
 from torch.distributed.fsdp import BackwardPrefetch, FullStateDictConfig
@@ -107,6 +107,7 @@
     from megatron.core.tensor_parallel.layers import param_is_not_tensor_parallel_duplicate
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_layer import TransformerLayer as MCoreTransformerLayer
+
     from nemo.utils.callbacks.dist_ckpt_io import DistributedCheckpointIO
 
     HAVE_MEGATRON_CORE = True
diff --git a/nemo/collections/nlp/parts/utils_funcs.py b/nemo/collections/nlp/parts/utils_funcs.py
index a989ff3f606c..87fc1aa6f73c 100644
--- a/nemo/collections/nlp/parts/utils_funcs.py
+++ b/nemo/collections/nlp/parts/utils_funcs.py
@@ -28,9 +28,9 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
+from lightning.pytorch.trainer.trainer import Trainer
 from matplotlib import pyplot as plt
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 from sklearn.metrics import classification_report, confusion_matrix
 from torch import Tensor
 
diff --git a/nemo/collections/tts/g2p/models/ctc.py b/nemo/collections/tts/g2p/models/ctc.py
index 2e180e766211..1859b09594ff 100644
--- a/nemo/collections/tts/g2p/models/ctc.py
+++ b/nemo/collections/tts/g2p/models/ctc.py
@@ -19,8 +19,8 @@
 
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 
 from nemo.collections.tts.g2p.data.ctc import CTCG2PBPEDataset
@@ -101,11 +101,21 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
         self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer)
 
-        self.wer = WER(decoding=self.decoding, use_cer=False, log_prediction=False, dist_sync_on_step=True,)
-        self.per = WER(decoding=self.decoding, use_cer=True, log_prediction=False, dist_sync_on_step=True,)
+        self.wer = WER(
+            decoding=self.decoding,
+            use_cer=False,
+            log_prediction=False,
+            dist_sync_on_step=True,
+        )
+        self.per = WER(
+            decoding=self.decoding,
+            use_cer=True,
+            log_prediction=False,
+            dist_sync_on_step=True,
+        )
 
     def setup_grapheme_tokenizer(self, cfg):
-        """ Initialized grapheme tokenizer """
+        """Initialized grapheme tokenizer"""
 
         if self.mode == "byt5":
             # Load appropriate tokenizer from HuggingFace
@@ -315,7 +325,10 @@ def _setup_infer_dataloader(self, cfg: DictConfig) -> 'torch.utils.data.DataLoad
         )
 
     @torch.no_grad()
-    def _infer(self, config: DictConfig,) -> List[int]:
+    def _infer(
+        self,
+        config: DictConfig,
+    ) -> List[int]:
         """
         Runs model inference.
 
diff --git a/nemo/collections/tts/g2p/models/heteronym_classification.py b/nemo/collections/tts/g2p/models/heteronym_classification.py
index 54b9a8b07413..47d08eb16e17 100644
--- a/nemo/collections/tts/g2p/models/heteronym_classification.py
+++ b/nemo/collections/tts/g2p/models/heteronym_classification.py
@@ -19,8 +19,8 @@
 
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.losses import CrossEntropyLoss
 from nemo.collections.nlp.metrics.classification_report import ClassificationReport
@@ -113,9 +113,9 @@ def make_step(self, batch):
 
     def training_step(self, batch, batch_idx):
         """
-		Lightning calls this inside the training loop with the data from the training dataloader
-		passed in as `batch`.
-		"""
+        Lightning calls this inside the training loop with the data from the training dataloader
+        passed in as `batch`.
+        """
 
         loss, logits = self.make_step(batch)
         self.log('train_loss', loss)
@@ -267,7 +267,11 @@ def disambiguate(
                 item = {"text_graphemes": cur_sentence, "start_end": cur_start_ends, "heteronym_span": cur_heteronyms}
                 f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
-        all_preds = self._disambiguate(manifest=tmp_manifest, batch_size=batch_size, num_workers=num_workers,)
+        all_preds = self._disambiguate(
+            manifest=tmp_manifest,
+            batch_size=batch_size,
+            num_workers=num_workers,
+        )
 
         if wordid_to_phonemes_file is not None:
             self.set_wordid_to_phonemes(wordid_to_phonemes_file)
diff --git a/nemo/collections/tts/g2p/models/t5.py b/nemo/collections/tts/g2p/models/t5.py
index 19f976081687..4c673b18dc4a 100644
--- a/nemo/collections/tts/g2p/models/t5.py
+++ b/nemo/collections/tts/g2p/models/t5.py
@@ -17,8 +17,8 @@
 
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
 from transformers import AutoTokenizer, T5ForConditionalGeneration
 
 from nemo.collections.asr.metrics.wer import word_error_rate
diff --git a/nemo/collections/tts/models/aligner.py b/nemo/collections/tts/models/aligner.py
index d8e65d6e6821..5fea8615f7f2 100644
--- a/nemo/collections/tts/models/aligner.py
+++ b/nemo/collections/tts/models/aligner.py
@@ -18,9 +18,9 @@
 import omegaconf
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
+from lightning.pytorch.loggers import WandbLogger
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
-from pytorch_lightning.loggers import WandbLogger
 from torch import nn
 
 from nemo.collections.tts.losses.aligner_loss import BinLoss, ForwardSumLoss
diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index 0c5e41157613..230a24e36cb0 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -21,8 +21,8 @@
 import torch.nn.functional as F
 from einops import rearrange
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.tts.losses.audio_codec_loss import (
     FeatureMatchingLoss,
diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
index b1e702c89124..34213303abf4 100644
--- a/nemo/collections/tts/models/fastpitch.py
+++ b/nemo/collections/tts/models/fastpitch.py
@@ -18,9 +18,9 @@
 
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
+from lightning.pytorch.loggers import TensorBoardLogger
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.loggers import TensorBoardLogger
 
 from nemo.collections.common.parts.preprocessing import parsers
 from nemo.collections.tts.losses.aligner_loss import BinLoss, ForwardSumLoss
diff --git a/nemo/collections/tts/models/fastpitch_ssl.py b/nemo/collections/tts/models/fastpitch_ssl.py
index fe743edf8783..f2384c41c5b5 100644
--- a/nemo/collections/tts/models/fastpitch_ssl.py
+++ b/nemo/collections/tts/models/fastpitch_ssl.py
@@ -16,9 +16,9 @@
 
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
+from lightning.pytorch.loggers import TensorBoardLogger
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
-from pytorch_lightning.loggers import TensorBoardLogger
 
 from nemo.collections.tts.losses.fastpitchloss import DurationLoss, MelLoss, PitchLoss
 from nemo.collections.tts.modules.fastpitch import FastPitchSSLModule, average_features
@@ -34,7 +34,7 @@
 class FastPitchModel_SSL(ModelPT):
     """
     FastPitch based model that can synthesize mel spectrograms from content and speaker embeddings
-    obtained from SSLDisentangler. This model can be used for voice conversion by swapping the speaker embedding 
+    obtained from SSLDisentangler. This model can be used for voice conversion by swapping the speaker embedding
     of a given source utterance, with the speaker embedding of a target speaker.
     """
 
@@ -133,9 +133,21 @@ def tb_logger(self):
         return self._tb_logger
 
     def forward(
-        self, *, enc_out=None, enc_mask=None, durs=None, pitch=None, pace=1.0,
+        self,
+        *,
+        enc_out=None,
+        enc_mask=None,
+        durs=None,
+        pitch=None,
+        pace=1.0,
     ):
-        return self.fastpitch(enc_out=enc_out, enc_mask=enc_mask, durs=durs, pitch=pitch, pace=pace,)
+        return self.fastpitch(
+            enc_out=enc_out,
+            enc_mask=enc_mask,
+            durs=durs,
+            pitch=pitch,
+            pace=pace,
+        )
 
     def compute_encoding(self, content_embedding, speaker_embedding, dataset_id=None):
         # content embedding is (B, C, T)
@@ -177,7 +189,11 @@ def training_step(self, batch, batch_idx):
         enc_mask = enc_mask[:, :, None]
 
         mels_pred, _, _, log_durs_pred, pitch_pred, pitch = self(
-            enc_out=enc_out, enc_mask=enc_mask, durs=durs, pitch=pitch, pace=1.0,
+            enc_out=enc_out,
+            enc_mask=enc_mask,
+            durs=durs,
+            pitch=pitch,
+            pace=1.0,
         )
 
         loss = 0
@@ -208,7 +224,10 @@ def training_step(self, batch, batch_idx):
             )
             spec_predict = mels_pred[0].data.cpu().float().numpy()
             self.tb_logger.add_image(
-                "train_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC",
+                "train_mel_predicted",
+                plot_spectrogram_to_numpy(spec_predict),
+                self.global_step,
+                dataformats="HWC",
             )
 
         return loss
@@ -286,7 +305,10 @@ def on_validation_epoch_end(self, outputs):
             )
             spec_predict = spec_predict[_rand_idx].data.cpu().float().numpy()
             self.tb_logger.add_image(
-                "val_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC",
+                "val_mel_predicted",
+                plot_spectrogram_to_numpy(spec_predict),
+                self.global_step,
+                dataformats="HWC",
             )
 
             if self.pitch_conditioning:
@@ -321,10 +343,10 @@ def generate_wav(
     ):
         """
         Args:
-            content_embedding : Content embedding from SSL backbone (B, C, T) 
+            content_embedding : Content embedding from SSL backbone (B, C, T)
             speaker_embedding : Speaker embedding from SSL backbone (B, C)
             pitch_contour : Normalized Pitch contour derived from the mel spectrogram
-            encoded_len: Length of each content embedding, optional if batch size is 1. 
+            encoded_len: Length of each content embedding, optional if batch size is 1.
             compute_pitch: if true, predict pitch contour from content and speaker embedding.
             compute_duration: if true, predict duration from content and speaker embedding.
             durs_gt: Ground truth duration of each content embedding, ignored if compute_duration is True.
diff --git a/nemo/collections/tts/models/hifigan.py b/nemo/collections/tts/models/hifigan.py
index 7a9a6d30671f..1a5462349c4d 100644
--- a/nemo/collections/tts/models/hifigan.py
+++ b/nemo/collections/tts/models/hifigan.py
@@ -18,8 +18,8 @@
 import torch
 import torch.nn.functional as F
 from hydra.utils import instantiate
+from lightning.pytorch.loggers.wandb import WandbLogger
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning.loggers.wandb import WandbLogger
 
 from nemo.collections.tts.losses.hifigan_losses import DiscriminatorLoss, FeatureMatchingLoss, GeneratorLoss
 from nemo.collections.tts.models.base import Vocoder
@@ -313,7 +313,7 @@ def stft(x):
             comp = torch.stft(x.squeeze(1), n_fft=1024, hop_length=256, win_length=1024, return_complex=True)
             comp = torch.view_as_real(comp)
             real, imag = comp[..., 0], comp[..., 1]
-            mags = torch.sqrt(real ** 2 + imag ** 2)
+            mags = torch.sqrt(real**2 + imag**2)
             phase = torch.atan2(imag, real)
             return mags, phase
 
diff --git a/nemo/collections/tts/models/mixer_tts.py b/nemo/collections/tts/models/mixer_tts.py
index c260df22e3c0..58b7f6f9706b 100644
--- a/nemo/collections/tts/models/mixer_tts.py
+++ b/nemo/collections/tts/models/mixer_tts.py
@@ -20,9 +20,9 @@
 import transformers
 import wandb
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
+from lightning.pytorch.loggers import WandbLogger
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
-from pytorch_lightning.loggers import WandbLogger
 from torch import nn
 from torch.nn import functional as F
 from transformers import AlbertTokenizer
diff --git a/nemo/collections/tts/models/radtts.py b/nemo/collections/tts/models/radtts.py
index 82f85d1ed6a2..3f04f2ca3908 100644
--- a/nemo/collections/tts/models/radtts.py
+++ b/nemo/collections/tts/models/radtts.py
@@ -15,9 +15,9 @@
 
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
+from lightning.pytorch.loggers import TensorBoardLogger
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
-from pytorch_lightning.loggers import TensorBoardLogger
 
 from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import BaseTokenizer
 from nemo.collections.tts.losses.radttsloss import AttentionBinarizationLoss, RADTTSLoss
diff --git a/nemo/collections/tts/models/spectrogram_enhancer.py b/nemo/collections/tts/models/spectrogram_enhancer.py
index 65934d9a10ce..3644a77eb6fe 100644
--- a/nemo/collections/tts/models/spectrogram_enhancer.py
+++ b/nemo/collections/tts/models/spectrogram_enhancer.py
@@ -43,9 +43,9 @@
 import torch.nn.functional as F
 from einops import rearrange
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
+from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
-from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
 from torch.utils.tensorboard.writer import SummaryWriter
 
 from nemo.collections.common.parts.utils import mask_sequence_tensor
diff --git a/nemo/collections/tts/models/ssl_tts.py b/nemo/collections/tts/models/ssl_tts.py
index 298a1a599008..f2cc4f798ec5 100644
--- a/nemo/collections/tts/models/ssl_tts.py
+++ b/nemo/collections/tts/models/ssl_tts.py
@@ -18,10 +18,10 @@
 import librosa
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
+from lightning.pytorch.loggers import TensorBoardLogger
+from lightning.pytorch.utilities.combined_loader import CombinedLoader
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
-from pytorch_lightning.loggers import TensorBoardLogger
-from pytorch_lightning.utilities.combined_loader import CombinedLoader
 
 from nemo.collections.asr.losses.angularloss import AngularSoftmaxLoss
 from nemo.collections.tts.data.dataset import TTSDataset
@@ -38,10 +38,10 @@
 class SSLDisentangler(ModelPT):
     """
     SSLDisentangler is a Conformer based model for extracting disentangled content and speaker embeddings
-    from an audio waveform. This model uses a pre-trained Conformer SSL model. To extract the linguistic content 
-    and speaker representations using a pre-trained Conformer, two randomly initialized downstream 
-    heads are added and the entire setup is finetuned in multi-task manner for speech recognition and speaker verification. 
-    These representations can be used by FastPitchModel_SSL for voice conversion by swapping the speaker embedding 
+    from an audio waveform. This model uses a pre-trained Conformer SSL model. To extract the linguistic content
+    and speaker representations using a pre-trained Conformer, two randomly initialized downstream
+    heads are added and the entire setup is finetuned in multi-task manner for speech recognition and speaker verification.
+    These representations can be used by FastPitchModel_SSL for voice conversion by swapping the speaker embedding
     of a given source utterance, with the speaker embedding of a target speaker.
     """
 
@@ -92,7 +92,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         librosa_mel_filter = librosa.filters.mel(
             sr=stft_cfg.sample_rate, n_fft=stft_cfg.n_fft, n_mels=stft_cfg.features, fmin=0, fmax=8000
         )
-        fb = torch.tensor(librosa_mel_filter, dtype=torch.float,).unsqueeze(0)
+        fb = torch.tensor(
+            librosa_mel_filter,
+            dtype=torch.float,
+        ).unsqueeze(0)
 
         self.register_buffer("fb", fb)
 
@@ -212,7 +215,10 @@ def configure_optimizers(self):
         sched_downstream_config = optim_downstream_config.pop("sched", None)
         OmegaConf.set_struct(optim_downstream_config, True)
 
-        optim_backbone = instantiate(optim_backbone_config, params=self.encoder.parameters(),)
+        optim_backbone = instantiate(
+            optim_backbone_config,
+            params=self.encoder.parameters(),
+        )
         optim_downstream = instantiate(
             optim_downstream_config,
             params=itertools.chain(
@@ -254,7 +260,8 @@ def configure_optimizers(self):
     def forward(self, input_signal=None, input_signal_length=None, normalize_content=True):
 
         processed_signal, processed_signal_length = self.preprocessor_disentangler(
-            input_signal=input_signal, length=input_signal_length,
+            input_signal=input_signal,
+            length=input_signal_length,
         )
 
         encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length)  # b,c,t
@@ -292,7 +299,9 @@ def forward_for_export(self, input_signal=None, input_signal_length=None, normal
         # Same as forward right now. Earlier version of encoder had a different forward for export.
         # This function is still kept for compatibility with older evaluation/inference scripts.
         return self.forward(
-            input_signal=input_signal, input_signal_length=input_signal_length, normalize_content=normalize_content,
+            input_signal=input_signal,
+            input_signal_length=input_signal_length,
+            normalize_content=normalize_content,
         )
 
     def training_step(self, batch, batch_idx):
diff --git a/nemo/collections/tts/models/tacotron2.py b/nemo/collections/tts/models/tacotron2.py
index 2fb005d80ca6..33d476029011 100644
--- a/nemo/collections/tts/models/tacotron2.py
+++ b/nemo/collections/tts/models/tacotron2.py
@@ -18,9 +18,9 @@
 
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
 from omegaconf import MISSING, DictConfig, OmegaConf, open_dict
 from omegaconf.errors import ConfigAttributeError
-from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
 from torch import nn
 
 from nemo.collections.common.parts.preprocessing import parsers
diff --git a/nemo/collections/tts/models/univnet.py b/nemo/collections/tts/models/univnet.py
index 64ee891b0754..12500be8d180 100644
--- a/nemo/collections/tts/models/univnet.py
+++ b/nemo/collections/tts/models/univnet.py
@@ -18,8 +18,8 @@
 import torch
 import torch.nn.functional as F
 from hydra.utils import instantiate
+from lightning.pytorch.loggers.wandb import WandbLogger
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning.loggers.wandb import WandbLogger
 
 from nemo.collections.tts.losses.hifigan_losses import DiscriminatorLoss, GeneratorLoss
 from nemo.collections.tts.losses.stftlosses import MultiResolutionSTFTLoss
@@ -114,8 +114,14 @@ def configure_optimizers(self):
         if sched_config is None and 'sched' in self._cfg:
             sched_config = self._cfg.sched
 
-        optim_g = instantiate(optim_config, params=self.generator.parameters(),)
-        optim_d = instantiate(optim_config, params=itertools.chain(self.mrd.parameters(), self.mpd.parameters()),)
+        optim_g = instantiate(
+            optim_config,
+            params=self.generator.parameters(),
+        )
+        optim_d = instantiate(
+            optim_config,
+            params=itertools.chain(self.mrd.parameters(), self.mpd.parameters()),
+        )
 
         if sched_config is not None:
             max_steps = self._cfg.get("max_steps", None)
@@ -290,7 +296,7 @@ def stft(x):
             comp = torch.stft(x.squeeze(1), n_fft=1024, hop_length=256, win_length=1024, return_complex=True)
             comp = torch.view_as_real(comp)
             real, imag = comp[..., 0], comp[..., 1]
-            mags = torch.sqrt(real ** 2 + imag ** 2)
+            mags = torch.sqrt(real**2 + imag**2)
             phase = torch.atan2(imag, real)
             return mags, phase
 
diff --git a/nemo/collections/tts/models/vits.py b/nemo/collections/tts/models/vits.py
index 4a891fa8823e..3c53442a0863 100644
--- a/nemo/collections/tts/models/vits.py
+++ b/nemo/collections/tts/models/vits.py
@@ -18,9 +18,9 @@
 import omegaconf
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch import Trainer
+from lightning.pytorch.loggers import WandbLogger
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Trainer
-from pytorch_lightning.loggers import WandbLogger
 from torch.cuda.amp import autocast
 from torch.nn import functional as F
 
diff --git a/nemo/collections/tts/models/waveglow.py b/nemo/collections/tts/models/waveglow.py
index 728b5b94b084..04eec734b26e 100644
--- a/nemo/collections/tts/models/waveglow.py
+++ b/nemo/collections/tts/models/waveglow.py
@@ -15,8 +15,8 @@
 
 import torch
 from hydra.utils import instantiate
+from lightning.pytorch.loggers import TensorBoardLogger
 from omegaconf import DictConfig, open_dict
-from pytorch_lightning.loggers import TensorBoardLogger
 
 from nemo.collections.tts.losses.waveglowloss import WaveGlowLoss
 from nemo.collections.tts.models.base import GlowVocoder
diff --git a/nemo/collections/tts/parts/utils/callbacks.py b/nemo/collections/tts/parts/utils/callbacks.py
index c4ec09031cf9..1856dee0ce0f 100644
--- a/nemo/collections/tts/parts/utils/callbacks.py
+++ b/nemo/collections/tts/parts/utils/callbacks.py
@@ -23,10 +23,10 @@
 import soundfile as sf
 import torch
 from einops import rearrange
-from pytorch_lightning import Callback, LightningModule, Trainer
-from pytorch_lightning.loggers import TensorBoardLogger
-from pytorch_lightning.loggers.logger import Logger
-from pytorch_lightning.loggers.wandb import WandbLogger
+from lightning.pytorch import Callback, LightningModule, Trainer
+from lightning.pytorch.loggers import TensorBoardLogger
+from lightning.pytorch.loggers.logger import Logger
+from lightning.pytorch.loggers.wandb import WandbLogger
 from torch import Tensor
 
 from nemo.collections.tts.parts.utils.helpers import create_plot
@@ -194,7 +194,10 @@ def _log_audio(self, audio: AudioArtifact, log_dir: Path, step: int):
 
         if self.tensorboard_logger:
             self.tensorboard_logger.add_audio(
-                tag=audio.id, snd_tensor=audio.data, global_step=step, sample_rate=audio.sample_rate,
+                tag=audio.id,
+                snd_tensor=audio.data,
+                global_step=step,
+                sample_rate=audio.sample_rate,
             )
 
         if self.wandb_logger:
@@ -212,7 +215,10 @@ def _log_image(self, image: ImageArtifact, log_dir: Path, step: int):
 
         if self.tensorboard_logger:
             self.tensorboard_logger.add_image(
-                tag=image.id, img_tensor=image_plot, global_step=step, dataformats="HWC",
+                tag=image.id,
+                img_tensor=image_plot,
+                global_step=step,
+                dataformats="HWC",
             )
 
         if self.wandb_logger:
@@ -220,8 +226,7 @@ def _log_image(self, image: ImageArtifact, log_dir: Path, step: int):
             self.wandb_logger.log({image.id: wandb_image})
 
     def _log_artifacts(self, audio_list: list, image_list: list, log_dir: Optional[Path] = None, global_step: int = 0):
-        """Log audio and image artifacts.
-        """
+        """Log audio and image artifacts."""
         if log_dir is not None:
             log_dir.mkdir(parents=True, exist_ok=True)
 
@@ -232,8 +237,7 @@ def _log_artifacts(self, audio_list: list, image_list: list, log_dir: Optional[P
             self._log_image(image=image, log_dir=log_dir, step=global_step)
 
     def on_fit_start(self, trainer: Trainer, model: LightningModule):
-        """Log initial data artifacts.
-        """
+        """Log initial data artifacts."""
         audio_list = []
         image_list = []
         for batch_dict in self.data_loader:
@@ -255,8 +259,7 @@ def on_fit_start(self, trainer: Trainer, model: LightningModule):
         self._log_artifacts(audio_list=audio_list, image_list=image_list, log_dir=log_dir)
 
     def on_train_epoch_end(self, trainer: Trainer, model: LightningModule):
-        """Log artifacts at the end of an epoch.
-        """
+        """Log artifacts at the end of an epoch."""
         epoch = 1 + model.current_epoch
         if (epoch not in self.log_epochs) and (epoch % self.epoch_frequency != 0):
             return
@@ -306,7 +309,10 @@ def generate_artifacts(
                 audio_gt_path = Path(f"{dataset_name}/{audio_id}_gt.wav")
                 audio_gt_i = audio[i, : audio_len[i]].cpu().numpy()
                 audio_artifact = AudioArtifact(
-                    id=f"audio_gt_{audio_id}", data=audio_gt_i, filepath=audio_gt_path, sample_rate=model.sample_rate,
+                    id=f"audio_gt_{audio_id}",
+                    data=audio_gt_i,
+                    filepath=audio_gt_path,
+                    sample_rate=model.sample_rate,
                 )
                 audio_artifacts.append(audio_artifact)
             return audio_artifacts, []
@@ -321,7 +327,10 @@ def generate_artifacts(
             audio_pred_path = Path(f"{dataset_name}/{audio_id}.wav")
             audio_pred_i = audio_pred[i, : audio_len[i]].cpu().numpy()
             audio_artifact = AudioArtifact(
-                id=f"audio_{audio_id}", data=audio_pred_i, filepath=audio_pred_path, sample_rate=model.sample_rate,
+                id=f"audio_{audio_id}",
+                data=audio_pred_i,
+                filepath=audio_pred_path,
+                sample_rate=model.sample_rate,
             )
             audio_artifacts.append(audio_artifact)
 
@@ -378,7 +387,10 @@ def _generate_audio(
             audio_pred_path = Path(f"{dataset_name}/{audio_id}_audio_out.wav")
             audio_pred_i = audio_pred[i, : audio_pred_len[i]].cpu().numpy()
             audio_artifact = AudioArtifact(
-                id=f"audio_out_{audio_id}", data=audio_pred_i, filepath=audio_pred_path, sample_rate=model.sample_rate,
+                id=f"audio_out_{audio_id}",
+                data=audio_pred_i,
+                filepath=audio_pred_path,
+                sample_rate=model.sample_rate,
             )
             audio_artifacts.append(audio_artifact)
 
@@ -388,7 +400,10 @@ def _generate_audio(
                 audio_in_path = Path(f"{dataset_name}/{audio_id}_audio_in.wav")
                 audio_in_i = audio[i, : audio_len[i]].cpu().numpy()
                 audio_artifact = AudioArtifact(
-                    id=f"audio_in_{audio_id}", data=audio_in_i, filepath=audio_in_path, sample_rate=model.sample_rate,
+                    id=f"audio_in_{audio_id}",
+                    data=audio_in_i,
+                    filepath=audio_in_path,
+                    sample_rate=model.sample_rate,
                 )
                 audio_artifacts.append(audio_artifact)
 
@@ -538,7 +553,11 @@ def _create_ground_truth_artifacts(
             spec_gt_path = Path(f"{dataset_name}/{audio_id}_spec_gt.png")
             spec_gt_i = spec[i, :, : spec_len[i]].cpu().numpy()
             spec_artifact = ImageArtifact(
-                id=f"spec_{audio_id}", data=spec_gt_i, filepath=spec_gt_path, x_axis="Audio Frames", y_axis="Channels",
+                id=f"spec_{audio_id}",
+                data=spec_gt_i,
+                filepath=spec_gt_path,
+                x_axis="Audio Frames",
+                y_axis="Channels",
             )
             image_artifacts.append(spec_artifact)
 
@@ -565,14 +584,22 @@ def _generate_predictions(
 
         with torch.no_grad():
             # [B, C, T_spec]
-            mels_pred, mels_pred_len, *_ = model.forward(text=text, input_lens=text_lens, speaker=speaker,)
+            mels_pred, mels_pred_len, *_ = model.forward(
+                text=text,
+                input_lens=text_lens,
+                speaker=speaker,
+            )
 
         if self.log_spectrogram:
             for i, (dataset_name, audio_id) in enumerate(zip(dataset_names, audio_ids)):
                 spec_path = Path(f"{dataset_name}/{audio_id}_spec.png")
                 spec_i = mels_pred[i, :, : mels_pred_len[i]].cpu().numpy()
                 spec_artifact = ImageArtifact(
-                    id=f"spec_{audio_id}", data=spec_i, filepath=spec_path, x_axis="Audio Frames", y_axis="Channels",
+                    id=f"spec_{audio_id}",
+                    data=spec_i,
+                    filepath=spec_path,
+                    x_axis="Audio Frames",
+                    y_axis="Channels",
                 )
                 image_artifacts.append(spec_artifact)
 
diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py
index 85d12a4261da..28be259502c5 100644
--- a/nemo/collections/tts/parts/utils/helpers.py
+++ b/nemo/collections/tts/parts/utils/helpers.py
@@ -63,7 +63,7 @@
     HAVE_WANDB = False
 
 try:
-    from pytorch_lightning.utilities import rank_zero_only
+    from lightning.pytorch.utilities import rank_zero_only
 except ModuleNotFoundError:
     from functools import wraps
 
diff --git a/nemo/collections/vision/models/megatron_vit_classification_models.py b/nemo/collections/vision/models/megatron_vit_classification_models.py
index 5cffdd6d12a3..c4024a5a47a7 100644
--- a/nemo/collections/vision/models/megatron_vit_classification_models.py
+++ b/nemo/collections/vision/models/megatron_vit_classification_models.py
@@ -17,9 +17,9 @@
 from typing import Any, Optional
 
 import torch
+from lightning.pytorch.accelerators import CPUAccelerator
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.accelerators import CPUAccelerator
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import MegatronPretrainingSampler
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
diff --git a/nemo/collections/vlm/mllama/data/lazy.py b/nemo/collections/vlm/mllama/data/lazy.py
index 30b8b2ea9d9c..5069f8593377 100644
--- a/nemo/collections/vlm/mllama/data/lazy.py
+++ b/nemo/collections/vlm/mllama/data/lazy.py
@@ -18,10 +18,10 @@
 import re
 from typing import Any, Dict, List, Optional, Sequence
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import torch.nn.functional as F
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils import data
 from torch.utils.data import DataLoader, default_collate
 
diff --git a/nemo/collections/vlm/mllama/data/mock.py b/nemo/collections/vlm/mllama/data/mock.py
index bb3afe83ea46..a88838b0025f 100644
--- a/nemo/collections/vlm/mllama/data/mock.py
+++ b/nemo/collections/vlm/mllama/data/mock.py
@@ -14,10 +14,10 @@
 
 from typing import Dict, List, Optional, Tuple
 
+import lightning.pytorch as pl
 import numpy as np
-import pytorch_lightning as pl
 import torch
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils import data
 from torch.utils.data import DataLoader, Dataset
 
diff --git a/nemo/collections/vlm/mllama/model/base.py b/nemo/collections/vlm/mllama/model/base.py
index f03af078987d..7dd84fefbb18 100644
--- a/nemo/collections/vlm/mllama/model/base.py
+++ b/nemo/collections/vlm/mllama/model/base.py
@@ -17,7 +17,7 @@
 from dataclasses import dataclass
 from typing import Callable, Dict, List, Optional, Tuple
 
-import pytorch_lightning as L
+import lightning.pytorch as L
 import torch
 import torch.distributed
 from einops import rearrange
diff --git a/nemo/collections/vlm/neva/data/api.py b/nemo/collections/vlm/neva/data/api.py
index c2e51e033d8a..15ba45c82fd9 100644
--- a/nemo/collections/vlm/neva/data/api.py
+++ b/nemo/collections/vlm/neva/data/api.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule
 from nemo.collections.vlm.neva.data.mock import MockDataModule
diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py
index 57aa5b408835..fddaca14faeb 100644
--- a/nemo/collections/vlm/neva/data/lazy.py
+++ b/nemo/collections/vlm/neva/data/lazy.py
@@ -20,12 +20,12 @@
 from typing import Any, Dict, List, Optional, Sequence
 
 import decord
+import lightning.pytorch as pl
 import numpy as np
-import pytorch_lightning as pl
 import torch
 import torch.nn.functional as F
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from PIL import Image
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils import data
 from torch.utils.data import DataLoader, Dataset, default_collate
 from transformers import CLIPImageProcessor, SiglipImageProcessor
diff --git a/nemo/collections/vlm/neva/data/mock.py b/nemo/collections/vlm/neva/data/mock.py
index ac4bc56a068c..ede06e9f5778 100644
--- a/nemo/collections/vlm/neva/data/mock.py
+++ b/nemo/collections/vlm/neva/data/mock.py
@@ -14,10 +14,10 @@
 
 from typing import Dict, List, Optional
 
+import lightning.pytorch as pl
 import numpy as np
-import pytorch_lightning as pl
 import torch
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils import data
 from torch.utils.data import DataLoader, Dataset
 
diff --git a/nemo/collections/vlm/neva/model/api.py b/nemo/collections/vlm/neva/model/api.py
index 62374d536712..19e94c70381e 100644
--- a/nemo/collections/vlm/neva/model/api.py
+++ b/nemo/collections/vlm/neva/model/api.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.collections.vlm.neva.model import Llava1_5Config7B, Llava1_5Config13B, LlavaModel
 
diff --git a/nemo/collections/vlm/neva/model/base.py b/nemo/collections/vlm/neva/model/base.py
index 260b7e7e0f4a..d4e578218ed2 100644
--- a/nemo/collections/vlm/neva/model/base.py
+++ b/nemo/collections/vlm/neva/model/base.py
@@ -17,7 +17,7 @@
 from dataclasses import dataclass
 from typing import Callable, Dict, List, Optional, Union
 
-import pytorch_lightning as L
+import lightning.pytorch as L
 import torch
 import torch.distributed
 import torch.nn.functional as F
diff --git a/nemo/collections/vlm/recipes/mllama_11b.py b/nemo/collections/vlm/recipes/mllama_11b.py
index 697be9990faf..e4842ae63d52 100644
--- a/nemo/collections/vlm/recipes/mllama_11b.py
+++ b/nemo/collections/vlm/recipes/mllama_11b.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo import lightning as nl
diff --git a/nemo/collections/vlm/recipes/mllama_90b.py b/nemo/collections/vlm/recipes/mllama_90b.py
index 8822aa9b189f..28a6ff7ff9a6 100644
--- a/nemo/collections/vlm/recipes/mllama_90b.py
+++ b/nemo/collections/vlm/recipes/mllama_90b.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo import lightning as nl
diff --git a/nemo/core/classes/__init__.py b/nemo/core/classes/__init__.py
index 3a6db2602648..e773972c6d7b 100644
--- a/nemo/core/classes/__init__.py
+++ b/nemo/core/classes/__init__.py
@@ -14,8 +14,8 @@
 
 
 import hydra
+import lightning.pytorch
 import omegaconf
-import pytorch_lightning
 
 from nemo.core.classes.common import (
     FileIO,
diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py
index aab09d42d907..ba284e7c28cd 100644
--- a/nemo/core/classes/exportable.py
+++ b/nemo/core/classes/exportable.py
@@ -15,7 +15,7 @@
 from typing import Dict, List, Optional, Union
 
 import torch
-from pytorch_lightning.core.module import _jit_is_scripting
+from lightning.pytorch.core.module import _jit_is_scripting
 
 from nemo.core.classes import typecheck
 from nemo.core.neural_types import NeuralType
diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index a15f769e9d88..308f5cbb8bee 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -35,9 +35,9 @@
 
     HAVE_MEGATRON_CORE = False
 
+from lightning.pytorch import LightningModule, Trainer
+from lightning.pytorch.utilities import model_summary, rank_zero_only
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning import LightningModule, Trainer
-from pytorch_lightning.utilities import model_summary, rank_zero_only
 
 from nemo import package_info
 from nemo.core import optim
@@ -79,7 +79,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         """
         if trainer is not None and not isinstance(trainer, Trainer):
             raise ValueError(
-                f"trainer constructor argument must be either None or pytorch_lightning.Trainer. But got {type(trainer)} instead."
+                f"trainer constructor argument must be either None or lightning.pytorch.Trainer. But got {type(trainer)} instead."
             )
         super().__init__()
 
diff --git a/nemo/core/connectors/save_restore_connector.py b/nemo/core/connectors/save_restore_connector.py
index cd9971a9c383..2c4c826d1daf 100644
--- a/nemo/core/connectors/save_restore_connector.py
+++ b/nemo/core/connectors/save_restore_connector.py
@@ -23,9 +23,9 @@
 from typing import Callable, Generator, Optional, Set, Union
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig, OmegaConf
 from omegaconf.omegaconf import open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.core import classes as nemo_classes  # to avoid circular import do not import ModelPT directly
 from nemo.utils import logging, model_utils
diff --git a/nemo/core/utils/k2_guard.py b/nemo/core/utils/k2_guard.py
index a9f64ce39c6b..b0e86d319ec0 100644
--- a/nemo/core/utils/k2_guard.py
+++ b/nemo/core/utils/k2_guard.py
@@ -21,8 +21,9 @@
 
 import textwrap
 
+from lightning.pytorch.utilities.imports import package_available
 from packaging.version import Version
-from pytorch_lightning.utilities.imports import package_available
+
 from nemo.core.utils.k2_utils import K2_INSTALLATION_MESSAGE
 
 __K2_MINIMUM_MAJOR_VERSION = 1
diff --git a/nemo/deploy/deploy_base.py b/nemo/deploy/deploy_base.py
index 63746199bac6..41e0e7ddbdc9 100644
--- a/nemo/deploy/deploy_base.py
+++ b/nemo/deploy/deploy_base.py
@@ -18,7 +18,7 @@
 
 use_pytorch_lightning = True
 try:
-    from pytorch_lightning import Trainer
+    from lightning.pytorch import Trainer
 except Exception:
     use_pytorch_lightning = False
 
diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py
index 64cf6114ceba..0ce5991cdc95 100644
--- a/nemo/deploy/nlp/megatronllm_deployable.py
+++ b/nemo/deploy/nlp/megatronllm_deployable.py
@@ -20,7 +20,7 @@
 import numpy as np
 import torch
 import wrapt
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.text_generation_utils import (
diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py
index 91d3b3f936d0..e01a2d5e5765 100644
--- a/nemo/lightning/__init__.py
+++ b/nemo/lightning/__init__.py
@@ -14,8 +14,8 @@
 
 from typing import Union
 
-from lightning_fabric.plugins.environments import slurm
-from pytorch_lightning import plugins as _pl_plugins
+from lightning.fabric.plugins.environments import slurm
+from lightning.pytorch import plugins as _pl_plugins
 
 # This is here to import it once, which improves the speed of launch when in debug-mode
 from nemo.utils.import_utils import safe_import
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index 1bee71e26e17..ea6fa4c4d226 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -28,7 +28,7 @@
 
 
 if TYPE_CHECKING:
-    from lightning_fabric.utilities.types import Optimizable
+    from lightning.fabric.utilities.types import Optimizable
     from megatron.core.model_parallel_config import ModelParallelConfig
 
 
diff --git a/nemo/lightning/base.py b/nemo/lightning/base.py
index b6ba14726818..3b0b1c0c7234 100644
--- a/nemo/lightning/base.py
+++ b/nemo/lightning/base.py
@@ -19,7 +19,7 @@
 
 import torch
 import torch.distributed
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 from torch import nn
 
 
diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index 9cf686464417..9cb685a096fa 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -19,7 +19,7 @@
 from typing import List, Literal, Optional
 
 import torch
-from pytorch_lightning.overrides.distributed import _IndexBatchSamplerWrapper
+from lightning.pytorch.overrides.distributed import _IndexBatchSamplerWrapper
 from torch.utils.data import DataLoader, Dataset
 
 
diff --git a/nemo/lightning/fabric/conversion.py b/nemo/lightning/fabric/conversion.py
index 9ad713ec5261..d1c7affe3f40 100644
--- a/nemo/lightning/fabric/conversion.py
+++ b/nemo/lightning/fabric/conversion.py
@@ -15,10 +15,10 @@
 from functools import singledispatch
 from typing import Any, TypeVar
 
-from lightning_fabric import plugins as fl_plugins
-from lightning_fabric import strategies as fl_strategies
-from pytorch_lightning import plugins as pl_plugins
-from pytorch_lightning import strategies as pl_strategies
+from lightning.fabric import plugins as fl_plugins
+from lightning.fabric import strategies as fl_strategies
+from lightning.pytorch import plugins as pl_plugins
+from lightning.pytorch import strategies as pl_strategies
 
 T = TypeVar('T')
 FabricT = TypeVar('FabricT')
@@ -39,8 +39,8 @@ def to_fabric(obj: Any) -> Any:
         NotImplementedError: If no converter is registered for the object's type.
 
     Example:
-        >>> from pytorch_lightning.strategies import Strategy as PLStrategy
-        >>> from lightning_fabric.strategies import Strategy as FabricStrategy
+        >>> from lightning.pytorch.strategies import Strategy as PLStrategy
+        >>> from lightning.fabric.strategies import Strategy as FabricStrategy
         >>> from nemo.lightning.fabric.conversion import to_fabric
         >>>
         >>> # Define a custom PyTorch Lightning strategy
@@ -70,7 +70,7 @@ def to_fabric(obj: Any) -> Any:
         f"No Fabric converter registered for {type(obj).__name__}. "
         f"To register a new conversion, use the @to_fabric.register decorator:\n\n"
         f"from nemo.lightning.fabric.conversion import to_fabric\n"
-        f"from lightning_fabric import strategies as fl_strategies\n\n"
+        f"from lightning.fabric import strategies as fl_strategies\n\n"
         f"@to_fabric.register({type(obj).__name__})\n"
         f"def _{type(obj).__name__.lower()}_converter(obj: {type(obj).__name__}) -> fl_strategies.Strategy:\n"
         f"    return fl_strategies.SomeStrategy(\n"
diff --git a/nemo/lightning/fabric/fabric.py b/nemo/lightning/fabric/fabric.py
index 60eb518a1e42..7d604de749d6 100644
--- a/nemo/lightning/fabric/fabric.py
+++ b/nemo/lightning/fabric/fabric.py
@@ -17,8 +17,8 @@
 from typing import TYPE_CHECKING, Optional, Protocol, Sequence, Type, TypeVar, Union, runtime_checkable
 
 import fiddle as fdl
-import lightning_fabric as lb
-import pytorch_lightning as pl
+import lightning.fabric as lb
+import lightning.pytorch as pl
 from torch import nn
 from typing_extensions import Self, override
 
diff --git a/nemo/lightning/fabric/plugins.py b/nemo/lightning/fabric/plugins.py
index 723b48b6b357..58bf5f5ca9f9 100644
--- a/nemo/lightning/fabric/plugins.py
+++ b/nemo/lightning/fabric/plugins.py
@@ -16,7 +16,7 @@
 from typing import TYPE_CHECKING, Any, Generator, Literal, TypeVar
 
 import torch
-from lightning_fabric.plugins.precision import MixedPrecision
+from lightning.fabric.plugins.precision import MixedPrecision
 from torch import nn
 from torch.optim import Optimizer
 
diff --git a/nemo/lightning/fabric/strategies.py b/nemo/lightning/fabric/strategies.py
index 575f69a58caf..30a03504060f 100644
--- a/nemo/lightning/fabric/strategies.py
+++ b/nemo/lightning/fabric/strategies.py
@@ -29,21 +29,21 @@
 )
 
 import torch
-from lightning_fabric.accelerators import CPUAccelerator
-from lightning_fabric.accelerators.accelerator import Accelerator
-from lightning_fabric.plugins.collectives.torch_collective import default_pg_timeout
-from lightning_fabric.plugins.environments.cluster_environment import ClusterEnvironment
-from lightning_fabric.plugins.io.checkpoint_io import CheckpointIO
-from lightning_fabric.plugins.precision import Precision
-from lightning_fabric.strategies import DDPStrategy
-from lightning_fabric.strategies.strategy import _validate_keys_for_strict_loading
-from lightning_fabric.utilities.types import _PATH, _Stateful
+from lightning.fabric.accelerators import CPUAccelerator
+from lightning.fabric.accelerators.accelerator import Accelerator
+from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout
+from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
+from lightning.fabric.plugins.io.checkpoint_io import CheckpointIO
+from lightning.fabric.plugins.precision import Precision
+from lightning.fabric.strategies import DDPStrategy
+from lightning.fabric.strategies.strategy import _validate_keys_for_strict_loading
+from lightning.fabric.utilities.types import _PATH, _Stateful
+from lightning.pytorch import LightningDataModule
+from lightning.pytorch.loops.fetchers import _DataFetcher
+from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
+from lightning.pytorch.utilities.combined_loader import CombinedLoader
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning import LightningDataModule
-from pytorch_lightning.loops.fetchers import _DataFetcher
-from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
-from pytorch_lightning.utilities.combined_loader import CombinedLoader
 from torch import Tensor, nn
 from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook
 from torch.nn import Module
diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py
index be9372f2e79b..869ec6e613cb 100644
--- a/nemo/lightning/io/api.py
+++ b/nemo/lightning/io/api.py
@@ -16,7 +16,7 @@
 from typing import Callable, Optional, Type, overload
 import fiddle as fdl
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector, load
 from nemo.lightning.io.pl import TrainerContext
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index e699f15565bd..a38be6ee8f0a 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -18,9 +18,9 @@
 from pathlib import Path, PosixPath, PurePath, WindowsPath
 from typing import Generic, Optional, Tuple, TypeVar
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from filelock import FileLock, Timeout
-from pytorch_lightning.trainer.states import TrainerFn
+from lightning.pytorch.trainer.states import TrainerFn
 
 from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
 
diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py
index 10ed52b136c2..788697887e39 100644
--- a/nemo/lightning/io/pl.py
+++ b/nemo/lightning/io/pl.py
@@ -17,12 +17,12 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Generic, Optional, TypeVar, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
-from lightning_fabric.plugins import CheckpointIO
-from lightning_fabric.plugins.io.checkpoint_io import CheckpointIO
-from lightning_fabric.utilities.cloud_io import get_filesystem
-from lightning_fabric.utilities.types import _PATH
+from lightning.fabric.plugins import CheckpointIO
+from lightning.fabric.plugins.io.checkpoint_io import CheckpointIO
+from lightning.fabric.utilities.cloud_io import get_filesystem
+from lightning.fabric.utilities.types import _PATH
 from megatron.core.dist_checkpointing.serialization import (
     get_default_load_sharded_strategy,
     get_default_save_sharded_strategy,
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 6a3138b1da29..0f84f3be0a23 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -42,13 +42,12 @@
 
 import torch
 import torch.distributed
+from lightning.pytorch.utilities import move_data_to_device
 from megatron.core import parallel_state
 from megatron.core.distributed import DistributedDataParallel as McoreDDP
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
 from megatron.core.transformer.transformer_config import TransformerConfig
-from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.utilities import move_data_to_device
 from torch import Tensor, nn
 from typing_extensions import override
 
@@ -58,7 +57,7 @@
 STEP_OUTPUT = Optional[Union[Tensor, Mapping[str, Any]]]
 
 if TYPE_CHECKING:
-    import pytorch_lightning as pl
+    import lightning.pytorch as pl
 
 
 @runtime_checkable
@@ -836,7 +835,7 @@ def add(self, *callbacks) -> "CallbackConnector":
         """
         _pl_callback = None
         try:
-            import pytorch_lightning as pl
+            import lightning.pytorch as pl
 
             _pl_callback = pl.Callback
         except ImportError:
diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py
index a901a3a8842a..79f622ebc6a8 100644
--- a/nemo/lightning/nemo_logger.py
+++ b/nemo/lightning/nemo_logger.py
@@ -18,10 +18,10 @@
 from pathlib import Path
 from typing import List, Optional, Union
 
-import lightning_fabric as fl
-import pytorch_lightning as pl
-from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint as PTLModelCheckpoint
-from pytorch_lightning.loggers import Logger, TensorBoardLogger, WandbLogger
+import lightning.fabric as fl
+import lightning.pytorch as pl
+from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint as PTLModelCheckpoint
+from lightning.pytorch.loggers import Logger, TensorBoardLogger, WandbLogger
 
 from nemo.lightning.io.mixin import IOMixin
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
diff --git a/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py b/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py
index 391666fb8f32..320140d76f3a 100644
--- a/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py
+++ b/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py
@@ -15,8 +15,8 @@
 from functools import cache
 
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.utils import check_param_hashes_across_dp_replicas
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo.lightning import io
 from nemo.utils import logging
diff --git a/nemo/lightning/pytorch/callbacks/debugging.py b/nemo/lightning/pytorch/callbacks/debugging.py
index 5f6e722ef89b..135e8e486837 100644
--- a/nemo/lightning/pytorch/callbacks/debugging.py
+++ b/nemo/lightning/pytorch/callbacks/debugging.py
@@ -14,9 +14,9 @@
 
 from typing import Callable, Dict, List, Optional, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
-from pytorch_lightning.callbacks import Callback
+from lightning.pytorch.callbacks import Callback
 
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
 from nemo.utils import logging
diff --git a/nemo/lightning/pytorch/callbacks/garbage_collection.py b/nemo/lightning/pytorch/callbacks/garbage_collection.py
index ba4d378ee893..90e122f6d3e4 100644
--- a/nemo/lightning/pytorch/callbacks/garbage_collection.py
+++ b/nemo/lightning/pytorch/callbacks/garbage_collection.py
@@ -15,7 +15,7 @@
 import gc
 from typing import Any
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from nemo.utils import logging
 
 
diff --git a/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py b/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py
index fc4312e2ff84..58173642527a 100644
--- a/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py
+++ b/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 from dataclasses import asdict, dataclass, fields
-import pytorch_lightning as pl
 
+import lightning.pytorch as pl
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core import ModelParallelConfig
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import TransformerLayerTPOverlapCfg
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy, ParallelismConfig
diff --git a/nemo/lightning/pytorch/callbacks/memory_profiler.py b/nemo/lightning/pytorch/callbacks/memory_profiler.py
index 5b2ee1d46e11..2813bd141a7a 100644
--- a/nemo/lightning/pytorch/callbacks/memory_profiler.py
+++ b/nemo/lightning/pytorch/callbacks/memory_profiler.py
@@ -15,7 +15,7 @@
 import os
 
 import torch
-from pytorch_lightning.callbacks.callback import Callback
+from lightning.pytorch.callbacks.callback import Callback
 from torch.utils.viz._cycles import warn_tensor_cycles
 
 from nemo.lightning import io
diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
index b384976d82bd..ca4ca08cab08 100644
--- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -19,12 +19,12 @@
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Union
 
-import pytorch_lightning
+import lightning.pytorch
 import torch
 from _weakref import proxy
-from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint as PTLModelCheckpoint
-from pytorch_lightning.callbacks.model_checkpoint import _is_local_file_protocol
-from pytorch_lightning.utilities import rank_zero_info
+from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint as PTLModelCheckpoint
+from lightning.pytorch.callbacks.model_checkpoint import _is_local_file_protocol
+from lightning.pytorch.utilities import rank_zero_info
 
 from nemo.lightning.ckpt_utils import ckpt_to_dir
 from nemo.lightning.io.pl import TrainerContext
@@ -312,7 +312,7 @@ def _del_model_without_trainer(self, filepath: str) -> None:
         if torch.distributed.is_initialized():
             torch.distributed.barrier()
 
-    def _ema_callback(self, trainer: 'pytorch_lightning.Trainer'):
+    def _ema_callback(self, trainer: 'lightning.pytorch.Trainer'):
         from nemo.collections.common.callbacks import EMA
 
         ema_callback = None
@@ -393,7 +393,7 @@ def remove_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barri
         except:
             return
 
-    def file_exists(self, filepath: str, trainer: "pytorch_lightning.Trainer", check_dist_ckpt: bool = True) -> bool:
+    def file_exists(self, filepath: str, trainer: "lightning.pytorch.Trainer", check_dist_ckpt: bool = True) -> bool:
         """Checks if a file or a file without a suffix (distributed checkpoint) exists."""
         exists = self._fs.exists(filepath) or (check_dist_ckpt and self._fs.exists(ckpt_to_dir(filepath)))
         return trainer.strategy.broadcast(exists)
@@ -432,7 +432,7 @@ def _link_checkpoint(self, trainer: "pl.Trainer", filepath: str, linkpath: str,
         linkpath = ckpt_to_dir(linkpath)
         super()._link_checkpoint(trainer, filepath, linkpath)
 
-    def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None:
+    def _save_checkpoint(self, trainer: 'lightning.pytorch.Trainer', filepath: str) -> None:
         from nemo.utils.get_rank import is_global_rank_zero
 
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
@@ -499,7 +499,7 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
                 finalize_fn()
 
     def _get_finalize_save_checkpoint_callback(
-        self, trainer: 'pytorch_lightning.Trainer', filepath: str, global_step: int
+        self, trainer: 'lightning.pytorch.Trainer', filepath: str, global_step: int
     ):
         """Creates a callback that can be used to finalize async (and sync) ckpt saves."""
 
@@ -534,7 +534,7 @@ def _cb():
 
         return _cb
 
-    def _remove_checkpoint(self, trainer: "pytorch_lightning.Trainer", filepath: str, override_async=False) -> None:
+    def _remove_checkpoint(self, trainer: "lightning.pytorch.Trainer", filepath: str, override_async=False) -> None:
         """Performs checkpoint removal.
 
         With async save, `self._remove_checkpoint` is called before the checkpoint
diff --git a/nemo/lightning/pytorch/callbacks/model_transform.py b/nemo/lightning/pytorch/callbacks/model_transform.py
index 64602b501ac3..b3c3310aa30f 100644
--- a/nemo/lightning/pytorch/callbacks/model_transform.py
+++ b/nemo/lightning/pytorch/callbacks/model_transform.py
@@ -15,7 +15,7 @@
 from functools import wraps
 from typing import Any, Callable, Optional, TypeVar
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from torch import nn
 
 from nemo.utils import logging
@@ -85,7 +85,7 @@ def _maybe_apply_transform(self, trainer):
 
     def apply_transform(self, trainer):
         self.model_transform(trainer.model)
-        from pytorch_lightning.utilities import model_summary
+        from lightning.pytorch.utilities import model_summary
 
         logging.info(
             f"After applying model_transform:\n" f"{model_summary.summarize(trainer.lightning_module, max_depth=1)}"
diff --git a/nemo/lightning/pytorch/callbacks/moe_token_drop.py b/nemo/lightning/pytorch/callbacks/moe_token_drop.py
index 10483dca5096..b0c7ff7999eb 100644
--- a/nemo/lightning/pytorch/callbacks/moe_token_drop.py
+++ b/nemo/lightning/pytorch/callbacks/moe_token_drop.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core import ModelParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 
diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py
index 2a5707d3166c..f350eae40730 100644
--- a/nemo/lightning/pytorch/callbacks/nsys.py
+++ b/nemo/lightning/pytorch/callbacks/nsys.py
@@ -15,7 +15,7 @@
 from typing import List, Optional
 
 import torch
-from pytorch_lightning.callbacks.callback import Callback
+from lightning.pytorch.callbacks.callback import Callback
 
 from nemo.utils import logging
 from nemo.utils.get_rank import get_rank
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index 5336615a4a38..3790b7788419 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -18,12 +18,12 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import torch.nn as nn
-from lightning_fabric.utilities.types import _PATH
-from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
-from pytorch_lightning.trainer.states import TrainerFn
+from lightning.fabric.utilities.types import _PATH
+from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
+from lightning.pytorch.trainer.states import TrainerFn
 from typing_extensions import override
 
 from nemo.lightning.ckpt_utils import ADAPTER_META_FILENAME
diff --git a/nemo/lightning/pytorch/callbacks/preemption.py b/nemo/lightning/pytorch/callbacks/preemption.py
index 69ac378ed698..98b59a9da0d0 100644
--- a/nemo/lightning/pytorch/callbacks/preemption.py
+++ b/nemo/lightning/pytorch/callbacks/preemption.py
@@ -18,8 +18,8 @@
 from typing import Optional
 
 import torch
-from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.callbacks import Callback
+from lightning.pytorch.trainer.trainer import Trainer
 
 from nemo.lightning.io.mixin import IOMixin
 from nemo.utils import logging
diff --git a/nemo/lightning/pytorch/callbacks/progress_bar.py b/nemo/lightning/pytorch/callbacks/progress_bar.py
index 6912c3fc57d4..f3c3c4555bac 100644
--- a/nemo/lightning/pytorch/callbacks/progress_bar.py
+++ b/nemo/lightning/pytorch/callbacks/progress_bar.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pytorch_lightning.callbacks.progress import TQDMProgressBar
-from pytorch_lightning.callbacks.progress.tqdm_progress import _update_n
+from lightning.pytorch.callbacks.progress import TQDMProgressBar
+from lightning.pytorch.callbacks.progress.tqdm_progress import _update_n
 
 
 class MegatronProgressBar(TQDMProgressBar):
diff --git a/nemo/lightning/pytorch/callbacks/progress_printer.py b/nemo/lightning/pytorch/callbacks/progress_printer.py
index d32f7d70cbdd..12d05ed2950c 100644
--- a/nemo/lightning/pytorch/callbacks/progress_printer.py
+++ b/nemo/lightning/pytorch/callbacks/progress_printer.py
@@ -15,9 +15,9 @@
 from collections import defaultdict
 from typing import Any
 
+from lightning.pytorch.callbacks.progress import ProgressBar
+from lightning.pytorch.utilities.types import STEP_OUTPUT
 from megatron.core.num_microbatches_calculator import get_num_microbatches
-from pytorch_lightning.callbacks.progress import ProgressBar
-from pytorch_lightning.utilities.types import STEP_OUTPUT
 from typing_extensions import override
 
 
diff --git a/nemo/lightning/pytorch/optim/base.py b/nemo/lightning/pytorch/optim/base.py
index 1d476142941a..fec3b7c118a4 100644
--- a/nemo/lightning/pytorch/optim/base.py
+++ b/nemo/lightning/pytorch/optim/base.py
@@ -17,8 +17,8 @@
 from copy import deepcopy
 from typing import List, Optional
 
-import pytorch_lightning as L
-from pytorch_lightning.utilities.types import OptimizerLRScheduler
+import lightning.pytorch as L
+from lightning.pytorch.utilities.types import OptimizerLRScheduler
 from torch.optim import Optimizer
 
 from nemo.lightning.io.mixin import IOMixin
diff --git a/nemo/lightning/pytorch/optim/megatron.py b/nemo/lightning/pytorch/optim/megatron.py
index 7ac413d4544f..9f9d2029be9e 100644
--- a/nemo/lightning/pytorch/optim/megatron.py
+++ b/nemo/lightning/pytorch/optim/megatron.py
@@ -15,7 +15,7 @@
 import inspect
 from typing import Callable, List, Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from megatron.core.distributed import finalize_model_grads
 from megatron.core.optimizer import OptimizerConfig
 from megatron.core.utils import get_model_config
diff --git a/nemo/lightning/pytorch/optim/pytorch.py b/nemo/lightning/pytorch/optim/pytorch.py
index 9d773917e4f4..ccd03f563ef8 100644
--- a/nemo/lightning/pytorch/optim/pytorch.py
+++ b/nemo/lightning/pytorch/optim/pytorch.py
@@ -14,8 +14,8 @@
 
 from typing import Callable, List, Optional
 
-import pytorch_lightning as pl
-import pytorch_lightning as L
+import lightning.pytorch as pl
+import lightning.pytorch as L
 from torch.optim import Optimizer
 from torch.optim.optimizer import ParamsT
 
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index 024e2577c868..479e442d5ccb 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -16,7 +16,7 @@
 import logging
 from typing import List, Literal, Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from torch.utils.data import DataLoader
 
 from nemo.lightning.megatron_parallel import MegatronStep
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
index 5c318b59e54a..830978ba11e7 100644
--- a/nemo/lightning/pytorch/plugins/mixed_precision.py
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -16,9 +16,8 @@
 from dataclasses import dataclass, fields
 from typing import Any, Callable, Generator, List, Literal, Tuple, TypeVar, Union
 
-import pytorch_lightning as pl
 import torch
-from pytorch_lightning.plugins.precision import Precision
+from lightning.pytorch.plugins.precision import Precision
 from torch.nn import Module
 from torch.optim import Optimizer
 
diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py
index 83d5781c0dde..4c5a165c2d8d 100644
--- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py
+++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py
@@ -17,14 +17,14 @@
 from pathlib import Path
 from typing import Any, Dict, Optional, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
-from lightning_fabric.plugins import CheckpointIO
-from lightning_fabric.strategies.fsdp import _get_sharded_state_dict_context
+from lightning.fabric.plugins import CheckpointIO
+from lightning.fabric.strategies.fsdp import _get_sharded_state_dict_context
+from lightning.pytorch.strategies.fsdp import FSDPStrategy as PLFSDPStrategy
+from lightning.pytorch.trainer.states import TrainerFn
+from lightning.pytorch.utilities.types import STEP_OUTPUT
 from megatron.core.transformer.transformer_layer import TransformerLayer
-from pytorch_lightning.strategies.fsdp import FSDPStrategy as PLFSDPStrategy
-from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.utilities.types import STEP_OUTPUT
 from torch.distributed.checkpoint.state_dict import (  # get_state_dict,
     StateDictOptions,
     get_optimizer_state_dict,
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index c62a90313b45..870cc0aaaddd 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -35,20 +35,20 @@
     cast,
 )
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import torch.distributed
-from lightning_fabric.plugins import CheckpointIO, ClusterEnvironment
-from lightning_fabric.utilities.optimizer import _optimizer_to_device, _optimizers_to_device
+from lightning.fabric.plugins import CheckpointIO, ClusterEnvironment
+from lightning.fabric.utilities.optimizer import _optimizer_to_device, _optimizers_to_device
+from lightning.pytorch.accelerators import CPUAccelerator
+from lightning.pytorch.loops import _AutomaticOptimization, evaluation_loop, fit_loop, prediction_loop
+from lightning.pytorch.loops.fetchers import _DataLoaderIterDataFetcher
+from lightning.pytorch.overrides.distributed import _sync_module_states
+from lightning.pytorch.strategies.ddp import DDPStrategy
+from lightning.pytorch.trainer.states import RunningStage, TrainerFn
+from lightning.pytorch.utilities.types import STEP_OUTPUT
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.accelerators import CPUAccelerator
-from pytorch_lightning.loops import _AutomaticOptimization, evaluation_loop, fit_loop, prediction_loop
-from pytorch_lightning.loops.fetchers import _DataLoaderIterDataFetcher
-from pytorch_lightning.overrides.distributed import _sync_module_states
-from pytorch_lightning.strategies.ddp import DDPStrategy
-from pytorch_lightning.trainer.states import RunningStage, TrainerFn
-from pytorch_lightning.utilities.types import STEP_OUTPUT
 from torch import nn
 from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook
 from torch.nn.parallel import DistributedDataParallel
diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py
index 43a5a9243aa5..4f5a78419d6d 100644
--- a/nemo/lightning/pytorch/strategies/utils.py
+++ b/nemo/lightning/pytorch/strategies/utils.py
@@ -17,15 +17,14 @@
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
-from lightning_fabric.plugins import ClusterEnvironment
+from lightning.fabric.plugins import ClusterEnvironment
+from lightning.pytorch.callbacks import TQDMProgressBar
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing.mapping import ShardedBase, ShardedObject, ShardedTensor
 from megatron.core.dist_checkpointing.strategies.torch import sharded_tensor_to_torch_sharded_tensor
 from megatron.core.transformer.utils import _get_extra_state_offsets
-from pytorch_lightning.callbacks import TQDMProgressBar
-from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
 from torch.distributed._sharded_tensor import ShardedTensor as TorchShardedTensor
 from torch.distributed._tensor import DTensor, Replicate, Shard
 from torch.distributed.device_mesh import DeviceMesh
diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py
index c97c59ef524d..701c1cde4eaf 100644
--- a/nemo/lightning/pytorch/trainer.py
+++ b/nemo/lightning/pytorch/trainer.py
@@ -16,9 +16,9 @@
 from copy import deepcopy
 
 import fiddle as fdl
-import pytorch_lightning as pl
-from pytorch_lightning.loops import _TrainingEpochLoop
-from pytorch_lightning.loops.fetchers import _DataFetcher
+import lightning.pytorch as pl
+from lightning.pytorch.loops import _TrainingEpochLoop
+from lightning.pytorch.loops.fetchers import _DataFetcher
 from typing_extensions import Self
 
 from nemo.lightning.fabric.conversion import to_fabric
diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py
index 412ca8665b84..7b534646731c 100644
--- a/nemo/lightning/resume.py
+++ b/nemo/lightning/resume.py
@@ -18,8 +18,8 @@
 from pathlib import Path, PosixPath, WindowsPath
 from typing import Optional, Union
 
-import lightning_fabric as fl
-import pytorch_lightning as pl
+import lightning.fabric as fl
+import lightning.pytorch as pl
 
 from nemo.lightning import io
 from nemo.lightning.base import NEMO_MODELS_CACHE
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index 9d2936e567ec..00738f1fc7d1 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -20,9 +20,9 @@
 
 import nemo_run as run
 import yaml
+from lightning.pytorch import Callback
+from lightning.pytorch.loggers import WandbLogger
 from nemo_run.core.serialization.yaml import YamlSerializer
-from pytorch_lightning import Callback
-from pytorch_lightning.loggers import WandbLogger
 
 from nemo.lightning.pytorch.callbacks import NsysCallback, PreemptionCallback
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py
index c78196934108..b44006828963 100644
--- a/nemo/utils/callbacks/cuda_graph.py
+++ b/nemo/utils/callbacks/cuda_graph.py
@@ -37,15 +37,15 @@
 from types import MethodType
 from typing import Any, Dict
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
-from pytorch_lightning import LightningModule
-from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.loops.optimization.automatic import ClosureResult
-from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection, _ResultMetric
-from pytorch_lightning.utilities import CombinedLoader, rank_zero_info
-from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
-from pytorch_lightning.utilities.types import STEP_OUTPUT
+from lightning.pytorch import LightningModule
+from lightning.pytorch.callbacks import Callback
+from lightning.pytorch.loops.optimization.automatic import ClosureResult
+from lightning.pytorch.trainer.connectors.logger_connector.result import _ResultCollection, _ResultMetric
+from lightning.pytorch.utilities import CombinedLoader, rank_zero_info
+from lightning.pytorch.utilities.signature_utils import is_param_in_hook_signature
+from lightning.pytorch.utilities.types import STEP_OUTPUT
 from torch.nn.parallel import DistributedDataParallel
 
 __all__ = ["CUDAGraphCallback"]
@@ -431,8 +431,8 @@ def on_save_checkpoint(
         Called when saving a checkpoint to give you a chance to store anything else you might want to save.
 
         Args:
-            trainer: the current :class:`~pytorch_lightning.trainer.Trainer` instance.
-            pl_module: the current :class:`~pytorch_lightning.core.module.LightningModule` instance.
+            trainer: the current :class:`~lightning.pytorch.trainer.Trainer` instance.
+            pl_module: the current :class:`~lightning.pytorch.core.module.LightningModule` instance.
             checkpoint: the checkpoint dictionary that will be saved.
         """
         # Since we've add bound method to optimizer and lr_scheduler, it can lead to more
diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
index 091075488878..b78ec9b4ac0f 100644
--- a/nemo/utils/callbacks/dist_ckpt_io.py
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -19,12 +19,12 @@
 from time import time
 from typing import Any, Dict, Optional, Union
 
-import pytorch_lightning as pl
-from lightning_fabric.plugins import CheckpointIO
-from lightning_fabric.utilities.cloud_io import get_filesystem
-from lightning_fabric.utilities.types import _PATH
-from pytorch_lightning import Callback
-from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
+import lightning.pytorch as pl
+from lightning.fabric.plugins import CheckpointIO
+from lightning.fabric.utilities.cloud_io import get_filesystem
+from lightning.fabric.utilities.types import _PATH
+from lightning.pytorch import Callback
+from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
 
 from nemo.utils import logging
 
diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py
index dc1da9ce1875..8fe3beaaa985 100644
--- a/nemo/utils/callbacks/nemo_model_checkpoint.py
+++ b/nemo/utils/callbacks/nemo_model_checkpoint.py
@@ -19,14 +19,13 @@
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
-import pytorch_lightning
+import lightning.pytorch
 import torch
 from _weakref import proxy
-
-from lightning_fabric.utilities.cloud_io import get_filesystem
-from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint, _is_local_file_protocol
-from pytorch_lightning.trainer import call
-from pytorch_lightning.utilities import rank_zero_info
+from lightning.fabric.utilities.cloud_io import get_filesystem
+from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint, _is_local_file_protocol
+from lightning.pytorch.trainer import call
+from lightning.pytorch.utilities import rank_zero_info
 
 from nemo.collections.common.callbacks import EMA
 from nemo.utils import logging
@@ -357,7 +356,7 @@ def _del_model_without_trainer(self, filepath: str) -> None:
                 except:
                     logging.info(f"Tried to remove checkpoint: {filepath} but failed.")
 
-    def _ema_callback(self, trainer: 'pytorch_lightning.Trainer') -> Optional[EMA]:
+    def _ema_callback(self, trainer: 'lightning.pytorch.Trainer') -> Optional[EMA]:
         ema_callback = None
         for callback in trainer.callbacks:
             if isinstance(callback, EMA):
@@ -506,12 +505,12 @@ def remove_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barri
         except:
             return
 
-    def file_exists(self, filepath: str, trainer: "pytorch_lightning.Trainer", check_dist_ckpt: bool = True) -> bool:
+    def file_exists(self, filepath: str, trainer: "lightning.pytorch.Trainer", check_dist_ckpt: bool = True) -> bool:
         """Checks if a file or a file without a suffix (distributed checkpoint) exists."""
         exists = self._fs.exists(filepath) or (check_dist_ckpt and self._fs.exists(ckpt_to_dir(filepath)))
         return trainer.strategy.broadcast(exists)
 
-    def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None:
+    def _save_checkpoint(self, trainer: 'lightning.pytorch.Trainer', filepath: str) -> None:
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
         # if anything goes wrong during checkpointing, we should be able to detect that data is incomplete.
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
@@ -552,7 +551,7 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
             self._drop_optimizer_states(trainer, filepath, storage_options)
 
     def _get_finalize_save_checkpoint_callback(
-        self, trainer: 'pytorch_lightning.Trainer', filepath: str, global_step: int
+        self, trainer: 'lightning.pytorch.Trainer', filepath: str, global_step: int
     ):
         """Creates a callback that can be used to finalize async (and sync) ckpt saves."""
 
@@ -585,7 +584,7 @@ def _cb():
 
         return _cb
 
-    def _remove_checkpoint(self, trainer: "pytorch_lightning.Trainer", filepath: str, override_async=False) -> None:
+    def _remove_checkpoint(self, trainer: "lightning.pytorch.Trainer", filepath: str, override_async=False) -> None:
         """Performs checkpoint removal or deferred removal.
 
         With async save, `self._remove_checkpoint` is called before the checkpoint
diff --git a/nemo/utils/callbacks/preemption.py b/nemo/utils/callbacks/preemption.py
index e9b5f95022f3..178fe94cee7c 100644
--- a/nemo/utils/callbacks/preemption.py
+++ b/nemo/utils/callbacks/preemption.py
@@ -16,7 +16,7 @@
 import sys
 
 import torch
-from pytorch_lightning.callbacks import Callback
+from lightning.pytorch.callbacks import Callback
 
 from nemo.utils import logging
 
@@ -24,7 +24,7 @@
 class PreemptionCallback(Callback):
     """
     PreemptionCallback class creates a callback that checks for preemption during training at the end of every step.
-    Upon preemption the callback provides a function to gracefully exit the training immediately and also saves the current state in a checkpoint as *last.ckpt. 
+    Upon preemption the callback provides a function to gracefully exit the training immediately and also saves the current state in a checkpoint as *last.ckpt.
     (to be able to start from the same step without wasting any compute while resuming the next time).
 
     PreemptionCallback is always enabled by default via the arg create_preemption_callback under ExpManagerConfig. To disable please pass
@@ -47,7 +47,7 @@ def interrupted(self):
 
     def on_train_start(self, trainer, pl_module):
         """
-        Defines custom handlers at the beginning of training to be executed when the 
+        Defines custom handlers at the beginning of training to be executed when the
         preemption signal is received.
         """
 
diff --git a/nemo/utils/callbacks/s3_checkpoint_io.py b/nemo/utils/callbacks/s3_checkpoint_io.py
index 7a9f984fee1b..4a48198311a2 100644
--- a/nemo/utils/callbacks/s3_checkpoint_io.py
+++ b/nemo/utils/callbacks/s3_checkpoint_io.py
@@ -22,7 +22,7 @@
 from typing import Any, Callable, Dict, Optional, Union
 
 import torch
-from lightning_fabric.plugins.io.checkpoint_io import CheckpointIO
+from lightning.fabric.plugins.io.checkpoint_io import CheckpointIO
 
 from nemo.utils import logging
 from nemo.utils.s3_utils import (
diff --git a/nemo/utils/cloud.py b/nemo/utils/cloud.py
index 7245567d636c..d565028bdf8c 100644
--- a/nemo/utils/cloud.py
+++ b/nemo/utils/cloud.py
@@ -17,8 +17,8 @@
 from time import sleep
 
 import wget
-from pytorch_lightning.plugins.environments import LightningEnvironment
-from pytorch_lightning.strategies import DDPStrategy, StrategyRegistry
+from lightning.pytorch.plugins.environments import LightningEnvironment
+from lightning.pytorch.strategies import DDPStrategy, StrategyRegistry
 
 from nemo.utils import logging
 
@@ -105,7 +105,10 @@ def initialize_sagemaker() -> None:
     """
 
     StrategyRegistry.register(
-        name='smddp', strategy=SageMakerDDPStrategy, process_group_backend="smddp", find_unused_parameters=False,
+        name='smddp',
+        strategy=SageMakerDDPStrategy,
+        process_group_backend="smddp",
+        find_unused_parameters=False,
     )
 
     def _install_system_libraries() -> None:
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index b512bc57cbab..04c43c46d247 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -26,18 +26,18 @@
 from shutil import copy, move
 from typing import Any, Collection, Dict, List, Optional, Tuple, Union
 
-import pytorch_lightning
+import lightning.pytorch
 import torch
 from hydra.core.hydra_config import HydraConfig
 from hydra.utils import get_original_cwd
+from lightning.pytorch.callbacks import Callback, ModelCheckpoint
+from lightning.pytorch.callbacks.early_stopping import EarlyStopping
+from lightning.pytorch.callbacks.timer import Interval, Timer
+from lightning.pytorch.loggers import MLFlowLogger, NeptuneLogger, TensorBoardLogger, WandbLogger
+from lightning.pytorch.loops import _TrainingEpochLoop
+from lightning.pytorch.strategies.ddp import DDPStrategy
+from lightning.pytorch.trainer.connectors.checkpoint_connector import _CheckpointConnector
 from omegaconf import DictConfig, OmegaConf, open_dict
-from pytorch_lightning.callbacks import Callback, ModelCheckpoint
-from pytorch_lightning.callbacks.early_stopping import EarlyStopping
-from pytorch_lightning.callbacks.timer import Interval, Timer
-from pytorch_lightning.loggers import MLFlowLogger, NeptuneLogger, TensorBoardLogger, WandbLogger
-from pytorch_lightning.loops import _TrainingEpochLoop
-from pytorch_lightning.strategies.ddp import DDPStrategy
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.common.callbacks import EMA
 from nemo.constants import NEMO_ENV_VARNAME_TESTING, NEMO_ENV_VARNAME_VERSION
@@ -343,7 +343,7 @@ def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx)
         self._on_batch_end("validation_step_timing in s", trainer, pl_module)
 
 
-def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None) -> Optional[Path]:
+def exp_manager(trainer: 'lightning.pytorch.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None) -> Optional[Path]:
     """
     exp_manager is a helper function used to manage folders for experiments. It follows the pytorch lightning paradigm
     of exp_dir/model_or_experiment_name/version. If the lightning trainer has a logger, exp_manager will get exp_dir,
@@ -362,7 +362,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
     resume_if_exists is set to True, creating the version folders is ignored.
 
     Args:
-        trainer (pytorch_lightning.Trainer): The lightning trainer.
+        trainer (lightning.pytorch.Trainer): The lightning trainer.
         cfg (DictConfig, dict): Can have the following keys:
 
             - explicit_log_dir (str, Path): Can be used to override exp_dir/name/version folder creation. Defaults to
@@ -680,7 +680,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
     return log_dir
 
 
-def error_checks(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None):
+def error_checks(trainer: 'lightning.pytorch.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None):
     """
     Checks that the passed trainer is compliant with NeMo and exp_manager's passed configuration. Checks that:
         - Throws error when hydra has changed the working directory. This causes issues with lightning's DDP
@@ -728,7 +728,7 @@ def _filter_out_unfinished_checkpoints(checkpoint_paths: Collection[Union[Path,
 
 
 def check_resume(
-    trainer: 'pytorch_lightning.Trainer',
+    trainer: 'lightning.pytorch.Trainer',
     log_dir: str,
     resume_if_exists: bool = False,
     resume_past_end: bool = False,
@@ -886,7 +886,7 @@ def check_resume(
 
 
 def check_explicit_log_dir(
-    trainer: 'pytorch_lightning.Trainer', explicit_log_dir: Union[Path, str], exp_dir: str, name: str, version: str
+    trainer: 'lightning.pytorch.Trainer', explicit_log_dir: Union[Path, str], exp_dir: str, name: str, version: str
 ) -> Tuple[Path, str, str, str]:
     """Checks that the passed arguments are compatible with explicit_log_dir.
 
@@ -917,7 +917,7 @@ def check_explicit_log_dir(
 
 
 def get_log_dir(
-    trainer: 'pytorch_lightning.Trainer',
+    trainer: 'lightning.pytorch.Trainer',
     exp_dir: str = None,
     name: str = None,
     version: str = None,
@@ -1025,7 +1025,7 @@ def get_git_diff():
 
 
 def configure_loggers(
-    trainer: 'pytorch_lightning.Trainer',
+    trainer: 'lightning.pytorch.Trainer',
     exp_dir: [Path, str],
     log_dir: [Path, str],
     name: str,
@@ -1136,7 +1136,7 @@ def resume_start(self, checkpoint_path=None) -> None:
 
 
 def configure_checkpointing(
-    trainer: 'pytorch_lightning.Trainer',
+    trainer: 'lightning.pytorch.Trainer',
     log_dir: Path,
     name: str,
     resume: bool,
@@ -1257,12 +1257,12 @@ def _check_time_remaining(self, trainer: "pl.Trainer") -> None:
                 monitor_candidates = checkpoint_callback._monitor_candidates(trainer)
                 checkpoint_callback._save_last_checkpoint(trainer, monitor_candidates)
             # Throw this exception to signal to Lightning to terminate gracefully.
-            from pytorch_lightning.utilities.exceptions import _TunerExitException
+            from lightning.pytorch.utilities.exceptions import _TunerExitException
 
             raise _TunerExitException()
 
 
-def configure_no_restart_validation_training_loop(trainer: pytorch_lightning.Trainer) -> None:
+def configure_no_restart_validation_training_loop(trainer: lightning.pytorch.Trainer) -> None:
     if type(trainer.fit_loop.epoch_loop) != _TrainingEpochLoop:
         warnings.warn("Detected custom epoch loop. Skipping no validation on restart support.", UserWarning)
         return
diff --git a/nemo/utils/lightning_logger_patch.py b/nemo/utils/lightning_logger_patch.py
index 1b21ce3b1ae5..1528146c64b5 100644
--- a/nemo/utils/lightning_logger_patch.py
+++ b/nemo/utils/lightning_logger_patch.py
@@ -15,7 +15,7 @@
 import logging as _logging
 from logging.handlers import MemoryHandler
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 
 HANDLERS = {}
 PATCHED = False
diff --git a/nemo/utils/loggers/clearml_logger.py b/nemo/utils/loggers/clearml_logger.py
index 4e2063705b4f..c7c3945ad853 100644
--- a/nemo/utils/loggers/clearml_logger.py
+++ b/nemo/utils/loggers/clearml_logger.py
@@ -19,11 +19,11 @@
 from typing import Any, List, Literal, Mapping, Optional, Union
 
 import pandas as pd
+from lightning.pytorch.callbacks import Checkpoint
+from lightning.pytorch.loggers import Logger
+from lightning.pytorch.utilities.parsing import AttributeDict
 from lightning_utilities.core.apply_func import apply_to_collection
 from omegaconf import DictConfig, ListConfig, OmegaConf
-from pytorch_lightning.callbacks import Checkpoint
-from pytorch_lightning.loggers import Logger
-from pytorch_lightning.utilities.parsing import AttributeDict
 from torch import Tensor
 
 from nemo.utils import logging
diff --git a/nemo/utils/loggers/dllogger.py b/nemo/utils/loggers/dllogger.py
index cdeef63b75f7..871d7ee3f7a2 100644
--- a/nemo/utils/loggers/dllogger.py
+++ b/nemo/utils/loggers/dllogger.py
@@ -17,11 +17,11 @@
 from pathlib import Path
 from typing import Optional
 
+from lightning.pytorch.loggers import Logger
+from lightning.pytorch.utilities import rank_zero_only
+from lightning.pytorch.utilities.parsing import AttributeDict
 from lightning_utilities.core.apply_func import apply_to_collection
 from omegaconf import DictConfig, ListConfig, OmegaConf
-from pytorch_lightning.loggers import Logger
-from pytorch_lightning.utilities import rank_zero_only
-from pytorch_lightning.utilities.parsing import AttributeDict
 
 from nemo.utils import logging
 
@@ -34,7 +34,7 @@
     HAVE_DLLOGGER = False
 
 try:
-    from lightning_fabric.utilities.logger import _convert_params, _flatten_dict, _sanitize_callable_params
+    from lightning.fabric.utilities.logger import _convert_params, _flatten_dict, _sanitize_callable_params
 
     PL_LOGGER_UTILITIES = True
 except (ImportError, ModuleNotFoundError):
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index e8020f244821..137fb2236971 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -1,6 +1,7 @@
 cloudpickle
 fiddle
 hydra-core>1.3,<=1.3.2
+lightning>2.2.1
 omegaconf<=2.3
 pytorch-lightning>2.2.1
 torchmetrics>=0.11.0
diff --git a/scripts/checkpoint_averaging/average_model_checkpoints.py b/scripts/checkpoint_averaging/average_model_checkpoints.py
index 06c522f1e192..ce88bba9716b 100644
--- a/scripts/checkpoint_averaging/average_model_checkpoints.py
+++ b/scripts/checkpoint_averaging/average_model_checkpoints.py
@@ -60,7 +60,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf, open_dict
 
diff --git a/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py b/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py
index 59f02a117da4..7b964fd7bade 100755
--- a/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py
+++ b/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py
@@ -35,8 +35,8 @@
 import sys
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.core import ModelPT
@@ -60,7 +60,10 @@ def main():
         help='A list of Python file names to "from FILE import *" (Needed when some classes were defined in __main__ of a script)',
     )
     parser.add_argument(
-        '--class_path', type=str, default='', help='A path to class "module.submodule.class" (if given)',
+        '--class_path',
+        type=str,
+        default='',
+        help='A path to class "module.submodule.class" (if given)',
     )
     args = parser.parse_args()
 
diff --git a/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py
index b87f7e028cdb..b35fb201865e 100644
--- a/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py
@@ -25,9 +25,9 @@
 from collections import OrderedDict
 
 import torch
+from lightning.pytorch.core.saving import _load_state as ptl_load_state
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.core.saving import _load_state as ptl_load_state
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -158,7 +158,7 @@ def convert(args):
         scaler = None
         if precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32),
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
                 growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
                 hysteresis=nemo_config.get('hysteresis', 2),
             )
diff --git a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py
index f84350f75f60..335989309791 100644
--- a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py
@@ -17,7 +17,7 @@
 from collections import OrderedDict
 
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 from transformers import AutoModelForCausalLM
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py b/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py
index e970ea29fca2..0ec5cc1e474b 100644
--- a/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py
@@ -26,7 +26,7 @@
 
 import torch
 import torch.nn.functional as F
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 from transformers import AutoTokenizer, BertConfig, BertModel
 
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
@@ -207,10 +207,16 @@ def convert_config(ref_config, hf_state_dict):
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--input_name_or_path", type=str, required=True, help="Path to .nemo file",
+        "--input_name_or_path",
+        type=str,
+        required=True,
+        help="Path to .nemo file",
     )
     parser.add_argument(
-        "--output_path", type=str, required=True, help="Output HF model path",
+        "--output_path",
+        type=str,
+        required=True,
+        help="Output HF model path",
     )
 
     args = parser.parse_args()
diff --git a/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py b/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py
index 363e4de09ef7..2545181ce968 100644
--- a/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py
@@ -25,8 +25,8 @@
 from collections import OrderedDict
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import AutoModel, AutoTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -126,7 +126,7 @@ def convert(args):
         scaler = None
         if precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32),
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
                 growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
                 hysteresis=nemo_config.get('hysteresis', 2),
             )
@@ -211,7 +211,11 @@ def convert(args):
             qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :]))
             qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :]))
             qkv_bias = torch.cat((qkv_bias, v[i : i + 1, :]))
-        qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups),])
+        qkv_bias = qkv_bias.reshape(
+            [
+                head_size * (head_num + 2 * num_query_groups),
+            ]
+        )
 
         if mcore_gpt:
             qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'
diff --git a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py
index 11279c15c0c9..241e4254a9be 100644
--- a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py
@@ -17,7 +17,7 @@
 from collections import OrderedDict
 
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 from transformers import AutoModel
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py b/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py
index 2b8156ad4b26..c47444534604 100644
--- a/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py
@@ -38,9 +38,9 @@
 from argparse import ArgumentParser
 
 import torch
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import CLIPModel
 
 from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel
diff --git a/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py b/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py
index ae8885f4de93..8a880a290484 100644
--- a/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py
@@ -32,7 +32,7 @@
 import time
 from typing import Dict
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import yaml
 from omegaconf import OmegaConf
@@ -83,11 +83,11 @@ def get_new_key(old_key):
 
 
 def load_falcon_config(args) -> FalconConfig:
-    """ Helper utility to load FalconConfig.
+    """Helper utility to load FalconConfig.
 
     Legacy Falcon-7B and Falcon-40B are not compatible with `transformers.FalconConfig` and
     `transformers.FalconModel`. need to manually set the config values
-    and force to `falcon` model type. 
+    and force to `falcon` model type.
     """
     config = FalconConfig.from_pretrained(args.input_name_or_path)
     if config.model_type == 'RefinedWeb':
diff --git a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py
index da8f15b92649..cc1d99b6d1c6 100644
--- a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py
@@ -17,7 +17,7 @@
 from collections import OrderedDict
 
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 from transformers import AutoModelForCausalLM
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
index 35039f8d02e9..61443a3bcb28 100644
--- a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
+++ b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
@@ -17,8 +17,8 @@
 from collections import OrderedDict
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
index 4eb8cb6330ca..44de38497b44 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
@@ -27,8 +27,8 @@
 from collections import OrderedDict
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
index 42d3e77ce4c8..75bd0d0ab6ed 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
@@ -28,8 +28,8 @@
 from collections import OrderedDict
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
index f7096996e5b1..4a8a409a88fd 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
@@ -27,8 +27,8 @@
 from collections import OrderedDict
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
index 126d50be8355..87b7151aa961 100644
--- a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
@@ -17,8 +17,8 @@
 from collections import OrderedDict
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import open_dict
-from pytorch_lightning import Trainer
 from transformers import AutoModelForCausalLM, LlamaTokenizer, LlamaTokenizerFast, convert_slow_tokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
index 4bceb250999f..3cf5bbd4acf9 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
@@ -29,9 +29,9 @@
 
 import torch
 import torch.nn
+from lightning.pytorch.core.saving import _load_state as ptl_load_state
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.core.saving import _load_state as ptl_load_state
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
index 70e7f0b751d3..1f0a31076f8e 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
@@ -25,7 +25,7 @@
 
 import torch
 import torch.nn
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.trainer.trainer import Trainer
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
index 36e4c0c2c3ea..a75c6876e70a 100644
--- a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
@@ -30,9 +30,9 @@
 import megatron.core.parallel_state as parallel_state
 import torch
 import torch.nn
+from lightning.pytorch.core.saving import _load_state as ptl_load_state
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.core.saving import _load_state as ptl_load_state
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
index 2961b11870d5..eb934803f164 100644
--- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
@@ -26,7 +26,7 @@
 import megatron.core.parallel_state as parallel_state
 import torch
 import torch.nn
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.trainer.trainer import Trainer
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py
index e7d81f709092..d4a450a8e046 100644
--- a/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py
@@ -56,7 +56,7 @@
 import argparse
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import yaml
 from omegaconf import OmegaConf
@@ -68,7 +68,11 @@
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--input_name_or_path", type=str, default=None, required=True, help="Path to Huggingface MPT checkpoints",
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface MPT checkpoints",
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
     parser.add_argument(
diff --git a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py
index fc0f660cbd42..392e3628ccdb 100644
--- a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py
@@ -19,7 +19,7 @@
 from collections import OrderedDict
 
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 from transformers import LlamaTokenizer, PreTrainedTokenizerFast
 from transformers.convert_slow_tokenizer import LlamaConverter
 
diff --git a/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py
index 223c7af50843..b472a7e5c6f3 100644
--- a/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py
@@ -25,8 +25,8 @@
 from collections import OrderedDict
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import Qwen2ForCausalLM, Qwen2Tokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
index 164d1a28b6c0..968caade917c 100644
--- a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
@@ -17,7 +17,7 @@
 from collections import OrderedDict
 
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 from transformers import Qwen2ForCausalLM, Qwen2Tokenizer, Qwen2TokenizerFast, convert_slow_tokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py
index fc898c797a9e..862777cf52a8 100644
--- a/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py
@@ -28,9 +28,9 @@
 
 import torch
 import torch.nn
+from lightning.pytorch.core.saving import _load_state as ptl_load_state
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning.core.saving import _load_state as ptl_load_state
-from pytorch_lightning.trainer.trainer import Trainer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -168,7 +168,7 @@ def convert(args):
         scaler = None
         if precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32),
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
                 growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
                 hysteresis=nemo_config.get('hysteresis', 2),
             )
diff --git a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
index a271e44f0bdb..c418a714be0a 100644
--- a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
@@ -25,7 +25,7 @@
 
 import torch
 import torch.nn
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.trainer.trainer import Trainer
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py b/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
index e600c65e6de1..6b9f30ab427b 100644
--- a/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
@@ -52,7 +52,7 @@
 import os
 from typing import Dict
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import yaml
 from omegaconf import OmegaConf
diff --git a/scripts/checkpoint_converters/quantize_model_to_nf4.py b/scripts/checkpoint_converters/quantize_model_to_nf4.py
index db3a48aaa16d..8fbaeb875f7a 100644
--- a/scripts/checkpoint_converters/quantize_model_to_nf4.py
+++ b/scripts/checkpoint_converters/quantize_model_to_nf4.py
@@ -16,7 +16,7 @@
 from typing import List
 
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 from torch import nn
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
diff --git a/scripts/confidence_ensembles/build_ensemble.py b/scripts/confidence_ensembles/build_ensemble.py
index 4c05e2e4ff3f..dfb3793b42f4 100644
--- a/scripts/confidence_ensembles/build_ensemble.py
+++ b/scripts/confidence_ensembles/build_ensemble.py
@@ -80,8 +80,8 @@
 from typing import Dict, List, Optional, Tuple
 
 import joblib
+import lightning.pytorch as pl
 import numpy as np
-import pytorch_lightning as pl
 from omegaconf import MISSING, DictConfig, OmegaConf
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import confusion_matrix
@@ -215,7 +215,12 @@ class BuildEnsembleConfig:
             preserve_frame_confidence=True,
             exclude_blank=True,
             aggregation="mean",
-            method_cfg=ConfidenceMethodConfig(name="entropy", entropy_type="renyi", alpha=0.25, entropy_norm="lin",),
+            method_cfg=ConfidenceMethodConfig(
+                name="entropy",
+                entropy_type="renyi",
+                alpha=0.25,
+                entropy_norm="lin",
+            ),
         )
     )
     temperature: float = 1.0
@@ -499,7 +504,12 @@ def find_best_confidence(
         dev_features = np.array(list(zip(*cur_dev_confidences)))
         dev_labels = np.array(dev_labels)
         pipe, score = train_model_selection(
-            training_features, training_labels, dev_features, dev_labels, tune_lr, tune_lr_config,
+            training_features,
+            training_labels,
+            dev_features,
+            dev_labels,
+            tune_lr,
+            tune_lr_config,
         )
         if max_score < score:
             max_score = score
@@ -513,7 +523,7 @@ def find_best_confidence(
 @hydra_runner(config_name="BuildEnsembleConfig", schema=BuildEnsembleConfig)
 def main(cfg: BuildEnsembleConfig):
     # silencing all messages from nemo/ptl to avoid dumping tons of configs to the stdout
-    logging.getLogger('pytorch_lightning').setLevel(logging.CRITICAL)
+    logging.getLogger('lightning.pytorch').setLevel(logging.CRITICAL)
     logging.getLogger('nemo_logger').setLevel(logging.CRITICAL)
     LOG.info(f'Build ensemble config:\n{OmegaConf.to_yaml(cfg)}')
 
diff --git a/scripts/diffusion_model_lora_merge/merge_lora_weights_into_base_model.py b/scripts/diffusion_model_lora_merge/merge_lora_weights_into_base_model.py
index 57d9964cad3d..a80d9d2639e3 100644
--- a/scripts/diffusion_model_lora_merge/merge_lora_weights_into_base_model.py
+++ b/scripts/diffusion_model_lora_merge/merge_lora_weights_into_base_model.py
@@ -16,7 +16,7 @@
 from typing import Any, Dict
 
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm import MegatronLatentDiffusion
 from nemo.collections.multimodal.parts.utils import setup_trainer_and_model_for_inference
diff --git a/scripts/export.py b/scripts/export.py
index acfd3e3e3450..6e0b9b72e15b 100644
--- a/scripts/export.py
+++ b/scripts/export.py
@@ -30,8 +30,8 @@
 import sys
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning import Trainer
 
 import nemo
 from nemo.core import ModelPT
diff --git a/scripts/nemo_legacy_import/nlp_checkpoint_port.py b/scripts/nemo_legacy_import/nlp_checkpoint_port.py
index b7541ffdb8cd..058f9e072f5f 100644
--- a/scripts/nemo_legacy_import/nlp_checkpoint_port.py
+++ b/scripts/nemo_legacy_import/nlp_checkpoint_port.py
@@ -30,7 +30,7 @@
 import logging
 import sys
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf, open_dict
 
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
diff --git a/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py b/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py
index 334b3415a93b..3e96186552a5 100644
--- a/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py
@@ -14,7 +14,7 @@
 
 import os
 
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import (
     MegatronGPTPromptLearningModel,
diff --git a/scripts/nlp_language_modeling/hf_t5-v1_1_to_nemo.py b/scripts/nlp_language_modeling/hf_t5-v1_1_to_nemo.py
index 6a94e8f501bb..2361e000ef7e 100644
--- a/scripts/nlp_language_modeling/hf_t5-v1_1_to_nemo.py
+++ b/scripts/nlp_language_modeling/hf_t5-v1_1_to_nemo.py
@@ -53,8 +53,8 @@
 from argparse import ArgumentParser
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
 from transformers import AutoTokenizer, T5ForConditionalGeneration
 
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
diff --git a/scripts/nlp_language_modeling/merge_lora_weights/merge.py b/scripts/nlp_language_modeling/merge_lora_weights/merge.py
index 55d50502705c..3a6d110997ba 100644
--- a/scripts/nlp_language_modeling/merge_lora_weights/merge.py
+++ b/scripts/nlp_language_modeling/merge_lora_weights/merge.py
@@ -33,8 +33,8 @@
 from typing import Any, Dict, List
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 from torch.utils.data import DataLoader, Dataset
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py b/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py
index ee32f69bf734..dd7c1a3656be 100644
--- a/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py
+++ b/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py
@@ -15,8 +15,8 @@
 import os
 
 import torch
+from lightning.pytorch import Trainer
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
 from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer
@@ -66,7 +66,10 @@ def main(cfg) -> None:
         save_restore_connector.model_extracted_dir = model_path
 
     model_cfg = MegatronRetrievalModel.restore_from(
-        model_path, trainer=trainer, return_config=True, save_restore_connector=save_restore_connector,
+        model_path,
+        trainer=trainer,
+        return_config=True,
+        save_restore_connector=save_restore_connector,
     )
 
     with open_dict(model_cfg):
@@ -76,7 +79,10 @@ def main(cfg) -> None:
         model_cfg.activations_checkpoint_method = None
 
     model = MegatronRetrievalModel.restore_from(
-        model_path, trainer=trainer, save_restore_connector=save_restore_connector, override_config_path=model_cfg,
+        model_path,
+        trainer=trainer,
+        save_restore_connector=save_restore_connector,
+        override_config_path=model_cfg,
     )
 
     # check whether the DDP is initialized
diff --git a/scripts/speech_recognition/confidence/benchmark_asr_confidence.py b/scripts/speech_recognition/confidence/benchmark_asr_confidence.py
index 9c42ef6cca5b..7208867ff938 100644
--- a/scripts/speech_recognition/confidence/benchmark_asr_confidence.py
+++ b/scripts/speech_recognition/confidence/benchmark_asr_confidence.py
@@ -18,7 +18,7 @@
 from pathlib import Path
 from typing import Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import MISSING, OmegaConf
 from sklearn.model_selection import ParameterGrid
diff --git a/scripts/speech_recognition/oomptimizer.py b/scripts/speech_recognition/oomptimizer.py
index 3d5eb5a4dbb1..8d215cbc14eb 100755
--- a/scripts/speech_recognition/oomptimizer.py
+++ b/scripts/speech_recognition/oomptimizer.py
@@ -20,7 +20,7 @@
 from typing import Iterable, Literal
 
 import click
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from lhotse import compute_num_samples
 from omegaconf import OmegaConf
diff --git a/tests/collections/asr/confidence/test_asr_confidence.py b/tests/collections/asr/confidence/test_asr_confidence.py
index 015264a9debe..89beb61f50bf 100644
--- a/tests/collections/asr/confidence/test_asr_confidence.py
+++ b/tests/collections/asr/confidence/test_asr_confidence.py
@@ -19,8 +19,8 @@
 
 import numpy as np
 import pytest
+from lightning.pytorch import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.collections.asr.models import ASRModel, EncDecCTCModelBPE, EncDecRNNTBPEModel
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
diff --git a/tests/collections/asr/test_asr_context_biasing.py b/tests/collections/asr/test_asr_context_biasing.py
index 0fa76fdfb95d..b23b12655a8d 100644
--- a/tests/collections/asr/test_asr_context_biasing.py
+++ b/tests/collections/asr/test_asr_context_biasing.py
@@ -19,7 +19,7 @@
 import numpy as np
 import pytest
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 
 from nemo.collections.asr.models import EncDecCTCModelBPE
 from nemo.collections.asr.parts import context_biasing
@@ -105,25 +105,43 @@ def test_merge_alignment_with_ws_hyps(self, conformer_ctc_bpe_model):
         # ctc argmax predictions
         preds = np.array([120, 29, blank_idx, blank_idx])
         pred_text, raw_text = context_biasing.merge_alignment_with_ws_hyps(
-            preds, asr_model, ws_results, decoder_type="ctc", blank_idx=blank_idx,
+            preds,
+            asr_model,
+            ws_results,
+            decoder_type="ctc",
+            blank_idx=blank_idx,
         )
         assert raw_text == "gp"
         assert pred_text == "gpu"
 
         # rnnt token predictions
         preds = rnnt_utils.Hypothesis(
-            y_sequence=torch.tensor([120, 29]), score=0.0, timestep=torch.tensor([0, 1, 2, 3]),
+            y_sequence=torch.tensor([120, 29]),
+            score=0.0,
+            timestep=torch.tensor([0, 1, 2, 3]),
         )
         pred_text, raw_text = context_biasing.merge_alignment_with_ws_hyps(
-            preds, asr_model, ws_results, decoder_type="rnnt", blank_idx=blank_idx,
+            preds,
+            asr_model,
+            ws_results,
+            decoder_type="rnnt",
+            blank_idx=blank_idx,
         )
         assert raw_text == "gp"
         assert pred_text == "gpu"
 
         # rnnt empty token predictions
-        preds = rnnt_utils.Hypothesis(y_sequence=[], score=0.0, timestep=[],)
+        preds = rnnt_utils.Hypothesis(
+            y_sequence=[],
+            score=0.0,
+            timestep=[],
+        )
         pred_text, raw_text = context_biasing.merge_alignment_with_ws_hyps(
-            preds, asr_model, ws_results, decoder_type="rnnt", blank_idx=blank_idx,
+            preds,
+            asr_model,
+            ws_results,
+            decoder_type="rnnt",
+            blank_idx=blank_idx,
         )
         assert raw_text == ""
         assert pred_text == "gpu"
diff --git a/tests/collections/asr/test_asr_interctc_models.py b/tests/collections/asr/test_asr_interctc_models.py
index 8d5e4b0b689c..a8d7101033ab 100644
--- a/tests/collections/asr/test_asr_interctc_models.py
+++ b/tests/collections/asr/test_asr_interctc_models.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 from typing import Dict
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 import torch
 from omegaconf import DictConfig, ListConfig
 
@@ -68,7 +68,8 @@ def squeezeformer_encoder_config() -> Dict:
 class TestInterCTCLoss:
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "model_class", [EncDecCTCModel, EncDecHybridRNNTCTCModel],
+        "model_class",
+        [EncDecCTCModel, EncDecHybridRNNTCTCModel],
     )
     @pytest.mark.parametrize(
         "encoder_config",
@@ -241,10 +242,12 @@ def __getitem__(self, idx):
             trainer.fit(
                 asr_model,
                 train_dataloaders=torch.utils.data.DataLoader(
-                    DummyDataset([input_signal, input_length, target, target_length]), collate_fn=lambda x: x[0],
+                    DummyDataset([input_signal, input_length, target, target_length]),
+                    collate_fn=lambda x: x[0],
                 ),
                 val_dataloaders=torch.utils.data.DataLoader(
-                    DummyDataset([input_signal, input_length, target, target_length]), collate_fn=lambda x: x[0],
+                    DummyDataset([input_signal, input_length, target, target_length]),
+                    collate_fn=lambda x: x[0],
                 ),
             )
             required_metrics = ['final_loss'] if len(loss_weights) > 0 else []
@@ -264,7 +267,8 @@ def __getitem__(self, idx):
             trainer.test(
                 asr_model,
                 dataloaders=torch.utils.data.DataLoader(
-                    DummyDataset([input_signal, input_length, target, target_length]), collate_fn=lambda x: x[0],
+                    DummyDataset([input_signal, input_length, target, target_length]),
+                    collate_fn=lambda x: x[0],
                 ),
             )
             required_metrics = [f'inter_ctc_loss_l{idx}' for idx in apply_at_layers]
diff --git a/tests/collections/asr/test_asr_local_attn.py b/tests/collections/asr/test_asr_local_attn.py
index 257dc0949af3..3013c0efbddf 100644
--- a/tests/collections/asr/test_asr_local_attn.py
+++ b/tests/collections/asr/test_asr_local_attn.py
@@ -15,8 +15,8 @@
 import shutil
 import tempfile
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 import torch
 from omegaconf import DictConfig
 
@@ -89,10 +89,12 @@ def test_change_save_restore(self):
 
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "global_tokens", [0, 1, 4],
+        "global_tokens",
+        [0, 1, 4],
     )
     @pytest.mark.parametrize(
-        "global_tokens_spacing", [1, 4],
+        "global_tokens_spacing",
+        [1, 4],
     )
     def test_train(self, global_tokens, global_tokens_spacing):
         preprocessor_config = {'_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor'}
@@ -178,15 +180,18 @@ def __getitem__(self, idx):
         trainer.fit(
             asr_model,
             train_dataloaders=torch.utils.data.DataLoader(
-                DummyDataset([input_signal, input_length, target, target_length]), collate_fn=lambda x: x[0],
+                DummyDataset([input_signal, input_length, target, target_length]),
+                collate_fn=lambda x: x[0],
             ),
             val_dataloaders=torch.utils.data.DataLoader(
-                DummyDataset([input_signal, input_length, target, target_length]), collate_fn=lambda x: x[0],
+                DummyDataset([input_signal, input_length, target, target_length]),
+                collate_fn=lambda x: x[0],
             ),
         )
         trainer.test(
             asr_model,
             dataloaders=torch.utils.data.DataLoader(
-                DummyDataset([input_signal, input_length, target, target_length]), collate_fn=lambda x: x[0],
+                DummyDataset([input_signal, input_length, target, target_length]),
+                collate_fn=lambda x: x[0],
             ),
         )
diff --git a/tests/collections/common/test_ema.py b/tests/collections/common/test_ema.py
index 98f733f1c568..18ee04e371e2 100644
--- a/tests/collections/common/test_ema.py
+++ b/tests/collections/common/test_ema.py
@@ -15,13 +15,13 @@
 import os.path
 from typing import Any, Dict, Union
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch import Callback, Trainer
+from lightning.pytorch.utilities.exceptions import MisconfigurationException
+from lightning.pytorch.utilities.types import STEP_OUTPUT
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning import Callback, Trainer
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.types import STEP_OUTPUT
 
 from nemo.collections.common.callbacks import EMA
 from nemo.collections.common.callbacks.ema import EMAOptimizer
@@ -349,7 +349,12 @@ class TestEMATrain:
     @pytest.mark.parametrize("validate_original_weights", [True, False])
     @pytest.mark.run_only_on('GPU')
     def test_ema_run_cuda(
-        self, test_data_dir, precision, accumulate_grad_batches, validate_original_weights, tmpdir,
+        self,
+        test_data_dir,
+        precision,
+        accumulate_grad_batches,
+        validate_original_weights,
+        tmpdir,
     ):
         self.run_training_test(
             accumulate_grad_batches=accumulate_grad_batches,
diff --git a/tests/collections/llm/common.py b/tests/collections/llm/common.py
index 95b8bc0de584..c17243936bd1 100644
--- a/tests/collections/llm/common.py
+++ b/tests/collections/llm/common.py
@@ -14,7 +14,7 @@
 
 import os
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 
 from nemo import lightning as nl
diff --git a/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py
index d7ecaafaaf8c..55bea59d6274 100644
--- a/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py
+++ b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py
@@ -16,9 +16,11 @@
 ## There are no guarantees that this script is up-to-date with latest NeMo.
 
 import argparse
+
 import torch
+from lightning.pytorch.loggers import TensorBoardLogger
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.loggers import TensorBoardLogger
+
 from nemo import lightning as nl
 from nemo.collections import llm
 from nemo.collections.llm.api import train
diff --git a/tests/collections/llm/lora_mistralai.py b/tests/collections/llm/lora_mistralai.py
index 09a52668e3ee..0415569304ac 100644
--- a/tests/collections/llm/lora_mistralai.py
+++ b/tests/collections/llm/lora_mistralai.py
@@ -14,7 +14,7 @@
 
 import argparse
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from megatron.core.optimizer import OptimizerConfig
 
diff --git a/tests/collections/llm/megatron_gpt_pretraining.py b/tests/collections/llm/megatron_gpt_pretraining.py
index a73b2a694c76..9722ba9d6c68 100644
--- a/tests/collections/llm/megatron_gpt_pretraining.py
+++ b/tests/collections/llm/megatron_gpt_pretraining.py
@@ -18,8 +18,8 @@
 import argparse
 
 import torch
+from lightning.pytorch.loggers import TensorBoardLogger
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.loggers import TensorBoardLogger
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py
index e8f4947c9674..9b8195b0895a 100644
--- a/tests/collections/llm/megatron_t5_finetuning.py
+++ b/tests/collections/llm/megatron_t5_finetuning.py
@@ -18,8 +18,8 @@
 import argparse
 
 import torch
+from lightning.pytorch.loggers import WandbLogger
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.loggers import WandbLogger
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py
index a5460be3d154..ad63ae88fb73 100644
--- a/tests/collections/llm/megatron_t5_pretraining.py
+++ b/tests/collections/llm/megatron_t5_pretraining.py
@@ -18,8 +18,8 @@
 import argparse
 
 import torch
+from lightning.pytorch.loggers import WandbLogger
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.loggers import WandbLogger
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/tests/collections/llm/test_mnist_model_nemo2.py b/tests/collections/llm/test_mnist_model_nemo2.py
index a5c2aa96fc03..92cffc2a35bb 100644
--- a/tests/collections/llm/test_mnist_model_nemo2.py
+++ b/tests/collections/llm/test_mnist_model_nemo2.py
@@ -23,16 +23,16 @@
 from pathlib import Path
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, TypedDict, TypeVar, Union
 
+import lightning.pytorch as pl
 import megatron.core.num_microbatches_calculator
 import pytest
-import pytorch_lightning as pl
 import torch
 import torch.distributed
+from lightning.pytorch.loggers import TensorBoardLogger
 from megatron.core import ModelParallelConfig, parallel_state
 from megatron.core.optimizer import OptimizerConfig
 from megatron.core.transformer.enums import ModelType
 from megatron.core.transformer.module import MegatronModule
-from pytorch_lightning.loggers import TensorBoardLogger
 from torch import Tensor, nn
 from torch.utils.data import DataLoader
 from torchvision import transforms
diff --git a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py
index 8a6c1f993d28..da45d0e1fc38 100644
--- a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py
+++ b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py
@@ -23,16 +23,16 @@
 from pathlib import Path
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, TypedDict, TypeVar, Union
 
+import lightning.pytorch as pl
 import megatron.core.num_microbatches_calculator
 import pytest
-import pytorch_lightning as pl
 import torch
 import torch.distributed
+from lightning.pytorch.loggers import TensorBoardLogger
 from megatron.core import ModelParallelConfig, parallel_state
 from megatron.core.optimizer import OptimizerConfig
 from megatron.core.transformer.enums import ModelType
 from megatron.core.transformer.module import MegatronModule
-from pytorch_lightning.loggers import TensorBoardLogger
 from torch import Tensor, nn
 from torch.optim import Adam
 from torch.utils.data import DataLoader
diff --git a/tests/collections/multimodal/test_speechllm_models.py b/tests/collections/multimodal/test_speechllm_models.py
index 8698fed205ea..09149064b657 100644
--- a/tests/collections/multimodal/test_speechllm_models.py
+++ b/tests/collections/multimodal/test_speechllm_models.py
@@ -16,13 +16,13 @@
 import tempfile
 from pathlib import Path
 
+import lightning.pytorch as pl
 import numpy as np
 import pytest
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.plugins.environments import TorchElasticEnvironment
 from megatron.core import parallel_state
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 
 from nemo.collections.multimodal.speech_llm.models import modular_models
 from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import shift_tokens_by_multi_audios
diff --git a/tests/collections/nlp/test_falcon_model.py b/tests/collections/nlp/test_falcon_model.py
index 23430ad36300..62a4591092a9 100644
--- a/tests/collections/nlp/test_falcon_model.py
+++ b/tests/collections/nlp/test_falcon_model.py
@@ -14,8 +14,8 @@
 
 import pytest
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/tests/collections/nlp/test_flash_attention.py b/tests/collections/nlp/test_flash_attention.py
index f5585ddc1636..c8309b34b433 100644
--- a/tests/collections/nlp/test_flash_attention.py
+++ b/tests/collections/nlp/test_flash_attention.py
@@ -16,8 +16,8 @@
 
 import pytest
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from megatron.core import ModelParallelConfig
-from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.modules.common.megatron.attention import CoreAttention
 from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo
diff --git a/tests/collections/nlp/test_gpt_eval.py b/tests/collections/nlp/test_gpt_eval.py
index fb3f9fda5ac3..020185ec7385 100644
--- a/tests/collections/nlp/test_gpt_eval.py
+++ b/tests/collections/nlp/test_gpt_eval.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import pytest
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
diff --git a/tests/collections/nlp/test_gpt_model.py b/tests/collections/nlp/test_gpt_model.py
index 7b6c02f948a4..334167f3dcf8 100644
--- a/tests/collections/nlp/test_gpt_model.py
+++ b/tests/collections/nlp/test_gpt_model.py
@@ -16,8 +16,8 @@
 
 import pytest
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/tests/collections/nlp/test_nlp_exportables.py b/tests/collections/nlp/test_nlp_exportables.py
index b404764e7eed..6da0f8c93cc0 100644
--- a/tests/collections/nlp/test_nlp_exportables.py
+++ b/tests/collections/nlp/test_nlp_exportables.py
@@ -14,9 +14,9 @@
 import os
 import tempfile
 
+import lightning.pytorch as pl
 import onnx
 import pytest
-import pytorch_lightning as pl
 import torch
 import wget
 from omegaconf import DictConfig, OmegaConf
diff --git a/tests/collections/nlp/test_pretrained_models_performance.py b/tests/collections/nlp/test_pretrained_models_performance.py
index 82ff6ed103f1..b51f00681f57 100644
--- a/tests/collections/nlp/test_pretrained_models_performance.py
+++ b/tests/collections/nlp/test_pretrained_models_performance.py
@@ -17,8 +17,8 @@
 from shutil import rmtree
 from unittest import TestCase
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 from omegaconf import OmegaConf
 
 import nemo.collections.nlp.models as models
diff --git a/tests/collections/nlp/test_rampup_batch_size.py b/tests/collections/nlp/test_rampup_batch_size.py
index c7efb5f57f4c..763dfaaf3c51 100644
--- a/tests/collections/nlp/test_rampup_batch_size.py
+++ b/tests/collections/nlp/test_rampup_batch_size.py
@@ -16,8 +16,8 @@
 
 import pytest
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
diff --git a/tests/collections/nlp/test_retrieval_module.py b/tests/collections/nlp/test_retrieval_module.py
index 426e393c85bf..381d009f0e02 100644
--- a/tests/collections/nlp/test_retrieval_module.py
+++ b/tests/collections/nlp/test_retrieval_module.py
@@ -16,7 +16,7 @@
 import pytest
 import torch
 from einops import rearrange
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.trainer.trainer import Trainer
 
 from nemo.collections.nlp.modules.common.megatron.attention import ParallelChunkedCrossAttention
 from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType
@@ -73,7 +73,13 @@ def setup_class(cls):
         MB_SIZE = 4
         GB_SIZE = 8
         SEED = 1234
-        trainer = Trainer(strategy=NLPDDPStrategy(), devices=GPUS, accelerator='gpu', num_nodes=1, logger=None,)
+        trainer = Trainer(
+            strategy=NLPDDPStrategy(),
+            devices=GPUS,
+            accelerator='gpu',
+            num_nodes=1,
+            logger=None,
+        )
 
         initialize_model_parallel_for_nemo(
             world_size=trainer.world_size,
@@ -134,7 +140,9 @@ def test_cross_attn(self, model_parallel_config):
         dec_attn_mask = rearrange(hidden_mask, '(k n) b -> (b k) n', k=chunks)
         context_attn_mask = rearrange(context_mask, 'k r n b -> (b k) (r n)')
         enc_dec_attn_mask_3d = build_attention_mask_3d(
-            source_mask=dec_attn_mask, target_mask=context_attn_mask, attn_mask_type=AttnMaskType.padding,
+            source_mask=dec_attn_mask,
+            target_mask=context_attn_mask,
+            attn_mask_type=AttnMaskType.padding,
         )
         enc_dec_attn_mask_3d = enc_dec_attn_mask_3d[:, None, :, :]
 
diff --git a/tests/collections/nlp/test_retrieval_module_inference.py b/tests/collections/nlp/test_retrieval_module_inference.py
index ccb426ce4ab1..a7da05340708 100644
--- a/tests/collections/nlp/test_retrieval_module_inference.py
+++ b/tests/collections/nlp/test_retrieval_module_inference.py
@@ -17,7 +17,7 @@
 import torch
 import torch.nn.functional as F
 from einops import rearrange
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.trainer.trainer import Trainer
 
 from nemo.collections.nlp.modules.common.megatron.attention import ParallelChunkedCrossAttention
 from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType
@@ -73,7 +73,13 @@ def setup_class(cls):
         MB_SIZE = 4
         GB_SIZE = 8
         SEED = 1234
-        trainer = Trainer(strategy=NLPDDPStrategy(), devices=GPUS, accelerator='gpu', num_nodes=1, logger=None,)
+        trainer = Trainer(
+            strategy=NLPDDPStrategy(),
+            devices=GPUS,
+            accelerator='gpu',
+            num_nodes=1,
+            logger=None,
+        )
 
         initialize_model_parallel_for_nemo(
             world_size=trainer.world_size,
@@ -176,15 +182,33 @@ def test_retrieval_encoder_inference(self, model_parallel_config):
             neighbors=neighbors,
         )
         assert (encoder.encoder_output - hidden_emb[:, :64]).abs().max().item() < 1e-5
-        assert (out_gt[:, 0,] - out_2[:, 0]).abs().max().item() < 1e-2
+        assert (
+            out_gt[
+                :,
+                0,
+            ]
+            - out_2[:, 0]
+        ).abs().max().item() < 1e-2
         out_test = encoder(
             retrieved_emb[:, :1],
             context_mask[:, :1],
             context_attn_mask=hidden_mask[:, :64],
             encoder_output=hidden_emb[:, :64],
         )
-        assert (out_gt[:, 0,] - out_test[:, 0]).abs().max().item() < 1e-2
-        assert (out_gt[:, 0,] - out_2[:, 0]).abs().max().item() < 1e-2
+        assert (
+            out_gt[
+                :,
+                0,
+            ]
+            - out_test[:, 0]
+        ).abs().max().item() < 1e-2
+        assert (
+            out_gt[
+                :,
+                0,
+            ]
+            - out_2[:, 0]
+        ).abs().max().item() < 1e-2
 
         for i in range(64, 127):
             out_3 = encoder(
@@ -207,7 +231,13 @@ def test_retrieval_encoder_inference(self, model_parallel_config):
             neighbors=neighbors,
         )
         assert (encoder.encoder_output - hidden_emb[:, 64:128]).abs().max().item() < 1e-5
-        assert (out_gt[:, :2,] - out_3).abs().max().item() < 1e-2
+        assert (
+            out_gt[
+                :,
+                :2,
+            ]
+            - out_3
+        ).abs().max().item() < 1e-2
         # test inference
         for i in range(128, 191):
             out_4 = encoder(
@@ -231,7 +261,13 @@ def test_retrieval_encoder_inference(self, model_parallel_config):
         )
 
         assert (encoder.encoder_output - hidden_emb[:, 128:192]).abs().max().item() < 1e-5
-        assert (out_gt[:, :3,] - out_4).abs().max().item() < 1e-2
+        assert (
+            out_gt[
+                :,
+                :3,
+            ]
+            - out_4
+        ).abs().max().item() < 1e-2
 
         out_2 = encoder(
             retrieved_emb[:, :2],
@@ -263,7 +299,13 @@ def test_retrieval_encoder_inference(self, model_parallel_config):
             neighbors=neighbors,
         )
         assert (encoder.encoder_output - hidden_emb[:, 128:192]).abs().max().item() < 1e-5
-        assert (out_gt[:, :3,] - out_4).abs().max().item() < 1e-2
+        assert (
+            out_gt[
+                :,
+                :3,
+            ]
+            - out_4
+        ).abs().max().item() < 1e-2
 
     @pytest.mark.unit
     def test_cross_attn_inference(self, model_parallel_config):
@@ -309,7 +351,9 @@ def get_attn_mask_3d(hidden_mask, context_mask, chunks):
             dec_attn_mask = rearrange(hidden_mask, '(k n) b -> (b k) n', k=chunks)
             context_attn_mask = rearrange(context_mask, 'k r n b -> (b k) (r n)')
             enc_dec_attn_mask_3d = build_attention_mask_3d(
-                source_mask=dec_attn_mask, target_mask=context_attn_mask, attn_mask_type=AttnMaskType.padding,
+                source_mask=dec_attn_mask,
+                target_mask=context_attn_mask,
+                attn_mask_type=AttnMaskType.padding,
             )
             enc_dec_attn_mask_3d = enc_dec_attn_mask_3d[:, None, :, :]
             return enc_dec_attn_mask_3d
diff --git a/tests/collections/nlp/test_retro_model.py b/tests/collections/nlp/test_retro_model.py
index b96016c8d7ec..e91590915ba5 100644
--- a/tests/collections/nlp/test_retro_model.py
+++ b/tests/collections/nlp/test_retro_model.py
@@ -16,8 +16,8 @@
 
 import pytest
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
diff --git a/tests/core/test_config_utils.py b/tests/core/test_config_utils.py
index bb0a0f177dfb..9716fc160629 100644
--- a/tests/core/test_config_utils.py
+++ b/tests/core/test_config_utils.py
@@ -15,9 +15,9 @@
 from dataclasses import dataclass
 from typing import Any
 
+import lightning.pytorch as ptl
 import pytest
-import pytorch_lightning as ptl
-from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 
 from nemo.core.config.pytorch_lightning import TrainerConfig
 from nemo.utils import config_utils
@@ -126,7 +126,9 @@ def test_ptl_config(self):
         assert dataclass_subset is None
 
     @pytest.mark.unit
-    def test_early_stopping_config(self,):
+    def test_early_stopping_config(
+        self,
+    ):
         result = config_utils.assert_dataclass_signature_match(EarlyStopping, EarlyStoppingParams)
         signatures_match, cls_subset, dataclass_subset = result
 
diff --git a/tests/core/test_dist_ckpt.py b/tests/core/test_dist_ckpt.py
index 0a483c0f58ab..6c066d1856a2 100644
--- a/tests/core/test_dist_ckpt.py
+++ b/tests/core/test_dist_ckpt.py
@@ -17,11 +17,11 @@
 from pathlib import Path
 from typing import Any, Dict
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 import torch
-from lightning_fabric.plugins import TorchCheckpointIO
-from pytorch_lightning.demos.boring_classes import BoringModel
+from lightning.fabric.plugins import TorchCheckpointIO
+from lightning.pytorch.demos.boring_classes import BoringModel
 
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.utils.callbacks.dist_ckpt_io import (
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index d4b1d37c1938..32d401b2051f 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -18,13 +18,13 @@
 from pathlib import Path
 from typing import Any
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch import Callback
+from lightning.pytorch.loops import _TrainingEpochLoop
 from omegaconf import OmegaConf
 from omegaconf.errors import OmegaConfBaseException
-from pytorch_lightning import Callback
-from pytorch_lightning.loops import _TrainingEpochLoop
 
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.constants import NEMO_ENV_VARNAME_VERSION
diff --git a/tests/core/test_fault_tolerance.py b/tests/core/test_fault_tolerance.py
index 5b4e0ecba4aa..f916a7b44454 100644
--- a/tests/core/test_fault_tolerance.py
+++ b/tests/core/test_fault_tolerance.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 import os
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 
 from nemo.utils.exp_manager import exp_manager
 
diff --git a/tests/core/test_optimizers_schedulers.py b/tests/core/test_optimizers_schedulers.py
index 5e5d1ee20c83..419db309a918 100644
--- a/tests/core/test_optimizers_schedulers.py
+++ b/tests/core/test_optimizers_schedulers.py
@@ -15,12 +15,12 @@
 import math
 import random
 
+import lightning.pytorch as pl
 import omegaconf
 import pytest
-import pytorch_lightning as pl
 import torch
 import torch.optim
-from pytorch_lightning.utilities import rank_zero_only
+from lightning.pytorch.utilities import rank_zero_only
 
 from nemo.core import config, optim
 from nemo.core.optim.lr_scheduler import AVAILABLE_SCHEDULERS
@@ -936,7 +936,13 @@ def train(
                 enable_progress_bar=False,
             )
             max_steps = optim.lr_scheduler.compute_max_steps(
-                max_epochs, accumulate_grad_batches, limit_train_batches, devices, dataset_len, batch_size, drop_last,
+                max_epochs,
+                accumulate_grad_batches,
+                limit_train_batches,
+                devices,
+                dataset_len,
+                batch_size,
+                drop_last,
             )
             model = ExampleModel(batch_size, dataset_len, drop_last, max_steps)
             trainer.callbacks.append(Callback())
@@ -991,7 +997,13 @@ def train(
             dataset_len = random.randint(20, devices * 500)
             batch_size = random.randint(math.ceil(5.0 / devices), min(dataset_len // devices, 128))
             train(
-                max_epochs, accumulate_grad_batches, limit_train_batches, devices, batch_size, dataset_len, drop_last,
+                max_epochs,
+                accumulate_grad_batches,
+                limit_train_batches,
+                devices,
+                batch_size,
+                dataset_len,
+                drop_last,
             )
 
     @pytest.mark.unit
diff --git a/tests/core/test_straggler_det.py b/tests/core/test_straggler_det.py
index ee5222854889..1f938214d792 100644
--- a/tests/core/test_straggler_det.py
+++ b/tests/core/test_straggler_det.py
@@ -14,8 +14,8 @@
 
 import sys
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 import torch
 from omegaconf import OmegaConf
 
diff --git a/tests/core_ptl/check_for_ranks.py b/tests/core_ptl/check_for_ranks.py
index a1eae66790c4..dfbc05166c5a 100644
--- a/tests/core_ptl/check_for_ranks.py
+++ b/tests/core_ptl/check_for_ranks.py
@@ -16,9 +16,9 @@
 import shutil
 
 import torch
+from lightning.pytorch import Trainer
+from lightning.pytorch.utilities import rank_zero_only
 from omegaconf import OmegaConf
-from pytorch_lightning import Trainer
-from pytorch_lightning.utilities import rank_zero_only
 
 from nemo.core import ModelPT
 from nemo.utils import logging
diff --git a/tests/core_ptl/check_manual_upload_to_hf_hub.py b/tests/core_ptl/check_manual_upload_to_hf_hub.py
index f411ee72332c..912eabb805bf 100644
--- a/tests/core_ptl/check_manual_upload_to_hf_hub.py
+++ b/tests/core_ptl/check_manual_upload_to_hf_hub.py
@@ -14,7 +14,7 @@
 import shutil
 
 from huggingface_hub import HfApi
-from pytorch_lightning.utilities import rank_zero_only
+from lightning.pytorch.utilities import rank_zero_only
 
 from nemo.core import ModelPT
 from nemo.utils import AppState, logging
@@ -40,7 +40,9 @@ def load_model_from_unpacked_hf_dir(repo_id):
 def upload_model_as_single_nemo_file(model: ModelPT, repo_id, token):
     # Upload the model to HF Hub
     model.push_to_hf_hub(
-        repo_id=repo_id, pack_nemo_file=True, token=token,
+        repo_id=repo_id,
+        pack_nemo_file=True,
+        token=token,
     )
 
 
@@ -48,7 +50,9 @@ def upload_model_as_single_nemo_file(model: ModelPT, repo_id, token):
 def upload_model_as_single_nemo_file(model: ModelPT, repo_id, token):
     # Upload the model to HF Hub
     model.push_to_hf_hub(
-        repo_id=repo_id, pack_nemo_file=True, token=token,
+        repo_id=repo_id,
+        pack_nemo_file=True,
+        token=token,
     )
 
 
@@ -56,7 +60,9 @@ def upload_model_as_single_nemo_file(model: ModelPT, repo_id, token):
 def upload_model_as_unpacked_files(model: ModelPT, repo_id, token):
     # Upload the model to HF Hub
     model.push_to_hf_hub(
-        repo_id=repo_id, pack_nemo_file=False, token=token,
+        repo_id=repo_id,
+        pack_nemo_file=False,
+        token=token,
     )
 
 
diff --git a/tests/core_ptl/test_ptl_stateless_timer.py b/tests/core_ptl/test_ptl_stateless_timer.py
index 25f354a23c0d..5cfbbda39bbf 100644
--- a/tests/core_ptl/test_ptl_stateless_timer.py
+++ b/tests/core_ptl/test_ptl_stateless_timer.py
@@ -17,8 +17,8 @@
 
 import pytest
 import torch
+from lightning.pytorch import Trainer
 from omegaconf import OmegaConf
-from pytorch_lightning import Trainer
 
 from nemo.core import ModelPT
 from nemo.utils import logging
diff --git a/tests/lightning/_fabric/test_conversion.py b/tests/lightning/_fabric/test_conversion.py
index e690557ec2eb..e97e766c86a7 100644
--- a/tests/lightning/_fabric/test_conversion.py
+++ b/tests/lightning/_fabric/test_conversion.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import pytest
-from lightning_fabric import plugins as fl_plugins
-from lightning_fabric import strategies as fl_strategies
-from pytorch_lightning import plugins as pl_plugins
-from pytorch_lightning import strategies as pl_strategies
+from lightning.fabric import plugins as fl_plugins
+from lightning.fabric import strategies as fl_strategies
+from lightning.pytorch import plugins as pl_plugins
+from lightning.pytorch import strategies as pl_strategies
 
 from nemo import lightning as nl
 from nemo.lightning.fabric.conversion import to_fabric
diff --git a/tests/lightning/_io/test_api.py b/tests/lightning/_io/test_api.py
index a4d458cef17b..e0aaac1a6aa2 100644
--- a/tests/lightning/_io/test_api.py
+++ b/tests/lightning/_io/test_api.py
@@ -19,7 +19,7 @@
 import fiddle as fdl
 import pytest
 import yaml
-from pytorch_lightning.loggers import TensorBoardLogger
+from lightning.pytorch.loggers import TensorBoardLogger
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/tests/lightning/pytorch/callbacks/test_model_checkpoint.py b/tests/lightning/pytorch/callbacks/test_model_checkpoint.py
index 802f2b28c25c..edaa8a6f4ec9 100644
--- a/tests/lightning/pytorch/callbacks/test_model_checkpoint.py
+++ b/tests/lightning/pytorch/callbacks/test_model_checkpoint.py
@@ -17,12 +17,12 @@
 from pathlib import Path
 from typing import Iterator, Optional, Sequence, Tuple
 
+import lightning.pytorch as pl
 import megatron
 import pytest
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from megatron.core import ModelParallelConfig, parallel_state
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch import Tensor
 
 import nemo.lightning as nl
diff --git a/tests/lightning/pytorch/callbacks/test_model_transform.py b/tests/lightning/pytorch/callbacks/test_model_transform.py
index c59a82895125..cfae55cf99a9 100644
--- a/tests/lightning/pytorch/callbacks/test_model_transform.py
+++ b/tests/lightning/pytorch/callbacks/test_model_transform.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 from torch import nn
 
 from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform
diff --git a/tests/lightning/pytorch/callbacks/test_peft.py b/tests/lightning/pytorch/callbacks/test_peft.py
index 49a6aa0784aa..fb6728acee8f 100644
--- a/tests/lightning/pytorch/callbacks/test_peft.py
+++ b/tests/lightning/pytorch/callbacks/test_peft.py
@@ -15,7 +15,7 @@
 from unittest.mock import MagicMock, call, patch
 
 import torch.nn as nn
-from pytorch_lightning.trainer.states import TrainerFn
+from lightning.pytorch.trainer.states import TrainerFn
 from nemo.collections.llm import fn
 from nemo.lightning.pytorch.callbacks.peft import PEFT, WrappedAdapterIO
 from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO
diff --git a/tests/lightning/pytorch/callbacks/test_preemption.py b/tests/lightning/pytorch/callbacks/test_preemption.py
index 4152f7fcce59..802d898c5a2b 100644
--- a/tests/lightning/pytorch/callbacks/test_preemption.py
+++ b/tests/lightning/pytorch/callbacks/test_preemption.py
@@ -17,7 +17,7 @@
 
 import pytest
 import torch
-from pytorch_lightning import Trainer
+from lightning.pytorch import Trainer
 
 from nemo.lightning.pytorch.callbacks.preemption import PreemptionCallback
 
diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py
index 886b1085ed55..107d15061792 100644
--- a/tests/lightning/test_dist_ckpt.py
+++ b/tests/lightning/test_dist_ckpt.py
@@ -21,8 +21,8 @@ def set_env():
 
 from pathlib import Path
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 import torch
 
 import nemo.lightning as nl
diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py
index a5a5ec32c886..8a63a92f0ee6 100644
--- a/tests/lightning/test_nemo_logger.py
+++ b/tests/lightning/test_nemo_logger.py
@@ -19,8 +19,8 @@
 from unittest.mock import patch
 
 import pytest
-from pytorch_lightning.callbacks import ModelCheckpoint as PTLModelCheckpoint
-from pytorch_lightning.loggers import WandbLogger
+from lightning.pytorch.callbacks import ModelCheckpoint as PTLModelCheckpoint
+from lightning.pytorch.loggers import WandbLogger
 
 from nemo import lightning as nl
 from nemo.constants import NEMO_ENV_VARNAME_VERSION
diff --git a/tests/lightning/test_precision_plugin.py b/tests/lightning/test_precision_plugin.py
index 44ffa5939fab..960e658187c5 100644
--- a/tests/lightning/test_precision_plugin.py
+++ b/tests/lightning/test_precision_plugin.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import lightning.pytorch as pl
 import pytest
-import pytorch_lightning as pl
 import torch
 from megatron.core.optimizer import OptimizerConfig
 
diff --git a/tests/lightning/test_state_restoration.py b/tests/lightning/test_state_restoration.py
index ccc0eed64d56..59c5cc2234f7 100644
--- a/tests/lightning/test_state_restoration.py
+++ b/tests/lightning/test_state_restoration.py
@@ -17,8 +17,8 @@
 
 import pytest
 import torch
+from lightning.pytorch.callbacks import Callback
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.callbacks import Callback
 
 from nemo import lightning as nl
 from nemo.collections import llm
diff --git a/tests/utils/test_trainer_utils.py b/tests/utils/test_trainer_utils.py
index 55eee92a523c..251e59d4b648 100644
--- a/tests/utils/test_trainer_utils.py
+++ b/tests/utils/test_trainer_utils.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from lightning.pytorch.strategies import DDPStrategy
 from omegaconf import OmegaConf
-from pytorch_lightning.strategies import DDPStrategy
 
 from nemo.utils.trainer_utils import resolve_trainer_cfg
 
@@ -25,7 +25,7 @@ def test_resolve_trainer_cfg_strategy():
     assert ans["strategy"] == "ddp"
 
     cfg = OmegaConf.create(
-        {"strategy": {"_target_": "pytorch_lightning.strategies.DDPStrategy", "gradient_as_bucket_view": True}}
+        {"strategy": {"_target_": "lightning.pytorch.strategies.DDPStrategy", "gradient_as_bucket_view": True}}
     )
     ans = resolve_trainer_cfg(cfg)
     assert isinstance(ans, dict)

From bca6b095be4bbe67cf9b24f676d82137c110460c Mon Sep 17 00:00:00 2001
From: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Date: Mon, 18 Nov 2024 03:12:31 -0500
Subject: [PATCH 12/48] fix perf plugin CUDA_DEVICE_MAX_CONNECTIONS setting
 (#11299)

* fix

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Docstrings

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
---
 nemo/lightning/run/plugins.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index 00738f1fc7d1..4130272bfc90 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -262,6 +262,8 @@ class PerfEnvPlugin(run.Plugin):
     enable_vboost: bool = False
 
     def get_vboost_srun_cmd(self, nodes, job_dir):
+        "Create the vboost `sudo nvidia-smi boost-slider --vboost 1` command"
+
         import shlex
 
         vboost_cmd = " ".join(
@@ -281,12 +283,13 @@ def get_vboost_srun_cmd(self, nodes, job_dir):
         return vboost_cmd
 
     def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+        """Enable the performance environment settings"""
 
         if task.trainer.strategy.__fn_or_cls__ == MegatronStrategy:
             # Force program order kernel launch for TP, CP overlap
             tp_size = task.trainer.strategy.tensor_model_parallel_size
             cp_size = task.trainer.strategy.context_parallel_size
-            if tp_size > 1 and cp_size > 1:
+            if tp_size > 1 or cp_size > 1:
                 executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
 
             # Set LayerNorm SM margin to support the overlap with LayerNorm kernel

From 956b54d48046b2fb8c40ad46da926030858994c4 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 18 Nov 2024 13:59:21 +0100
Subject: [PATCH 13/48] PTQ via NeMo-Run CLI (#10984)

* PTQ support in nemo CLI

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Naming engine vs checkpoint

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/collections/llm/__init__.py              |  3 +-
 nemo/collections/llm/api.py                   | 67 +++++++++++++++++--
 .../collections/llm/quantization/quantizer.py | 22 ++++--
 3 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 13f0ffe3db88..64246cf80ec7 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -214,7 +214,7 @@
 try:
     import nemo_run as run
 
-    from nemo.collections.llm.api import export_ckpt, finetune, generate, import_ckpt, pretrain, train, validate
+    from nemo.collections.llm.api import export_ckpt, finetune, generate, import_ckpt, pretrain, ptq, train, validate
     from nemo.collections.llm.recipes import *  # noqa
 
     __all__.extend(
@@ -226,6 +226,7 @@
             "validate",
             "finetune",
             "generate",
+            "ptq",
         ]
     )
 except ImportError as error:
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index d41bad067476..e7e660060f54 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -24,6 +24,7 @@
 from typing_extensions import Annotated
 
 import nemo.lightning as nl
+from nemo.collections.llm.quantization import ExportConfig, QuantizationConfig
 from nemo.lightning import (
     AutoResume,
     NeMoLogger,
@@ -67,7 +68,8 @@ def train(
         resume (Optional[Union[AutoResume, Resume]]): Resume training from a checkpoint.
         optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer
             from the model will be used.
-        tokenizer (Optional[TokenizerType]): Tokenizer setting to be applied. Can be 'data' or 'model' or an instance of TokenizerSpec.
+        tokenizer (Optional[TokenizerType]): Tokenizer setting to be applied. Can be 'data' or 'model'
+            or an instance of TokenizerSpec.
         export (Optional[str]): Filename to save the exported checkpoint after training.
         model_transform (Optional[Union[Callable[[nn.Module], nn.Module], PEFT]]): A model transform to be applied.
 
@@ -83,7 +85,7 @@ def train(
         >>> data = llm.SquadDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
         >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
         >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
-        >>> train(model, data, trainer, tokenizer="data")
+        >>> llm.train(model, data, trainer, tokenizer="data")
         PosixPath('/path/to/log_dir')
     """
     app_state = _setup(
@@ -185,7 +187,7 @@ def finetune(
         >>> data = llm.SquadDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
         >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
         >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
-        >>> finetune(model, data, trainer, peft=llm.peft.LoRA()])
+        >>> llm.finetune(model, data, trainer, peft=llm.peft.LoRA()])
         PosixPath('/path/to/log_dir')
     """
 
@@ -223,7 +225,8 @@ def validate(
         resume (Optional[AutoResume]): Resume from a checkpoint for validation.
         optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer
             from the model will be used.
-        tokenizer (Optional[TokenizerType]): Tokenizer setting to be applied. Can be 'data' or 'model' or an instance of TokenizerSpec.
+        tokenizer (Optional[TokenizerType]): Tokenizer setting to be applied. Can be 'data' or 'model'
+            or an instance of TokenizerSpec.
         model_transform (Optional[Union[Callable[[nn.Module], nn.Module], PEFT]]): A model transform to be applied.
 
     Returns:
@@ -236,7 +239,7 @@ def validate(
         >>> data = llm.SquadDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
         >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
         >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
-        >>> validate(model, data, trainer, tokenizer="data")
+        >>> llm.validate(model, data, trainer, tokenizer="data")
         PosixPath('/path/to/log_dir')
     """
     app_state = _setup(
@@ -255,6 +258,60 @@ def validate(
     return app_state.exp_dir
 
 
+@run.cli.entrypoint(name="ptq", namespace="llm")
+def ptq(
+    nemo_checkpoint: str,
+    calib_tp: int = 1,
+    calib_pp: int = 1,
+    quantization_config: Annotated[Optional[QuantizationConfig], run.Config[QuantizationConfig]] = None,
+    export_config: Optional[Union[ExportConfig, run.Config[ExportConfig]]] = None,
+) -> Path:
+    # TODO: Fix "nemo_run.cli.cli_parser.CLIException: An unexpected error occurred (Argument: , Context: {})"
+    """
+    Applies Post-Training Quantization (PTQ) for a model using the specified quantization and export configs. It runs
+    calibration for a small dataset to collect scaling factors low-precision GEMMs used by desired quantization method.
+    This function produces TensorRT-LLM checkpoint ready for deployment using nemo.export and nemo.deploy modules
+    or direcly using TensorRT-LLM library.
+    The function can be used through the NeMo CLI in the following way:
+    ```bash
+    # Run calibration using tensor parallel set to 8 and export quantized checkpoint with tensor parallel equal 2
+    nemo llm ptq nemo_checkpoint=/models/Llama-3-70B \
+        export_config.path=/models/Llama-3-70B-FP8 \
+        calib_tp=8 \
+        export_config.inference_tensor_parallel=2
+    # Choose different quantization method, for example, INT8 SmoothQuant
+    nemo llm ptq nemo_checkpoint=/models/Llama-3-8B \
+        export_config.path=/models/Llama-3-8B-INT8_SQ \
+        quantization_config.algorithm=int8_sq
+    ```
+    Args:
+        nemo_checkpoint (str): The path to model to be quantized.
+        calib_tp (int): Calibration tensor parallelism.
+        calib_pp (int): Calibration pipeline parallelism.
+        quantization_config (QuantizationConfig): Configuration for quantization algorithm.
+        export_config (ExportConfig): Export configuration for TensorRT-LLM checkpoint.
+    Returns:
+        Path: The path where the quantized checkpoint has been saved after calibration.
+    """
+    if export_config.path is None:
+        raise ValueError("The export_config.path needs to be specified, got None.")
+
+    from nemo.collections.llm import quantization
+
+    quantizer = quantization.Quantizer(quantization_config, export_config)
+
+    model = quantization.load_with_modelopt_layer_spec(nemo_checkpoint, calib_tp, calib_pp)
+
+    model = quantizer.quantize(model)
+
+    quantizer.export(model, nemo_checkpoint)
+
+    console = Console()
+    console.print(f"[green]✓ PTQ succeded, quantized checkpoint exported to {export_config.path}[/green]")
+
+    return export_config.path
+
+
 @run.cli.entrypoint(namespace="llm")
 def deploy(
     nemo_checkpoint: Path = None,
diff --git a/nemo/collections/llm/quantization/quantizer.py b/nemo/collections/llm/quantization/quantizer.py
index 2f3e0e1e986e..5fa0afb86e74 100644
--- a/nemo/collections/llm/quantization/quantizer.py
+++ b/nemo/collections/llm/quantization/quantizer.py
@@ -15,6 +15,7 @@
 import os
 import shutil
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Optional, Union
 
 import torch
@@ -75,17 +76,20 @@ class QuantizationConfig:
 
 @dataclass
 class ExportConfig:
-    """Inference configuration for the quantized TensorRT-LLM engine"""
+    """Inference configuration for the quantized TensorRT-LLM checkpoint."""
 
-    path: str
+    path: Union[Path, str]
     dtype: Union[str, int] = "bf16"
     decoder_type: Optional[str] = None
     inference_tensor_parallel: int = 1
     inference_pipeline_parallel: int = 1
 
+    def __post_init__(self):
+        self.path = Path(self.path)
+
 
 def get_modelopt_decoder_type(config: llm.GPTConfig) -> str:
-    """Infers the modelopt decoder type from GPTConfig class"""
+    """Infers the modelopt decoder type from GPTConfig class."""
     mapping = [
         (llm.Baichuan2Config, "baichuan"),
         (llm.ChatGLMConfig, "chatglm"),
@@ -109,17 +113,17 @@ def get_modelopt_decoder_type(config: llm.GPTConfig) -> str:
 
 
 class Quantizer:
-    """Post-training quantization (PTQ) and TRT-LLM export of NeMo 2.0 checkpoints.
+    """Post-training quantization (PTQ) and TensorRT-LLM export of NeMo 2.0 checkpoints.
 
     PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving.
     The process consist of several steps:
 
         1. Loading a Nemo model from disk using appropriate parallelism strategy
         2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
-        3. Producing output directory
+        3. Producing an output directory with a quantized checkpoint and a tokenizer
 
     The output directory produced is intended to be consumed by TensorRT-LLM toolbox
-    for efficient inference. This can be achieved using NeMo inference containers.
+    for efficient inference. This can be achieved using nemo.export.tensorrt_llm module.
     """
 
     def __init__(self, quantization_config: QuantizationConfig, export_config: ExportConfig):
@@ -231,6 +235,7 @@ def quantize(self, model: llm.GPTModel, forward_loop=None):
     def create_megatron_forward_loop(
         self, get_dataloader, num_batches, seq_length=None, micro_batch_size=None, decoder_seq_length=None
     ):
+        """Create a forward loop for over a given data iterator."""
         from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
         forward_backward_func = get_forward_backward_func()
@@ -262,6 +267,7 @@ def loop(model):
         return loop
 
     def export(self, model: llm.GPTModel, model_dir: str) -> None:
+        """Export model to a TensorRT-LLM checkpoint."""
         assert self.export_config is not None, "Export config is not set"
         # TODO: Add sample generate
         # TODO: Support megatron_amp_O2
@@ -294,7 +300,7 @@ def export(self, model: llm.GPTModel, model_dir: str) -> None:
 def get_calib_data_iter(
     data: str = "cnn_dailymail", batch_size: int = 64, calib_size: int = 512, max_sequence_length: int = 512
 ):
-    """Creates a sample data iterator for calibration"""
+    """Creates a sample data iterator for calibration."""
     if data == "wikitext":
         dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
         text_column = "text"
@@ -314,6 +320,8 @@ def get_calib_data_iter(
 
 
 def create_data_iterator_getter(model, dataset, seq_len, batch_size, calibration_size):
+    """Create a function that provides iterator over a given dataset."""
+
     def _iterator():
         CHARACTERS_PER_TOKEN = 4
 

From a4c935f92126c53b7ce91a84925ff7a33a50bc3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Kami=C5=84ski?=
 <67481570+Laplasjan107@users.noreply.github.com>
Date: Mon, 18 Nov 2024 17:57:21 +0100
Subject: [PATCH 14/48] PTQ memory optimization (#11257)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Initial commit

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Add sample generate

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Nemotron quantization, reduce diff

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Reduce diff

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* code review suggestions

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Bug fixes

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* remove not needed import

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* fix model type and allow ddp/optim setup

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

---------

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Co-authored-by: Piotr Kaminski <pikaminski@nvidia.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
---
 .../collections/llm/quantization/quantizer.py | 19 +++++----
 nemo/collections/llm/quantization/utils.py    | 42 ++++++++++++++-----
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/nemo/collections/llm/quantization/quantizer.py b/nemo/collections/llm/quantization/quantizer.py
index 5fa0afb86e74..45f72f06741e 100644
--- a/nemo/collections/llm/quantization/quantizer.py
+++ b/nemo/collections/llm/quantization/quantizer.py
@@ -272,9 +272,8 @@ def export(self, model: llm.GPTModel, model_dir: str) -> None:
         # TODO: Add sample generate
         # TODO: Support megatron_amp_O2
         export_dir = self.export_config.path
-        use_nfs_workspace = (model.trainer._fabric.__io__.num_nodes > 1) or (
-            model.config.pipeline_model_parallel_size > 1
-        )
+
+        use_nfs_workspace = model.config.pipeline_model_parallel_size > 1
         export_tensorrt_llm_checkpoint(
             model=get_unwrapped_mcore_model(model),
             decoder_type=self._get_decoder_type(model.config),
@@ -284,15 +283,17 @@ def export(self, model: llm.GPTModel, model_dir: str) -> None:
             inference_pipeline_parallel=self.export_config.inference_pipeline_parallel,
             use_nfs_workspace=use_nfs_workspace,
         )
+        dist.barrier()
 
         # Save the model context in order to restore its tokenizer later. The destination
         # path is "nemo_context" as this name is used in nemo.export to setup tokenizer.
-        shutil.copytree(
-            os.path.join(model_dir, CONTEXT_PATH),
-            os.path.join(export_dir, "nemo_context"),
-            dirs_exist_ok=True,
-        )
-        logging.info(f"Model context saved.")
+        if dist.get_rank() == 0:
+            shutil.copytree(
+                os.path.join(model_dir, CONTEXT_PATH),
+                os.path.join(export_dir, "nemo_context"),
+                dirs_exist_ok=True,
+            )
+            logging.info("Model context saved.")
 
         logging.info(f"Export succeeded, model has been exported to {export_dir}.")
 
diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
index c4c533fe38d0..bdfccb208d06 100644
--- a/nemo/collections/llm/quantization/utils.py
+++ b/nemo/collections/llm/quantization/utils.py
@@ -18,6 +18,7 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.llm.inference.base import _setup_trainer_and_restore_model
 from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
 from nemo.utils import logging
 
@@ -42,25 +43,44 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig:
     return model_cfg
 
 
-def load_with_modelopt_layer_spec(nemo_checkpoint_path: str, calib_tp: int = 1, calib_pp: int = 1) -> llm.GPTModel:
+def load_with_modelopt_layer_spec(
+    nemo_checkpoint_path: str, calib_tp: int = 1, calib_pp: int = 1, inference_only: bool = True
+):
+    # TODO: setting ddp="pytorch" with manually deleting model.optim is a hackish way to disable DDP initialization. Needs a systematic solution.
+    if inference_only:
+        strategy = nl.MegatronStrategy(
+            tensor_model_parallel_size=calib_tp,
+            pipeline_model_parallel_size=calib_pp,
+            pipeline_dtype=torch.bfloat16,
+            ckpt_load_optimizer=False,
+            ckpt_parallel_save_optim=False,
+            setup_optimizers=False,
+            lazy_init=True,
+            ddp="pytorch",
+        )
+    else:
+        strategy = nl.MegatronStrategy(
+            tensor_model_parallel_size=calib_tp, pipeline_model_parallel_size=calib_pp, pipeline_dtype=torch.bfloat16
+        )
+
     trainer = nl.Trainer(
         devices=calib_tp,
         num_nodes=calib_pp,
-        strategy=nl.MegatronStrategy(
-            tensor_model_parallel_size=calib_tp, pipeline_model_parallel_size=calib_pp, pipeline_dtype=torch.bfloat16
-        ),
-        plugins=nl.MegatronMixedPrecision(precision='bf16', pipeline_dtype=torch.bfloat16, autocast_enabled=True),
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision='bf16', params_dtype=torch.bfloat16, autocast_enabled=True),
     )
-    fabric = trainer.to_fabric()
-    fabric.launch()
-
     model_path = Path(nemo_checkpoint_path)
-    model = nl.io.load_context(ckpt_to_context_subdir(model_path)).model
+    model = nl.io.load_context(path=ckpt_to_context_subdir(model_path), subpath="model")
     model.config = quantizable_model_config(model.config)
-    return fabric.load_model(nemo_checkpoint_path, model=model)
+
+    if inference_only:
+        del model.optim
+
+    _setup_trainer_and_restore_model(nemo_checkpoint_path, trainer, model)
+    return model
 
 
-def get_unwrapped_mcore_model(model: llm.GPTModel):
+def get_unwrapped_mcore_model(model):
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
 
     unwrapped_model = model

From 59b8b489cb70c0935ed6d4561a812de336b78ce3 Mon Sep 17 00:00:00 2001
From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 18 Nov 2024 11:15:55 -0800
Subject: [PATCH 15/48] update README.md (#11223)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 nemo/README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/nemo/README.md b/nemo/README.md
index a6025e77822a..ebc23f4d5803 100644
--- a/nemo/README.md
+++ b/nemo/README.md
@@ -2,7 +2,12 @@ NeMo (**Ne**ural **Mo**dules) is a toolkit for creating AI applications built ar
 
 **NeMo Core** provides common APIs all modules and models have to implement.
 
-**NeMo Collections**
+**NeMo 2.0 Collections**
+
+* LLM - A collection of data modules, models, configurations, and recipes for building training and parameter-efficient fine-tuning (PEFT) pipelines, including decoder-only models like those in the Llama, Gemma, and Mamba families.
+* VLM - A collection of data modules, models, configurations, and recipes for training and PEFT pipelines in vision-language models.
+
+**NeMo 1.0 Collections**
 
 * ASR - collection of modules and models for building speech recognition networks
 * TTS - collection of modules and models for building speech synthesis networks

From 168c3e5f0c89c6af6ff91c25e8df5e5f4e74ca49 Mon Sep 17 00:00:00 2001
From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 18 Nov 2024 12:45:30 -0800
Subject: [PATCH 16/48] Add `attention_bias` argument in transformer block and
 transformer layer modules, addressing change in MCore (#11289)

* fix api

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix ci

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add docstring

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix docstring2

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix line too long

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 .../megatron/bert/bert_model.py               |   1 +
 .../megatron/falcon/falcon_decoder_layer.py   |   1 +
 .../gpt_full_te_layer_autocast_spec.py        |   1 +
 .../common/megatron/adapters/mcore_mixins.py  |  20 +--
 nemo/collections/vlm/mllama/model/vision.py   | 116 +++++++++++++++---
 5 files changed, 113 insertions(+), 26 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
index 0d75ab7cc706..c629db5af3c3 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
@@ -208,6 +208,7 @@ def forward(
         rotary_pos_emb=None,
         rotary_pos_cos=None,
         rotary_pos_sin=None,
+        attention_bias=None,
         inference_params=None,
         packed_seq_params=None,
     ):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 131f154d6709..7c3f3c194f14 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -108,6 +108,7 @@ def forward(
         rotary_pos_emb=None,
         rotary_pos_cos=None,
         rotary_pos_sin=None,
+        attention_bias=None,
         inference_params=None,
         packed_seq_params=None,
     ):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index d1945139dee9..1def214113ee 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -252,6 +252,7 @@ def forward(
         rotary_pos_emb=None,
         rotary_pos_cos=None,
         rotary_pos_sin=None,
+        attention_bias=None,
         inference_params=None,
         packed_seq_params=None,  # TODO: handle this
     ):
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index da9c98fd94ea..e306a0a9b6b7 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -82,19 +82,20 @@ def forward(
         rotary_pos_emb: Tensor = None,
         rotary_pos_cos: Tensor = None,
         rotary_pos_sin: Tensor = None,
+        attention_bias: Tensor = None,
         inference_params: InferenceParams = None,
         packed_seq_params: PackedSeqParams = None,
     ):
         hidden_states = super().forward(
-            hidden_states,
-            attention_mask,
-            context,
-            context_mask,
-            rotary_pos_emb,
-            rotary_pos_cos,
-            rotary_pos_sin,
-            inference_params,
-            packed_seq_params,
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            context=context,
+            context_mask=context_mask,
+            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_cos=rotary_pos_cos,
+            rotary_pos_sin=rotary_pos_sin,
+            inference_params=inference_params,
+            packed_seq_params=packed_seq_params,
         )
 
         mlp_head_adapter = self.get_adapter_module(AdapterName.MLP_HEAD_ADAPTER)
@@ -232,6 +233,7 @@ def forward(
         packed_seq_params=None,
         rotary_pos_cos=None,
         rotary_pos_sin=None,
+        attention_bias=None,
     ):
         # hidden_states: [sq, b, h]
 
diff --git a/nemo/collections/vlm/mllama/model/vision.py b/nemo/collections/vlm/mllama/model/vision.py
index f662546d21ae..f023cc7bf943 100644
--- a/nemo/collections/vlm/mllama/model/vision.py
+++ b/nemo/collections/vlm/mllama/model/vision.py
@@ -59,6 +59,9 @@
 
 
 def to_2tuple(x):
+    """
+    Convert an input to a 2-tuple.
+    """
     if isinstance(x, collections.abc.Iterable):
         return x
     return (x, x)
@@ -71,9 +74,16 @@ def _stack_images(
     max_num_images: int,
 ) -> Tuple[torch.Tensor, List[int]]:
     """
-    Takes a list of list of images and stacks them into a tensor.
-    This function is needed since images can be of completely
-    different resolutions and aspect ratios.
+    Stack a list of image lists into a tensor while accounting for varying resolutions and aspect ratios.
+
+    Args:
+        images (List[List[PIL_Image.Image]]): List of image lists for stacking.
+        max_num_chunks (int): Maximum number of chunks per image.
+        image_res (int): Target resolution for each image.
+        max_num_images (int): Maximum number of images to stack.
+
+    Returns:
+        Tuple[torch.Tensor, List[int]]: Tensor of stacked images and a list of chunk counts for each image.
     """
     out_images, out_num_chunks = [], []
     for imgs_sample in images:
@@ -97,7 +107,17 @@ def build_encoder_attention_mask(
     x: torch.Tensor, ar_ids: torch.Tensor, ntok: int, num_chunks: int, supported_aspect_ratios: List[List[int]]
 ):
     """
-    Build vision encoder attention mask that omits padding tiles and tokens.
+    Build attention masks for a vision encoder to handle padding and token alignment.
+
+    Args:
+        x (torch.Tensor): Input tensor of shape (batch_size, sequence_length).
+        ar_ids (torch.Tensor): Aspect ratio IDs for masking.
+        ntok (int): Number of tokens.
+        num_chunks (int): Number of chunks in the data.
+        supported_aspect_ratios (List[List[int]]): List of supported aspect ratios.
+
+    Returns:
+        torch.Tensor: Tensor containing the attention mask.
     """
     masks = []
     for ar_id in ar_ids:
@@ -113,6 +133,9 @@ def build_encoder_attention_mask(
 
 
 def apply_scaling(freqs: torch.Tensor):
+    """
+    Scale frequency values based on predefined thresholds and a smoothing factor.
+    """
     # Values obtained from grid search
     scale_factor = 8
     low_freq_factor = 1
@@ -137,6 +160,9 @@ def apply_scaling(freqs: torch.Tensor):
 
 # Use this spec for an implementation using modules in TE
 def get_image_transformer_layer_spec() -> ModuleSpec:
+    """
+    Create a specification for an image transformer layer.
+    """
     image_transformer_submodules = TransformerLayerSubmodules(
         input_layernorm=TENorm,
         self_attention=ModuleSpec(
@@ -175,6 +201,10 @@ def forward_with_return_intermediate(
     packed_seq_params: PackedSeqParams = None,
     return_intermediate: List[int] = None,
 ):
+    """
+    Perform a forward pass through the transformer layers with optional intermediate outputs.
+    Override regular MCore transformer layer forward pass.
+    """
     # hidden_states (float): [s, b, h]
     # attention_mask (bool): [1, 1, s, s]
 
@@ -278,16 +308,22 @@ def forward_with_return_intermediate(
 
 
 class ColumnParallelConv2dPatch(MegatronModule):
-    """Conv2D Patching layer with model parallelism.
-    Column parallel over unfolded input.
-    Arguments:
-        in_channels: Input channels.
-        out_channels: Output channels.
-        kernel_size: Size of convolution kernel.
-        stride (default 1): Stride for convolution.
-        bias (default False): Use bias in Conv2d.
-    Input: (bsz, in_channels, width, height)
-    Output: (bsz, num_tokens, out_channels)
+    """
+    Conv2D Patching layer with model parallelism. Applies convolution in a column-parallel fashion.
+
+    Args:
+        config (TransformerConfig): Configuration object for the layer.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (Union[int, Tuple[int, int]]): Size of the convolution kernel.
+        stride (Union[int, Tuple[int, int]]): Stride of the convolution.
+        bias (Optional[bool], default=False): Whether to include a bias term.
+
+    Input:
+        torch.Tensor: Input tensor of shape (batch_size, in_channels, width, height).
+
+    Output:
+        torch.Tensor: Output tensor of shape (batch_size, num_tokens, out_channels).
     """
 
     def __init__(
@@ -316,6 +352,7 @@ def __init__(
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward."""
         x = self._unfold(x)
         x = x.permute(0, 2, 1)
         x = F.linear(x, self._linear.weight)
@@ -324,6 +361,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class PrecomputedTilePositionEmbedding(torch.nn.Module):
+    """
+    Module to compute positional embeddings for tiles with optional gating.
+
+    Args:
+        config (TransformerConfig): Configuration object.
+        gated (bool, default=False): Whether to apply gating to the embeddings.
+    """
+
     def __init__(
         self,
         config: TransformerConfig,
@@ -340,6 +385,7 @@ def __init__(
             self.gate = nn.Parameter(torch.zeros(1))
 
     def forward(self, hidden_states: torch.Tensor, aspect_ratio_ids: torch.Tensor) -> torch.Tensor:
+        """Forward."""
         embeddings = self.embedding(aspect_ratio_ids)
         embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size)
 
@@ -351,7 +397,15 @@ def forward(self, hidden_states: torch.Tensor, aspect_ratio_ids: torch.Tensor) -
 
 
 class SelfAttentionNoBias(SelfAttention):
-    """Self-attention layer class without bias"""
+    """
+    Self-attention layer implementation without bias.
+
+    Args:
+        config (TransformerConfig): Configuration for the transformer.
+        submodules (SelfAttentionSubmodules): Submodules required for self-attention.
+        layer_number (int): The layer number in the transformer stack.
+        attn_mask_type (AttnMaskType): Type of attention mask to apply.
+    """
 
     def __init__(
         self,
@@ -396,6 +450,16 @@ def __init__(
 
 
 class ImageTransformerLayer(TransformerLayer):
+    """
+    Transformer layer adapted for processing image data with optional gating.
+
+    Args:
+        config (TransformerConfig): Transformer configuration object.
+        submodules (TransformerLayerSubmodules): Submodules to use in the layer.
+        layer_number (int, default=1): Layer number in the transformer.
+        hidden_dropout (float, optional): Dropout rate for hidden layers.
+    """
+
     def __init__(
         self,
         config: TransformerConfig,
@@ -423,9 +487,11 @@ def forward(
         rotary_pos_emb=None,
         rotary_pos_cos=None,
         rotary_pos_sin=None,
+        attention_bias=None,
         inference_params=None,
         packed_seq_params=None,
     ):
+        """Forward."""
         # hidden_states: [s, b, h]
 
         # Residual connection.
@@ -485,6 +551,19 @@ def forward(
 
 
 class VisionEncoder(MegatronModule):
+    """
+    Vision encoder module for processing image inputs with patch-based embeddings.
+
+    Args:
+        config ('CrossAttentionVisionConfig'): Configuration object for the encoder.
+        image_size (int, default=560): Input image size.
+        patch_size (int, default=14): Size of patches extracted from the image.
+        in_channels (int, default=3): Number of input channels.
+        pre_process (bool, default=True): Whether to preprocess input.
+        post_process (bool, default=True): Whether to postprocess output.
+        return_intermediate (Optional[bool]): Whether to return intermediate layers.
+    """
+
     def __init__(
         self,
         config: 'CrossAttentionVisionConfig',
@@ -556,7 +635,7 @@ def __init__(
         self.gated_positional_embedding_gate = nn.Parameter(torch.zeros(1))
 
     def apply_positional_embedding(self, x, aspect_ratio_ids):
-        # apply regular position embedding
+        """Apply regular position embedding and tile positonal embedding."""
         bsz, num_chunks, num_tokens, dim = x.shape
         x = x.view(bsz * num_chunks, num_tokens, dim)
         x = x + self.positional_embedding * (1 - self.gated_positional_embedding_gate.tanh())
@@ -567,6 +646,7 @@ def apply_positional_embedding(self, x, aspect_ratio_ids):
         return x
 
     def apply_class_embedding(self, x):
+        """Concat class embedding tokens."""
         x = torch.cat(
             [
                 self.class_embedding.to(x.dtype)
@@ -578,6 +658,7 @@ def apply_class_embedding(self, x):
         return x
 
     def forward(self, images: torch.Tensor, ar_ids: torch.Tensor) -> torch.Tensor:
+        """Forward."""
         if images.ndim == 5:
             num_concurrent_media = 1
             bsz, num_chunks, nch, w, h = images.shape
@@ -617,7 +698,8 @@ def forward(self, images: torch.Tensor, ar_ids: torch.Tensor) -> torch.Tensor:
             return_intermediate=self.return_intermediate,
         )
 
-        # [ntok * num_concurrent_media * num_chunks, bsz, hidden_size] ->  [bsz, ntok * num_concurrent_media * num_chunks, hidden_size]
+        # [ntok * num_concurrent_media * num_chunks, bsz, hidden_size]
+        # -> [bsz, ntok * num_concurrent_media * num_chunks, hidden_size]
         x, int_x = x.transpose(0, 1).contiguous(), int_x.transpose(0, 1).contiguous()
         x = self.ln_post(x)
         x = x.reshape(bsz * num_concurrent_media, num_chunks, ntok + npad, dim)

From bc9e6a898b79db2be3a90a4a11ecf4dc311608d2 Mon Sep 17 00:00:00 2001
From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Date: Mon, 18 Nov 2024 13:27:38 -0800
Subject: [PATCH 17/48] Remove pytorch-lightning (#11306)

* update import in docs

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update import in tutorials

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* remove pl requirement

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* missed import updates

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 docs/source/nlp/punctuation_and_capitalization.rst            | 2 +-
 docs/source/starthere/fundamentals.rst                        | 2 +-
 nemo/collections/llm/recipes/phi3_mini_4k_instruct.py         | 4 ++--
 .../models/speechllm/megatron_base_speechllm_prompt_model.py  | 2 +-
 .../tts/models/speechllm/megatron_t5_speechllm_model.py       | 2 +-
 requirements/requirements_lightning.txt                       | 1 -
 tutorials/01_NeMo_Models.ipynb                                | 2 +-
 tutorials/asr/ASR_CTC_Language_Finetuning.ipynb               | 4 ++--
 tutorials/asr/ASR_TTS_Tutorial.ipynb                          | 2 +-
 tutorials/asr/ASR_with_NeMo.ipynb                             | 2 +-
 tutorials/asr/ASR_with_Subword_Tokenization.ipynb             | 2 +-
 tutorials/asr/ASR_with_Transducers.ipynb                      | 2 +-
 tutorials/asr/Confidence_Ensembles.ipynb                      | 2 +-
 tutorials/asr/Multilang_ASR.ipynb                             | 2 +-
 tutorials/asr/Self_Supervised_Pre_Training.ipynb              | 2 +-
 tutorials/asr/Speech_Commands.ipynb                           | 2 +-
 tutorials/asr/Transducers_with_HF_Datasets.ipynb              | 2 +-
 tutorials/asr/Voice_Activity_Detection.ipynb                  | 2 +-
 tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb            | 2 +-
 tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb          | 2 +-
 .../speech_enhancement/Speech_Enhancement_with_NeMo.ipynb     | 2 +-
 .../Speech_Enhancement_with_Online_Augmentation.ipynb         | 4 ++--
 tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb                  | 2 +-
 tutorials/nlp/Punctuation_and_Capitalization.ipynb            | 2 +-
 .../nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb    | 2 +-
 tutorials/nlp/Relation_Extraction-BioMegatron.ipynb           | 2 +-
 tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb    | 2 +-
 tutorials/nlp/Token_Classification-BioMegatron.ipynb          | 2 +-
 .../nlp/Token_Classification_Named_Entity_Recognition.ipynb   | 2 +-
 tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb              | 2 +-
 tutorials/nlp/lora.ipynb                                      | 2 +-
 tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb    | 2 +-
 .../speaker_tasks/Speaker_Identification_Verification.ipynb   | 2 +-
 .../tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb     | 2 +-
 tutorials/tts/Tacotron2_Training.ipynb                        | 2 +-
 tutorials/tts/Vits_Training.ipynb                             | 2 +-
 36 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/docs/source/nlp/punctuation_and_capitalization.rst b/docs/source/nlp/punctuation_and_capitalization.rst
index 4be0d2151d8e..d67332eb00c1 100755
--- a/docs/source/nlp/punctuation_and_capitalization.rst
+++ b/docs/source/nlp/punctuation_and_capitalization.rst
@@ -240,7 +240,7 @@ An example of a config file is
      - trainer config
      -
      - Parameters of
-       `pytorch_lightning.Trainer <https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api>`_.
+       `lightning.pytorch.Trainer <https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api>`_.
    * - **exp_manager**
      - exp manager config
      -
diff --git a/docs/source/starthere/fundamentals.rst b/docs/source/starthere/fundamentals.rst
index e3014e0f5a03..f486bf3d6e49 100644
--- a/docs/source/starthere/fundamentals.rst
+++ b/docs/source/starthere/fundamentals.rst
@@ -116,7 +116,7 @@ Below is an example training script for our ``ExampleEncDecModel`` model. We hig
 	:linenos:
 	:emphasize-lines: 10, 11, 12
 
-	import pytorch_lightning as pl
+	import lightning.pytorch as pl
 	from nemo.collections.path_to_model_class import ExampleEncDecModel
 	from nemo.core.config import hydra_runner
 
diff --git a/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py b/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
index d3bc64f54d91..80ac60d054ce 100644
--- a/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
+++ b/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
@@ -14,11 +14,11 @@
 
 from typing import Callable, Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
+from lightning.pytorch.callbacks.callback import Callback
 from megatron.core.distributed import DistributedDataParallelConfig
-from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
diff --git a/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py b/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py
index f5c3ba720224..658ace21726f 100644
--- a/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py
+++ b/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py
@@ -15,8 +15,8 @@
 import re
 
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
 from torch import Tensor
 
 from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
diff --git a/nemo/collections/tts/models/speechllm/megatron_t5_speechllm_model.py b/nemo/collections/tts/models/speechllm/megatron_t5_speechllm_model.py
index b3b5d857155d..d35d53b3cac7 100644
--- a/nemo/collections/tts/models/speechllm/megatron_t5_speechllm_model.py
+++ b/nemo/collections/tts/models/speechllm/megatron_t5_speechllm_model.py
@@ -25,10 +25,10 @@
 import numpy as np
 import soundfile as sf
 import torch
+from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
 from omegaconf.omegaconf import open_dict
-from pytorch_lightning.trainer.trainer import Trainer
 
 import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.metrics.wer import word_error_rate
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 137fb2236971..adca2283f577 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -3,7 +3,6 @@ fiddle
 hydra-core>1.3,<=1.3.2
 lightning>2.2.1
 omegaconf<=2.3
-pytorch-lightning>2.2.1
 torchmetrics>=0.11.0
 transformers>=4.45.0
 wandb
diff --git a/tutorials/01_NeMo_Models.ipynb b/tutorials/01_NeMo_Models.ipynb
index 4255a6656b8a..eb76e00cd981 100644
--- a/tutorials/01_NeMo_Models.ipynb
+++ b/tutorials/01_NeMo_Models.ipynb
@@ -984,7 +984,7 @@
         "id": "0TsfmCYthMux"
       },
       "source": [
-        "import pytorch_lightning as ptl\n",
+        "import lightning.pytorch as ptl\n",
         "from nemo.core import ModelPT\n",
         "from omegaconf import OmegaConf"
       ],
diff --git a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
index a02ee4f99714..6ad3307da496 100644
--- a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
+++ b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
@@ -1292,7 +1292,7 @@
       },
       "source": [
         "import torch\n",
-        "import pytorch_lightning as ptl\n",
+        "import lightning.pytorch as ptl\n",
         "\n",
         "if torch.cuda.is_available():\n",
         "  accelerator = 'gpu'\n",
@@ -2088,7 +2088,7 @@
       },
       "source": [
         "import torch\n",
-        "import pytorch_lightning as ptl\n",
+        "import lightning.pytorch as ptl\n",
         "\n",
         "if torch.cuda.is_available():\n",
         "  accelerator = 'gpu'\n",
diff --git a/tutorials/asr/ASR_TTS_Tutorial.ipynb b/tutorials/asr/ASR_TTS_Tutorial.ipynb
index 709f96d14ba5..544255f76d06 100644
--- a/tutorials/asr/ASR_TTS_Tutorial.ipynb
+++ b/tutorials/asr/ASR_TTS_Tutorial.ipynb
@@ -172,7 +172,7 @@
     "import tempfile\n",
     "\n",
     "from omegaconf import OmegaConf\n",
-    "import pytorch_lightning as pl\n",
+    "import lightning.pytorch as pl\n",
     "import torch\n",
     "from tqdm.auto import tqdm\n",
     "import wget\n",
diff --git a/tutorials/asr/ASR_with_NeMo.ipynb b/tutorials/asr/ASR_with_NeMo.ipynb
index bd95c7194655..bb62e2f5eb9d 100644
--- a/tutorials/asr/ASR_with_NeMo.ipynb
+++ b/tutorials/asr/ASR_with_NeMo.ipynb
@@ -619,7 +619,7 @@
                 "id": "GUfR6tAK0k2u"
             },
             "source": [
-                "import pytorch_lightning as pl\n",
+                "import lightning.pytorch as pl\n",
                 "trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=50)"
             ],
             "execution_count": null,
diff --git a/tutorials/asr/ASR_with_Subword_Tokenization.ipynb b/tutorials/asr/ASR_with_Subword_Tokenization.ipynb
index ff15a5f75532..7a69735ae542 100644
--- a/tutorials/asr/ASR_with_Subword_Tokenization.ipynb
+++ b/tutorials/asr/ASR_with_Subword_Tokenization.ipynb
@@ -765,7 +765,7 @@
         "id": "3rslHEKeq9qy"
       },
       "source": [
-        "import pytorch_lightning as pl\r\n",
+        "import lightning.pytorch as pl\r\n",
         "trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=50)"
       ],
       "execution_count": null,
diff --git a/tutorials/asr/ASR_with_Transducers.ipynb b/tutorials/asr/ASR_with_Transducers.ipynb
index d20042b9b970..95eecbfb8916 100644
--- a/tutorials/asr/ASR_with_Transducers.ipynb
+++ b/tutorials/asr/ASR_with_Transducers.ipynb
@@ -754,7 +754,7 @@
       "outputs": [],
       "source": [
         "import torch\n",
-        "from pytorch_lightning import Trainer\n",
+        "from lightning.pytorch import Trainer\n",
         "\n",
         "if torch.cuda.is_available():\n",
         "  accelerator = 'gpu'\n",
diff --git a/tutorials/asr/Confidence_Ensembles.ipynb b/tutorials/asr/Confidence_Ensembles.ipynb
index 734ddc9a0604..5a999df304b0 100644
--- a/tutorials/asr/Confidence_Ensembles.ipynb
+++ b/tutorials/asr/Confidence_Ensembles.ipynb
@@ -214,7 +214,7 @@
     "# check out https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb\n",
     "# to learn more about finetuning NeMo ASR models\n",
     "from omegaconf import open_dict, OmegaConf\n",
-    "from pytorch_lightning import Trainer\n",
+    "from lightning.pytorch import Trainer\n",
     "\n",
     "from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE\n",
     "import nemo.utils.exp_manager as exp_manager\n",
diff --git a/tutorials/asr/Multilang_ASR.ipynb b/tutorials/asr/Multilang_ASR.ipynb
index 612271a8baab..800f8a2d2ded 100644
--- a/tutorials/asr/Multilang_ASR.ipynb
+++ b/tutorials/asr/Multilang_ASR.ipynb
@@ -1527,7 +1527,7 @@
    "outputs": [],
    "source": [
     "import torch\n",
-    "import pytorch_lightning as ptl"
+    "import lightning.pytorch as ptl"
    ]
   },
   {
diff --git a/tutorials/asr/Self_Supervised_Pre_Training.ipynb b/tutorials/asr/Self_Supervised_Pre_Training.ipynb
index c2e1e7362b3e..0506bafb56e3 100644
--- a/tutorials/asr/Self_Supervised_Pre_Training.ipynb
+++ b/tutorials/asr/Self_Supervised_Pre_Training.ipynb
@@ -433,7 +433,7 @@
       },
       "outputs": [],
       "source": [
-        "import pytorch_lightning as pl\n",
+        "import lightning.pytorch as pl\n",
         "from omegaconf import OmegaConf\n",
         "\n",
         "from nemo.collections.asr.models.ssl_models import SpeechEncDecSelfSupervisedModel\n",
diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb
index 438533f0f03a..c8a54e5135b2 100644
--- a/tutorials/asr/Speech_Commands.ipynb
+++ b/tutorials/asr/Speech_Commands.ipynb
@@ -408,7 +408,7 @@
             },
             "source": [
                 "import torch\n",
-                "import pytorch_lightning as pl"
+                "import lightning.pytorch as pl"
             ],
             "execution_count": null,
             "outputs": []
diff --git a/tutorials/asr/Transducers_with_HF_Datasets.ipynb b/tutorials/asr/Transducers_with_HF_Datasets.ipynb
index a47cd00a0b9a..82f17fe8c1ac 100644
--- a/tutorials/asr/Transducers_with_HF_Datasets.ipynb
+++ b/tutorials/asr/Transducers_with_HF_Datasets.ipynb
@@ -554,7 +554,7 @@
    "outputs": [],
    "source": [
     "import torch\n",
-    "from pytorch_lightning import Trainer\n",
+    "from lightning.pytorch import Trainer\n",
     "\n",
     "if torch.cuda.is_available():\n",
     "  accelerator = 'gpu'\n",
diff --git a/tutorials/asr/Voice_Activity_Detection.ipynb b/tutorials/asr/Voice_Activity_Detection.ipynb
index 123a03efc28e..fb3cef1b44ea 100644
--- a/tutorials/asr/Voice_Activity_Detection.ipynb
+++ b/tutorials/asr/Voice_Activity_Detection.ipynb
@@ -425,7 +425,7 @@
             "outputs": [],
             "source": [
                 "import torch\n",
-                "import pytorch_lightning as pl"
+                "import lightning.pytorch as pl"
             ]
         },
         {
diff --git a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
index c9c547a8383e..c3334a59b0d2 100644
--- a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+++ b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
@@ -260,7 +260,7 @@
       "source": [
         "import torch\n",
         "from omegaconf import OmegaConf, open_dict\n",
-        "from pytorch_lightning import Trainer\n",
+        "from lightning.pytorch import Trainer\n",
         "\n",
         "import nemo.collections.asr as nemo_asr"
       ],
diff --git a/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb b/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb
index cb364ab7396d..0d35feb11a9a 100644
--- a/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb
+++ b/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb
@@ -908,7 +908,7 @@
     "\n",
     "from torch.utils.data import DataLoader, Dataset\n",
     "\n",
-    "import pytorch_lightning as L\n",
+    "import lightning.pytorch as L\n",
     "\n",
     "from transformers import T5Tokenizer, T5ForConditionalGeneration\n",
     "\n",
diff --git a/tutorials/audio/speech_enhancement/Speech_Enhancement_with_NeMo.ipynb b/tutorials/audio/speech_enhancement/Speech_Enhancement_with_NeMo.ipynb
index 5c697840ba09..faef27d18abf 100644
--- a/tutorials/audio/speech_enhancement/Speech_Enhancement_with_NeMo.ipynb
+++ b/tutorials/audio/speech_enhancement/Speech_Enhancement_with_NeMo.ipynb
@@ -91,7 +91,7 @@
         "import IPython.display as ipd\n",
         "import matplotlib.pyplot as plt\n",
         "import numpy as np\n",
-        "import pytorch_lightning as pl\n",
+        "import lightning.pytorch as pl\n",
         "import soundfile as sf\n",
         "\n",
         "from omegaconf import OmegaConf, open_dict\n",
diff --git a/tutorials/audio/speech_enhancement/Speech_Enhancement_with_Online_Augmentation.ipynb b/tutorials/audio/speech_enhancement/Speech_Enhancement_with_Online_Augmentation.ipynb
index ff6970d98522..e8b734537a41 100644
--- a/tutorials/audio/speech_enhancement/Speech_Enhancement_with_Online_Augmentation.ipynb
+++ b/tutorials/audio/speech_enhancement/Speech_Enhancement_with_Online_Augmentation.ipynb
@@ -93,7 +93,7 @@
         "import IPython.display as ipd\n",
         "import matplotlib.pyplot as plt\n",
         "import numpy as np\n",
-        "import pytorch_lightning as pl\n",
+        "import lightning.pytorch as pl\n",
         "import soundfile as sf\n",
         "from pathlib import Path\n",
         "from omegaconf import OmegaConf, open_dict\n",
@@ -981,4 +981,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb b/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb
index 6204bf2516bb..b028b2d5c190 100644
--- a/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb
+++ b/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb
@@ -249,7 +249,7 @@
         "\n",
         "import wget \n",
         "import torch\n",
-        "import pytorch_lightning as pl\n",
+        "import lightning.pytorch as pl\n",
         "from omegaconf import OmegaConf\n",
         "import pandas as pd"
       ]
diff --git a/tutorials/nlp/Punctuation_and_Capitalization.ipynb b/tutorials/nlp/Punctuation_and_Capitalization.ipynb
index f88c33fada34..cbdab3941b6f 100644
--- a/tutorials/nlp/Punctuation_and_Capitalization.ipynb
+++ b/tutorials/nlp/Punctuation_and_Capitalization.ipynb
@@ -72,7 +72,7 @@
                 "import os\n",
                 "import wget \n",
                 "import torch\n",
-                "import pytorch_lightning as pl\n",
+                "import lightning.pytorch as pl\n",
                 "from omegaconf import OmegaConf"
             ]
         },
diff --git a/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb b/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb
index 2afbb19c0e66..51d3a66c91fc 100644
--- a/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb
+++ b/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb
@@ -74,7 +74,7 @@
     "import os\n",
     "import wget\n",
     "import torch\n",
-    "import pytorch_lightning as pl\n",
+    "import lightning.pytorch as pl\n",
     "from omegaconf import OmegaConf"
    ]
   },
diff --git a/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb b/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb
index d6b1e98b428e..3c9e427e7e09 100644
--- a/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb
+++ b/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb
@@ -71,7 +71,7 @@
                 "import os\n",
                 "import wget\n",
                 "import torch\n",
-                "import pytorch_lightning as pl\n",
+                "import lightning.pytorch as pl\n",
                 "from omegaconf import OmegaConf"
             ]
         },
diff --git a/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb b/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb
index fdcff979ea46..0ed846881d02 100644
--- a/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb
+++ b/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb
@@ -58,7 +58,7 @@
                 "import os\n",
                 "import wget \n",
                 "import torch\n",
-                "import pytorch_lightning as pl\n",
+                "import lightning.pytorch as pl\n",
                 "from omegaconf import OmegaConf"
             ]
         },
diff --git a/tutorials/nlp/Token_Classification-BioMegatron.ipynb b/tutorials/nlp/Token_Classification-BioMegatron.ipynb
index 85cb769b28c0..a59eae67dde1 100644
--- a/tutorials/nlp/Token_Classification-BioMegatron.ipynb
+++ b/tutorials/nlp/Token_Classification-BioMegatron.ipynb
@@ -45,7 +45,7 @@
                 "import os\n",
                 "import wget \n",
                 "import torch\n",
-                "import pytorch_lightning as pl\n",
+                "import lightning.pytorch as pl\n",
                 "from omegaconf import OmegaConf"
             ]
         },
diff --git a/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb b/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb
index 3ab98f6c19fd..4c34c293dcca 100644
--- a/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb
+++ b/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb
@@ -94,7 +94,7 @@
         "import os\n",
         "import wget \n",
         "import torch\n",
-        "import pytorch_lightning as pl\n",
+        "import lightning.pytorch as pl\n",
         "from omegaconf import OmegaConf"
       ],
       "execution_count": null,
diff --git a/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb b/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb
index 7f1baf536d87..b1eca63b8fd1 100644
--- a/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb
+++ b/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb
@@ -66,7 +66,7 @@
                 "from nemo.utils import logging\n",
                 "from omegaconf import OmegaConf\n",
                 "import pandas as pd\n",
-                "import pytorch_lightning as pl\n",
+                "import lightning.pytorch as pl\n",
                 "import torch\n",
                 "import wget "
             ]
diff --git a/tutorials/nlp/lora.ipynb b/tutorials/nlp/lora.ipynb
index c67fa6c2de15..0429dd7f053c 100644
--- a/tutorials/nlp/lora.ipynb
+++ b/tutorials/nlp/lora.ipynb
@@ -422,7 +422,7 @@
    "source": [
     "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n",
     "import torch\n",
-    "import pytorch_lightning as pl\n",
+    "import lightning.pytorch as pl\n",
     "from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder\n",
     "\n",
     "# let's modify some trainer configs\n",
diff --git a/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb b/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb
index 7db905b6d225..c193e6600666 100644
--- a/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb
+++ b/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb
@@ -777,7 +777,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pytorch_lightning as pl\n",
+    "import lightning.pytorch as pl\n",
     "from nemo.collections.asr.models import EncDecDiarLabelModel\n",
     "from nemo.utils.exp_manager import exp_manager\n",
     "\n",
diff --git a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
index 27a01b894eae..c4f7fbaca67e 100644
--- a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
+++ b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
@@ -438,7 +438,7 @@
    "outputs": [],
    "source": [
     "import torch\n",
-    "import pytorch_lightning as pl"
+    "import lightning.pytorch as pl"
    ]
   },
   {
diff --git a/tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb b/tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb
index afd202f99d4a..8b0114690540 100644
--- a/tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb
+++ b/tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb
@@ -1636,7 +1636,7 @@
         "outputId": "67209ee3-5161-40dc-a179-83d8219c3d71"
       },
       "source": [
-        "import pytorch_lightning as pl\n",
+        "import lightning.pytorch as pl\n",
         "from omegaconf import DictConfig\n",
         "import copy\n",
         "\n",
diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb
index 79546bb79db9..edc814cf12ec 100644
--- a/tutorials/tts/Tacotron2_Training.ipynb
+++ b/tutorials/tts/Tacotron2_Training.ipynb
@@ -178,7 +178,7 @@
     "Let's take a look at the tacotron2.py file\n",
     "\n",
     "```python\n",
-    "import pytorch_lightning as pl\n",
+    "import lightning.pytorch as pl\n",
     "\n",
     "from nemo.collections.common.callbacks import LogEpochTimeCallback\n",
     "from nemo.collections.tts.models import Tacotron2Model\n",
diff --git a/tutorials/tts/Vits_Training.ipynb b/tutorials/tts/Vits_Training.ipynb
index 9d3919e8dc6a..060c6bda43bb 100644
--- a/tutorials/tts/Vits_Training.ipynb
+++ b/tutorials/tts/Vits_Training.ipynb
@@ -191,7 +191,7 @@
     "Let's take a look at the vits.py file\n",
     "\n",
     "```python\n",
-    "import pytorch_lightning as pl\n",
+    "import lightning.pytorch as pl\n",
     "\n",
     "from nemo.collections.tts.models.vits import VitsModel\n",
     "from nemo.core.config import hydra_runner\n",

From ccee416f2c8233ec6a1fbcb1f95b36cf502ff7f3 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Mon, 18 Nov 2024 17:15:56 -0800
Subject: [PATCH 18/48] Adding multimodal examples (#11279)

* Adding multimodal examples

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

---------

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
---
 examples/multimodal_autoregressive/README.md  |   3 +
 ...tron_mm_ar_inference_image_generation.yaml |  36 +++
 ..._mm_ar_inference_vision_understanding.yaml |  32 +++
 ...mm_autoregressive_eval_image_generation.py | 196 ++++++++++++++++
 ...utoregressive_eval_vision_understanding.py | 220 ++++++++++++++++++
 .../multimodal_autoregressive/data/README.md  |  34 +--
 6 files changed, 493 insertions(+), 28 deletions(-)
 create mode 100644 examples/multimodal_autoregressive/README.md
 create mode 100644 examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_image_generation.yaml
 create mode 100644 examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_vision_understanding.yaml
 create mode 100644 examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
 create mode 100644 examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py

diff --git a/examples/multimodal_autoregressive/README.md b/examples/multimodal_autoregressive/README.md
new file mode 100644
index 000000000000..5934074a7d17
--- /dev/null
+++ b/examples/multimodal_autoregressive/README.md
@@ -0,0 +1,3 @@
+### MULTIMODAL AUTOREGRESSIVE GENERTION
+
+For information on how to get started with autoregressive generation for multimodal datasets using discrete tokenizers follow this [guide](nemo/collections/multimodal_autoregressive/data/README.md)
diff --git a/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_image_generation.yaml b/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_image_generation.yaml
new file mode 100644
index 000000000000..806800c96155
--- /dev/null
+++ b/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_image_generation.yaml
@@ -0,0 +1,36 @@
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: True # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["<|extra_204|>"]  # generation will stop when one of these tokens is generated
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: bf16 # 16, 32, or bf16
+  use_distributed_sampler: False
+
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+image_encoder: Cosmos-Tokenizer-DV8x16x16
+gpt_model_file: null  # GPT nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+captions: # prompts for GPT inference
+  - "a drawing of a green pokemon with red eyes"
+  - "a red pokemon with green eyes"
+  - "a cartoon fish with a big smile"
+images_output_path: null # Path to the directory to store the output images
+
diff --git a/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_vision_understanding.yaml b/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_vision_understanding.yaml
new file mode 100644
index 000000000000..c392f5dcc5c2
--- /dev/null
+++ b/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_vision_understanding.yaml
@@ -0,0 +1,32 @@
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: False # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["<|extra_204|>"]  # generation will stop when one of these tokens is generated
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: bf16 # 16, 32, or bf16
+  use_distributed_sampler: False
+
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+gpt_model_file: null  # GPT nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+images_path: # prompts for GPT inference
+  - "/path/to/image1"
+  - "/path/to/image2"
diff --git a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
new file mode 100644
index 000000000000..ae8dddb29553
--- /dev/null
+++ b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import math
+import os
+import re
+
+import torch
+import torchvision
+from examples.nlp.language_modeling.megatron_gpt_eval import (
+    load_model_from_config,
+    remove_padded_prompts,
+    round_to_mult,
+)
+from pytorch_lightning.trainer.trainer import Trainer
+
+# pylint: disable=line-too-long
+from nemo.collections.common.video_tokenizers.cosmos_tokenizer import CausalVideoTokenizer
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
+from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
+from nemo.core.config import hydra_runner
+
+"""
+This is the script to run multimodal autoregresssive text generation.
+
+Make sure you  install tiktoken==0.6.0
+
+Usage:
+    Assume the model has TP=1, PP=1 in the following use cases.
+    a. run greedy inference from a nemo file:
+        python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.greedy=True \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            captions=[caption1,caption2]
+
+    b. run greedy inference from a PTL checkpoint file:
+        python megatron_mm_autoregresssive_eval.py \
+            checkpoint_dir=PATH_TO_CHECKPOINT_FILE \
+            checkpoint_name=CHECKPOINT_FILE_NAME \
+            hparams_file=HPARAMS_FILE \
+            inference.greedy=True \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            captions=[caption1,caption2]
+
+    c. run top_p inference from a nemo file:
+        python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.greedy=False \
+            inference.top_k=0 \
+            inference.top_p=0.9 \
+            inference.repetition_penalty=1.2 \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            captions=[caption1,caption2]
+
+    d. If you don't need to generate tokens and need model to compute logprobs:
+         python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.compute_logprob=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            captions=[caption1,caption2]
+"""
+
+
+def to_img(tokens_string, image_tokenizer):
+    """Converts visual tokens to images
+
+    Given input visual tokens, we extract the indices, pass it to the decoder to get the image
+    """
+    visual_token_pattern = r"<\|visual token (\d+)\|>"
+    visual_tokens = [int(match) for match in re.findall(visual_token_pattern, tokens_string)]
+    # We assume image is square. So if 64 tokensa are present, we reshape it to 8x8 and then pass it to decoder
+    dim = int(math.sqrt(len(visual_tokens)))
+    visual_tokens_tensor = torch.tensor(visual_tokens[: dim * dim])
+    # Decoder accepts input of the following format [bs, channel_dim, h, w]
+    visual_tokens_tensor_reshaped = visual_tokens_tensor.reshape((dim, dim)).unsqueeze(0).unsqueeze(0)
+    visual_tokens_final = visual_tokens_tensor_reshaped.to(image_tokenizer._device)
+    img = image_tokenizer.decode(visual_tokens_final)
+
+    # Convert from bf16 to 16 and to format [channel_dim, h, w]
+    image = torchvision.transforms.functional.to_pil_image(img.float().squeeze())
+    return image
+
+
+def load_prompts(cfg):
+    """Function to return the prompts passed into the model"""
+    prompts = []
+    for caption in cfg.captions:
+        prompt = f'You are a helpful assistant. Draw a picture for the caption given by the user. USER: {caption}. ASSISTANT: '
+        prompts.append(prompt)
+    return prompts
+
+
+if not torch.cuda.is_available():
+    raise EnvironmentError("GPU is needed for the inference")
+
+
+@hydra_runner(config_path="conf", config_name="megatron_mm_ar_inference_image_generation")
+def main(cfg) -> None:
+    """Main function"""
+
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    # trainer required for restoring model parallel models
+    trainer = Trainer(
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=callbacks,
+    )
+
+    image_tokenizer = CausalVideoTokenizer.from_pretrained(
+        tokenizer_type=cfg.image_encoder, load_encoder=False, load_decoder=True, load_full_model=False
+    )
+
+    model = load_model_from_config(trainer, cfg)
+    model.freeze()
+
+    # Have to turn off activations_checkpoint_method for inference
+    try:
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+
+    length_params: LengthParam = {
+        "max_length": cfg.inference.tokens_to_generate,
+        "min_length": cfg.inference.min_tokens_to_generate,
+    }
+
+    sampling_params: SamplingParam = {
+        "use_greedy": cfg.inference.greedy,
+        "temperature": cfg.inference.temperature,
+        "top_k": cfg.inference.top_k,
+        "top_p": cfg.inference.top_p,
+        "repetition_penalty": cfg.inference.repetition_penalty,
+        "add_BOS": cfg.inference.add_BOS,
+        "all_probs": cfg.inference.all_probs,
+        "compute_logprob": cfg.inference.compute_logprob,
+        "end_strings": cfg.inference.end_strings,
+    }
+
+    prompts = []
+    with torch.no_grad():
+        prompts = load_prompts(cfg)
+
+    fp8_enabled = hasattr(model.cfg, "fp8") and (model.cfg.fp8 == True)
+    if fp8_enabled and len(prompts) > 0:
+        padded_len = round_to_mult(len(prompts), 8)
+        nb_paddings = padded_len - len(prompts)
+        if nb_paddings > 0:
+            nb_paddings += [''] * nb_paddings
+
+    # First method of running text generation, call model.generate method
+    response = model.generate(inputs=prompts, length_params=length_params, sampling_params=sampling_params)
+
+    if fp8_enabled:
+        response = remove_padded_prompts(response, nb_paddings)
+
+    output_tokens_strings = response['sentences']
+    for idx, output_token_string in enumerate(output_tokens_strings):
+        image = to_img(output_token_string, image_tokenizer)
+        image.save(os.path.join(cfg.images_output_path, f'{idx}.jpg'))
+
+    print(f'Images saved to {cfg.images_output_path}')
+
+
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter
diff --git a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py
new file mode 100644
index 000000000000..4aea4d9898ae
--- /dev/null
+++ b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import datetime
+
+import torch
+import torchvision
+from examples.nlp.language_modeling.megatron_gpt_eval import (
+    RequestDataSet,
+    load_model_from_config,
+    remove_padded_prompts,
+    round_to_mult,
+)
+from omegaconf import OmegaConf
+from PIL import Image
+from pytorch_lightning.trainer.trainer import Trainer
+from torch.utils.data import DataLoader
+from transformers import AutoModel, AutoTokenizer
+
+# pylint: disable=line-too-long
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
+from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
+from nemo.core.config import hydra_runner
+
+"""
+This is the script to run multimodal autoregresssive text generation.
+
+Make sure you  install tiktoken==0.6.0
+
+Usage:
+    Assume the model has TP=1, PP=1 in the following use cases.
+    a. run greedy inference from a nemo file:
+        python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.greedy=True \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            images_path=[image_path1,image_path2]
+
+    b. run greedy inference from a PTL checkpoint file:
+        python megatron_mm_autoregresssive_eval.py \
+            checkpoint_dir=PATH_TO_CHECKPOINT_FILE \
+            checkpoint_name=CHECKPOINT_FILE_NAME \
+            hparams_file=HPARAMS_FILE \
+            inference.greedy=True \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            images_path=[image_path1,image_path2]
+
+    c. run top_p inference from a nemo file:
+        python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.greedy=False \
+            inference.top_k=0 \
+            inference.top_p=0.9 \
+            inference.repetition_penalty=1.2 \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            images_path=[image_path1,image_path2]
+
+    d. If you don't need to generate tokens and need model to compute logprobs:
+         python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.compute_logprob=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            images_path=[image_path1,image_path2]
+"""
+
+EMU_HUB = "BAAI/Emu3-Gen"
+VQ_HUB = "BAAI/Emu3-VisionTokenizer"
+
+
+def to_imgstr(image_tokens, tokenizer):
+    """Convert integer image tokens to visual tokens string"""
+    image_tokens = image_tokens.cpu().numpy().tolist()
+    image_token_str = [
+        ['<|visual token {token_id:0>6d}|>'.format(token_id=token_id) for token_id in token_row]
+        for token_row in image_tokens
+    ]
+    image_row_str = ["".join(token_row) for token_row in image_token_str]
+    imgstr = tokenizer.eol_token.join(image_row_str)
+    return imgstr
+
+
+def load_prompts(cfg, image_tokenizer, tokenizer):
+    """Function to generate prompts
+
+    The prompts generated here are fed to the model.
+    """
+    prompts = []
+    text = "Please describe the image"
+    for image_path in cfg.images_path:
+        image = Image.open(image_path)
+        image_tensor = torchvision.transforms.functional.pil_to_tensor(image).unsqueeze(0)
+        image_tokens = image_tokenizer.encode(image_tensor.to(image_tokenizer.device, image_tokenizer.dtype))
+        bs, h, w = image_tokens.shape
+        imgstr = to_imgstr(image_tokens[0], tokenizer=tokenizer)
+        image_prompt = (
+            tokenizer.boi_token
+            + f'{h}*{w}'
+            + tokenizer.img_token
+            + imgstr
+            + tokenizer.eol_token
+            + tokenizer.eof_token
+            + tokenizer.eoi_token
+        )
+        prompt = f'{tokenizer.bos_token}You are a helpful assistant. USER: {image_prompt}{text}. ASSISTANT:'
+        prompts.append(prompt)
+    return prompts
+
+
+if not torch.cuda.is_available():
+    raise EnvironmentError("GPU is needed for the inference")
+
+
+@hydra_runner(config_path="conf", config_name="megatron_mm_ar_inference_vision_understanding")
+def main(cfg) -> None:
+    """Main function"""
+
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    # trainer required for restoring model parallel models
+    trainer = Trainer(
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=callbacks,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(EMU_HUB, trust_remote_code=True)
+    image_tokenizer = AutoModel.from_pretrained(VQ_HUB, device_map="cuda", trust_remote_code=True).eval()
+
+    model = load_model_from_config(trainer, cfg)
+    model.freeze()
+
+    # Have to turn off activations_checkpoint_method for inference
+    try:
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+
+    length_params: LengthParam = {
+        "max_length": cfg.inference.tokens_to_generate,
+        "min_length": cfg.inference.min_tokens_to_generate,
+    }
+
+    sampling_params: SamplingParam = {
+        "use_greedy": cfg.inference.greedy,
+        "temperature": cfg.inference.temperature,
+        "top_k": cfg.inference.top_k,
+        "top_p": cfg.inference.top_p,
+        "repetition_penalty": cfg.inference.repetition_penalty,
+        "add_BOS": cfg.inference.add_BOS,
+        "all_probs": cfg.inference.all_probs,
+        "compute_logprob": cfg.inference.compute_logprob,
+        "end_strings": cfg.inference.end_strings,
+    }
+
+    prompts = []
+    with torch.no_grad():
+        prompts = load_prompts(cfg, image_tokenizer, tokenizer)
+
+    fp8_enabled = hasattr(model.cfg, "fp8") and (model.cfg.fp8 == True)
+    if fp8_enabled and len(prompts) > 0:
+        padded_len = round_to_mult(len(prompts), 8)
+        nb_paddings = padded_len - len(prompts)
+        if nb_paddings > 0:
+            nb_paddings += [''] * nb_paddings
+
+    # First method of running text generation, call model.generate method
+    response = model.generate(inputs=prompts, length_params=length_params, sampling_params=sampling_params)
+
+    if fp8_enabled:
+        response = remove_padded_prompts(response, nb_paddings)
+    print("***************************")
+    print(response)
+    print("***************************")
+
+    # Second method of running text generation, call trainer.predict [recommended]
+    bs = 8 if fp8_enabled else 2
+    ds = RequestDataSet(prompts)
+    request_dl = DataLoader(dataset=ds, batch_size=bs)
+    config = OmegaConf.to_container(cfg.inference)
+    model.set_inference_config(config)
+    response = trainer.predict(model, request_dl)
+
+    if fp8_enabled:
+        response[-1] = remove_padded_prompts(response[-1], nb_paddings)
+    print("***************************")
+    print(response)
+    print("***************************")
+
+
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter
diff --git a/nemo/collections/multimodal_autoregressive/data/README.md b/nemo/collections/multimodal_autoregressive/data/README.md
index c4814ad267f8..3f6d5a6c6a81 100644
--- a/nemo/collections/multimodal_autoregressive/data/README.md
+++ b/nemo/collections/multimodal_autoregressive/data/README.md
@@ -8,27 +8,7 @@ This is an example of how to do autoregressive generation for multiple modalitie
 ### 1. Vision Understanding using EMU3 Tokenizer
 
 #### Download and Extract data 
-We will be working with coyo dataset which has 700 million images. 
-
-First create credentials for rclone . Create this file at `~/.config/rclone/rclone.conf`
-```
-[pbss-team-vfm-share-ro-s3]
-type = s3
-env_auth = true
-access_key_id = <ACCESS ID>
-secret_access_key = <ACCESS KEY>
-region = us-east-1
-endpoint = https://pdx.s8k.io
-```
-To download the images
-```
-rclone copy pbss-team-vfm-share-ro-s3:webdataset_images/webdataset_edify_image_v3/coyo_700m/resolution_lt_720/aspect_ratio_16_9/images images --transfers=16 --multi-thread-streams=16 --checkers=8 -P --stats 5s
-```
-
-To download the captions 
-```
-rclone copy pbss-team-vfm-share-ro-s3:webdataset_images/webdataset_edify_image_v3/coyo_700m/resolution_lt_720/aspect_ratio_16_9/captions_ai_v3p1 captions_ai_v3p1 --transfers=16 --multi-thread-streams=16 --checkers=8 -P --stats 5s
-```
+Download the [COYO700M dataset](https://github.com/kakaobrain/coyo-dataset)
 
 Once downloaded extract the data using tar utilities. 
 
@@ -70,13 +50,13 @@ Follow usual nemo instructions to train any autoregressive model.
 ```
 
 #### Inference 
-To run inference edit the [inference config file](examples/multimodal_autoregressive/conf/megatron_mm_ar_inference.yaml)
+To run inference edit the [inference config file](examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_vision_understanding.yaml)
 *NOTE* Make sure you have a .nemo file (checkpoint). If you just have a regular megatron checkpoint you  have to do a conversion as shown in [this doc](https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/gpt/checkpointconversion.html?highlight=convert)
 
 Run inference as follows
 
 ```
-torchrun --nproc-per-node 2 examples/multimodal_autoregressive/megatron_mm_autoregressive_eval.py
+torchrun --nproc-per-node 2 examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py
 ```
 
 
@@ -116,13 +96,11 @@ Follow usual nemo instructions to train any autoregressive model.
 ```
 
 #### Inference 
-To run inference edit the [inference config file](examples/multimodal_autoregressive/conf/megatron_mm_ar_inference.yaml)
+To run inference edit the [inference config file](examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_image_generation.yaml)
 *NOTE* Make sure you have a .nemo file (checkpoint). If you just have a regular megatron checkpoint you  have to do a conversion as shown in [this doc](https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/gpt/checkpointconversion.html?highlight=convert)
 
 Run inference as follows
 
 ```
-torchrun --nproc-per-node 2 examples/multimodal_autoregressive/megatron_mm_autoregressive_eval.py
-```
-
-TODO : Instructions to convert visual tokens to images coming soon. 
\ No newline at end of file
+torchrun --nproc-per-node 2 examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
+```
\ No newline at end of file

From da79f6a04c49baa535b95b01f04613d1827ded65 Mon Sep 17 00:00:00 2001
From: Huy Vu <86480512+huvunvidia@users.noreply.github.com>
Date: Mon, 18 Nov 2024 22:23:59 -0500
Subject: [PATCH 19/48] Update T5 attention-mask shapes to be compatible with
 all attention-backend in new TE versions (#11059)

* initial commits

* updating cicd test

* commit for FlashFused T5 from Mcore

* testing CICD

* update code for data/mock, update mcore commit for dockerfile

* fix error

* fix error

* fix error in nemo/collections/llm/inference/base.py

* update t5/data/mock.py

* fix cicd erorr

* remove unused libs

* address Yu Yao's comments

* Apply isort and black reformatting

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>

---------

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: huvunvidia <huvunvidia@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               | 28 +++++------
 Dockerfile.ci                                 |  2 +-
 nemo/collections/llm/t5/data/mock.py          |  9 ++--
 nemo/collections/llm/t5/model/t5.py           | 49 +++++++++++--------
 .../megatron_lm_encoder_decoder_model.py      | 39 ++++++---------
 .../collections/llm/megatron_t5_finetuning.py |  4 +-
 6 files changed, 65 insertions(+), 66 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index e41e96cfa794..61f809312bef 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -2943,7 +2943,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=null \
@@ -2975,7 +2975,7 @@ jobs:
         +model.data.data_impl_kwargs.workers=null \
         +model.data.data_impl_kwargs.sort_dataset_paths=False
 
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=null \
@@ -3398,8 +3398,8 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_t5_eval.py \
-            --model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
+        python examples/nlp/language_modeling/megatron_t5_eval.py \
+            --model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
             --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
             --tensor_model_parallel_size 1
 
@@ -3410,7 +3410,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
+        python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=9999 \
@@ -3421,7 +3421,7 @@ jobs:
         exp_manager.exp_dir=/tmp/nlp_mcore_t5_lora_tuning_tp2 \
         model.pipeline_model_parallel_size=1 \
         model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
         model.peft.peft_scheme=lora \
         model.answer_only_loss=True \
         model.micro_batch_size=1 \
@@ -3433,8 +3433,8 @@ jobs:
         model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
         model.data.validation_ds.names=[quarel]
 
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
+        python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
         model.peft.restore_from_path=/tmp/nlp_mcore_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
         model.peft.restore_from_ckpt_name=null \
         model.peft.restore_from_hparams_path=null \
@@ -3852,14 +3852,14 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
+        python tests/collections/llm/megatron_t5_pretraining.py \
         --devices=2 \
         --max-steps=3 \
         --experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \
         --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \
         --index-mapping-dir=tests/collections/llm/t5_index_mappings/${{ github.run_id }}
 
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
+        python tests/collections/llm/megatron_t5_pretraining.py \
         --devices=2 \
         --max-steps=6 \
         --experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \
@@ -3876,11 +3876,11 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
+        python tests/collections/llm/megatron_t5_finetuning.py \
         --devices=2 \
         --max-steps=250 \
         --experiment-dir=tests/collections/llm/t5_finetune_results/${{ github.run_id }} \
-        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
+        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_padding_attnmasktype_150steps
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/t5_finetune_results/${{ github.run_id }}
 
@@ -3891,12 +3891,12 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
+        python tests/collections/llm/megatron_t5_finetuning.py \
         --devices=2 \
         --max-steps=250 \
         --peft=lora \
         --experiment-dir=tests/collections/llm/t5_peft_results/${{ github.run_id }} \
-        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
+        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_padding_attnmasktype_150steps
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/t5_peft_results/${{ github.run_id }}
 
diff --git a/Dockerfile.ci b/Dockerfile.ci
index 5858f0aadf5b..e1b78547325a 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -54,7 +54,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.19.0
-ARG MCORE_TAG=aded519cfb1de2abf96f36ca059f992294b7876f
+ARG MCORE_TAG=c1728c12f1f1cdbb786e52f1ffe512295d76bef3
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
diff --git a/nemo/collections/llm/t5/data/mock.py b/nemo/collections/llm/t5/data/mock.py
index 25dcf4d560fc..31198a4446e9 100644
--- a/nemo/collections/llm/t5/data/mock.py
+++ b/nemo/collections/llm/t5/data/mock.py
@@ -125,13 +125,11 @@ def __init__(
         self.seed = seed
         self.create_attention_mask = create_attention_mask
 
-        self.mask_encoder = torch.ones((self.seq_length, self.seq_length), device='cpu')
-        self.mask_decoder = torch.tril(torch.ones((self.seq_length_dec, self.seq_length_dec), device='cpu'))
-        self.mask_encoder_decoder = torch.ones((self.seq_length_dec, self.seq_length), device='cpu')
+        # update for T5 now use FlashFused attention (b11s)
+        self.mask_encoder = torch.ones(self.seq_length, device='cpu')
+        self.mask_decoder = torch.ones(self.seq_length_dec, device='cpu')
         self.mask_encoder = self.mask_encoder < 0.5
         self.mask_decoder = self.mask_decoder < 0.5
-        self.mask_encoder_decoder = self.mask_encoder_decoder < 0.5
-
         self.loss_mask = torch.ones(self.seq_length_dec, dtype=torch.float)
 
     def __len__(self) -> int:
@@ -156,7 +154,6 @@ def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
             "truncated": 0,
             "enc_mask": self.mask_encoder,
             "dec_mask": self.mask_decoder,
-            "enc_dec_mask": self.mask_encoder_decoder,
         }
 
         return batch
diff --git a/nemo/collections/llm/t5/model/t5.py b/nemo/collections/llm/t5/model/t5.py
index 7d8ff65e8d99..8160fb729229 100644
--- a/nemo/collections/llm/t5/model/t5.py
+++ b/nemo/collections/llm/t5/model/t5.py
@@ -21,6 +21,7 @@
 import torch.distributed
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
 from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import T5InferenceWrapper
+from megatron.core.models.T5.t5_model import T5Model as MCoreT5Model
 from megatron.core.optimizer import OptimizerConfig
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -38,8 +39,6 @@
     HAVE_TE = False
 
 if TYPE_CHECKING:
-    from megatron.core.models.T5.t5_model import T5Model as MCoreT5Model
-
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 
@@ -58,22 +57,32 @@ def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
     else:
         _batch = batch
 
-    # if Dataset object is NeMo 1.0's T5SFTDataset (e.g. when finetuning with SQUAD)
-    if 'enc_dec_mask' not in _batch:
-        encoder_attn_mask_3d = build_attention_mask_3d(_batch['enc_mask'], _batch['enc_mask'], AttnMaskType.padding)
-        decoder_attn_mask_3d = build_attention_mask_3d(_batch['dec_mask'], _batch['dec_mask'], AttnMaskType.causal)
-        enc_dec_attn_mask_3d = build_attention_mask_3d(_batch['dec_mask'], _batch['enc_mask'], AttnMaskType.padding)
-        _batch['enc_mask'] = encoder_attn_mask_3d
-        _batch['dec_mask'] = decoder_attn_mask_3d
-        _batch['enc_dec_mask'] = enc_dec_attn_mask_3d
-
-    # if Dataset object is Mcore T5 dataset (e.g. pretraining)
-    else:
-        # convert attention mask values from int to True/False
-        _batch['enc_mask'] = _batch['enc_mask'] < 0.5
-        _batch['dec_mask'] = _batch['dec_mask'] < 0.5
-        _batch['enc_dec_mask'] = _batch['enc_dec_mask'] < 0.5
-
+    # work for both mcore's T5 pre-train dataset object, and NeMo's T5SFTDataset dataset
+    enc_mask = _batch['enc_mask'] < 0.5
+    dec_mask = _batch['dec_mask'] < 0.5
+    # process for Flash/Fused
+    enc_mask = enc_mask.unsqueeze(1).unsqueeze(1)
+    dec_mask = dec_mask.unsqueeze(1).unsqueeze(1)
+    enc_dec_mask = (
+        dec_mask,
+        enc_mask,
+    )
+    # set dec_mask to None because decoder uses AttnMaskType.causal
+    dec_mask = None
+    _batch['enc_mask'] = enc_mask
+    _batch['dec_mask'] = dec_mask
+    _batch['enc_dec_mask'] = enc_dec_mask
+
+    # bring to device
+    for key in _batch.keys():
+        if key == "enc_dec_mask":  # because enc_dec_mask is a tuple
+            _batch[key] = (_batch[key][0].cuda(non_blocking=True), _batch[key][1].cuda(non_blocking=True))
+        elif key == "dec_mask":  # because dec_mask is a None since decoder uses AttnMaskType.causal
+            continue
+        else:
+            _batch[key] = _batch[key].cuda(non_blocking=True)
+
+    # set up forward arguments for pipeline parallelism
     required_keys = set()
     required_keys.update(["enc_mask", "dec_mask", "enc_dec_mask"])
     if parallel_state.is_pipeline_first_stage():
@@ -81,7 +90,7 @@ def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
     if parallel_state.is_pipeline_last_stage():
         required_keys.update(("labels", "loss_mask"))
 
-    output = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in _batch.items()}
+    output = {key: val if key in required_keys else None for key, val in _batch.items()}
 
     return output
 
@@ -139,6 +148,7 @@ class T5Config(TransformerConfig, io.IOMixin):
     share_embeddings_and_output_weights: bool = True
     make_vocab_size_divisible_by: int = 128
     position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute"
+    apply_rope_fusion: bool = True
     max_position_embeddings: int = 512
     rotary_percent: float = 1.0
     seq_len_interpolation_factor: Optional[float] = None
@@ -168,7 +178,6 @@ def configure_model(self, tokenizer) -> "MCoreT5Model":
             ) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."
 
         from megatron.core import parallel_state
-        from megatron.core.models.T5.t5_model import T5Model as MCoreT5Model
 
         encoder_config = copy.deepcopy(self)
         encoder_config.num_layers = self.encoder_num_layers
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index f5b675c94891..e530a40d8aaa 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -32,12 +32,10 @@
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import (
-    AttnMaskType,
     MegatronTokenLevelEncoderDecoderModule,
 )
 from nemo.collections.nlp.modules.common.megatron.utils import (
     average_losses_across_data_parallel_group,
-    build_attention_mask_3d,
     get_params_for_weight_decay_optimization,
 )
 from nemo.collections.nlp.modules.common.text_generation_utils import (
@@ -683,14 +681,13 @@ def fwd_output_and_loss_func(dataloader_iter, model):
 
             if self.mcore_t5:
                 # attn mask logic follows megatron.data.t5_dataset.py in Megatron-LM
-                encoder_attn_mask_3d = build_attention_mask_3d(
-                    encoder_attn_mask, encoder_attn_mask, AttnMaskType.padding
-                )
-                decoder_attn_mask_3d = build_attention_mask_3d(
-                    decoder_attn_mask, decoder_attn_mask, AttnMaskType.causal
-                )
-                enc_dec_attn_mask_3d = build_attention_mask_3d(
-                    decoder_attn_mask, encoder_attn_mask, AttnMaskType.padding
+                encoder_attn_mask = encoder_attn_mask < 0.5
+                decoder_attn_mask = decoder_attn_mask < 0.5
+                encoder_attn_mask_3d = encoder_attn_mask.unsqueeze(1).unsqueeze(1)
+                decoder_attn_mask_3d = decoder_attn_mask.unsqueeze(1).unsqueeze(1)
+                enc_dec_attn_mask_3d = (
+                    decoder_attn_mask_3d,
+                    encoder_attn_mask_3d,
                 )
 
                 output = model(  # model is MCoreT5Model
@@ -816,10 +813,8 @@ def fwd_output_only_func(dataloader_iter, model):
                         encoder_attn_mask,
                     ) = batch
 
-                    # attn mask logic follows megatron.data.t5_dataset.py in Megatron-LM
-                    encoder_attn_mask_3d = build_attention_mask_3d(
-                        encoder_attn_mask, encoder_attn_mask, AttnMaskType.padding
-                    )
+                    encoder_attn_mask = encoder_attn_mask < 0.5
+                    encoder_attn_mask_3d = encoder_attn_mask.unsqueeze(1).unsqueeze(1)
 
                     output = model(
                         encoder_input_ids=encoder_input_ids,
@@ -841,15 +836,13 @@ def fwd_output_only_func(dataloader_iter, model):
                         decoder_attn_mask,
                     ) = batch
 
-                    # attn mask logic follows megatron.data.t5_dataset.py in Megatron-LM
-                    encoder_attn_mask_3d = build_attention_mask_3d(
-                        encoder_attn_mask, encoder_attn_mask, AttnMaskType.padding
-                    )
-                    decoder_attn_mask_3d = build_attention_mask_3d(
-                        decoder_attn_mask, decoder_attn_mask, AttnMaskType.causal
-                    )
-                    enc_dec_attn_mask_3d = build_attention_mask_3d(
-                        decoder_attn_mask, encoder_attn_mask, AttnMaskType.padding
+                    encoder_attn_mask = encoder_attn_mask < 0.5
+                    decoder_attn_mask = decoder_attn_mask < 0.5
+                    encoder_attn_mask_3d = encoder_attn_mask.unsqueeze(1).unsqueeze(1)
+                    decoder_attn_mask_3d = decoder_attn_mask.unsqueeze(1).unsqueeze(1)
+                    enc_dec_attn_mask_3d = (
+                        decoder_attn_mask_3d,
+                        encoder_attn_mask_3d,
                     )
 
                     # re-transpose encoder_hidden_states from [batch, seq_len, hidden] to [seq_len, batch, hidden]
diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py
index 9b8195b0895a..976ad5c48053 100644
--- a/tests/collections/llm/megatron_t5_finetuning.py
+++ b/tests/collections/llm/megatron_t5_finetuning.py
@@ -103,7 +103,7 @@ def get_args():
         optimizer='adam',
         lr=2.0e-5,
         use_distributed_optimizer=False,
-        bf16=False,
+        bf16=True,
         weight_decay=0.1,
     )
     opt = MegatronOptimizerModule(
@@ -124,7 +124,7 @@ def get_args():
         log_every_n_steps=1,
         limit_val_batches=2,
         val_check_interval=50,
-        plugins=nl.MegatronMixedPrecision(precision="32"),
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
     )
 
     if args.wandb_project is not None:

From e98106e3064da6d0fa8cee7c80bfa6b7711cbd6c Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 19 Nov 2024 01:47:50 -0800
Subject: [PATCH 20/48] Add HF untrusted code toggle (#11313)

* add trust_remote_code toggle

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 .../llm/gpt/model/hf_auto_model_for_causal_lm.py   | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
index 4d9e7ae026d5..6922f38cfb26 100644
--- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
+++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -39,6 +39,7 @@ def __init__(
         tokenizer=None,
         loss_fn=masked_cross_entropy,
         model_transform=None,
+        trust_remote_code=False,
     ):
         super().__init__()
         self.save_hyperparameters()
@@ -49,11 +50,14 @@ def __init__(
         self.load_pretrained_weights = load_pretrained_weights
         self.is_hf_model = True
         self.model_transform = model_transform
+        self.trust_remote_code = trust_remote_code
 
     @property
     def tokenizer(self):
         if self._tokenizer is None:
-            self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(self.model_name)
+            self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(
+                self.model_name, trust_remote_code=self.trust_remote_code
+            )
         return self._tokenizer
 
     @tokenizer.setter
@@ -68,12 +72,14 @@ def configure_tokenizer(model_name):
     def configure_model(self):
         # create all your layers here
         if self.load_pretrained_weights:
-            self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype='auto')
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name, torch_dtype='auto', trust_remote_code=self.trust_remote_code
+            )
         else:
             from transformers import AutoConfig
 
-            config = AutoConfig.from_pretrained(self.model_name)
-            self.model = AutoModelForCausalLM.from_config(config)
+            config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code)
+            self.model = AutoModelForCausalLM.from_config(config, trust_remote_code=self.trust_remote_code)
         self.model.train()
 
     def forward(self, input_ids, attention_mask=None, labels=None, loss_mask=None):

From 425a4dd0f31ca10b83ad42bb540e8007f2f9bf1b Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 19 Nov 2024 19:02:46 +0900
Subject: [PATCH 21/48] P2p chunk size setting in nemo 2.0 (#11312)

* NCCL P2P communication chunk size

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* NCCL P2P communication chunk size

Signed-off-by: Sangkug Lym <slym@nvidia.com>

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 nemo/lightning/run/plugins.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index 4130272bfc90..7366843d5a86 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -260,6 +260,7 @@ class PerfEnvPlugin(run.Plugin):
     enable_layernorm_sm_margin: bool = True
     layernorm_sm_margin: int = 16
     enable_vboost: bool = False
+    nccl_pp_comm_chunksize: int = None
 
     def get_vboost_srun_cmd(self, nodes, job_dir):
         "Create the vboost `sudo nvidia-smi boost-slider --vboost 1` command"
@@ -297,6 +298,13 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
                 executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
                 executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
 
+            # Set the chunk size of P2P communications. Using a large chunk size reduces the
+            # buffering overhead from the communication kernel execution time
+            pp_size = task.trainer.strategy.pipeline_model_parallel_size
+            if pp_size > 1 and self.nccl_pp_comm_chunksize is not None:
+                assert isinstance(self.nccl_pp_comm_chunksize, int) and self.nccl_pp_comm_chunksize > 1
+                executor.env_vars["NCCL_P2P_NET_CHUNKSIZE"] = str(self.nccl_pp_comm_chunksize)
+
         # Improve perf by steering power to tensor cores, may not work on all systems
         if self.enable_vboost and isinstance(executor, run.SlurmExecutor):
             vboost_cmd = self.get_vboost_srun_cmd(executor.nodes, executor.job_dir)

From b817158c86958bc1d49d15efb2b117a96efef7eb Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Tue, 19 Nov 2024 10:11:44 -0800
Subject: [PATCH 22/48] Nemo2 batcheval (#11158)

* initial draft for eval api

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add dp to generate

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* Apply isort and black reformatting

Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>

* add top_k=1 to defaul inf param to get deterministic output

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* change name

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add eval ds and write to file to llm.generate

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* support standalone input jsonl

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>
Co-authored-by: HuiyingLi <HuiyingLi@users.noreply.github.com>
---
 nemo/collections/llm/api.py            | 57 ++++++++++++++++++++++++--
 nemo/collections/llm/inference/base.py |  2 +-
 2 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index e7e660060f54..aaef714ef738 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -20,7 +20,9 @@
 import lightning.pytorch as pl
 import nemo_run as run
 import torch
+from megatron.core import parallel_state
 from rich.console import Console
+from torch.distributed import all_gather_object
 from typing_extensions import Annotated
 
 import nemo.lightning as nl
@@ -36,6 +38,8 @@
 from nemo.lightning.base import NEMO_MODELS_CACHE
 from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform
 from nemo.utils import logging
+from nemo.utils.get_rank import is_global_rank_zero
+
 
 if TYPE_CHECKING:
     from megatron.core.inference.common_inference_params import CommonInferenceParams
@@ -662,9 +666,10 @@ def export_ckpt(
 @run.cli.entrypoint(name="generate", namespace="llm")
 def generate(
     path: Union[Path, str],
-    prompts: list[str],
     trainer: nl.Trainer,
+    prompts: Optional[list[str]] = None,
     encoder_prompts: Optional[list[str]] = None,
+    input_dataset: Optional[Union[pl.LightningDataModule, str]] = None,
     params_dtype: torch.dtype = torch.bfloat16,
     add_BOS: bool = False,
     max_batch_size: int = 4,
@@ -672,6 +677,7 @@ def generate(
     inference_batch_times_seqlen_threshold: int = 1000,
     inference_params: Optional["CommonInferenceParams"] = None,
     text_only: bool = False,
+    output_path: Optional[Union[Path, str]] = None,
 ) -> list[Union["InferenceRequest", str]]:
     """
     Generates text using a NeMo LLM model.
@@ -725,6 +731,8 @@ def generate(
         prompts (list[str]): The list of prompts to generate text for.
         trainer (nl.Trainer): The trainer object.
         encoder_prompts (Optional[list[str]], optional): The list of encoder prompts. Defaults to None.
+        input_dataset (Optional[Union[pl.LightningDataModule, str]], optional): The input data module or jsonl file.
+            Test set will be used for generation for data modules. Defaults to None.
         params_dtype (torch.dtype, optional): The data type of the model parameters. Defaults to torch.bfloat16.
         add_BOS (bool, optional): Whether to add the beginning of sequence token. Defaults to False.
         max_batch_size (int, optional): The maximum batch size. Defaults to 4.
@@ -734,6 +742,8 @@ def generate(
         inference_params (Optional["CommonInferenceParams"], optional): The inference parameters defined in
             Mcore's CommonInferenceParams. Defaults to None.
         text_only (bool, optional): Whether to return only the generated text as a string. Defaults to False.
+        output_path (Optional[Union[Path, str]], optional): The path to save the generated text or test dataset
+            predictions. Defaults to None.
 
     Returns:
         list[Union["InferenceRequest", str]]: A list of generated text,
@@ -741,24 +751,63 @@ def generate(
     """
     from nemo.collections.llm import inference
 
+    if input_dataset is not None:
+        input_path = input_dataset if isinstance(input_dataset, str) else input_dataset.test_path
+        with open(input_path) as f:
+            dataset = [json.loads(sample) for sample in f.readlines()]
+            inputs = [sample["input"] for sample in dataset]
+    elif prompts is not None:
+        inputs = prompts
+    else:
+        raise ValueError("Either prompts or input_dataset must be provided.")
+
     inference_wrapped_model, mcore_tokenizer = inference.setup_model_and_tokenizer(
         path=path,
         trainer=trainer,
         params_dtype=params_dtype,
         inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold,
     )
-    results = inference.generate(
+
+    dp_size = trainer.strategy.distributed_sampler_kwargs['num_replicas']
+    dp_rank = trainer.strategy.distributed_sampler_kwargs['rank']
+    chunk_size = (len(inputs) + dp_size - 1) // dp_size
+    start_idx = dp_rank * chunk_size
+    end_idx = min(start_idx + chunk_size, len(inputs))
+    inputs_on_this_dp_rank = inputs[start_idx:end_idx]
+
+    results_on_this_dp_rank = inference.generate(
         model=inference_wrapped_model,
         tokenizer=mcore_tokenizer,
-        prompts=prompts,
+        prompts=inputs_on_this_dp_rank,
         encoder_prompts=encoder_prompts,
         add_BOS=add_BOS,
         max_batch_size=max_batch_size,
         random_seed=random_seed,
         inference_params=inference_params,
     )
+    gathered_results = [None] * dp_size
 
-    return [r.generated_text if text_only else r for r in results]
+    all_gather_object(
+        gathered_results,
+        [r.generated_text if text_only else r for r in results_on_this_dp_rank],
+        group=parallel_state.get_data_parallel_group(),
+    )
+    gathered_results = [result for sublist in gathered_results for result in sublist]
+
+    assert len(gathered_results) == len(inputs)
+
+    if output_path is not None and is_global_rank_zero():
+        with open(output_path, "w") as f:
+            for sample, pred in zip(dataset if input_dataset else inputs, gathered_results):
+                if type(sample) == dict:
+                    sample["label"] = sample.pop("output", None)
+                    sample["prediction"] = pred if text_only else pred.generated_text
+                elif type(sample) == str:
+                    sample = {"input": sample, "prediction": pred if text_only else pred.generated_text}
+                f.write(json.dumps(sample) + "\n")
+        logging.info(f"Predictions written to {output_path}")
+
+    return gathered_results
 
 
 def _use_tokenizer(model: pl.LightningModule, data: pl.LightningDataModule, tokenizer: TokenizerType) -> None:
diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py
index 8a3cbc925dad..085d686afa4e 100644
--- a/nemo/collections/llm/inference/base.py
+++ b/nemo/collections/llm/inference/base.py
@@ -242,7 +242,7 @@ def generate(
         text_generation_controller=text_generation_controller, max_batch_size=max_batch_size, random_seed=random_seed
     )
 
-    common_inference_params = inference_params or CommonInferenceParams(num_tokens_to_generate=512)
+    common_inference_params = inference_params or CommonInferenceParams(num_tokens_to_generate=512, top_k=1)
 
     results = mcore_engine.generate(
         prompts=prompts,

From b11ca61ab736c3a4e84af61022de57c3b66a15a9 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Tue, 19 Nov 2024 14:32:21 -0500
Subject: [PATCH 23/48] DoRA (#11104)

* initial commit for DoRA

Signed-off-by: Chen Cui <chcui@nvidia.com>

* clean up code

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* clean up

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix TP

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* add dropout correction term

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* add copyright and doc strings

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fix

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* docstrings

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* docstrings

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add ci test

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add ci test

Signed-off-by: Chen Cui <chcui@nvidia.com>

* typo

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove unused code

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove commented out code

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix

Signed-off-by: Chen Cui <chcui@nvidia.com>

* bug

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml          |  29 +++
 nemo/collections/llm/inference/base.py   |   8 +-
 nemo/collections/llm/peft/__init__.py    |  10 +-
 nemo/collections/llm/peft/dora.py        | 261 +++++++++++++++++++++++
 nemo/collections/llm/peft/lora.py        |  25 +--
 nemo/lightning/pytorch/callbacks/peft.py |  97 +++++++--
 tests/collections/llm/gpt_finetuning.py  |   4 +-
 7 files changed, 383 insertions(+), 51 deletions(-)
 create mode 100644 nemo/collections/llm/peft/dora.py

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 61f809312bef..4fcafea82d07 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4199,6 +4199,34 @@ jobs:
         --pp_size 1 \
         --mbs 1 --packed
 
+  L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 3 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft dora \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+        
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 6 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft dora \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+
   L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4428,6 +4456,7 @@ jobs:
       - L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2
       - L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
       - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
+      - L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED
       - L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2
       - L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1
       - L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py
index 085d686afa4e..795d6efadd3a 100644
--- a/nemo/collections/llm/inference/base.py
+++ b/nemo/collections/llm/inference/base.py
@@ -34,10 +34,10 @@
 from megatron.core.transformer.module import MegatronModule
 
 import nemo.lightning as nl
-from nemo.collections.llm.peft import LoRA
 from nemo.lightning import io
 from nemo.lightning.ckpt_utils import ADAPTER_META_FILENAME, ckpt_to_context_subdir
 from nemo.lightning.io.pl import ckpt_to_weights_subdir
+from nemo.lightning.pytorch.callbacks import PEFT
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 from nemo.lightning.pytorch.strategies.utils import RestoreConfig
 
@@ -161,9 +161,9 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
     trainer.strategy.trainer = trainer
     trainer.strategy.selective_restore()
 
-    lora: Union[io.TrainerContext, LoRA] = io.load_context(ckpt_to_context_subdir(path), "model.model_transform")
-    if isinstance(lora, LoRA):
-        model = lora(model)
+    peft: Union[io.TrainerContext, PEFT] = io.load_context(ckpt_to_context_subdir(path), "model.model_transform")
+    if isinstance(peft, PEFT):
+        model = peft(model)
         adapter_sharded_state_dict = {k: v for k, v in model.sharded_state_dict().items() if ".adapter." in k}
         adapter_state = trainer.strategy.checkpoint_io.load_checkpoint(
             ckpt_to_weights_subdir(path, is_saving=False), sharded_state_dict=adapter_sharded_state_dict
diff --git a/nemo/collections/llm/peft/__init__.py b/nemo/collections/llm/peft/__init__.py
index 3dae5622b733..11511ffe72b7 100644
--- a/nemo/collections/llm/peft/__init__.py
+++ b/nemo/collections/llm/peft/__init__.py
@@ -13,6 +13,14 @@
 # limitations under the License.
 
 from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.peft.dora import DoRA
 from nemo.collections.llm.peft.lora import LoRA
 
-__all__ = ["LoRA", "gpt_lora"]
+PEFT_STR2CLS = {
+    "LoRA": LoRA,
+    "lora": LoRA,
+    "DoRA": DoRA,
+    "dora": DoRA,
+}
+
+__all__ = ["LoRA", "DoRA", "gpt_lora", "PEFT_STR2CLS"]
diff --git a/nemo/collections/llm/peft/dora.py b/nemo/collections/llm/peft/dora.py
new file mode 100644
index 000000000000..d77d2a4dc0d4
--- /dev/null
+++ b/nemo/collections/llm/peft/dora.py
@@ -0,0 +1,261 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import re
+from dataclasses import dataclass, field
+from typing import List, Literal, Optional
+
+import torch
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.tensor_parallel import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    gather_from_tensor_model_parallel_region,
+)
+from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_tp_sharded_tensor_for_checkpoint
+from torch import nn
+
+from nemo.collections.llm.peft.lora import LinearAdapter
+from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ParallelLinearAdapter
+from nemo.lightning.pytorch.callbacks.peft import PEFT, AdapterWrapper
+from nemo.utils import logging
+from nemo.utils.import_utils import safe_import_from
+
+TEColumnParallelLinear, HAVE_TE_COL_LINEAR = safe_import_from(
+    "megatron.core.extensions.transformer_engine", "TEColumnParallelLinear"
+)
+TELayerNormColumnParallelLinear, HAVE_TE_LN_COL_LINEAR = safe_import_from(
+    "megatron.core.extensions.transformer_engine",
+    "TELayerNormColumnParallelLinear",
+)
+TERowParallelLinear, HAVE_TE_ROW_LINEAR = safe_import_from(
+    "megatron.core.extensions.transformer_engine", "TERowParallelLinear"
+)
+HAVE_TE = all((HAVE_TE_COL_LINEAR, HAVE_TE_LN_COL_LINEAR, HAVE_TE_ROW_LINEAR))
+
+
+class ParallelLinearDoRAAdapter(ParallelLinearAdapter):
+    """
+    Adapter class for DoRA to handle the additional weight_magnitude parameter
+    """
+
+    def init_weight_magnitude(self, value):
+        """
+        Initialize weight_magnitude with shape (d,), where d is the output dim of the linear layer
+        """
+        self.weight_magnitude = nn.Parameter(value, requires_grad=True)
+
+    def get_weight_magnitude(self):
+        """
+        Public function to get the weight magnitude parameter
+        """
+        return self.weight_magnitude
+
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None
+    ) -> ShardedStateDict:
+        """
+        Sharded state dict implementation for DoRA adapter.
+        Weight magnitude is TP sharded for linear_qkv and linear_fc1 only.
+        """
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
+
+        magnitude_key = f"{prefix}weight_magnitude"
+        if self.input_is_parallel:
+            # RPL output is gathered, so weight_magnitude is not sharded for TP
+            magnitude_sharded_tensor = make_sharded_tensor_for_checkpoint(
+                self.weight_magnitude, magnitude_key, prepend_offsets=sharded_offsets
+            )
+        else:
+            # CPL output is sharded, so weight_magnitude is sharded for TP
+            magnitude_sharded_tensor = make_tp_sharded_tensor_for_checkpoint(
+                self.weight_magnitude, magnitude_key, 0, prepend_offsets=sharded_offsets
+            )
+        sharded_state_dict[magnitude_key] = magnitude_sharded_tensor
+
+        return sharded_state_dict
+
+
+class DoRALinear(AdapterWrapper):
+    """
+    An adapter wrapper that is designed to be used with DoRA
+    It extends the AdapterWrapper class to provide a specific implementation of the forward method.
+    """
+
+    def __init__(self, to_wrap: nn.Module, adapter: ParallelLinearDoRAAdapter):
+        super().__init__(to_wrap, adapter)
+        self.adapter: ParallelLinearDoRAAdapter
+        self.scaling = adapter.alpha / adapter.dim
+        self.adapter.init_weight_magnitude(self._get_weight_norm())
+
+    def _get_weight_norm(self):
+        if self.adapter.input_is_parallel:
+            linear_out_weight = gather_from_tensor_model_parallel_region(self.adapter.linear_out.weight.T).T
+            linear_in_weight = self.adapter.linear_in.weight
+        else:
+            linear_out_weight = self.adapter.linear_out.weight
+            linear_in_weight = gather_from_tensor_model_parallel_region(self.adapter.linear_in.weight.T).T
+
+        weight = self.to_wrap.weight + self.scaling * linear_out_weight @ linear_in_weight
+        return torch.linalg.norm(weight, dim=1).to(weight.dtype).detach()
+
+    def forward(self, x):
+        """
+        Forward method for DoRA
+
+          mag_norm_scale * (linear_output + adapter_output)
+        = ||W_0 + B_0 A_0|| / ||W_0 + B A|| * (W_0 x + B A x)
+        = ||W_0 + B_0 A_0|| ((W_0 + B A) / ||W_0 + B A||) x
+        = m ((W_0 + B A) / ||W_0 + B A||) x
+        = equation 5 in DoRA paper
+
+        When dropout is used, equation becomes
+          W_0 x + (m /||W_0 + B A|| - 1) W_0 dropout(x) + m /||W_0 + B A|| B A dropout(x)
+        = ...
+        = m /||W_0 + B A|| (W_0 x + B A dropout(x)) + (m /||W_0 + B A|| - 1) W_0 (dropout(x) - x)
+
+        """
+        linear_output, bias, layernorm_output = self.base_linear_forward(x)
+        adapter_output = self.adapter(layernorm_output.contiguous())
+
+        # mag_norm_scale is  ||W_0 + B_0 A_0|| / ||W_0 + B A||  (scaling in front of BA not shown)
+        mag_norm_scale = (self.adapter.get_weight_magnitude() / self._get_weight_norm()).view(1, 1, -1)
+        if self.adapter.dropout is None or not self.training:
+            dropout_correction = 0
+        else:
+            dropout_correction = (mag_norm_scale - 1) * self.base_linear_forward(
+                self.adapter.dropout(layernorm_output) - layernorm_output
+            )[0]
+
+        return mag_norm_scale * (linear_output + adapter_output) + dropout_correction, bias
+
+
+@dataclass
+class DoRA(PEFT):
+    """
+    Implements the DoRA (Weight-Decomposed LowRank Adaptation) module for parameter-efficient fine-tuning.
+
+    DoRA decomposes pre-trained weight into magnitude and direction, and uses a low-rank projection in the
+    directional component to adapt the weights of a pre-trained model to a new downstream task.
+    This class facilitates the application of DoRA to specific modules within the model architecture.
+
+    Args:
+        See LoRA class for a detailed explanation of the arguments.
+
+    Example:
+    --------
+        >>> from nemo.collections import llm
+        >>> lora = llm.peft.DoRA(target_modules=['linear_qkv', 'linear_proj'], dim=32, alpha=64)
+        >>> model = llm.Mistral7BModel(model_transform=lora)
+        >>> # (set up trainer and data)
+        >>> trainer.fit(model, data)
+
+    References:
+    -----------
+        Shih-Yang Liu, Chien-Yi Wang, Hongxu Yin, Pavlo Molchanov, Yu-Chiang Frank Wang, Kwang-Ting Cheng,
+        Min-Hung Chen (2024). DoRA: Weight-Decomposed Low-Rank Adaptation. arXiv preprint arXiv:2402.09353.
+        https://arxiv.org/abs/2402.09353
+    )
+    """
+
+    target_modules: List[str] = field(
+        default_factory=lambda: ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2']
+    )
+    dim: int = 32
+    alpha: int = 64
+    dropout: float = 0.0
+    dropout_position: Literal['pre', 'post'] = 'pre'
+    lora_A_init_method: str = "xavier"
+    lora_B_init_method: str = "zero"
+
+    def __post_init__(self):
+        assert self.dropout_position == "pre", (
+            "DoRA only supports pre-adapter dropout at this time." "Please set DoRA(..., dropout_position='pre')"
+        )
+
+    def transform(self, m: nn.Module, name=None, prefix=None):
+        """
+        Applies DoRA to a specific module within the model architecture.
+
+        Args:
+            m (nn.Module): The module to apply DoRA to.
+            name (str, optional): Name of the module (if applicable). Defaults to None.
+            prefix (str, optional): Prefix for the module name (if applicable). Defaults to None.
+
+        Returns:
+            nn.Module: The modified module with DoRA applied, or the original module if not a target.
+        """
+
+        def wildcard_match(pattern, key):
+            if key is None:
+                return None
+            regex_pattern = re.compile("^" + pattern.replace("*", "(.*)") + "$")
+            match = regex_pattern.match(key)
+            return match is not None
+
+        full_name = f"{prefix}.{name}" if prefix else name
+        if name in self.target_modules or any(wildcard_match(pattern, full_name) for pattern in self.target_modules):
+            if HAVE_TE and isinstance(m, TEColumnParallelLinear) or isinstance(m, TELayerNormColumnParallelLinear):
+                input_is_parallel = False
+                # m.in_features and m.out_features are divided by tp_size already,
+                # but in_features and out_features passed to ParallelLinearAdapter are not.
+                tp_size = parallel_state.get_tensor_model_parallel_world_size()
+                in_features = m.in_features
+                out_features = m.out_features * tp_size
+                # DoRA is applied after layernorm, so layernorm output must be returned
+                m.return_layernorm_output = True
+                # perf optimization for DoRA + SP (to check!)
+                if m.config.sequence_parallel and not m.ub_overlap_ag:
+                    m.return_layernorm_output_gathered = True
+            elif HAVE_TE and isinstance(m, TERowParallelLinear):
+                input_is_parallel = True
+                tp_size = parallel_state.get_tensor_model_parallel_world_size()
+                in_features = m.in_features * tp_size
+                out_features = m.out_features
+            elif isinstance(m, ColumnParallelLinear):
+                input_is_parallel = False
+                in_features = m.input_size
+                out_features = m.output_size
+            elif isinstance(m, RowParallelLinear):
+                input_is_parallel = True
+                in_features = m.input_size
+                out_features = m.output_size
+            elif isinstance(m, nn.Linear):
+                return LinearAdapter(
+                    m, dim=self.dim, alpha=self.alpha, dropout=self.dropout, lora_A_init_method=self.lora_A_init_method
+                )
+            else:
+                raise NotImplementedError(f"Layer type is unrecognized for LoRA: {type(m)}")
+
+            logging.info(f"Adding DoRA to: {full_name}")
+            adapter = ParallelLinearDoRAAdapter(
+                in_features,
+                out_features,
+                self.dim,
+                activation='identity',
+                norm_position=None,
+                norm_type=None,
+                column_init_method=self.lora_A_init_method,
+                row_init_method=self.lora_B_init_method,
+                gather_output=False,
+                input_is_parallel=input_is_parallel,
+                dropout=self.dropout,
+                dropout_position=self.dropout_position,
+                model_parallel_config=getattr(m, "config", None),
+                alpha=self.alpha,
+            )
+            return DoRALinear(m, adapter)
+        return m
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index 77063b9d7e98..55f807bf1f9c 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -48,25 +48,8 @@ class to provide a specific implementation of the forward method.
     """
 
     def forward(self, x):
-        linear_output = self.to_wrap(x)
-        assert isinstance(
-            linear_output, tuple
-        ), f"{self.to_wrap} should return a tuple but instead returns {linear_output}"
-        """ Four cases for the wrapped module's return values
-        1. nothing: (out, None)
-        2. return_bias: (out, bias)
-        2. return_layernorm_output: ((out, ln_out), None)
-        3. both: (out, bias, ln_out)
-        """
-        if len(linear_output) == 2:
-            linear_output, bias = linear_output
-            if isinstance(linear_output, tuple) and len(linear_output) == 2:
-                linear_output, layernorm_output = linear_output
-                x = layernorm_output
-        elif len(linear_output) == 3:
-            linear_output, bias, layernorm_output = linear_output
-            x = layernorm_output
-        adapter_output = self.adapter(x.contiguous())
+        linear_output, bias, layernorm_output = self.base_linear_forward(x)
+        adapter_output = self.adapter(layernorm_output.contiguous())
         return linear_output + adapter_output, bias
 
 
@@ -129,8 +112,8 @@ class LoRA(PEFT):
         target_modules (List[str], optional): A list of module names to apply LoRA to.
             Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
                 - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections
-                                in self-attention modules.
-                - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention modules.
+                                in self-attention.
+                - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention.
                 - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP.
                 - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP.
             Target modules can also contain wildcards. For example, you can specify
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index 3790b7788419..fb846043c8aa 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -105,6 +105,8 @@ def __call__(self, model: nn.Module) -> nn.Module:
         else:
             model.walk(self.transform)
 
+        if hasattr(model, "trainer") and model.trainer.state.fn != TrainerFn.FITTING:
+            self.freeze_model(model)
         return model
 
     def freeze_model(self, model: nn.Module) -> None:
@@ -126,9 +128,11 @@ def freeze_model(self, model: nn.Module) -> None:
             model.module.freeze()
         else:
             model.freeze()
-        model.train(mode=True)
+        if hasattr(model, "trainer") and model.trainer.state.fn == TrainerFn.FITTING:
+            model.train(mode=True)
 
     def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None:
+        """PTL callback setup function."""
         from nemo.lightning.pytorch.strategies.utils import create_checkpoint_io
 
         super().setup(trainer, pl_module, stage=stage)
@@ -160,6 +164,13 @@ def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str)
         trainer.strategy._setup_optimizers = False
 
     def apply_transform(self, trainer):
+        """
+        This function does the following:
+        1. Apply PEFT model transform.
+        2. Set up model parallel and optimizer, which were skipped in setup
+        3. Load weights and optimizer state dict
+        4. Set up `finalize_model_grads` from mcore.
+        """
         super().apply_transform(trainer)
         self.trainable_params = set(
             name for name, param in trainer.lightning_module.named_parameters() if param.requires_grad
@@ -205,6 +216,10 @@ def apply_transform(self, trainer):
             )
 
     def adapter_key_filter(self, key: str) -> bool:
+        """
+        Given a key in the state dict, return whether the key is an adapter (or base model).
+        This function can be subclassed in each PEFT method class.
+        """
         return key in self.trainable_params or ".adapter." in key or key.endswith(".adapters")
 
 
@@ -241,6 +256,36 @@ def __init__(self, to_wrap: nn.Module, adapter: nn.Module):
         self.to_wrap = to_wrap
         self.adapter = adapter
 
+    def base_linear_forward(self, x):
+        """
+        Run the forward method of the linear module `to_wrap`.
+        Return a tuple of three elements: linear_output, bias, layernorm_output
+
+        x -> [layernorm/identity] -> layernorm_output -> [linear] -> linear_output, bias
+
+        layernorm_output is different from input x only when linear layer is LayerNormColumnParallelLinear.
+        """
+        linear_output = self.to_wrap(x)
+        assert isinstance(
+            linear_output, tuple
+        ), f"{self.to_wrap} should return a tuple but instead returns {linear_output}"
+        """ Four cases for the wrapped module's return values
+        1. nothing: (out, None)
+        2. return_bias: (out, bias)
+        2. return_layernorm_output: ((out, ln_out), None)
+        3. both: (out, bias, ln_out)
+        """
+        bias = None
+        layernorm_output = x
+        if len(linear_output) == 2:
+            linear_output, bias = linear_output
+            if isinstance(linear_output, tuple) and len(linear_output) == 2:
+                linear_output, layernorm_output = linear_output
+        elif len(linear_output) == 3:
+            linear_output, bias, layernorm_output = linear_output
+
+        return linear_output, bias, layernorm_output
+
     def state_dict(self, destination=None, prefix='', keep_vars=False):
         """Retrieve the state dictionary of the wrapped module and adapter.
 
@@ -295,33 +340,38 @@ def sharded_state_dict(
         sharded_state_dict.update(self.adapter.sharded_state_dict(f"{prefix}adapter.", sharded_offsets, metadata))
         return sharded_state_dict
 
-    def load_state_dict(self, state_dict, strict=True):
-        """Load a state dictionary into the wrapped module and adapter.
 
-        This method overrides the default load_state_dict behavior to handle
-        loading states for both the main module and the adapter.
+class WrappedAdapterIO(_WrappingCheckpointIO, AsyncCompatibleCheckpointIO):
+    """
+    A wrapper class for checkpoint I/O operations, specifically designed for PEFT (Parameter-Efficient Fine-Tuning).
 
-        Args:
-            state_dict (dict): The state dictionary to load.
-            strict (bool): Whether to strictly enforce that the keys in state_dict
-                           match the keys returned by this module's state_dict()
-                           function. Defaults to True.
-        """
-        # Check if the 'adapters' key is present in the state_dict
-        if 'adapters' in state_dict:
-            adapter_state_dict = state_dict.pop('adapters')
-        else:
-            adapter_state_dict = {}
+    This class handles the complexities of saving and loading checkpoints for both initial PEFT training and resuming
+    PEFT training. It ensures that only the necessary adapter weights are saved and loaded, while also preserving the
+    base model weights.
 
-        # Load the main module state dict
-        self.to_wrap.load_state_dict(state_dict, strict)
+    **Usage:**
 
-        # Load the adapter module state dict if present
-        if adapter_state_dict:
-            self.adapter.load_state_dict(adapter_state_dict, strict)
+    1. **Initial PEFT Training:**
+       - The class handles the saving of only adapter weights.
+       - Metadata about the base model checkpoint is stored for future reference.
 
+    2. **PEFT Resume:**
+       - The class loads both base model and adapter weights.
+       - The previously stored metadata is used to locate the correct base model checkpoint.
+
+    **Attributes:**
+
+    - `peft`: The PEFT instance associated with the wrapped checkpoint I/O.
+    - `model_ckpt_path`: The path to the base model checkpoint.
+    - `adapter_ckpt_path`: The path to the adapter checkpoint.
+    Note that the paths are set by save/load functions and users do not need to set them.
+
+    **Methods:**
+
+    - `save_checkpoint`: Saves the adapter weights and metadata to the specified path.
+    - `load_checkpoint`: Loads the base model and adapter weights based on the specified path and metadata.
+    """
 
-class WrappedAdapterIO(_WrappingCheckpointIO, AsyncCompatibleCheckpointIO):
     peft: Optional[PEFT] = None
     model_ckpt_path: Optional[Path] = None
     adapter_ckpt_path: Optional[Path] = None
@@ -374,7 +424,8 @@ def load_checkpoint(
         As such, this function will be entered twice during PEFT training resume.
 
         For the FIRST TIME this function is called by trainer._checkpoint_connector._restore_modules_and_callbacks.
-        `path = AdapterPath(<adapter_path>, base_model_path=<base_path>)`, and sharded_state_dict contains only base model weights
+        `path = AdapterPath(<adapter_path>, base_model_path=<base_path>)`, and sharded_state_dict contains only base
+        model weights
 
         For the SECOND TIME this function is called by PEFT.apply_transform (above, in the same file).
         `path = PosixPath(<adapter_path>)`, and sharded_state_dict contains only adapter weights.
diff --git a/tests/collections/llm/gpt_finetuning.py b/tests/collections/llm/gpt_finetuning.py
index 7eaa7744729c..91796585bf96 100644
--- a/tests/collections/llm/gpt_finetuning.py
+++ b/tests/collections/llm/gpt_finetuning.py
@@ -94,8 +94,8 @@ def get_args():
         ),
     )
 
-    if args.peft == 'lora':
-        peft = llm.peft.LoRA()
+    if args.peft in ['lora', 'dora']:
+        peft = llm.peft.PEFT_STR2CLS[args.peft]()
     else:
         peft = None
 

From f3f200b8707475db03a163428e485dd99d64c8c5 Mon Sep 17 00:00:00 2001
From: lilyw97 <lilyw@nvidia.com>
Date: Wed, 20 Nov 2024 04:20:02 +0800
Subject: [PATCH 24/48] Profiling - support Chakra & Kineto trace dumping
 (#11115)

* Support chakra trace dumping by cfg

Signed-off-by: Lily Wang <lilyw@nvidia.com>

remove the manual recording of process::init

Signed-off-by: Lily Wang <lilyw@nvidia.com>

1. Remove unnecessary kineto config  2. Fix typo

Signed-off-by: Lily Wang <lilyw@nvidia.com>

Change warning to exception when nsys is enabled with chakra profiling

Signed-off-by: Lily Wang <lilyw@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>

* fix bug in identifying profiling start step

Signed-off-by: Lily Wang <lilyw@nvidia.com>

* Update baseline

Signed-off-by: lilyw97 <lilyw97@users.noreply.github.com>

* [1]remove unused import [2]switch to use isinstance instead of type() [3]move torch.profiling to function

Signed-off-by: Lily Wang <lilyw@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilyw97 <lilyw97@users.noreply.github.com>

---------

Signed-off-by: Lily Wang <lilyw@nvidia.com>
Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>
Signed-off-by: lilyw97 <lilyw97@users.noreply.github.com>
Signed-off-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Co-authored-by: Lily Wang <lilyw@cw-dfw-cs-001-vscode-01.cm.cluster>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: pablo-garay <pablo-garay@users.noreply.github.com>
Co-authored-by: lilyw97 <lilyw97@users.noreply.github.com>
Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
---
 .secrets.baseline            |   2 +-
 nemo/core/classes/modelPT.py | 107 ++++++++++++++++++++++++++++++++++-
 2 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/.secrets.baseline b/.secrets.baseline
index 1e4832b40075..09fc7a78a6ca 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -1933,7 +1933,7 @@
         "filename": "tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb",
         "hashed_secret": "80903ddedcf4ec0a2ee5911cefa7e1ad52419dcc",
         "is_verified": false,
-        "line_number": 989
+        "line_number": 990
       }
     ],
     "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb": [
diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index 308f5cbb8bee..88ff47caf8c2 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -211,6 +211,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self._memory_profile_started = False
         self._memory_profile_complete = False
 
+        # Setup chakra profiling if it has been enabled in the model config
+        self._setup_chakra_profiling()
+
+        # A flag for the profile generation
+        self._chakra_profile_in_progress = False
+
     def __init_subclass__(cls) -> None:
         cls._save_restore_connector = SaveRestoreConnector()
 
@@ -1744,6 +1750,78 @@ def update_save_restore_connector(cls, save_restore_connector):
         else:
             setattr(cls, '_save_restore_connector', save_restore_connector)
 
+    def _setup_chakra_profiling(self):
+        """Enables chakra profiling
+        To use, add the following options to the model config:
+        ## Chakra profiling options
+        chakra_profile:
+            enabled: False
+            start_step: 2  # Global batch to start profiling
+            end_step: 2 # Global batch to end profiling
+            warmup_steps: 0  # Global batch to start profiling
+            active_steps: 1  # Global batch to start profiling
+            trace_dir: None # Path to store the profile output file
+        """
+        if self.cfg.get('chakra_profile', None) is not None:
+            if self.cfg.chakra_profile.get('enabled', False):
+
+                from torch.profiler import ExecutionTraceObserver
+                from nemo.utils.env_var_parsing import get_envint
+
+                self._chakra_profile_enabled = True
+                self._chakra_profile_start_step = self.cfg.chakra_profile.get('start_step', 0)
+                self._chakra_profile_end_step = self.cfg.chakra_profile.get('end_step', 0)
+                trace_dir = self.cfg.chakra_profile.get('trace_dir', None)
+
+                if trace_dir is None or not os.path.isdir(trace_dir):
+                    raise ValueError(f'chakra profile output path ({trace_dir}) is not set or does not exist.')
+
+                trace_dir = Path(trace_dir)
+                warmup_steps = self.cfg.chakra_profile.get('warmup_steps', 0)
+                active_steps = self.cfg.chakra_profile.get('active_steps', 1)
+
+                job_id = get_envint("SLURM_JOB_ID", 0)
+
+                self._chakra_trace_dir = trace_dir / f'{job_id}_chakra'
+                self._kineto_trace_dir = trace_dir / f'{job_id}_kineto'
+
+                self._chakra_trace_dir.mkdir(parents=True, exist_ok=True)
+                self._kineto_trace_dir.mkdir(parents=True, exist_ok=True)
+
+                if isinstance(self._chakra_profile_start_step, int):
+                    logging.info(f'chakra profiling setup with start_step: {self._chakra_profile_start_step}')
+                else:
+                    raise ValueError(
+                        f'chakra start_step must be of type int. Found: {type(self._chakra_profile_start_step)}'
+                    )
+
+                if isinstance(self._chakra_profile_end_step, int):
+                    logging.info(f'chakra profiling setup with end_step: {self._chakra_profile_end_step}')
+                else:
+                    raise ValueError(
+                        f'chakra end_step must be of type int. Found: {type(self._chakra_profile_end_step)}'
+                    )
+
+                if self._chakra_profile_end_step >= self._chakra_profile_start_step:
+                    pass
+                else:
+                    raise ValueError(f'chakra end_step must be greater than or equal to chakra start_step')
+
+                if self.cfg.nsys_profile.get('enabled', False):
+                    raise Exception(
+                        f"Profiler conflict: Chakra profiling and Nsys profiling cannot be enabled at the same time."
+                    )
+
+                self._et = ExecutionTraceObserver()
+                self._prof = torch.profiler.profile(
+                    activities=[
+                        torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA,
+                    ],
+                    schedule=torch.profiler.schedule(wait=0, warmup=warmup_steps, active=active_steps),
+                    execution_trace_observer=self._et,
+                )
+
     def _setup_profiling(self):
         """Enables nsys profiling
         To use, add the following optoins to the model config:
@@ -1848,11 +1926,22 @@ def on_train_start(self):
     def on_train_batch_start(self, batch: Any, batch_idx: int, unused: int = 0) -> Optional[int]:
         """PyTorch Lightning hook:
         https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-start
-        We use it here to enable nsys profiling and dynamic freezing.
+        We use it here to enable profiling and dynamic freezing.
         """
-
-        # nsys profiling
         if self.device.type == 'cuda':
+            if hasattr(self, '_chakra_profile_enabled'):
+                if self._chakra_profile_enabled and not self._chakra_profile_in_progress:
+                    if (
+                        self.trainer.global_step >= self._chakra_profile_start_step
+                        and self.trainer.global_step <= self._chakra_profile_end_step
+                    ):
+                        logging.info(
+                            f"====== Start chakra profiling from global_step {self.trainer.global_step} ======"
+                        )
+                        self._et.register_callback(str(self._chakra_trace_dir / f'rank-{get_rank()}.json'))
+                        self._prof.start()
+                        self._chakra_profile_in_progress = True
+
             if hasattr(self, '_nsys_profile_enabled'):
                 if self._nsys_profile_enabled and not self._nsys_profile_started:
                     if batch_idx >= self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
@@ -1898,6 +1987,18 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int, unused: int =
         """
 
         if self.device.type == 'cuda':
+            if hasattr(self, '_chakra_profile_enabled'):
+                # self.trainer.global_step is increaeasd before on_train_batch_end
+                if self._chakra_profile_enabled and self._chakra_profile_in_progress:
+                    if self.trainer.global_step - 1 >= self._chakra_profile_end_step:
+                        logging.info(f"====== End chakra profiling at global_step {self.trainer.global_step} ======")
+                        self._prof.stop()
+                        self._prof.export_chrome_trace(str(self._kineto_trace_dir / f'rank-{get_rank()}.json'))
+                        self._et.unregister_callback()
+                        self._chakra_profile_in_progress = False
+                    elif self.trainer.global_step - 1 >= self._chakra_profile_start_step:
+                        self._prof.step()
+
             if hasattr(self, '_nsys_profile_enabled'):
                 if self._nsys_profile_enabled and not self._nsys_profile_complete:
                     if batch_idx >= self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:

From 7d3253769fa9560dfb23d8336f31f72fab02443c Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Tue, 19 Nov 2024 12:23:45 -0800
Subject: [PATCH 25/48] NeMo 2.0 SFT PEFT notebooks (#10874)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* nemo2-sft notebook initial draft

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* remove mixtral info

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fixes

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fixes

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fixes

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add import_ckpt script and minor changes

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* Random read for tarr files in lhotse dataloaders (#10536)

* Random read for tarr files in lhotse dataloaders

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* Solve failled tests

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* Adding a testcase

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* Some changs in tests

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* removing import

Signed-off-by: Nune <ntadevosyan@nvidia.com>

---------

Signed-off-by: Nune <ntadevosyan@nvidia.com>
Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>
Co-authored-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* training code for hybrid-autoregressive inference model (#10841)

* training code for hybrid-autoregressive inference model

Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hainan-xv <hainan-xv@users.noreply.github.com>

---------

Signed-off-by: Hainan Xu <hainanx@nvidia.com>
Signed-off-by: hainan-xv <hainan-xv@users.noreply.github.com>
Co-authored-by: Hainan Xu <hainanx@nvidia.com>
Co-authored-by: hainan-xv <hainan-xv@users.noreply.github.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 772faca ! (#10871)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* Use trainer.local_rank/global_rank (#10860)

* fix global_rank calculation

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* use trainer's global/local rank

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove stacking operation from batched functions (#10524)

* remove stacking operations

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fixes im base class

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* clean up

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* remove potentially uninitialized local variable

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* restore batch_intilize states funcname

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix typo

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix potentially uninitialized local variable

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix potentially uninitialized local variable
in stateless transduser

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix test

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* fix docstring, rm comment

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix dosctrings

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

---------

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>
Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>
Co-authored-by: lilithgrigoryan <lgrigoryan@nvidia.com>
Co-authored-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* [NeMo-UX] Add llm.generate to nemo.collections.llm (#10471)

* Add llm.generate

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Remove comment

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Fix launching with python

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* PR feedback

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* PR feedback

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add assert cp

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add example script

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>

* Adding support for LightningDataModule inside Fabric-API (#10879)

* Make FabricMegatronMixedPrecision match MegatronMixedPrecision

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Supporting DataModule in fabric-API

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>

* Adding support for LightningDataModule inside Fabric-API

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Remove import in mock.py

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>

---------

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>

* initial draft

Signed-off-by: smajumdar <titu1994@gmail.com>

* Initial local run

Signed-off-by: smajumdar <titu1994@gmail.com>

* Initial local run

Signed-off-by: smajumdar <titu1994@gmail.com>

* Initial local run

Signed-off-by: smajumdar <titu1994@gmail.com>

* Initial local run

Signed-off-by: smajumdar <titu1994@gmail.com>

* Save yaml config for model in nemo.lightning.io (#10765)

* Save yaml config for model in nemo.lightning.io

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Fix bug

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Fix bug

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* fix bug

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add explicit yaml comparison

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* relax test

Signed-off-by: Hemil Desai <hemild@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>

* Move collectiob.nlp imports inline for t5 (#10877)

* Move collectiob.nlp imports inline for t5

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>

* add world_size/pp_size runtime check (#10842)

* add world_size/pp_size runtime check

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix msg precision

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix test_init_parallel_ranks ws=3 pp=3

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix peft resume (#10887)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Update engine build step for TRT-LLM 0.13.0 (#10880)

* Setting use_fused_mlp for TRT-LLM >= 0.13.0

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Unused import removal

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Akoumparouli/nemo ux moe loss logging (#10128)

* Move across pipeline loss reduction to a separate function

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Add support for MoE loss logging

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused function

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* enable vboost and set LM SM margin (#10853)

* enable vboost

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* env vars

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* add perf plugin

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* revert default executor

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* fix typo

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* fix more typo

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* ln margin knob

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* specify lm margin

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* use _get_extra_te_kwargs_meta in fabric (call mcore's _get_extra_te_k… (#10608)

* use _get_extra_te_kwargs_meta in fabric (call mcore's _get_extra_te_kwargs & overwrite device)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Use torch sdpa implementation in ASR mha (#9590)

* use pytorch sdpa

Signed-off-by: WoodieDudy <goshagks@gmail.com>

* sdpa work

Signed-off-by: WoodieDudy <goshagks@gmail.com>

* Apply isort and black reformatting

Signed-off-by: titu1994 <titu1994@users.noreply.github.com>

* sdpa flag to false & sdpa_backend arg

Signed-off-by: WoodieDudy <goshagks@gmail.com>

* Apply isort and black reformatting

Signed-off-by: WoodieDudy <WoodieDudy@users.noreply.github.com>

* change arg name

Signed-off-by: WoodieDudy <goshagks@gmail.com>

* Apply isort and black reformatting

Signed-off-by: WoodieDudy <WoodieDudy@users.noreply.github.com>

* fix config args

Signed-off-by: WoodieDudy <goshagks@gmail.com>

* Apply isort and black reformatting

Signed-off-by: WoodieDudy <WoodieDudy@users.noreply.github.com>

* add condition on version

Signed-off-by: WoodieDudy <goshagks@gmail.com>

* Apply isort and black reformatting

Signed-off-by: WoodieDudy <WoodieDudy@users.noreply.github.com>

* update condition on version

Signed-off-by: WoodieDudy <goshagks@gmail.com>

* remove condition on torch version

Signed-off-by: WoodieDudy <goshagks@gmail.com>

* Apply isort and black reformatting

Signed-off-by: WoodieDudy <WoodieDudy@users.noreply.github.com>

* move code to init

Signed-off-by: WoodieDudy <goshagks@gmail.com>

* Apply isort and black reformatting

Signed-off-by: WoodieDudy <WoodieDudy@users.noreply.github.com>

* refactor

Signed-off-by: WoodieDudy <goshagks@gmail.com>

* Apply isort and black reformatting

Signed-off-by: WoodieDudy <WoodieDudy@users.noreply.github.com>

* refactor

Signed-off-by: WoodieDudy <goshagks@gmail.com>

---------

Signed-off-by: WoodieDudy <goshagks@gmail.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
Signed-off-by: WoodieDudy <WoodieDudy@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: WoodieDudy <WoodieDudy@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

* Add registry to register all needed classes with artifacts in nemo.lightning.io (#10861)

* Add registry to register all needed classes with artifacts in nemo.lightning.io

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Fixes

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* comments

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Remove cyclic import

Signed-off-by: Hemil Desai <hemild@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>

* call __post_init__ after altering config values (#10885)

* call __post_init__ after altering config values

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* test fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* turn off SP

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Nemo 2.0 ckpt support in TRT-LLM export (#10891)

* fix minor import bug

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Add registry to register all needed classes with artifacts in nemo.lightning.io

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Fixes

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* nemo 2.0 support in export to trt-llm

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* get mixing from main

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* fix style

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* [Docs] Fix doc warnings, focus on feature and multimodal sections (#10171)

* various simple docs source fixes

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix docstrings and typing with forward reference

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: erastorgueva-nv <erastorgueva-nv@users.noreply.github.com>

* fix typing forward reference for PromptedAudioToTextLhotseDataset

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix feature warnings

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Try fix some model part errors

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* try add requirements

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* try add requirements

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix indent in docstring

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* handle duplicate issue

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* handle duplicate issue

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix imagen cite

* fix ratio issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix Dreambooth

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix activation recomputation

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix sequence packing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix asr_language_modeling_and_customization

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fixes wip

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: erastorgueva-nv <erastorgueva-nv@users.noreply.github.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Co-authored-by: erastorgueva-nv <erastorgueva-nv@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: Ao Tang <aot@nvidia.com>
Co-authored-by: Huiying Li <willwin.lee@gmail.com>

* calculate step time batch end-batch end (#10202)

* log step time at end

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* use nemo logging

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* cleanup

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* check remove

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* delta timing callback

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* comment and name change

Signed-off-by: Malay Nagda <malayn@nvidia.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>

* late import prettytable (#10912)

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 0d89fc4 ! (#10919)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Warning for missing FP8 checkpoint support for vLLM deployment (#10906)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Add lhotse fixes for rnnt model training and WER hanging issue with f… (#10821)

* Add lhotse fixes for rnnt model training and WER hanging issue with f… (#10787)

* Add lhotse fixes for rnnt model training and WER hanging issue with fuse batching

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: nithinraok <nithinraok@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: nithinraok <nithinraok@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>

* Fix ASR tests (#10794)

* Make tests required

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Debug torch.load issue

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Run only necessary tests

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Try fix loading

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Avoid caching fixture

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Try restore model several times

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Try customize temporary directory

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Reorder tests

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Disable one test

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Avoid xxlarge model

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Disable test

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Revert changes

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Magic fix

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Revert unnecessary changes

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Disable all jobs except L0

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* RNNT alignments - merge with unit tests

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix CUDA graph frame-looping decoder to handle non-CUDA inputs

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix config

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Log test results

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Use less audio files for tests

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>

* Integrating mcore export (#10238)

* Integrating mcore export

* Integrating mcore export

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Move trt imports in nemo.collections.llm inside respective functions (#10234)

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add tests for LazyNeMoIterator and fix case with metadata_only=True and offsets in manifest (#10198)

* Add tests for LazyNeMoIterator and fix case with manifest_only=True and offsets in manifest

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Address code review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* [NeMo-UX] Fix a serialization bug that prevents users from moving checkpoints (#9939)

* perfor serialization using relative paths to allow users to move checkpoints after they're saved

Signed-off-by: ashors1 <ashors@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

* remove unused import

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix artifact load

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix path artifact

Signed-off-by: ashors1 <ashors@nvidia.com>

* remove unused import

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: ashors1 <ashors1@users.noreply.github.com>

* Add MemoryProfileCallback (#10166)

* Add MemoryProfileCallback

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>

* Remove reference cycles, save snapshot on specific ranks

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Remove unnecessary imports

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>

* Update docstring

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

---------

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
Signed-off-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>

* Lower bound transformers to support nemotron (#10240)

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Co-authored-by: Dong Hyuk Chang <donghyukc@nvidia.com>

* [Audio] SSL Pretraining framework for flow-matching model for audio processing (#10052)

Flow matching generative model with SSL pretraining framework

Signed-off-by: Pin-Jui Ku <pku@nvidia.com>
Co-authored-by: Kuray107 <Kuray107@users.noreply.github.com>

* Revert torchrun fix for model import (#10251)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [NeMo-UX[ Move nemotron imports inline (#10255)

* Move nemotron transformers + tokenizer imports inline to reduce number of required deps

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Wrap CPU model init with megatron_lazy_init_context (#10219)

* Wrap CPU model init with megatron_lazy_init_context

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Cleanup checkpoint-dir if saving fails

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Bump `Dockerfile.ci` (2024-08-22) (#10227)

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 124bcff !

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* fix bert flags

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* salm export trtllm (#10245)

Signed-off-by: slyne deng <slyned@nvidia.com>
Co-authored-by: slyne deng <slyned@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to ef85bc9 ! (#10250)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 01ca03f ! (#10266)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* Load model in the target export precision by default in PTQ (#10267)

* Load model in the target export precision by default

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Enable megatron_amp_O2=true to actually use half-precision

Signed-off-by: Jan Lasek <jlasek@nvidia.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Jan Lasek <jlasek@nvidia.com>

* Add WandbPlugin, NsysPlugin and PreemptionPlugin to nemo.lightning.run.plugins (#10223)

* Add WandbPlugin, NsysPlugin and PreemptionPlugin to nemo.lightning.run.plugins

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Remove duplicate

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add entity to wandb logger

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add documentation

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add warning

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* PR feedback

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add comments

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>

* [NeMo-UX] Handle absolute logger directories in nemo_logger (#10259)

* handle absolute and relative logger directories

Signed-off-by: Anna Shors <ashors@nvidia.com>

* merge lines

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: Anna Shors <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>

* Add sdxl notebook (#10139)

* Add sdxl notebook

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Rename

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* final Update SDXL notebook

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

---------

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Updating some coments

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Updating some coments

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Updating some coments

* Small change

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* ADD support for layernorm1p

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Update Dockerfile.ci

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Dockerfile.ci

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Dockerfile.ci

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

---------

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors1@users.noreply.github.com>
Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
Signed-off-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Signed-off-by: Pin-Jui Ku <pku@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Signed-off-by: slyne deng <slyned@nvidia.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Jan Lasek <jlasek@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: Anna Shors <ashors@nvidia.com>
Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Co-authored-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com>
Co-authored-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: Dong Hyuk Chang <thomaschang26@tutanota.com>
Co-authored-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Co-authored-by: Kuray107 <pku9@gatech.edu>
Co-authored-by: Kuray107 <Kuray107@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Co-authored-by: Slyne Deng <slynedeng@gmail.com>
Co-authored-by: slyne deng <slyned@nvidia.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: Ming <111467530+Victor49152@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>

* Fix artifact saving (#10914)

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Lora improvement (#10918)

* pull out freeze model

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add wildcard match to lora target modules

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Huvu/t5 nemo2.0 peft (#10916)

* adding peft test and cicd

* add setting mcore model to train in peft.py

* adding test for T5 lora

* fix follow Chen's fix

* restore cicd-main.yml

---------

Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>

* Add tie_word_embeddings=True (#10710)

Signed-off-by: Yoshi Suhara <ysuhara@nvidia.com>

* Use a context-manager when opening files (#10895)

* Use a context-manager when opening files

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>

* long context performance numbers in doc (#10784)

* long context perf

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* update the long context perf

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Akoumparouli/mcore microbatch calculator fix (#10780)

* move tests/lightning/{,_}io

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add microbatch calculator context manager

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* use microbatch calculator context manager

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add on_load_checkpoint test to ValidateModelRestoration; use ctx manager to reconfigure microbatch calculator; update save/restore path; add cleanup step at the end

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused var

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* remove 8x3b recipes (#10764)

* remove 8x3b recipes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove 8x3b from test_nemo_run

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm from __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* change the figure file name

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Accommodating the reviewer's comment

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* update the y-axis title

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 3f90b98 ! (#10789)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Add ModelOpt transformer model pruning example for Llama models, default to llama3.1-8b-base (#10294)

* Add ModelOpt transformer model pruning example for Llama3 model

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* examples code is at wrong dir, move them

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* changes as suggested in comment

remove some logging and unused config code, update example model to
llama3.1

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Add pruning of hidden_size into example

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Update examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Add pruning test to cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

---------

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: shengliangxu <shengliangxu@users.noreply.github.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Update mamba.rst after dist ckpt addition (#10800)

Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* fix chunked infer (#10581)

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* fix state transform (#10728)

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* use ckpt_to_weights_subdir in restore (#10786)

* use ckpt_to_weights_subdir in restore

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make ckpt_to_{weight,context}_subdir idempotent

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Mixtral set seq_length=4k (#10704)

* enable SP & set seq_lenght=4k

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update test expected values

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* 8x22b 4k

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Fix for crashes with tensorboard_logger=false and VP + LoRA (#10792)

* Fix for crashes with tensorboard_logger=false and virtual pipeline parallel + LoRA

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: vysarge <vysarge@users.noreply.github.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Disable checkpoint conversion inside AutoResume (#10645)

* Disable checkpoint conversion inside AutoResume

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Update resume docstrings

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* add default finetuning recipe and refactor llama3 8b recipe

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* address comment

Signed-off-by: Chen Cui <chcui@nvidia.com>

* refactor other recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* remove 8x3b finetuning recipe for now because HF version not available

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add copyright header

Signed-off-by: Chen Cui <chcui@nvidia.com>

* adjust unit tests based on recipe fixes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix failed unit test

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* replace png file to github assets

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* change image url to github release

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

---------

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Co-authored-by: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com>
Co-authored-by: shengliangxu <shengliangxu@users.noreply.github.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>

* perf recipes and Mcore DistOpt params (#10883)

* 175b gpt3 recipe

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* dist opt params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* 405b dist opt params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* perf recipes and dist opt params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* MoE dist opt params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* gpt bias fusion params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* 175b recipe

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* perf params comments

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* MoE perf params comments

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* perf recipes suffix

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* specific models fusion params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>

* ci: Fix cherry pick team (#10945)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Packed sequence bug fixes (#10898)

* save prepared dataset to different folders according to tokenizer name

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix hang

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* fix hang

Signed-off-by: Chen Cui <chcui@nvidia.com>

* raise mbs>1 error and provide suggestion to user instead of automatically changing config

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* add ci for packed seq

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fix bug

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>

* Fix requirements for MacOS (#10930)

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix nemo 2.0 recipes  (#10915)

* Fix recipe num_nodes and long context docstring

* Fix typo

* Fix PP issue

* Fix unit test

* Change recipes

* fix test

* Fix unit tests

* Fix recipes

* Add general legal test on parallelization settings

* Rename test

* Apply isort and black reformatting

Signed-off-by: BoxiangW <BoxiangW@users.noreply.github.com>

---------

Signed-off-by: BoxiangW <BoxiangW@users.noreply.github.com>
Co-authored-by: BoxiangW <BoxiangW@users.noreply.github.com>

* Akoumparouli/nemo ux fix dir or string artifact (#10936)

* Add __repr__ to Artifact

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* nemo.lightning.io.artifact: represent strings as fdl.Config to avoid path adjustment during restoration

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* t5 test minification

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* ckpt convert bug fixes (#10878)

* Mistral-NeMo-12B recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename mistral to mistral_7b

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* include mistral_nemo_12b in __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* add to __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Remove stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* TP=2

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove finetune_reci[e

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Rename MistralNeMo2407Config12B to MistralNeMoConfig12B per review's suggestion

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update config names in tests

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* mistral-nemo-12b from llama_8b

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* TP=2; SP=True

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix overlap value

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* update mistral-nemo-base-12b finetune recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* bug fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* remove extra file

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove extra changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add ckpt_format configurable

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* revert changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>

* fix typo in docstring (#10955)

Signed-off-by: ashors1 <ashors@nvidia.com>

* remove deprecated ci tests (#10922)

* remove deprecated tutorial

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove deprecated ci tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add deprecation note

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add deprecation note

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove bart tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [Nemo CICD] Remove deprecated tests (#10960)

* remove deprecated tutorial

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove deprecated ci tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add deprecation note

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add deprecation note

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove bart tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Remove deleted CI tests

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>

* Adithyare/oai chat completion (#10785)

* updates

Signed-off-by: adithyare <adithyare@nvidia.com>

* open ai chat completion wip

Signed-off-by: adithyare <adithyare@nvidia.com>

* responding with model responses

Signed-off-by: adithyare <adithyare@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: arendu <arendu@users.noreply.github.com>

* also support general completion

Signed-off-by: adithyare <adithyare@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: arendu <arendu@users.noreply.github.com>

---------

Signed-off-by: adithyare <adithyare@nvidia.com>
Signed-off-by: arendu <arendu@users.noreply.github.com>
Co-authored-by: arendu <arendu@users.noreply.github.com>

* Update megatron_t5_pretraining.py (#10952)

Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com>

* Convert perf plugin env vars to strings (#10947)

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* disable dynamo for ddp checker (#10961)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to db7d37b ! (#10965)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* Mistral-NeMo-12B recipe (#10607)

* Mistral-NeMo-12B recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename mistral to mistral_7b

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* include mistral_nemo_12b in __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* add to __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Remove stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* TP=2

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove finetune_reci[e

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Rename MistralNeMo2407Config12B to MistralNeMoConfig12B per review's suggestion

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update config names in tests

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* mistral-nemo-12b from llama_8b

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* TP=2; SP=True

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix overlap value

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* update mistral-nemo-base-12b finetune recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Make nemo text processing optional in TTS (#10584)

* move TN guard to better location; make guard print error message rather than throwing error

Signed-off-by: Jason <jasoli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: blisc <blisc@users.noreply.github.com>

* Forgot to add the actual normalizer

Signed-off-by: Jason <jasoli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: blisc <blisc@users.noreply.github.com>

---------

Signed-off-by: Jason <jasoli@nvidia.com>
Signed-off-by: blisc <blisc@users.noreply.github.com>
Co-authored-by: blisc <blisc@users.noreply.github.com>

* respect warnings' filters (#10953)

* respect warnings' filters

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Update T5 tokenizer (adding additional tokens to tokenizer config) (#10972)

* initial commit

* restore t5_pretraining

* Apply isort and black reformatting

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>

---------

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: huvunvidia <huvunvidia@users.noreply.github.com>

* Alit/mamba recipe (#10935)

* add some mamba recipe

* add 130m

* add the rest of the recipes

* add tokenizer

* add tokenizer

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* add fixes to ssm for nemorun recipes

* add hybrid tokenizer

* updating some recipes

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* remove comments

* update gbs

* fix ckpt resume

* fix ckpt resume

* fix ckpt resume

* update recipes final

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* remove redundant imports

* ckpt convertor dtype fix

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

---------

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>

* Long context performance doc hot fix (#10946)

* long context perf

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* update the long context perf

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Akoumparouli/mcore microbatch calculator fix (#10780)

* move tests/lightning/{,_}io

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add microbatch calculator context manager

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* use microbatch calculator context manager

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add on_load_checkpoint test to ValidateModelRestoration; use ctx manager to reconfigure microbatch calculator; update save/restore path; add cleanup step at the end

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused var

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* remove 8x3b recipes (#10764)

* remove 8x3b recipes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove 8x3b from test_nemo_run

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm from __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* change the figure file name

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Accommodating the reviewer's comment

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* update the y-axis title

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 3f90b98 ! (#10789)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Add ModelOpt transformer model pruning example for Llama models, default to llama3.1-8b-base (#10294)

* Add ModelOpt transformer model pruning example for Llama3 model

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* examples code is at wrong dir, move them

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* changes as suggested in comment

remove some logging and unused config code, update example model to
llama3.1

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Add pruning of hidden_size into example

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Update examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Add pruning test to cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

---------

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: shengliangxu <shengliangxu@users.noreply.github.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Update mamba.rst after dist ckpt addition (#10800)

Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* fix chunked infer (#10581)

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* fix state transform (#10728)

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* use ckpt_to_weights_subdir in restore (#10786)

* use ckpt_to_weights_subdir in restore

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make ckpt_to_{weight,context}_subdir idempotent

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Mixtral set seq_length=4k (#10704)

* enable SP & set seq_lenght=4k

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update test expected values

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* 8x22b 4k

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Fix for crashes with tensorboard_logger=false and VP + LoRA (#10792)

* Fix for crashes with tensorboard_logger=false and virtual pipeline parallel + LoRA

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: vysarge <vysarge@users.noreply.github.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Disable checkpoint conversion inside AutoResume (#10645)

* Disable checkpoint conversion inside AutoResume

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Update resume docstrings

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* add default finetuning recipe and refactor llama3 8b recipe

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* address comment

Signed-off-by: Chen Cui <chcui@nvidia.com>

* refactor other recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* remove 8x3b finetuning recipe for now because HF version not available

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add copyright header

Signed-off-by: Chen Cui <chcui@nvidia.com>

* adjust unit tests based on recipe fixes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix failed unit test

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* replace png file to github assets

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* change image url to github release

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* hot fix on table style

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

---------

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Co-authored-by: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com>
Co-authored-by: shengliangxu <shengliangxu@users.noreply.github.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>

* Performance mode (#10926)

* llama3 performance mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* llama3 performance mode tests

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* mixtral performance mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* remove unused

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* nemotron perf mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* 405b, 174b perf mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* perf mode comment

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>

* Add flux inference pipeline (#10752)

* Vae added and matched flux checkpoint

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Flux model added.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Copying FlowMatchEulerScheduler over

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* WIP: Start to test the pipeline forward pass

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Vae added and matched flux checkpoint

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Inference pipeline runs with offloading function

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Start to test image generation

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Decoding with VAE part has been verified. Still need to check the denoising loop.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* The inference pipeline is verified.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Add arg parsers and refactoring

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Tested on multi batch sizes and prompts.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Add headers

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Renaming

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Move shceduler to sampler folder

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Merging folders.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Tested after path changing.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Move MMDIT block to NeMo

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Add joint attention and single attention to NeMo

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Joint attention updated

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Remove redundant importing

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Refactor to inherit megatron module

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

---------

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>
Co-authored-by: Victor49152 <Victor49152@users.noreply.github.com>

* Add assertion for always save nemo add model parallel size (#10690)

* Add assertion for always save nemo add model parallel size

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Add assertions

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Fix typo

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: BoxiangW <BoxiangW@users.noreply.github.com>

* Revert nemo_model_checkpoint.py changes

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Add test

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Fix typo

* Fix test bug

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Fix test

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

---------

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>
Signed-off-by: BoxiangW <BoxiangW@users.noreply.github.com>
Co-authored-by: BoxiangW <BoxiangW@users.noreply.github.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 563d5d1 ! (#10979)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* Reflect CLI change nemorun -> nemo (#10443)

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>

* minor fix (#10990)

Co-authored-by: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>

* Fixed sampler override and audio_key in prepare_audio_data (#10980)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Add more recipes (#10957)

* add recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* adjust finetuning recipe

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>

* Fix parallel_embedding (#10975)

* Upgrade transformers (#10854)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Add support and recipes for HF models via AutoModelForCausalLM (#10962)

* initial hf_lit_module

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make sft gpt dataset sanity check optional

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* HF sft example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Rename HfLitModule to HfAutoModel

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update default model id

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* move rank&world_size as params

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix mbs in example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix for fsdp and logger

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make loss_fn configurable

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* remove optim from HfAutoModel

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add pytorch native optim

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add hfAutoModel pretrain nemorun recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove debug

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove stale import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* tokenizer fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename pytorch_adam to pytorch_adam_with_cosine_annealing

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* small refactor

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix no_weight_decay_cond

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* switch to flat_lr optim for example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* remove imports & update docstrings

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add a tokenizer setter to allow it to work with nemo/collections/llm/api.py::_use_tokenizer

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* allow loss_mask to be none

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Add HF-dataset lightning module

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* check if pad_token_id is None

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename hf_lit_module.py to hf_auto_model.py

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* class rename

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* HfAutoModelForCausalLM

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm stale import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add option to start with random weights

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add check in megatron-strategy

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename param

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* drop mcore sampler from squadmodule

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make megatron_sampler optional in HfDatasetDataModule

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* copyright

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* use is_hf_model to mark hf classes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* ci: Update tests (#10987)

* ci: Re-enable `L0_Unit_Tests_GPU_Lightning`

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci: Disable `L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2`

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 425cdd4 ! (#11001)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* gpt3 175b cli (#10985)

* gpt3 175b cli

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>

* Fix for crash with LoRA + tp_overlap_comm=false + sequence_parallel=true (#10920)

* Add fusion defaults for llama2

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Alter ParallelLinearAdapter condition to account for tp_comm_overlap=false

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: vysarge <vysarge@users.noreply.github.com>

* Clean up unneeded defaults

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* gpt3 175b cli

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: vysarge <vysarge@users.noreply.github.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Signed-off-by: Eric Harper <complex451@gmail.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: Malay Nagda <malayn@nvidia.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* llm.generate fixes (#10983)

* fix context path, disable optimizer init, add tp

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* format

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* address comments, require user to provide trainer

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fix

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fixes

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* use __dict__ in check (#11012)

* check is_hf_model in leaf module

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* disable getattr alternative path

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* undo;

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* LoRA support for HF::AutoModelForCausalLM (#10982)

* add LinearAdapter

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add hf lora example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* subclass mixin

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* undo

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix scale

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* regex selector for peft

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* move lora

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fmt

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* hf_auto_model_for_causal_lm finetune recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Change default for always_save_context to True (#11014)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>

* Add a build option to load_context (#10713)

* Add a build option to load_context

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Adding test

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Trying to fix failing CPU test

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* cherry-pick fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Fix pip install (#11026)

* Move AutoTokenizer inline

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Move einops to common requirements

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Move AutoTokenizer import to top-level again in fine_tuning

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Move megatron init inside nemo.lightning

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Make megatron_lazy_init_context work when transformer-engine is not installed

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Only import get_nmt_tokenizer when needed

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>

* [WIP] Add docs for NEST SSL (#10804)

* add docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc and fix missing param

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Change dist ckpt defaults (#10913)

* Enable ckpt features by default (async ckpt), ckpt every 15mins and reduce preemption time to 1min

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* fix ssm tests

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Make note that ckpt_async_save is disabled for SSMs

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Enable async ckpt for SSMs with fix

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Disable async ckpt in the peft test as it is a known bug, add note.

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Fix failing unit tests

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Ashors/peft async ckpt (#11010)

* [WIP] prototype for supporting async checkpointing with peft

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Enable async ckpt for the peft test

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Fix peft setup test

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

---------

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Co-authored-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* Akoumparouli/mixtral recipe fix r2.0.0 (#10994)

* Mixtral TP8 EP1

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Fix _strategy_lib tests (#11033)

* fix world size and don't mock

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* cleanup global state

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* check app state instead

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix syntax nemo logger test

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Update `BaseMegatronSampler` for compatibility with PTL's `_BatchProgress` (#11016)

* Revert "[NeMo-UX] Use custom `BatchProgress` class which does not restore states (#10383)"

This reverts commit b5798ded9f27168db9d7d77cbe4f9da80bf49268.

* make megatron sampler return the total number of batches in the dataset

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: ashors1 <ashors@nvidia.com>

* PTQ example for NeMo 2.0 (#10642)

* initial commit

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* create Quantizer for NeMo 2.0

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* refactor

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Call quantize on an unwrapped mcore model

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Add tests, adjust unwrapping

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* fix export

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Fix output_path argument for HF import

Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>

* fix fabric ckpt loading

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* code review suggestions

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* remove unused import

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* use cnn dataset in github ci

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* applied code review

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* code review changes

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* simplify interface for data iterator

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* (partial) PP fix

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

---------

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: Piotr Kaminski <pikaminski@nvidia.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>

* TDT compute timestamps option and Extra Whitespace handling for SPE (#10875)

* add token duration

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* revert rnnt change

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add remove_extra_whitespaces arg to spe tokenizer

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add token duration retrieval

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add ignore_extra_whitespace to spe

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add compute_timestamp support for tdt

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix config field name

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add refinement for tdt timestamps

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add segments timestamp support and  refinement for ctc

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* modify tests for ctc decoding timestamps

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add rnnt timestamp tests

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* updated doc

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix in test

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* fix of unicode char

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix rnnt_decoding test

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* workaround for tesst tokenizer

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* modify segments formation

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* modify segments for ctc

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix in ctc refinement

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* minor changes

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* reverse offset change

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* warning mode=once

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* make ignore_extrawhitespaces false

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* minor changes

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* adjust changes to the tests

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* modify prompt_formatter tests

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

---------

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* Basic online dynamic FP8 quantization with vLLM (#10904)

* Basic online dynamic quantization with vLLM

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

* vllm 0.6.3 updates

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Pass quantization param in deploy_vllm_triton.py script

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>

* ci: Improve VM maintenance (#10758)

* ci: Improve VM maintenance

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* rename stuff

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* title

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* use team

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* run on failure too

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* yrdy

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Expose packed seq in finetuning recipes (#11006)

* add packed seq recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* extraneous prefix

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>

* PEFT Inference (#11030)

* adapter inference first commit

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* Fix yaml serialization

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* add copyright header

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* revert accidental commit

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>

* ci: Switch to CPU only runner (#11035)

* ci: Switch to CPU only runner

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* runner

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Raise EnvironmentError when CUDA is unavailable to prevent cuda-python failure

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix error message

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* update

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* finalize

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>

* added Lhotse online augmentation tutorial for SE (#10944)

Signed-off-by: Rauf <rnasretdinov@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 397e9da ! (#11051)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* ci: Send team alerts on specific keywords (#10986)

* ci: Send team alerts on specific keywords

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Fix timestamps tests (#11053)

* change timestamps tests

Signed-off-by: Monica Sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

---------

Signed-off-by: Monica Sekoyan <msekoyan@nvidia.com>
Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* Qwen2 Recipe (#10974)

* Add qwen recipe

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* change to TP1 for small models

Signed-off-by: Ao Tang <aot@nvidia.com>

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Signed-off-by: Ao Tang <aot@nvidia.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>

* Add copyright notice (#11050)

* Add copyright notice

* Apply isort and black reformatting

Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>

---------

Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>
Co-authored-by: pablo-garay <pablo-garay@users.noreply.github.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to d357c18 ! (#11054)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* add deprecation note (#11024)

* add deprecation note

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>

* Generalizing Inference pipeline in NeMo 2.0 to support encoder-decoder models (#10924)

* initial commit

* adding example t5_generate.py

* workable inference code

* updating code

* update cpde

* workable solution for T5 tokenizer (we add 100 sentinel tokens when initializing tokenizer throug setting config, instead of adding after initialization)

* separate autokenizer's changes to another PR

* cleaning code

* addressing Marc's comments

* addressing Marc's reviews

* update code after merge

* small fix

* Apply isort and black reformatting

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>

---------

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: root <root@eos0290.eos.clusters.nvidia.com>
Co-authored-by: huvunvidia <huvunvidia@users.noreply.github.com>

* [Bug fix] use default_factory to provide default value in dataclasses in energon MultiModalSampleConfig (#11041)

Signed-off-by: Guyue Huang <guyueh@login-preos01.a51.clusters.nvidia.com>
Co-authored-by: Guyue Huang <guyueh@login-preos01.a51.clusters.nvidia.com>

* Update ModelOpt Width Pruning example defaults (#10902)

* update width pruning example defaults

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update Dockerfile.ci

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Undo CI version update

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

---------

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* fix: Resolve mutable default issue in MultiModalSampleConfig dataclass (#11061)

* SC1/SC2 Recipe (#10971)

* Add recipe for sc1/sc2

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* typo fixed

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>

* Add copyright notice (#11066)

* Add copyright notice

* Add copyright notice

* Wrap batch_sampler with _IndexBatchSamplerWrapper (#10934)

* wrap batch_sampler

Signed-off-by: Farhad Ramezanghorbani <farhad.ghorbani@gmail.com>

* Apply isort and black reformatting

Signed-off-by: farhadrgh <farhadrgh@users.noreply.github.com>

* pass dataloader mode

* Apply isort and black reformatting

Signed-off-by: farhadrgh <farhadrgh@users.noreply.github.com>

* pass dataloader mode

Signed-off-by: Farhad Ramezanghorbani <farhad.ghorbani@gmail.com>

* pass dataloader mode

Signed-off-by: Farhad Ramezanghorbani <farhad.ghorbani@gmail.com>

* resolve conflict

Signed-off-by: Farhad Ramezanghorbani <farhad.ghorbani@gmail.com>

* change import

Signed-off-by: Farhad Ramezanghorbani <farhad.ghorbani@gmail.com>

---------

Signed-off-by: Farhad Ramezanghorbani <farhad.ghorbani@gmail.com>
Signed-off-by: farhadrgh <farhadrgh@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* Performance fine-tuning recipes for llama3 8b + 70b (#11046)

* llama3 finetuning perf recipes progress capture

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Small syntax fix

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* syntax

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: vysarge <vysarge@users.noreply.github.com>

* Correct ddp setting

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Fix hasattr check

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* bf16 grad

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Update configs for 8b + 70b

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Set wgrad_deferral_limit

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>

* Set TE spec name for NeMo to HF checkpoint converters (#11036)

* Set TE spec name for NeMo to HF checkpoint converters

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: kevalmorabia97 <kevalmorabia97@users.noreply.github.com>

* Update convert_falcon_nemo_to_hf.py

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

---------

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: kevalmorabia97 <kevalmorabia97@users.noreply.github.com>
Co-authored-by: kevalmorabia97 <kevalmorabia97@users.noreply.github.com>

* ci: Re-add secrets detector (#11038)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* switch to NeMo 2.0 recipes (#10948)

* switch to NeMo 2.0 recipes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* add NeMo 2.0 recipe support

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* fix unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* fix auto conf ci test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove unused imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* fix stlye

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix typo

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>

* Adding nemo-run recipes for NeMo 2.0 T5  (#10964)

* initial commit

* adding all recipes and tests

* add tests for 220m, 3b, 11b; runnable with example_nemorun_t5.py

* restore test_llama3_70b.py

* remove unused libraties

* remove unused libraries

* address Ali's comments

* address Ali's comments

* edit test_t5_3b.py

* Apply isort and black reformatting

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>

---------

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: huvunvidia <huvunvidia@users.noreply.github.com>

* Minor fixes for NeMo 2.0 PTQ (#11079)

* initial commit

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* cleanup

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* fix fabric num nodes

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* make style coherent

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

---------

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: Piotr Kaminski <pikaminski@nvidia.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Add copyright notice (#11083)

* Add L2 llama test (#10824)

* use llama recipe with some changes

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* copy max_steps

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* optional parallelisms

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* precision options

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* add ci job

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* typehints

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* sp arg as int

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* change interval

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* verify checkpointing

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* address CI

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* doc

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix precision verification

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* add callback to verify model attributes

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* extract to common

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* import fixes

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix checkpointing check

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* add callback for stopping before end

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unused

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* disable for now

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* add distcp dir check

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* change early stop timing

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update ckpts verification

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* update test

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix import

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* disable for now

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use tmp dir

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* add misc check for max_steps

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* check end batch progress state

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* more ckpt dir assertions

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* disable time-based ckpting

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* val takes too long

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Add copyright check (#11048)

* check for copyright

* Copyright check

* Copyright check

* Copyright check

* Exclude path

* Exclude path

* account for other license types

* account for other license types

* account for other license types

* account for other license types

* Ignore __init__.py files

* Ignore __init__.py files

* Ignore __init__.py files

* Ignore __init__.py files

* Change job name

* Fix finalize model grad for PEFT (#11065)

* fix finalize model grad for PEFT

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* change to warning

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>

* ci: Less verbose infra alerts (#11080)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Add copyright notice (#11085)

* ci: Fix cron schedule  (#11076)

* ci: Fix cron

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Export & deploy updates (part I) (#10941)

* Update vLLMExporter docstring

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* No need to create empty kwargs here

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Use debug from command line

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Param save_engine for both both vLLM and TRT-LLM

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Unused backend param in run_trt_llm_inference

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Reindent files for non-existent checkpoint check

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Docs for lora_checkpoints

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Improve config readability

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Raise error directly in get_vllm_deployable

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

* Revert "Reindent files for non-existent checkpoint check"

This reverts commit 8499d501fc8da8781bc7651b928946efad3ef46d.

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Cut off prompt for real

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>

* Add doc-strings to import & export + improve logging (#11078)

* Add doc-strings to import & export + improve logging

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

---------

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>

* ci: Use code-freeze via Nemo-FW-Templates (#11073)

* ci: Use FW-templates

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* rename

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* use slack_endpoint

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* will this work?

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* SLACK_RELEASE_ENDPOINT

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* secret

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Akoumparouli/hf lit module peft ckpt bugfix (#11022)

* Add sharded_state_dict

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove sharded_state_dict

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fallback to state_dict if ckpt does not have sharded_state_dict

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove Optional since it was optional

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* NeMo 1.0: upcycle dense to moe (#11002)

* upcycle dense to moe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix(?) path when saving

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* bot happy

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* bot happy #2

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add unwrap method

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* move file

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Update mcore parallelism initialization in nemo2 (#10643)

* update mcore parallelism initialization

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Update megatron_init.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* add encoder parallel default config

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix _strategy_lib.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* update megatron_init.py inside lightning

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix test

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* try fix test

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* try fix test

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix megatron megatron_init.py dp

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* Update lightning megatron_init.py dp

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* Gemma2 in Nemo2 with Recipes (#11037)

* add gemma2 in nemo2.0 and 2b recipe

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* Fix gemma1 inference bug

* add more recipe

* minor fix

* recipe fix

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* merge fix

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>

* PEFT perf and TE spec fixes (#11070)

* lora perf fixes

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* fix vpp

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* gpt dropout

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* support full te layer spec

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* import fix

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* Update llama3_70b.py

Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 213c8a2 ! (#11092)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* NeMo-UX: fix nemo-ux export path (#11081)

* only make optim config if model has optim and setup_optimizers is True

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* pass setup_optimizers=False in nemo_load

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix backwards compatibility in load_context

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add torch_dtype_from_mcore_config

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix hf model dtype & prune embedding size

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* propagate changes: mistral

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* propagate changes: mixtral

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* propagate changes: nemotron

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* propagate changes: qwen2

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* propagate changes: startcoder

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* propagate changes: startcoder2

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* propagate chatglm

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove commented code

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm rename

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm rename

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* NeMorun for NeMo 2.0 T5 finetuning (#11040)

* workable code commit

* small edits for t5_3b.py and t5_11b.py

* Apply isort and black reformatting

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>

---------

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: huvunvidia <huvunvidia@users.noreply.github.com>

* fix model_checkpoint.py (#11057)

support the case where monitor is the only metric in checkpoint name. for example 'global_step=1000'

Signed-off-by: Ethan He <yihuih@nvidia.com>

* Update PTQ tests and ModelOpt version (#11095)

* Deprecate NeMo 1 PTQ tests except FP8

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Convert model right before testing it for FP8 PTQ

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Bump modelopt version

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix TRTLLM nemo2 activation parsing (#11062)

* Fix TRTLLM nemo2 activation parsing

* Apply isort and black reformatting

Signed-off-by: meatybobby <meatybobby@users.noreply.github.com>

---------

Signed-off-by: meatybobby <meatybobby@users.noreply.github.com>
Co-authored-by: meatybobby <meatybobby@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>

* Fix datasets in CLI (#11097)

* Fix datasets in CLI

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Fix wrong import

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Fix wrong import

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

---------

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Fix yaml serialization in io mixin (#11106)

* Fix yaml serialization in io mixin

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add docstring

Signed-off-by: Hemil Desai <hemild@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* disable overlap_param_gather_with_optimizer_step (#11102)

* disable overlap_param_gather_with_optimizer_step

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* fix comment

Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* fix typo again

Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>
Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* Add Packed Seq option to GPT based models (#11100)

* add pack seq args/docstr

Signed-off-by: Ao Tang <aot@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* reword docstr

Signed-off-by: Ao Tang <aot@nvidia.com>

* space

Signed-off-by: Ao Tang <aot@nvidia.com>

---------

Signed-off-by: Ao Tang <aot@nvidia.com>
Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>

* Fix MCoreGPTModel import in llm.gpt.model.base (#11109)

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* TP+MoE peft fix (#11114)

* add is_expert option to ParallelLinearAdapter

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* pass is_expert to ParalleLinearAdapter

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* nemo1 to nemo2 checkpoint convert (#10937)

* initial draft

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add .nemo loading and handle tokenizers

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* format

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* remove tmp dir afterwards and minor fixes

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add model.yaml save to context to io_dump

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* change to cpu convert

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* change to default bf16, add rc2 patch and logging

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* remove torch.git.script for squared_relu for 24.09

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* add copyright

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* add more model and minor changes

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* minor fix

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* format

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* format

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* fix expert regex filter (#11103)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Support exporting Nemotron-340B for TensorRT-LLM (#11015)

Signed-off-by: Jinyang Yuan <jinyangyuan-nvidia@users.noreply.github.com>
Co-authored-by: Jinyang Yuan <jinyangyuan-nvidia@users.noreply.github.com>
Co-authored-by: meatybobby <bobchen@nvidia.com>

* GPT recipes to use full te spec (#11119)

* use full te spec

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* more recipe fix

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* rm dropout_ffn, default num modes

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* rm rm (#11116)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* NeMo-UX: Mistral/mixtral peft ci test (#11094)

* add mistral/mixtral peft ci test

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add mistral/mixtral peft ci test

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add mistral tp2

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* add tests to NEMO_CICD_Test

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Update .github/workflows/cicd-main.yml

Co-authored-by: oliver könig <okoenig@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>

* fix params

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm devices arg

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add --dist-opt arg

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add tp=2 mixtral

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add ep test

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>

* Make nemo.collections.llm PreTrainingDataModule num samples configurable (#11088)

* Make nemo.collections.llm PreTrainingDataModule num samples configurable

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add explicit method to build pretraining datamodule index mapping

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* PR feedback

Signed-off-by: Hemil Desai <hemild@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>

* Virtual pipeline parallel support for LoRA in NLPAdapterModelMixin (#11128)

* Update NLPAdapterModelMixin to handle model structure for virtual pipeline parallel + LoRA

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Clean up assert guard

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Clean up ValueError raise

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: vysarge <vysarge@users.noreply.github.com>

* documentation

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>

* fix path (#11121)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Allow arguments passed to dataset class + Gemma recipe fix (#11125)

* add dataset kwargs

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update gemma recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* change arguments from callers

Signed-off-by: Chen Cui <chcui@nvidia.com>

* address comments

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add bos to gemma2 as well

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>

* Nemotron Recipe (#11118)

* add finetune recipe for nemotron

* Refactor Nemotron4-22b to Nemotron3-22b,

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* optimize 22b finetune recipe

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>

* NeMo-UX: HF PeFT fix (#11096)

* add model_transform param

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix model freeze with vanilla DDPStrategy; param passing to megatronstrategy

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove stale code

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make vanilla adam config a run.Config

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update peft recipe for hf_auto_model_for_causal_lm

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix typo

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* fix statement condition

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* remove deprecated test (#11134)

* ci: Pin release freeze (#11143)

* ci: Pin release freeze

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Autoconfigurator test nonoptional

* Fix RNN-T loss memory usage (#11144)

* Fix RNN-T memory usage

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Recipe Fix for NeMo CI (#11127)

* rename starcoder to starcoder_15b

* change sc1 TP2 -> PP2

* change sc1 to PP8

* change nemotron4 340b to TP16

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* nemotron340b and gemma 7b fix

* change to PP18

* revert TPPPCP settings for 340b

* enable sp

* enable sp

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Signed-off-by: Ao Tang <aot@nvidia.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>

* vLLM Hugging Face exporter (#11124)

* fix minor import bug

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Add save pretrained to HF model

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* vllm hf exporter added

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* pytriton function added

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* update test

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Example fixed

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* Remove unused import

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* update nemo args for mcore flash decode arg change (#11138)

* update mcore transformer layer, block, attention forward args for flash decode

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* Apply isort and black reformatting

Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>

* update mcore tag in Dockerfile.ci

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

---------

Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Co-authored-by: HuiyingLi <HuiyingLi@users.noreply.github.com>

* Fix freeze_model call in peft (#11146)

* fix freeze_model call in peft

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 3d27a9d ! (#11159)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* NeMo-UX: Add sgd optim (#11157)

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add sgd

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove stale args

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove clip grad arg

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Pass optimizer as a run.Partial instead

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* move param extraction out of PytorchOptimizerModule

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Add ParamsT import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Added deprecation notice (#11133)

* Added deprecation notice

Signed-off-by: Ssofja <sofiakostandian@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Ssofja <Ssofja@users.noreply.github.com>

* Updated text in deprecation notice

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Ssofja <78349198+Ssofja@users.noreply.github.com>

* Updated text in deprecation notice

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Ssofja <78349198+Ssofja@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: Ssofja <Ssofja@users.noreply.github.com>

* Update version

Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Ssofja <78349198+Ssofja@users.noreply.github.com>

* Update nemo/collections/asr/data/audio_to_text.py

Co-authored-by: oliver könig <okoenig@nvidia.com>
Signed-off-by: Ssofja <78349198+Ssofja@users.noreply.github.com>

---------

Signed-off-by: Ssofja <sofiakostandian@gmail.com>
Signed-off-by: Ssofja <Ssofja@users.noreply.github.com>
Signed-off-by: Ssofja <78349198+Ssofja@users.noreply.github.com>
Co-authored-by: Ssofja <Ssofja@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>

* Update copyright check (#11168)

* Make/move copyright check template

* Adjustment for standard format

* Call `ckpt_to_weights_subdir` from `MegatronCheckpointIO` (#10897)

* locate weights path within MegatronCheckpointIO

Signed-off-by: ashors1 <ashors@nvidia.com>

* small refactor

Signed-off-by: ashors1 <ashors@nvidia.com>

* remove another instance of ckpt_to_weights_subdir

Signed-off-by: ashors1 <ashors@nvidia.com>

* move ckpt_to_weights_subdir

Signed-off-by: ashors1 <ashors@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* add weights path in save_checkpoint

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix circular import

Signed-off-by: ashors1 <ashors@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

* handle saving in ckpt_to_weights_subdir

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix minor typo

Signed-off-by: ashors1 <ashors@nvidia.com>

* bug fixes

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix undefined variable

Signed-off-by: ashors1 <ashors@nvidia.com>

* move function

Signed-off-by: ashors1 <ashors@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

* fix adapter meta file path

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fix mixtral test

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix mixtral test

Signed-off-by: ashors1 <ashors@nvidia.com>

* use function for weights subdir

Signed-off-by: Chen Cui <chcui@nvidia.com>

* address comments

Signed-off-by: ashors1 <ashors@nvidia.com>

* move asserts

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix undefined vars

Signed-off-by: ashors1 <ashors@nvidia.com>

* bug fix

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors1@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>

* Fix export of configuration parameters to Weights and Biases (#10995)

* Fix export of configuration parameters to weights and biases

Add test to ensure json serializability of hparams_init after restore

Signed-off-by: Sam Oluwalana <soluwalana@nvidia.com>

* Function was moved in recent versions

Signed-off-by: Sam Oluwalana <soluwalana@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: soluwalana <soluwalana@users.noreply.github.com>

---------

Signed-off-by: Sam Oluwalana <soluwalana@nvidia.com>
Signed-off-by: soluwalana <soluwalana@users.noreply.github.com>
Co-authored-by: soluwalana <soluwalana@users.noreply.github.com>

* add lora recipt for 405b (#10991)

* add lora recipt for 405b

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* remove pdb :D

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* Update llama31_405b.py

Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>

* fix typo

Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>

---------

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>

* Change activation parsing in TRTLLM (#11173)

* Fix squared relu

* Fix openai-gelu

* added dit training diagrams (#10873)

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* ci: Switch to FW templates for build (#11077)

* ci: Switch to FW templates for build

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix image tag

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* use labels for building and cleaning

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix tag

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to af4b5de ! (#11174)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* feat: Run PyLint (#11147)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Fixes for Canary adapters tutorial (#11184)

* Fixes for Canary adapters tutorial

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>

* clear notebook output cells

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>

---------

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>

* Add Alpaca Finetune Datamodule (#11185)

* Add Alpaca Datamodule

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>

* add ipython import guard (#11191)

* add ipython import guard

Signed-off-by: Nithin Rao Koluguri <nithinrao.koluguri@gmail.com>

* handle notebooks which uses vad_utils plot function:

Signed-off-by: Nithin Rao Koluguri <nithinrao.koluguri@gmail.com>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

* decrease line length

Signed-off-by: Nithin Rao Koluguri <nithinrao.koluguri@gmail.com>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

* small update to doc

Signed-off-by: Nithin Rao Koluguri <nithinrao.koluguri@gmail.com>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

---------

Signed-off-by: Nithin Rao Koluguri <nithinrao.koluguri@gmail.com>
Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>
Co-authored-by: nithinraok <nithinraok@users.noreply.github.com>

* Updated Diffusion Collection README (#11179)

* updated model archs + linked data instructions in readme

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* fix bullet points

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* add citation

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* st dit image

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* fix st dit image

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* update feature status

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* added dit training diagrams (#10873)

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* updated features

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* updated image scale

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* removed coming soon

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

---------

Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>

* Add support for Cosmos Tokenizers (#11194)

* Add cosmos tokenizer with examples and documentation

Signed-off-by: Joseph Jennings <jjennings@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: jojennin <jojennin@users.noreply.github.com>

* Disable pylint checking for tokenizer regex

Signed-off-by: Joseph Jennings <jjennings@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: jojennin <jojennin@users.noreply.github.com>

---------

Signed-off-by: Joseph Jennings <jjennings@nvidia.com>
Signed-off-by: jojennin <jojennin@users.noreply.github.com>
Co-authored-by: jojennin <jojennin@users.noreply.github.com>

* Run formatting only if files changed. Echo message if pylint fails. (#11188)

* Run formatting only if files changed (not deleted). Echo message if pylint fails.

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Change message

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Remove space

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: lilithgrigoryan <38436437+lilithgrigoryan@users.noreply.github.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 19515ac ! (#11196)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* download from BRANCH (#11206)

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* update the return type (#11210)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* Fix rotary_percentage parsing in nemo2 config (#11197)

* Fix rotary_percentage parsing in nemo2 config

* Add default value

* ci: Update cherry pick workflow (#11202)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci: Build, test, publish a wheel (#11183)

* ci: Build, test, publish a wheel

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* add workspace

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* deployment enviroment

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* finalize

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to bc8c4f3 ! (#11222)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* update default pipeline_parallelism_type (#11213)

* use pipeline dtype from precision plugin

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* pylint

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* pylint

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* nenikikamen

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* check actual value of vocab_file (#11228)

* check actual value of vocab_file

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* pylint

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Fix VP Initialization Issue with Latest MCore (#11209)

* docstr reformat

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>

* ci: Run Pylint strictly on new files, softly on history (#11212)

* ci: Improve PyLint handling

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* finalize

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* bot

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Add LLama32 Vision Model Support in Nemo 2.0 (#10763)

* add initial code for llama vlm

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* some restructure

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add mock data placeholder

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix some importing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add language component for vlm llama

* update code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* now match num of params

* update language part and fix vision part

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* minor fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* model can now init

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* minor update for llama32 text config

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* make checkpoint loading work

* missing import

* match vision part tensor shapes with configs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* solve some fwd issues and mismatch issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add vision import

* fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update importer to convert both text and image weights

* importer typos and reduce clutter

* fix import qkv

* some fixes for LLM

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add embedding

* some updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable loading only text or only vision

* add example script

* TP fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update

* upload examples

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update generate

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update to newer version

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* upload for sharing

* update to new pyt ckpt

* xattn_caches matches (except small differences due to TE RMSNorm)

* cleanup

* embeddings match

* match precision of weights

* update sharded state dict

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* change xattn layer num to 3 7 11 etc

* upload llama generation

* minor fix

* fix dummy layer input format

* fix vision qkv order

* fix shareded state dict

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix vision precision

* fix rope

* match cross attn layer

* remove nrep

* Remove cross attention in ImageTransformerLayer and fix _gate_ffn

* PP draft

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix intermediate tensor

* temp save for pp2 is working

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix pp issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* merge

* update mcore parallelism initialization

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* small update to pretrain script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update mcore parallelism initialization

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* added energon dataloader for neva training (#10451)

* added energon dataloader for neva training

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* specify global batch size to support grad accumulation

* adding neva pretrain example

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* change pretraine example to handle new ckpt reloading

* fixed code quality warnings and unused imports

Signed-off-by: ykarnati <ykarnati@nvidia.com>

* minor changes for PR comments

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* refactor conversation template config

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* remove optional import

---------

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>
Signed-off-by: ykarnati <ykarnati@nvidia.com>
Co-authored-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>
(cherry picked from commit 7354740a9788ea838ae1204c84855990baef55ab)

* llama energon dataloader

* have tokenizer for base task encoder class

* Update megatron_init.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* Add simple inference

* evian3 update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add encoder parallel default config

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add encoder parallel default config

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clean up

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add aspect ratio in model

* support energon dataloader

* some pp update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix kv merging

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix get_key_value_tensors

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* rename files

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update to HF style position embedding

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix energon dataloader and support batching

* update forward args

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clean up and move to aspect_ratio_ids

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* rename back to language.py

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix loss function

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update and fix energon

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add hf import

* Fix type

* Change config

* update energon pretrain

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clean up

* clean up

* reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update inference files for new code

* update to instruct

* update to instruct

* update few names

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update generation

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix importer embedding.weight

* few fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add hf script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix kv import

* remove interleaved

* fixes and updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* lora fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* some code clean ups

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update training scripts

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* refactors

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add LoRA finetuning

* fixes and nemo update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix importer registering issue by adding 11B and 90B configs

* update `decoder_seq_len`

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* science vqa script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clean up script name

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix ckpt save serialization issue

* fix predefined config classes

* add num_chunks in input

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix format

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update finetuning scripts for PEFT

* add 11b recipe (need #10645 to test)

* fix mask generation

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* minor fix code style

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Support no image inference

* add llama svqa eval

* fix masking

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix generation

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* add 90b recipe and revise 11b recipe

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* clean up typing

* add option to disable vision padding

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* base model finetuning (does not work yet)

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fixed default conversation template config for MLLama

* Update svqa

* add multinode

* bot happy

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Perf improvements. Mainly from XAttn mask calculation (#10901)

* Perf improvements. Mainly from XAttn mask calculation

* Apply isort and black reformatting

Signed-off-by: parthmannan <parthmannan@users.noreply.github.com>

---------

Signed-off-by: parthmannan <parthmannan@users.noreply.github.com>
Co-authored-by: parthmannan <parthmannan@users.noreply.github.com>

* fix existing issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix scripts

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix lora

* few fixes for non image support

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update masking gen

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update lazy dataset

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix data sampler and loading issue

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add vlm generation

* Apply isort and black reformatting

Signed-off-by: meatybobby <meatybobby@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* generation update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update lazy dataset

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix _strategy_lib.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix warning

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* hide vlm examples

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Revert "Add vlm generation"

This reverts commit 4711c75a

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix VisionEncoder multi-batch bug

* update mcore parallelism initialization

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Update megatron_init.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* add encoder parallel default config

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix _strategy_lib.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* llm.generate fixes (#10983)

* fix context path, disable optimizer init, add tp

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* format

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* address comments, require user to provide trainer

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fix

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fixes

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* use __dict__ in check (#11012)

* check is_hf_model in leaf module

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* disable getattr alternative path

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* undo;

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* LoRA support for HF::AutoModelForCausalLM (#10982)

* add LinearAdapter

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add hf lora example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* subclass mixin

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* undo

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix scale

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* regex selector for peft

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* move lora

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fmt

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* hf_auto_model_for_causal_lm finetune recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Change default for always_save_context to True (#11014)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>

* Add a build option to load_context (#10713)

* Add a build option to load_context

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Adding test

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Trying to fix failing CPU test

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* cherry-pick fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Fix pip install (#11026)

* Move AutoTokenizer inline

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Move einops to common requirements

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Move AutoTokenizer import to top-level again in fine_tuning

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Move megatron init inside nemo.lightning

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Make megatron_lazy_init_context work when transformer-engine is not installed

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Only import get_nmt_tokenizer when needed

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>

* [WIP] Add docs for NEST SSL (#10804)

* add docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc and fix missing param

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Change dist ckpt defaults (#10913)

* Enable ckpt features by default (async ckpt), ckpt every 15mins and reduce preemption time to 1min

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* fix ssm tests

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Make note that ckpt_async_save is disabled for SSMs

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Enable async ckpt for SSMs with fix

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Disable async ckpt in the peft test as it is a known bug, add note.

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Fix failing unit tests

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Ashors/peft async ckpt (#11010)

* [WIP] prototype for supporting async checkpointing with peft

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Enable async ckpt for the peft test

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Fix peft setup test

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

---------

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Co-authored-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* Akoumparouli/mixtral recipe fix r2.0.0 (#10994)

* Mixtral TP8 EP1

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Fix _strategy_lib tests (#11033)

* fix world size and don't mock

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* cleanup global state

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* check app state instead

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix syntax nemo logger test

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Update `BaseMegatronSampler` for compatibility with PTL's `_BatchProgress` (#11016)

* Revert "[NeMo-UX] Use custom `BatchProgress` class which does not restore states (#10383)"

This reverts commit b5798ded9f27168db9d7d77cbe4f9da80bf49268.

* make megatron sampler return the total number of batches in the dataset

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: ashors1 <ashors@nvidia.com>

* PTQ example for NeMo 2.0 (#10642)

* initial commit

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* create Quantizer for NeMo 2.0

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* refactor

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Call quantize on an unwrapped mcore model

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Add tests, adjust unwrapping

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* fix export

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Fix output_path argument for HF import

Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>

* fix fabric ckpt loading

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* code review suggestions

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* remove unused import

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* use cnn dataset in github ci

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* applied code review

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* code review changes

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* simplify interface for data iterator

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* (partial) PP fix

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

---------

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: Piotr Kaminski <pikaminski@nvidia.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>

* TDT compute timestamps option and Extra Whitespace handling for SPE (#10875)

* add token duration

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* revert rnnt change

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add remove_extra_whitespaces arg to spe tokenizer

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add token duration retrieval

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add ignore_extra_whitespace to spe

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add compute_timestamp support for tdt

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix config field name

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add refinement for tdt timestamps

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add segments timestamp support and  refinement for ctc

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* modify tests for ctc decoding timestamps

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add rnnt timestamp tests

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* updated doc

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix in test

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* fix of unicode char

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix rnnt_decoding test

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* workaround for tesst tokenizer

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* modify segments formation

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* modify segments for ctc

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix in ctc refinement

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* minor changes

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* reverse offset change

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* warning mode=once

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* make ignore_extrawhitespaces false

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* minor changes

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* adjust changes to the tests

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* modify prompt_formatter tests

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

---------

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* Basic online dynamic FP8 quantization with vLLM (#10904)

* Basic online dynamic quantization with vLLM

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

* vllm 0.6.3 updates

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Pass quantization param in deploy_vllm_triton.py script

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>

* ci: Improve VM maintenance (#10758)

* ci: Improve VM maintenance

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* rename stuff

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* title

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* use team

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* run on failure too

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* yrdy

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Add comment for vision transpose

* update megatron_init.py inside lightning

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* rename llama to mllama folder name

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update to attention bias

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* update dropout to 0

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix attention bias

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* remove disable_vision_padding since we now have a fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Update init for mllama

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix copyright title

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix code scan

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update vision code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* revert attention bias changes until latest MLM code got merged

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix warning

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Turn off system message check, as it's "" now

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Rolllback megatron_parallel.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Signed-off-by: parthmannan <parthmannan@users.noreply.github.com>
Signed-off-by: meatybobby <meatybobby@users.noreply.github.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Co-authored-by: Ao Tang <aot@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: Yashaswi Karnati <144376261+yashaswikarnati@users.noreply.github.com>
Co-authored-by: ykarnati <ykarnati@nvidia.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Yashaswi Karnati <ykarnati@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: Parth Mannan <38387286+parthmannan@users.noreply.github.com>
Co-authored-by: parthmannan <parthmannan@users.noreply.github.com>
Co-authored-by: meatybobby <meatybobby@users.noreply.github.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com>
Co-authored-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Co-authored-by: Piotr Kaminski <pikaminski@nvidia.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: monica-sekoyan <166123533+monica-sekoyan@users.noreply.github.com>
Co-authored-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>

* fix typo (#11234)

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* ci: Add release workflow (#11180)

* ci: Add prerelease workflow

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* use pat

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* finalize

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* tmp

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* finalize

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Fix llm.generate (#11217)

* Fix llm.generate

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>

* Timestamps to transcribe (#10950)

* inital version

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* Support for RNNT, TDT, Hybrid Models

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* move change of decoder stratery from mixin to individual model class

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

* update transcribe_speech.py

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* uncomment

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

* add docs

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* fix docs

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

* codeql fixes

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* unit tests

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* minor rebase fix

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

* add None case to restore the state set outside using decoding_stratergy()

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

* remove ipdb traces

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* updates doc for transcription.py

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* remove preserve alignment for AED models as it doesn;t support it

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* lint warnings

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: nithinraok <nithinraok@users.noreply.github.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 1b8fce7 ! (#11247)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 47ff44e ! (#11254)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Handling tokenizer in PTQ for Nemo 2.0 (#11237)

* Handling tokenizer in PTQ for Nemo 2.0

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Print log msg and enable overriding

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Warning for legacy tokenizer config

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Save HF tokenizer to make tokenizer_config.yaml (almost) redundant

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Handle tokenizer in a unified way

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Move saving context within export

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix typo in get_tokenzier

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Reduce diff

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Drop unused import

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix finetuning datamodule resume (#11187)

* fix datamodule resume

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fix subclass

Signed-off-by: Chen Cui <chcui@nvidia.com>

* docstrings and formats

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>

* ci: Move `bump mcore` to templates (#11229)

* ci: Move `bump mcore` to templates

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* final

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix: Update baseline (#11205)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Remove deprecated builder_opt param from build command (#11259)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* chore(beep boop 🤖): Bump `MCORE_TAG=aded519...` (2024-11-12) (#11260)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* [Doc fixes] update file names, installation instructions, bad links (#11045)

* rename eval_beamsearch_ngram.py to eval_beamsearch_ngram_ctc.py in docs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* replace out of date installation instructions with pointer to NeMo README installation section

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* point to user guide instead of readme

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* some link updates

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update more links

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>

* fix(export): GPT models w/ bias=False convert properly (#11255)

Signed-off-by: Terry Kong <terryk@nvidia.com>

* ci: Run secrets detector on `pull_request_target` (#11263)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix(export): update API for disabling device reassignment in TRTLLM for Aligner (#10863)

* fix(export): update API for disabling device reassignment in TRTLLM for Aligner

[feat] Upgrade nemo-export path for aligner to TRTLLM-v12 and use python runtime

Signed-off-by: Terry Kong <terryk@nvidia.com>

fix: forgot to always set _disable_torch_cuda_device_set

Signed-off-by: Terry Kong <terryk@nvidia.com>

Signed-off-by: Terry Kong <terryk@nvidia.com>

Apply isort and black reformatting

Signed-off-by: terrykong <terrykong@users.noreply.github.com>

invert torch device set

Signed-off-by: Terry Kong <terryk@nvidia.com>

* remove comment

Signed-off-by: Terry Kong <terryk@nvidia.com>

---------

Signed-off-by: Terry Kong <terryk@nvidia.com>

* new vfm training features (#11246)

Signed-off-by: Zeeshan Patel <zeeshanp@nvidia.com>
Co-authored-by: Zeeshan Patel <zeeshanp@nvidia.com>

* Update pruning and distillation tutorial notebooks (#11091)

* Update pruning and distillation tutorial notebooks

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>

* Update README

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>

* Update batch size in width pruning script

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>

* Update README

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>

---------

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>

* Beam search algorithm implementation for TDT models (#10903)

* initial commit

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* add: default beam search implementation

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix: changed to removing duplicate hypothesis in separate function

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix: changed to cartesian product in choosing best hyp

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix: minor fixes in comments

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* add: maes decoding strategy

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* add: durations filtering in maes, lm fusion in progress

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix: refactored, added comments, command line args, finalized

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix: removed prints

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* add: docs

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* fix: minor fix

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix: rm beam_size=1 exception, rm duplicates check, fix error handling

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix: error handling

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* fix: removed evaluations file

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* rn: blank scoring

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* clean up

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* rm: blank scoring and duration beam size

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* fix: removed durations_beam_size from default beam search

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* add: logaddexp

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* rm: prefix search

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* rn: nested loop over extensions

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix: bug with caching

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* rm: topk on durations

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* add: restored prefix search

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* clean up

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix: fixed comments

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* refactored duplicate merging

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* changes batch scoring

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* refactored rnnt batch scoring

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* alsd first working

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* refactored

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* clean up

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* remove stacking operations

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fixes im base class

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* clean up

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* remove potentially uninitialized local variable

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* default beam search minor fixes

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* add test, fix maes timesteps

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* rm file

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* rm file

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* clean up

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* clean up

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix comments

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* add ngram lm test

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* fix maes_num_steps=1

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix kenlm model path

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix kenlm model full path

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* made requested changes

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* merge after isort

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* add prints to test

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* add Kenlm to asr requirements

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* remove prints in tests

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add kenlm to test requirements

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rm kenlm from link, add package-name

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rm second kenlm installation

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* rm kenlm from dependencies make test optional

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* fix in test

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix in test

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* fix comments

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* add comments

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* add comments

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* splitted docstrings

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* add comments

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* splitted docstrings

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* add comments

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* fixes to python3 type annotations

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* merging

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* merging

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix in return type

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* fix test

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>

* rm time_idx

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

* fix comments to python3 style

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>

---------

Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>
Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>
Co-authored-by: lilithgrigoryan <lgrigoryan@nvidia.com>
Co-authored-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* update nemo1->2 conversion according to changes in main (#11253)

* update nemo1->2 conversion according to changes in main

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* Apply isort and black reformatting

Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>

* format fix

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* add docstrings

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

---------

Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>
Co-authored-by: HuiyingLi <HuiyingLi@users.noreply.github.com>

* Add llama 3.1 recipes (#11273)

* add llama 3.1 recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fix pylint

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Fix llama3.1 wrong config in io.json

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Ao Tang <aot@nvidia.com>

* Fix Finetune Recipe (#11267)

* Fix Starcoder_15 SFT recipe

* Fix PP type SFT recipe

* Fix PP type SFT recipe

* Fix Gemma2b SFT TP=1

* Fix more sft recipe

* Fix more sft recipe

* Fix more sft recipe

* Fix more sft recipe

* Fix more sft recipe

* Fix more sft recipe

* Fix more sft recipe

* Fix more sft recipe

* Fix more sft recipe

* remove pp dtype

* remove pp dtype

* Configure no restart validation loop in nl.Trainer (#11029)

* Configure no restart validation loop in nl.Trainer

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Skip validation whenever restarting=True

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* PR feedback

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>

* Handle _io_unflatten_object when _thread_local.output_dir is not available (#11199)

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* change default ckpt name (#11277)

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Use MegatronDataSampler in HfDatasetDataModule (#11274)

* Use MegatronDataSampler in HfDataset

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Remove opencc upperbound (#10909)

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>

* Fix head_size in NeMo to HF checkpoint converters for width pruned model support (#11230)

* update attn head_size to kv_channels for width pruning support

Signed-off-by: Joosung <joosungy@nvidia.com>

* Update llama ckpt converter usage about tokenizer args

Signed-off-by: Joosung <joosungy@nvidia.com>

---------

Signed-off-by: Joosung <joosungy@nvidia.com>
Co-authored-by: Joosung <joosungy@nvidia.com>

* Add openai-gelu in gated activation (#11293)

* Fixes per comments (#11280)

* Fixes per comments

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>

* Update README

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>

---------

Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>

* update sft peft notebook with batch generate

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* address comments

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fix

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* address comments

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* remove extra imports

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* simply strategy parameters

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Nune <ntadevosyan@nvidia.com>
Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>
Signed-off-by: hainan-xv <hainan-xv@users.noreply.github.com>
Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: lilithgrigoryan <lgrigoryan@nvidia.com>
Signed-off-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>
Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
Signed-off-by: WoodieDudy <goshagks@gmail.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
Signed-off-by: WoodieDudy <WoodieDudy@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: erastorgueva-nv <erastorgueva-nv@users.noreply.github.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors1@users.noreply.github.com>
Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
Signed-off-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Signed-off-by: Pin-Jui Ku <pku@nvidia.com>
Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Signed-off-by: slyne deng <slyned@nvidia.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
Signed-off-by: Jan Lasek <jlasek@nvidia.com>
Signed-off-by: Anna Shors <ashors@nvidia.com>
Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Yoshi Suhara <ysuhara@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: BoxiangW <BoxiangW@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Signed-off-by: Pablo Garay <palenq@gmail.com>
Signed-off-by: adithyare <adithyare@nvidia.com>
Signed-off-by: arendu <arendu@users.noreply.github.com>
Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Signed-off-by: blisc <blisc@users.noreply.github.com>
Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>
Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>
Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>
Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Signed-off-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Signed-off-by: Rauf <rnasretdinov@nvidia.com>
Signed-off-by: Monica Sekoyan <msekoyan@nvidia.com>
Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Signed-off-by: Ao Tang <aot@nvidia.com>
Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>
Signed-off-by: Guyue Huang <guyueh@login-preos01.a51.clusters.nvidia.com>
Signed-off-by: Farhad Ramezanghorbani <farhad.ghorbani@gmail.com>
Signed-off-by: farhadrgh <farhadrgh@users.noreply.github.com>
Signed-off-by: kevalmorabia97 <kevalmorabia97@users.noreply.github.com>
Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Signed-off-by: Ethan He <yihuih@nvidia.com>
Signed-off-by: meatybobby <meatybobby@users.noreply.github.com>
Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>
Signed-off-by: Jinyang Yuan <jinyangyuan-nvidia@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>
Signed-off-by: Ssofja <sofiakostandian@gmail.com>
Signed-off-by: Ssofja <Ssofja@users.noreply.github.com>
Signed-off-by: Ssofja <78349198+Ssofja@users.noreply.github.com>
Signed-off-by: Sam Oluwalana <soluwalana@nvidia.com>
Signed-off-by: soluwalana <soluwalana@users.noreply.github.com>
Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>
Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinrao.koluguri@gmail.com>
Signed-off-by: Joseph Jennings <jjennings@nvidia.com>
Signed-off-by: jojennin <jojennin@users.noreply.github.com>
Signed-off-by: parthmannan <parthmannan@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Zeeshan Patel <zeeshanp@nvidia.com>
Signed-off-by: Gomathy Venkata Krishnan <gvenkatakris@nvidia.com>
Signed-off-by: Joosung <joosungy@nvidia.com>
Co-authored-by: nune-tadevosyan <152167970+nune-tadevosyan@users.noreply.github.com>
Co-authored-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>
Co-authored-by: Hainan Xu <hainan.xv@gmail.com>
Co-authored-by: Hainan Xu <hainanx@nvidia.com>
Co-authored-by: hainan-xv <hainan-xv@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: lilithgrigoryan <38436437+lilithgrigoryan@users.noreply.github.com>
Co-authored-by: lilithgrigoryan <lgrigoryan@nvidia.com>
Co-authored-by: lilithgrigoryan <lilithgrigoryan@users.noreply.github.com>
Co-authored-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
Co-authored-by: Georgy Kramarenko <goshagks@yandex.ru>
Co-authored-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: WoodieDudy <WoodieDudy@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Co-authored-by: erastorgueva-nv <erastorgueva-nv@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: Ao Tang <aot@nvidia.com>
Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Co-authored-by: nithinraok <nithinraok@users.noreply.github.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com>
Co-authored-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: Dong Hyuk Chang <thomaschang26@tutanota.com>
Co-authored-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Co-authored-by: Kuray107 <pku9@gatech.edu>
Co-authored-by: Kuray107 <Kuray107@users.noreply.github.com>
Co-authored-by: Slyne Deng <slynedeng@gmail.com>
Co-authored-by: slyne deng <slyned@nvidia.com>
Co-authored-by: Ming <111467530+Victor49152@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Yoshi Suhara <y.suhara@gmail.com>
Co-authored-by: Youngeun Kwon <youngeunk@nvidia.com>
Co-authored-by: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com>
Co-authored-by: shengliangxu <shengliangxu@users.noreply.github.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com>
Co-authored-by: BoxiangW <BoxiangW@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Adi Renduchintala <adithya.r@gmail.com>
Co-authored-by: arendu <arendu@users.noreply.github.com>
Co-authored-by: Jason <jasoli@nvidia.com>
Co-authored-by: blisc <blisc@users.noreply.github.com>
Co-authored-by: huvunvidia <huvunvidia@users.noreply.github.com>
Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>
Co-authored-by: Victor49152 <Victor49152@users.noreply.github.com>
Co-authored-by: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: anteju <108555623+anteju@users.noreply.github.com>
Co-authored-by: meatybobby <bobchen@nvidia.com>
Co-authored-by: Malay Nagda <malayn@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>
Co-authored-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Co-authored-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Co-authored-by: Piotr Kaminski <pikaminski@nvidia.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: monica-sekoyan <166123533+monica-sekoyan@users.noreply.github.com>
Co-authored-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: nasretdinovr <uniform97@gmail.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: pablo-garay <pablo-garay@users.noreply.github.com>
Co-authored-by: root <root@eos0290.eos.clusters.nvidia.com>
Co-authored-by: guyueh1 <140554423+guyueh1@users.noreply.github.com>
Co-authored-by: Guyue Huang <guyueh@login-preos01.a51.clusters.nvidia.com>
Co-authored-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: Farhad Ramezanghorbani <farhadrgh@users.noreply.github.com>
Co-authored-by: kevalmorabia97 <kevalmorabia97@users.noreply.github.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Ethan He <yihuih@nvidia.com>
Co-authored-by: meatybobby <meatybobby@users.noreply.github.com>
Co-authored-by: Jinyang Yuan <154768711+jinyangyuan-nvidia@users.noreply.github.com>
Co-authored-by: Jinyang Yuan <jinyangyuan-nvidia@users.noreply.github.com>
Co-authored-by: HuiyingLi <HuiyingLi@users.noreply.github.com>
Co-authored-by: Ssofja <78349198+Ssofja@users.noreply.github.com>
Co-authored-by: Ssofja <Ssofja@users.noreply.github.com>
Co-authored-by: Sam O <soluwalana@gmail.com>
Co-authored-by: soluwalana <soluwalana@users.noreply.github.com>
Co-authored-by: Zeeshan Patel <zeeshanp@berkeley.edu>
Co-authored-by: Joseph Jennings <106725747+jojennin@users.noreply.github.com>
Co-authored-by: jojennin <jojennin@users.noreply.github.com>
Co-authored-by: Yashaswi Karnati <144376261+yashaswikarnati@users.noreply.github.com>
Co-authored-by: ykarnati <ykarnati@nvidia.com>
Co-authored-by: Yashaswi Karnati <ykarnati@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Parth Mannan <38387286+parthmannan@users.noreply.github.com>
Co-authored-by: parthmannan <parthmannan@users.noreply.github.com>
Co-authored-by: Terry Kong <terryk@nvidia.com>
Co-authored-by: Zeeshan Patel <zeeshanp@nvidia.com>
Co-authored-by: gvenkatakris <gvenkatakris@nvidia.com>
Co-authored-by: Joosung Yoon <7252598+eagle705@users.noreply.github.com>
Co-authored-by: Joosung <joosungy@nvidia.com>
---
 .../llm/llama-3/nemo2-sft-peft/README.rst     |  61 ++
 .../llama-3/nemo2-sft-peft/nemo2-peft.ipynb   | 547 +++++++++++++++
 .../llama-3/nemo2-sft-peft/nemo2-sft.ipynb    | 632 ++++++++++++++++++
 3 files changed, 1240 insertions(+)
 create mode 100644 tutorials/llm/llama-3/nemo2-sft-peft/README.rst
 create mode 100644 tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
 create mode 100644 tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb

diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/README.rst b/tutorials/llm/llama-3/nemo2-sft-peft/README.rst
new file mode 100644
index 000000000000..74a581c52f44
--- /dev/null
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/README.rst
@@ -0,0 +1,61 @@
+Llama 3 Supervised Fine-Tuning and Parameter Efficient Fine-Tuning with NeMo 2.0
+================================================================================
+
+`Llama 3 <https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/>`_ is an open-source large language model by Meta that delivers state-of-the-art performance on popular industry benchmarks. It has been pretrained on over 15 trillion tokens and supports an 8K token context length. It is available in two sizes, 8B and 70B, and each size has two variants—base pretrained and instruction tuned.
+
+Supervised Fine-Tuning (SFT) refers to unfreezing all the weights and layers in our model and training on a newly labeled set of examples. We can fine-tune to incorporate new, domain-specific knowledge, or teach the foundation model what type of response to provide.
+
+`Low-Rank Adaptation (LoRA) <https://arxiv.org/pdf/2106.09685>`__ has emerged as a popular Parameter-Efficient Fine-Tuning (PEFT) technique that tunes a very small number of additional parameters as compared to full fine-tuning, thereby reducing the compute required.
+
+`NVIDIA NeMo
+Framework <https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html>`__ provides tools to perform SFT and LoRA on Llama 3 to fit your use case.
+
+Requirements
+------------
+
+* System Configuration
+    * For SFT: access to at least 2 NVIDIA GPUs with a cumulative memory of at least 80GB, for example: 2 x H100-80GB or 2 x A100-80GB.
+    * For LoRA: access to at least 1 NVIDIA GPUs with a cumulative memory of at least 80GB, for example: 1 x H100-80GB or 1 x A100-80GB.
+    * A Docker-enabled environment, with `NVIDIA Container Runtime <https://developer.nvidia.com/container-runtime>`_ installed, which will make the container GPU-aware.
+   
+* Software Requirements
+    * Use the latest [NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) . Note that you must be logged in to the container registry to view this page.
+    * This notebook uses the container: `nvcr.io/nvidia/nemo:dev`.
+    * Get your Hugging Face [access token](https://huggingface.co/docs/hub/en/security-tokens), which will be used to obtain the tokenizer required during training.
+
+* NeMo 2.0 and NeMo-Run
+    * We will use NeMo 2.0 and NeMo-Run to perform SFT and LoRA on Llama 3. Both are already available in the NeMo Framework Container.
+
+
+Start the NeMo Framework Container
+----------------------------------
+
+1. You can start and enter the dev container by:
+
+.. code:: bash
+
+   docker run \
+     --gpus device=1 \
+     --shm-size=2g \
+     --net=host \
+     --ulimit memlock=-1 \
+     --rm -it \
+     -v ${PWD}:/workspace \
+     -w /workspace \
+     nvcr.io/nvidia/nemo:dev bash
+
+
+2. You need to request download permission from Meta and Hugging Face. Then, from within the container, log in through `huggingface-cli` using your Hugging Face token. 
+
+.. code:: bash
+
+   huggingface-cli login
+
+
+3. From within the container, start the Jupyter lab:
+
+.. code:: bash
+
+   jupyter lab --ip 0.0.0.0 --port=8888 --allow-root
+
+4. Then, navigate to `the SFT notebook <./nemo2-sft.ipynb>`__ or `the LoRA notebook <./nemo2-peft.ipynb>`__ to perform SFT or LoRA on Llama 3, respectively.
diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
new file mode 100644
index 000000000000..cd3bae1cc627
--- /dev/null
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
@@ -0,0 +1,547 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Learning Goals\n",
+    "\n",
+    "## Optimizing Foundation Models with Parameter-Efficient Fine-Tuning (PEFT)\n",
+    "\n",
+    "This notebook aims to demonstrate how to adapt or customize foundation models to improve performance on specific tasks using NeMo 2.0.\n",
+    "\n",
+    "This optimization process is known as fine-tuning, which involves adjusting the weights of a pre-trained foundation model with custom data.\n",
+    "\n",
+    "Considering that foundation models can be significantly large, a variant of fine-tuning has gained traction recently known as PEFT. PEFT encompasses several methods, including P-Tuning, LoRA, Adapters, IA3, etc. NeMo 2.0 currently supports [Low-Rank Adaptation (LoRA)](https://arxiv.org/pdf/2106.09685) method.\n",
+    "\n",
+    "NeMo 2.0 introduces Python-based configurations, PyTorch Lightning’s modular abstractions, and NeMo-Run for scaling experiments across multiple GPUs. In this notebook, we will use NeMo-Run to streamline the configuration and execution of our experiments.\n",
+    "\n",
+    "## Data\n",
+    "This notebook uses the SQuAD dataset. For more details about the data, refer to [SQuAD: 100,000+ Questions for Machine Comprehension of Text](https://arxiv.org/abs/1606.05250)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Step 1. Import the Hugging Face Checkpoint\n",
+    "We use the `llm.import_ckpt` API to download the specified model using the \"hf://<huggingface_model_id>\" URL format. It will then convert the model into NeMo 2.0 format. For all model supported in NeMo 2.0, refer to [Large Language Models](https://docs.nvidia.com/nemo-framework/user-guide/24.09/llms/index.html#large-language-models) section of NeMo Framework User Guide."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import nemo_run as run\n",
+    "from nemo import lightning as nl\n",
+    "from nemo.collections import llm\n",
+    "from megatron.core.optimizer import OptimizerConfig\n",
+    "from nemo.collections.llm.peft.lora import LoRA\n",
+    "import torch\n",
+    "import pytorch_lightning as pl\n",
+    "from pathlib import Path\n",
+    "from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed\n",
+    "\n",
+    "\n",
+    "# llm.import_ckpt is the nemo2 API for converting Hugging Face checkpoint to NeMo format\n",
+    "# example usage:\n",
+    "# llm.import_ckpt(model=llm.llama3_8b.model(), source=\"hf://meta-llama/Meta-Llama-3-8B\")\n",
+    "#\n",
+    "# We use run.Partial to configure this function\n",
+    "def configure_checkpoint_conversion():\n",
+    "    return run.Partial(\n",
+    "        llm.import_ckpt,\n",
+    "        model=llm.llama3_8b.model(),\n",
+    "        source=\"hf://meta-llama/Meta-Llama-3-8B\",\n",
+    "        overwrite=False,\n",
+    "    )\n",
+    "\n",
+    "# configure your function\n",
+    "import_ckpt = configure_checkpoint_conversion()\n",
+    "# define your executor\n",
+    "local_executor = run.LocalExecutor()\n",
+    "\n",
+    "# run your experiment\n",
+    "run.run(import_ckpt, executor=local_executor)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2. Prepare the Data\n",
+    "\n",
+    "We will be using SQuAD for this notebook. NeMo 2.0 already provides a `SquadDataModule`. Example usage:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def squad() -> run.Config[pl.LightningDataModule]:\n",
+    "    return run.Config(llm.SquadDataModule, seq_length=2048, micro_batch_size=1, global_batch_size=8, num_workers=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To learn how to use your own data to create a custom `DataModule` for performing PEFT, refer to [NeMo 2.0 SFT notebook](./nemo2-sft.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3.1: Configure PEFT with NeMo 2.0 API and NeMo-Run\n",
+    "\n",
+    "The following Python script utilizes the NeMo 2.0 API to perform PEFT. In this script, we are configuring the following components for training. These components are similar between SFT and PEFT. SFT and PEFT both use `llm.finetune` API. To switch from SFT to PEFT, you just need to add `peft` with the LoRA adapter to the API parameter.\n",
+    "\n",
+    "### Configure the Trainer\n",
+    "The NeMo 2.0 Trainer works similarly to the PyTorch Lightning trainer.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def trainer() -> run.Config[nl.Trainer]:\n",
+    "    strategy = run.Config(\n",
+    "        nl.MegatronStrategy,\n",
+    "        tensor_model_parallel_size=1\n",
+    "    )\n",
+    "    trainer = run.Config(\n",
+    "        nl.Trainer,\n",
+    "        devices=1,\n",
+    "        max_steps=20,\n",
+    "        accelerator=\"gpu\",\n",
+    "        strategy=strategy,\n",
+    "        plugins=bf16_mixed(),\n",
+    "        log_every_n_steps=1,\n",
+    "        limit_val_batches=2,\n",
+    "        val_check_interval=2,\n",
+    "        num_sanity_val_steps=0,\n",
+    "    )\n",
+    "    return trainer\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure the Logger\n",
+    "Configure your training steps, output directories and logging through `NeMoLogger`. In the following example, the experiment output will be saved at `./results/nemo2_peft`.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def logger() -> run.Config[nl.NeMoLogger]:\n",
+    "    ckpt = run.Config(\n",
+    "        nl.ModelCheckpoint,\n",
+    "        save_last=True,\n",
+    "        every_n_train_steps=10,\n",
+    "        monitor=\"reduced_train_loss\",\n",
+    "        save_top_k=1,\n",
+    "        save_on_train_epoch_end=True,\n",
+    "        save_optim_on_train_end=True,\n",
+    "    )\n",
+    "\n",
+    "    return run.Config(\n",
+    "        nl.NeMoLogger,\n",
+    "        name=\"nemo2_peft\",\n",
+    "        log_dir=\"./results\",\n",
+    "        use_datetime_version=False,\n",
+    "        ckpt=ckpt,\n",
+    "        wandb=None\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "\n",
+    "### Configure the Optimizer\n",
+    "In the following example, we will be using the distributed adam optimizer and pass in the optimizer configuration through `OptimizerConfig`: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def adam_with_cosine_annealing() -> run.Config[nl.OptimizerModule]:\n",
+    "    opt_cfg = run.Config(\n",
+    "        OptimizerConfig,\n",
+    "        optimizer=\"adam\",\n",
+    "        lr=0.0001,\n",
+    "        adam_beta2=0.98,\n",
+    "        use_distributed_optimizer=True,\n",
+    "        clip_grad=1.0,\n",
+    "        bf16=True,\n",
+    "    )\n",
+    "    return run.Config(\n",
+    "        nl.MegatronOptimizerModule,\n",
+    "        config=opt_cfg\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Pass in the LoRA Adapter\n",
+    "We need to pass in the LoRA adapter to our fine-tuning API to perform LoRA fine-tuning. We can configure the adapter as follows. The target module we support includes: `linear_qkv`, `linear_proj`, `linear_fc1` and `linear_fc2`. In the final script, we used the default configurations for LoRA (`llm.peft.LoRA()`), which will use the full list with `dim=32`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def lora() -> run.Config[nl.pytorch.callbacks.PEFT]:\n",
+    "    return run.Config(LoRA)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure the Base Model\n",
+    "We will perform PEFT on top of Llama-3-8b, so we create a `LlamaModel` to pass to the NeMo 2.0 finetune API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def llama3_8b() -> run.Config[pl.LightningModule]:\n",
+    "    return run.Config(llm.LlamaModel, config=run.Config(llm.Llama3Config8B))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Auto Resume\n",
+    "In NeMo 2.0, we can directly pass in the Llama3-8b Hugging Face ID to start PEFT without manually converting it into the NeMo checkpoint, as required in NeMo 1.0."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def resume() -> run.Config[nl.AutoResume]:\n",
+    "    return run.Config(\n",
+    "        nl.AutoResume,\n",
+    "        restore_config=run.Config(nl.RestoreConfig,\n",
+    "            path=\"nemo://meta-llama/Meta-Llama-3-8B\"\n",
+    "        ),\n",
+    "        resume_if_exists=True,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Configure the NeMo 2.0 finetune API\n",
+    "Using all the components we created above, we can call the NeMo 2.0 finetune API. The python example usage is as below:\n",
+    "```\n",
+    "llm.finetune(\n",
+    "    model=llama3_8b(),\n",
+    "    data=squad(),\n",
+    "    trainer=trainer(),\n",
+    "    peft=lora(),\n",
+    "    log=logger(),\n",
+    "    optim=adam_with_cosine_annealing(),\n",
+    "    resume=resume(),\n",
+    ")\n",
+    "```\n",
+    "We configure the `llm.finetune` API as below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def configure_finetuning_recipe():\n",
+    "    return run.Partial(\n",
+    "        llm.finetune,\n",
+    "        model=llama3_8b(),\n",
+    "        trainer=trainer(),\n",
+    "        data=squad(),\n",
+    "        log=logger(),\n",
+    "        peft=lora(),\n",
+    "        optim=adam_with_cosine_annealing(),\n",
+    "        resume=resume(),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3.2: Run PEFT with NeMo 2.0 API and NeMo-Run\n",
+    "\n",
+    "We use `LocalExecutor` for executing our configured finetune function. For more details on the NeMo-Run executor, refer to [Execute NeMo Run](https://github.com/NVIDIA/NeMo-Run/blob/main/docs/source/guides/execution.md) of NeMo-Run Guides. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def local_executor_torchrun(nodes: int = 1, devices: int = 1) -> run.LocalExecutor:\n",
+    "    # Env vars for jobs are configured here\n",
+    "    env_vars = {\n",
+    "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
+    "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
+    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
+    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
+    "        \"NVTE_FUSED_ATTN\": \"0\",\n",
+    "    }\n",
+    "\n",
+    "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
+    "\n",
+    "    return executor\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    run.run(configure_finetuning_recipe(), executor=local_executor_torchrun())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4. Generate Results from Trained PEFT Checkpoints \n",
+    "\n",
+    "We use the `llm.generate` API in NeMo 2.0 to generate results from the trained PEFT checkpoint. Find your last saved checkpoint from your experiment dir: `results/nemo2_peft/checkpoints`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "peft_ckpt_path=str(next((d for d in Path(\"./results/nemo2_peft/checkpoints/\").iterdir() if d.is_dir() and d.name.endswith(\"-last\")), None))\n",
+    "print(\"We will load PEFT checkpoint from:\", peft_ckpt_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The SQuAD test set contains over 10,000 samples. For a quick demonstration, we will use the first 100 lines as an example input. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "head -n 100 /root/.cache/nemo/datasets/squad/test.jsonl > toy_testset.jsonl\n",
+    "head -n 3 /root/.cache/nemo/datasets/squad/test.jsonl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should see something like:\n",
+    "```\n",
+    "{\"input\": \"Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24\\u201310 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \\\"golden anniversary\\\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \\\"Super Bowl L\\\"), so that the logo could prominently feature the Arabic numerals 50. Question: Which NFL team represented the AFC at Super Bowl 50? Answer:\", \"output\": \"Denver Broncos\", \"original_answers\": [\"Denver Broncos\", \"Denver Broncos\", \"Denver Broncos\"]}\n",
+    "{\"input\": \"Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24\\u201310 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \\\"golden anniversary\\\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \\\"Super Bowl L\\\"), so that the logo could prominently feature the Arabic numerals 50. Question: Which NFL team represented the NFC at Super Bowl 50? Answer:\", \"output\": \"Carolina Panthers\", \"original_answers\": [\"Carolina Panthers\", \"Carolina Panthers\", \"Carolina Panthers\"]}\n",
+    "{\"input\": \"Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24\\u201310 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \\\"golden anniversary\\\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \\\"Super Bowl L\\\"), so that the logo could prominently feature the Arabic numerals 50. Question: Where did Super Bowl 50 take place? Answer:\", \"output\": \"Santa Clara, California\", \"original_answers\": [\"Santa Clara, California\", \"Levi's Stadium\", \"Levi's Stadium in the San Francisco Bay Area at Santa Clara, California.\"]}\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will pass the string `toy_testset.jsonl` to the `input_dataset` parameter of `llm.generate`. To evaluate the entire test set, you can instead pass the SQuAD data module directly, using `input_dataset=squad()`. The input JSONL file should follow the format shown above, containing `input` and `output` fields (additional keys are optional)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from megatron.core.inference.common_inference_params import CommonInferenceParams\n",
+    "\n",
+    "\n",
+    "def trainer() -> run.Config[nl.Trainer]:\n",
+    "    strategy = run.Config(\n",
+    "        nl.MegatronStrategy,\n",
+    "        tensor_model_parallel_size=1\n",
+    "    )\n",
+    "    trainer = run.Config(\n",
+    "        nl.Trainer,\n",
+    "        accelerator=\"gpu\",\n",
+    "        devices=1,\n",
+    "        num_nodes=1,\n",
+    "        strategy=strategy,\n",
+    "        plugins=bf16_mixed(),\n",
+    "    )\n",
+    "    return trainer\n",
+    "\n",
+    "def configure_inference():\n",
+    "    return run.Partial(\n",
+    "        llm.generate,\n",
+    "        path=str(peft_ckpt_path),\n",
+    "        trainer=trainer(),\n",
+    "        input_dataset=\"toy_testset.jsonl\",\n",
+    "        inference_params=CommonInferenceParams(num_tokens_to_generate=20, top_k=1),\n",
+    "        output_path=\"peft_prediction.jsonl\",\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def local_executor_torchrun(nodes: int = 1, devices: int = 1) -> run.LocalExecutor:\n",
+    "    # Env vars for jobs are configured here\n",
+    "    env_vars = {\n",
+    "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
+    "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
+    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
+    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
+    "        \"NVTE_FUSED_ATTN\": \"0\",\n",
+    "    }\n",
+    "\n",
+    "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
+    "\n",
+    "    return executor\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    run.run(configure_inference(), executor=local_executor_torchrun())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the inference is complete, you will see results similar to the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "head -n 3 peft_prediction.jsonl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should see outputs similar to the following:\n",
+    "```\n",
+    "{\"input\": \"Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24\\u201310 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \\\"golden anniversary\\\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \\\"Super Bowl L\\\"), so that the logo could prominently feature the Arabic numerals 50. Question: Which NFL team represented the AFC at Super Bowl 50? Answer:\", \"original_answers\": [\"Denver Broncos\", \"Denver Broncos\", \"Denver Broncos\"], \"label\": \"Denver Broncos\", \"prediction\": \" Denver Broncos\"}\n",
+    "{\"input\": \"Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24\\u201310 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \\\"golden anniversary\\\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \\\"Super Bowl L\\\"), so that the logo could prominently feature the Arabic numerals 50. Question: Which NFL team represented the NFC at Super Bowl 50? Answer:\", \"original_answers\": [\"Carolina Panthers\", \"Carolina Panthers\", \"Carolina Panthers\"], \"label\": \"Carolina Panthers\", \"prediction\": \" Carolina Panthers\"}\n",
+    "{\"input\": \"Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24\\u201310 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \\\"golden anniversary\\\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \\\"Super Bowl L\\\"), so that the logo could prominently feature the Arabic numerals 50. Question: Where did Super Bowl 50 take place? Answer:\", \"original_answers\": [\"Santa Clara, California\", \"Levi's Stadium\", \"Levi's Stadium in the San Francisco Bay Area at Santa Clara, California.\"], \"label\": \"Santa Clara, California\", \"prediction\": \" Levi's Stadium\"}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# NeMo Tools and Resources\n",
+    "1. [NeMo GitHub repo](https://github.com/NVIDIA/NeMo)\n",
+    "\n",
+    "2. [NeMo-Run GitHub repo](https://github.com/NVIDIA/NeMo-Run/)\n",
+    "\n",
+    "3. NeMo Framework Container: `nvcr.io/nvidia/nemo:dev`\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Educational Resources\n",
+    "1. Blog: [Mastering LLM Techniques: Customization](https://developer.nvidia.com/blog/selecting-large-language-model-customization-techniques/)\n",
+    "\n",
+    "2. Whitepaper: [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)\n",
+    "\n",
+    "3. [NeMo 2.0 Overview](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/index.html)\n",
+    "\n",
+    "4. Blog: [Tune and Deploy LoRA LLMs with NVIDIA TensorRT-LLM](https://developer.nvidia.com/blog/tune-and-deploy-lora-llms-with-nvidia-tensorrt-llm/)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
new file mode 100644
index 000000000000..479d81928e98
--- /dev/null
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
@@ -0,0 +1,632 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Learning Goals\n",
+    "\n",
+    "## Optimizing Foundation Models with Supervised Fine-Tuning (SFT)\n",
+    "\n",
+    "Often, we want to adapt or customize foundation models to be more performant on our specific task. Fine-tuning refers to how we can modify the weights of a pre-trained foundation model with additional custom data. Supervised Fine-Tuning (SFT) refers to unfreezing all the weights and layers in our model and training on a newly labeled set of examples. We can fine-tune to incorporate new, domain-specific knowledge, or teach the foundation model what type of response to provide. One specific type of SFT is also referred to as “instruction tuning” where we use SFT to teach a model to follow instructions better. In this tutorial, will demonstrate how to perform SFT with Llama3-8b using NeMo 2.0.\n",
+    "\n",
+    "NeMo 2.0 introduces Python-based configurations, PyTorch Lightning’s modular abstractions, and NeMo-Run for scaling experiments across multiple GPUs. In this notebook, we will use NeMo-Run to streamline the configuration and execution of our experiments.\n",
+    "\n",
+    "## Data\n",
+    "Databricks-dolly-15k is an open-source dataset created by the collaborative efforts of Databricks employees. It consists of high-quality, human-generated prompt/response pairs specifically designed for instruction tuning LLMs. These pairs cover a diverse range of behaviors, from brainstorming and content generation to information extraction and summarization. \n",
+    "\n",
+    "For more information, refer to [databricks-dolly-15k | Hugging Face](https://huggingface.co/datasets/databricks/databricks-dolly-15k)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Step 1. Import the Hugging Face Checkpoint\n",
+    "We use the `llm.import_ckpt` API to download the specified model using the \"hf://<huggingface_model_id>\" URL format. It will then convert the model into NeMo 2.0 format. For all model supported in NeMo 2.0, refer to [Large Language Models](https://docs.nvidia.com/nemo-framework/user-guide/24.09/llms/index.html#large-language-models) section of NeMo Framework User Guide."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import nemo_run as run\n",
+    "from nemo import lightning as nl\n",
+    "from nemo.collections import llm\n",
+    "from megatron.core.optimizer import OptimizerConfig\n",
+    "import torch\n",
+    "import pytorch_lightning as pl\n",
+    "from pathlib import Path\n",
+    "from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed\n",
+    "\n",
+    "\n",
+    "# llm.import_ckpt is the nemo2 API for converting Hugging Face checkpoint to NeMo format\n",
+    "# example python usage:\n",
+    "# llm.import_ckpt(model=llm.llama3_8b.model(), source=\"hf://meta-llama/Meta-Llama-3-8B\")\n",
+    "#\n",
+    "# We use run.Partial to configure this function\n",
+    "def configure_checkpoint_conversion():\n",
+    "    return run.Partial(\n",
+    "        llm.import_ckpt,\n",
+    "        model=llm.llama3_8b.model(),\n",
+    "        source=\"hf://meta-llama/Meta-Llama-3-8B\",\n",
+    "        overwrite=False,\n",
+    "    )\n",
+    "\n",
+    "# configure your function\n",
+    "import_ckpt = configure_checkpoint_conversion()\n",
+    "# define your executor\n",
+    "local_executor = run.LocalExecutor()\n",
+    "\n",
+    "# run your experiment\n",
+    "run.run(import_ckpt, executor=local_executor)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2. Prepare the Data and Customize the DataModule\n",
+    "\n",
+    "We will be using Databricks-dolly-15k for this notebook. NeMo 2.0 already provides a `DollyDataModule`. For all data modules that are included in NeMo 2.0, refer to the [data module directory](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/llm/gpt/data). Example usage:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def dolly() -> run.Config[pl.LightningDataModule]:\n",
+    "    return run.Config(llm.DollyDataModule, seq_length=2048, micro_batch_size=1, global_batch_size=8, num_workers=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To use your own data, you will need to create a custom `DataModule`. This involves extending the base class `FineTuningDataModule` so that you have access to existing data handling logic, such as packed sequences. Here we walk you through the process step by step, using the already existing [`DollyDataModule`](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/gpt/data/dolly.py) as an example. \n",
+    "\n",
+    "### Subclass the FineTuningDataModule\n",
+    "You need to extend the `FineTuningDataModule` if you're fine-tuning NeMo models. This provides access to existing data handling logic, such as packed sequences. The `data_root` parameter is where you store your generated `train/validation/test.jsonl` in NeMo format. Below is how `DollyDataModule` does it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from typing import List, Optional\n",
+    "from nemo.lightning.io.mixin import IOMixin\n",
+    "from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule\n",
+    "\n",
+    "class DollyDataModule(FineTuningDataModule, IOMixin):\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        seq_length: int = 2048,\n",
+    "        tokenizer: Optional[\"TokenizerSpec\"] = None,\n",
+    "        micro_batch_size: int = 4,\n",
+    "        global_batch_size: int = 8,\n",
+    "        rampup_batch_size: Optional[List[int]] = None,\n",
+    "        force_redownload: bool = False,\n",
+    "        delete_raw: bool = True,\n",
+    "        seed: int = 1234,\n",
+    "        memmap_workers: int = 1,\n",
+    "        num_workers: int = 8,\n",
+    "        pin_memory: bool = True,\n",
+    "        persistent_workers: bool = False,\n",
+    "        pad_to_max_length: bool = False,\n",
+    "        packed_sequence_size: int = -1,\n",
+    "    ):\n",
+    "        self.force_redownload = force_redownload\n",
+    "        self.delete_raw = delete_raw\n",
+    "\n",
+    "        super().__init__(\n",
+    "            dataset_root=get_dataset_root(\"dolly\"),\n",
+    "            seq_length=seq_length,\n",
+    "            tokenizer=tokenizer,\n",
+    "            micro_batch_size=micro_batch_size,\n",
+    "            global_batch_size=global_batch_size,\n",
+    "            rampup_batch_size=rampup_batch_size,\n",
+    "            seed=seed,\n",
+    "            memmap_workers=memmap_workers,\n",
+    "            num_workers=num_workers,\n",
+    "            pin_memory=pin_memory,\n",
+    "            persistent_workers=persistent_workers,\n",
+    "            pad_to_max_length=pad_to_max_length,\n",
+    "            packed_sequence_size=packed_sequence_size,\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Override the `prepare_data` Method\n",
+    "\n",
+    "The `prepare_data` method is responsible for downloading and preprocessing data if needed. If the dataset is already downloaded, you can skip this step.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def prepare_data(self) -> None:\n",
+    "    # if train file is specified, no need to do anything\n",
+    "    if not self.train_path.exists() or self.force_redownload:\n",
+    "        dset = self._download_data()\n",
+    "        self._preprocess_and_split_data(dset)\n",
+    "    super().prepare_data()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Implement Data Download and Preprocessing Logic\n",
+    "\n",
+    "If your dataset requires downloading or preprocessing, implement this logic within the helper methods. Skip the download part if it's not needed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def _download_data(self):\n",
+    "    logging.info(f\"Downloading {self.__class__.__name__}...\")\n",
+    "    return load_dataset(\n",
+    "        \"databricks/databricks-dolly-15k\",\n",
+    "        cache_dir=str(self.dataset_root),\n",
+    "        download_mode=\"force_redownload\" if self.force_redownload else None,\n",
+    "    )\n",
+    "\n",
+    "def _preprocess_and_split_data(self, dset, train_ratio: float = 0.80, val_ratio: float = 0.15):\n",
+    "    logging.info(f\"Preprocessing {self.__class__.__name__} to jsonl format and splitting...\")\n",
+    "\n",
+    "    test_ratio = 1 - train_ratio - val_ratio\n",
+    "    save_splits = {}\n",
+    "    dataset = dset.get('train')\n",
+    "    split_dataset = dataset.train_test_split(test_size=val_ratio + test_ratio, seed=self.seed)\n",
+    "    split_dataset2 = split_dataset['test'].train_test_split(\n",
+    "        test_size=test_ratio / (val_ratio + test_ratio), seed=self.seed\n",
+    "    )\n",
+    "    save_splits['training'] = split_dataset['train']\n",
+    "    save_splits['validation'] = split_dataset2['train']\n",
+    "    save_splits['test'] = split_dataset2['test']\n",
+    "\n",
+    "    for split_name, dataset in save_splits.items():\n",
+    "        output_file = self.dataset_root / f\"{split_name}.jsonl\"\n",
+    "        with output_file.open(\"w\", encoding=\"utf-8\") as f:\n",
+    "            for example in dataset:\n",
+    "                context = example[\"context\"].strip()\n",
+    "                if context != \"\":\n",
+    "                    # Randomize context and instruction order.\n",
+    "                    context_first = np.random.randint(0, 2) == 0\n",
+    "                    if context_first:\n",
+    "                        instruction = example[\"instruction\"].strip()\n",
+    "                        assert instruction != \"\"\n",
+    "                        _input = f\"{context}\\n\\n{instruction}\"\n",
+    "                        _output = example[\"response\"]\n",
+    "                    else:\n",
+    "                        instruction = example[\"instruction\"].strip()\n",
+    "                        assert instruction != \"\"\n",
+    "                        _input = f\"{instruction}\\n\\n{context}\"\n",
+    "                        _output = example[\"response\"]\n",
+    "                else:\n",
+    "                    _input = example[\"instruction\"]\n",
+    "                    _output = example[\"response\"]\n",
+    "\n",
+    "                f.write(json.dumps({\"input\": _input, \"output\": _output, \"category\": example[\"category\"]}) + \"\\n\")\n",
+    "\n",
+    "        logging.info(f\"{split_name} split saved to {output_file}\")\n",
+    "\n",
+    "    if self.delete_raw:\n",
+    "        for p in self.dataset_root.iterdir():\n",
+    "            if p.is_dir():\n",
+    "                shutil.rmtree(p)\n",
+    "            elif '.jsonl' not in str(p.name):\n",
+    "                p.unlink()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The original example in Dolly dataset looks like:\n",
+    "```\n",
+    "{'instruction': 'Extract all the movies from this passage and the year they were released out. Write each movie as a separate sentence', 'context': \"The genre has existed since the early years of silent cinema, when Georges Melies' A Trip to the Moon (1902) employed trick photography effects. The next major example (first in feature length in the genre) was the film Metropolis (1927). From the 1930s to the 1950s, the genre consisted mainly of low-budget B movies. After Stanley Kubrick's landmark 2001: A Space Odyssey (1968), the science fiction film genre was taken more seriously. In the late 1970s, big-budget science fiction films filled with special effects became popular with audiences after the success of Star Wars (1977) and paved the way for the blockbuster hits of subsequent decades.\", 'response': 'A Trip to the Moon was released in 1902. Metropolis came out in 1927. 2001: A Space Odyssey was released in 1968. Star Wars came out in 1977.', 'category': 'information_extraction'}\n",
+    "```\n",
+    "After the preprocessing logic, the data examples are transformed into NeMo format, as below:\n",
+    "```\n",
+    "{'input': \"Extract all the movies from this passage and the year they were released out. Write each movie as a separate sentence\\n\\nThe genre has existed since the early years of silent cinema, when Georges Melies' A Trip to the Moon (1902) employed trick photography effects. The next major example (first in feature length in the genre) was the film Metropolis (1927). From the 1930s to the 1950s, the genre consisted mainly of low-budget B movies. After Stanley Kubrick's landmark 2001: A Space Odyssey (1968), the science fiction film genre was taken more seriously. In the late 1970s, big-budget science fiction films filled with special effects became popular with audiences after the success of Star Wars (1977) and paved the way for the blockbuster hits of subsequent decades.\", 'output': 'A Trip to the Moon was released in 1902. Metropolis came out in 1927. 2001: A Space Odyssey was released in 1968. Star Wars came out in 1977.', 'category': 'information_extraction'}\n",
+    "```\n",
+    "Each data example is saved as a json string as one line in the `train/validation/test.jsonl` file, under `data_root` directory you specified earlier."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3.1: Configure SFT with the NeMo 2.0 API \n",
+    "\n",
+    "In this notebook we use NeMo 2.0 API to perform SFT. First we configure the following components for training. These components are similar between SFT and PEFT. SFT and PEFT both uses `llm.finetune` API. To switch from PEFT to SFT, you just need to remove the `peft` parameter.\n",
+    "\n",
+    "### Configure the Trainer\n",
+    "The NeMo 2.0 Trainer works similarly to the PyTorch Lightning trainer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def trainer() -> run.Config[nl.Trainer]:\n",
+    "    strategy = run.Config(\n",
+    "        nl.MegatronStrategy,\n",
+    "        tensor_model_parallel_size=2\n",
+    "    )\n",
+    "    trainer = run.Config(\n",
+    "        nl.Trainer,\n",
+    "        devices=2,\n",
+    "        max_steps=20,\n",
+    "        accelerator=\"gpu\",\n",
+    "        strategy=strategy,\n",
+    "        plugins=bf16_mixed(),\n",
+    "        log_every_n_steps=1,\n",
+    "        limit_val_batches=2,\n",
+    "        val_check_interval=2,\n",
+    "        num_sanity_val_steps=0,\n",
+    "    )\n",
+    "    return trainer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Configure the Logger\n",
+    "Configure your training steps, output directories and logging through `NeMoLogger`. In the following example, the experiment output will be saved at `./results/nemo2_sft`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def logger() -> run.Config[nl.NeMoLogger]:\n",
+    "    ckpt = run.Config(\n",
+    "        nl.ModelCheckpoint,\n",
+    "        save_last=True,\n",
+    "        every_n_train_steps=10,\n",
+    "        monitor=\"reduced_train_loss\",\n",
+    "        save_top_k=1,\n",
+    "        save_on_train_epoch_end=True,\n",
+    "        save_optim_on_train_end=True,\n",
+    "    )\n",
+    "\n",
+    "    return run.Config(\n",
+    "        nl.NeMoLogger,\n",
+    "        name=\"nemo2_sft\",\n",
+    "        log_dir=\"./results\",\n",
+    "        use_datetime_version=False,\n",
+    "        ckpt=ckpt,\n",
+    "        wandb=None\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "\n",
+    "### Configure the Optimizer\n",
+    "In the following example, we will be using the distributed adam optimizer and pass in the optimizer configuration through `OptimizerConfig`: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def adam_with_cosine_annealing() -> run.Config[nl.OptimizerModule]:\n",
+    "    opt_cfg = run.Config(\n",
+    "        OptimizerConfig,\n",
+    "        optimizer=\"adam\",\n",
+    "        lr=5e-6,\n",
+    "        adam_beta2=0.98,\n",
+    "        use_distributed_optimizer=True,\n",
+    "        clip_grad=1.0,\n",
+    "        bf16=True,\n",
+    "    )\n",
+    "    return run.Config(\n",
+    "        nl.MegatronOptimizerModule,\n",
+    "        config=opt_cfg\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure the Base Model\n",
+    "We will perform SFT on top of Llama3-8B, so we create a `LlamaModel` to pass to the finetune API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def llama3_8b() -> run.Config[pl.LightningModule]:\n",
+    "    return run.Config(llm.LlamaModel, config=run.Config(llm.Llama3Config8B))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Auto Resume\n",
+    "In NeMo 2.0, we can directly pass in the Llama3-8b Hugging Face ID to start SFT without manually converting it into the NeMo checkpoint format, as required in NeMo 1.0."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def resume() -> run.Config[nl.AutoResume]:\n",
+    "    return run.Config(\n",
+    "        nl.AutoResume,\n",
+    "        restore_config=run.Config(nl.RestoreConfig,\n",
+    "            path=\"nemo://meta-llama/Meta-Llama-3-8B\"\n",
+    "        ),\n",
+    "        resume_if_exists=True,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Configure NeMo 2.0 finetune API\n",
+    "Using all the components we created above, we can call the NeMo 2.0 finetune API. The python example usage is as below:\n",
+    "```\n",
+    "llm.finetune(\n",
+    "    model=llama3_8b(),\n",
+    "    data=dolly(),\n",
+    "    trainer=trainer(),\n",
+    "    log=logger(),\n",
+    "    optim=adam_with_cosine_annealing(),\n",
+    "    resume=resume(),\n",
+    ")\n",
+    "```\n",
+    "We configure the `llm.finetune` API as below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def configure_finetuning_recipe():\n",
+    "    return run.Partial(\n",
+    "        llm.finetune,\n",
+    "        model=llama3_8b(),\n",
+    "        trainer=trainer(),\n",
+    "        data=dolly(),\n",
+    "        log=logger(),\n",
+    "        optim=adam_with_cosine_annealing(),\n",
+    "        resume=resume(),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3.2: Run SFT with NeMo 2.0 API and NeMo-Run\n",
+    "\n",
+    "We use `LocalExecutor` for executing our configured finetune function. For more details on the NeMo-Run executor, refer to [Execute NeMo Run](https://github.com/NVIDIA/NeMo-Run/blob/main/docs/source/guides/execution.md) of NeMo-Run Guides. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:\n",
+    "    # Env vars for jobs are configured here\n",
+    "    env_vars = {\n",
+    "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
+    "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
+    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
+    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
+    "        \"NVTE_FUSED_ATTN\": \"0\",\n",
+    "    }\n",
+    "\n",
+    "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
+    "\n",
+    "    return executor\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    run.run(configure_finetuning_recipe(), executor=local_executor_torchrun())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4. Generate Results from Trained SFT Checkpoints\n",
+    "\n",
+    "We use the `llm.generate` API in NeMo 2.0 to generate results from the trained SFT checkpoint. Find your last saved checkpoint from your experiment dir: `results/nemo2_sft/checkpoints`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "sft_ckpt_path=str(next((d for d in Path(\"./results/nemo2_sft/checkpoints/\").iterdir() if d.is_dir() and d.name.endswith(\"-last\")), None))\n",
+    "print(\"We will load SFT checkpoint from:\", sft_ckpt_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When using `llm.generate` API, you can pass a data module such as dolly: `input_dataset=dolly()`. This will use the test set from the specified data module to generate predictions. In the following example, the generated predictions are saved to the `sft_predictions.txt` file. Note that while fine-tuning required a minimum of 2 GPUs with `tensor_model_parallel_size=2`, generating predictions only requires `tensor_model_parallel_size=1`. However, using multiple GPUs can speed up the inference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from megatron.core.inference.common_inference_params import CommonInferenceParams\n",
+    "\n",
+    "\n",
+    "def trainer() -> run.Config[nl.Trainer]:\n",
+    "    strategy = run.Config(\n",
+    "        nl.MegatronStrategy,\n",
+    "        tensor_model_parallel_size=1,\n",
+    "    )\n",
+    "    trainer = run.Config(\n",
+    "        nl.Trainer,\n",
+    "        accelerator=\"gpu\",\n",
+    "        devices=1,\n",
+    "        num_nodes=1,\n",
+    "        strategy=strategy,\n",
+    "        plugins=bf16_mixed(),\n",
+    "    )\n",
+    "    return trainer\n",
+    "\n",
+    "def configure_inference():\n",
+    "    return run.Partial(\n",
+    "        llm.generate,\n",
+    "        path=str(sft_ckpt_path),\n",
+    "        trainer=trainer(),\n",
+    "        input_dataset=dolly(),\n",
+    "        inference_params=CommonInferenceParams(num_tokens_to_generate=20, top_k=1),\n",
+    "        output_path=\"sft_prediction.jsonl\",\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def local_executor_torchrun(nodes: int = 1, devices: int = 1) -> run.LocalExecutor:\n",
+    "    # Env vars for jobs are configured here\n",
+    "    env_vars = {\n",
+    "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
+    "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
+    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
+    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
+    "        \"NVTE_FUSED_ATTN\": \"0\",\n",
+    "    }\n",
+    "\n",
+    "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
+    "\n",
+    "    return executor\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    run.run(configure_inference(), executor=local_executor_torchrun())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the inference is complete, you will see results similar to the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "head -n 3 sft_prediction.jsonl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should see output similar to the following:\n",
+    "```\n",
+    "{\"input\": \"What is best creator's platform\", \"category\": \"brainstorming\", \"label\": \"Youtube. Youtube should be best creator platform\", \"prediction\": \" for video content creators. YouTube is best creator's platform for video content creators.\"}\n",
+    "{\"input\": \"When was the last time the Raiders won the Super Bowl?\", \"category\": \"open_qa\", \"label\": \"The Raiders have won three Super Bowl championships (1977, 1981, and 1984), one American Football League (AFL) championship (1967), and four American Football Conference (AFC) titles. The most recent Super Bowl ring was won in 1984 against the Washington Redskins of the NFC.\", \"prediction\": \" 2003\"}\n",
+    "{\"input\": \"Muckle Water is a long, narrow fresh water loch on Ward Hill on Rousay, Orkney, Scotland. It is the biggest loch on the island and is popular for fishing. It can be reached by a track from the roadside. The Suso Burn on the north eastern shore drains the loch into the Sound of Rousay.\\n\\nWhere is Muckle Water?\", \"category\": \"closed_qa\", \"label\": \"Muckle water is located in Rousay, Orkney, Scotland.\", \"prediction\": \" Muckle Water is a long, narrow fresh water loch on Ward Hill on Rousay,\"}\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From a273ddd4991027d6280e24c9f73fa626425f0296 Mon Sep 17 00:00:00 2001
From: Paul Gibbons <87940629+paul-gibbons@users.noreply.github.com>
Date: Tue, 19 Nov 2024 12:26:49 -0800
Subject: [PATCH 26/48] update symlink option for save_last in ModelCheckpoint
 (#11319)

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>
---
 nemo/lightning/pytorch/callbacks/model_checkpoint.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
index ca4ca08cab08..455022b1ba44 100644
--- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -17,7 +17,7 @@
 import shutil
 from datetime import timedelta
 from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Literal, Optional, Union
 
 import lightning.pytorch
 import torch
@@ -63,7 +63,7 @@ def __init__(
         self,
         monitor: Optional[str] = "val_loss",
         verbose: bool = True,
-        save_last: Optional[bool] = True,
+        save_last: Optional[Union[bool, Literal["link"]]] = True,
         save_top_k: int = 3,
         save_weights_only: bool = False,  ## TODO: check support
         mode: str = "min",

From 1c59ce3ed5a090eb1b44ee91812f2493bcae4712 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Tue, 19 Nov 2024 21:39:56 +0100
Subject: [PATCH 27/48] ci: Pass-through of `workflow_event` (#11322)

* ci: Pass-through of `workflow_event`

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* bump

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cicd-main.yml | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 4fcafea82d07..d1cc1b910f29 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -38,6 +38,7 @@ jobs:
     outputs:
       test_to_run: ${{ steps.test_to_run.outputs.main }}
       all: ${{ steps.all.outputs.main }}
+      event_name: ${{ steps.github-event.outputs.main }}
     steps:
       - name: Parse test_to_run
         id: test_to_run
@@ -47,11 +48,16 @@ jobs:
       - name: Parse all
         id: all
         run: |
-          echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT" 
+          echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT"
+      - name: Infer github event
+        id: github-event
+        run: |
+          echo "main=${{ github.event_name }}" | tee -a "$GITHUB_OUTPUT"
   
   cicd-test-container-build:
-    if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }}
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.1.0
+    if: ${{ github.event.label.name == 'Run CICD' || needs.pre-flight.outputs.event_name == 'workflow_dispatch' }}
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.14.0
+    needs: pre-flight
     with:
       image-name: nemo_container
       dockerfile: Dockerfile.ci

From 4b93e7f67592a2b58c212f656db962a1447e3488 Mon Sep 17 00:00:00 2001
From: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Date: Wed, 20 Nov 2024 04:07:42 +0530
Subject: [PATCH 28/48] Add StragglerDetection and auto-relaunch (#11328)

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
---
 .../llm/recipes/callbacks/__init__.py         | 13 +++++
 .../llm/recipes/callbacks/common.py           | 53 ++++++++++++++++++
 nemo/lightning/run/plugins.py                 | 56 ++++++++++++++++++-
 3 files changed, 121 insertions(+), 1 deletion(-)
 create mode 100644 nemo/collections/llm/recipes/callbacks/__init__.py
 create mode 100644 nemo/collections/llm/recipes/callbacks/common.py

diff --git a/nemo/collections/llm/recipes/callbacks/__init__.py b/nemo/collections/llm/recipes/callbacks/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/llm/recipes/callbacks/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/llm/recipes/callbacks/common.py b/nemo/collections/llm/recipes/callbacks/common.py
new file mode 100644
index 000000000000..72a1b3a0c640
--- /dev/null
+++ b/nemo/collections/llm/recipes/callbacks/common.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from nemo_run import Config, cli
+
+from nemo.utils.import_utils import safe_import
+
+res_module, HAVE_RES = safe_import('nvidia_resiliency_ext.ptl_resiliency')
+
+
+@cli.factory(is_target_default=True)
+def straggler_det_callback(
+    straggler_report_time_interval: Optional[int] = 300, stop_if_detected_straggler: Optional[bool] = True
+) -> Config[res_module.StragglerDetectionCallback]:
+    """
+    This callback is used to detect slower ranks participating in a PyTorch distributed workload.
+    This callback is obtained from nvidia-resiliency-ext.
+    Performance scores are scalar values from 0.0 (worst) to 1.0 (best), reflecting each rank's performance.
+    A performance score can be interpreted as the ratio of current performance to reference performance.
+    Depending on the reference used, there are two types of performance scores:
+    Relative performance score: The best-performing rank in the workload is used as a reference.
+    Individual performance score: The best historical performance of the rank is used as a reference.
+    If the performance score drops below the threshold which is set to 0.7, it is deemed as a straggler.
+    To detect the stragglers, users can enable this callback which reports the performance scores every 5mins.
+    Args:
+        straggler_report_time_interval (int): Performance score reporting frequency in seconds, Default is 300 seconds.
+        stop_if_detected_straggler (bool): Whether to stop training if a straggler is detection. Default is True.
+    """
+
+    return Config(
+        res_module.StragglerDetectionCallback,
+        report_time_interval=straggler_report_time_interval,
+        calc_relative_gpu_perf=True,
+        calc_individual_gpu_perf=True,
+        num_gpu_perf_scores_to_print=5,
+        gpu_relative_perf_threshold=0.7,
+        gpu_individual_perf_threshold=0.7,
+        stop_if_detected=stop_if_detected_straggler,
+        enable_ptl_logging=True,
+    )
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index 7366843d5a86..645665723706 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -28,9 +28,14 @@
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 from nemo.utils import logging
 
+from nemo.utils.import_utils import safe_import
+
+res_module, HAVE_RES = safe_import('nvidia_resiliency_ext.ptl_resiliency')
+
 # This file contains plugins based on NeMo-Run's run.Plugin API.
 # Plugins operate both on a configured task and an executor at the same time, and are specific to NeMo-Run.
-# If you are adding functionality that goes directly into the Pytorch Lightning trainer, you may consider adding a callback instead of a plugin.
+# If you are adding functionality that goes directly into the Pytorch Lightning trainer,
+# you may consider adding a callback instead of a plugin.
 
 
 def _merge_callbacks(partial: run.Partial, callbacks: list[run.Config[Callback]]):
@@ -79,6 +84,55 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
         _merge_callbacks(task, callbacks=self.callbacks)
 
 
+@dataclass(kw_only=True)
+class FaultTolerancePlugin(run.Plugin):
+    """
+    A plugin for setting up the fault tolerance callback from nvidia-resiliency-ext.
+    This plugin enables workload hang detection, automatic calculation of timeouts used for hang detection, detection of rank(s) terminated due to an error and workload respawning in case of a failure.
+    Note: FaultTolerancePlugin does not work with the NsysPlugin.
+    Args:
+        num_in_process_restarts (int): Max number of restarts on failure, within the same job. Default is 3.
+        num_job_retries_on_failure (int): Max number of new job restarts on failure. Default is 2.
+        initial_rank_heartbeat_timeout (int): Timeouts are time intervals used by a rank monitor to detect that a rank is not alive. This is the max timeout for the initial heartbeat. Default is 1800.
+        rank_heartbeat_timeout (int): This is the timeout for subsequent hearbeats after the initial heartbeat. Default is 300.
+    """
+
+    num_in_process_restarts: int = 3
+    num_job_retries_on_failure: int = 2
+    initial_rank_heartbeat_timeout: int = 1800
+    rank_heartbeat_timeout: int = 300
+
+    def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+
+        assert HAVE_RES, "nvidia-resiliency-ext.ptl_resiliency is required to use the FaultTolerancePlugin."
+
+        executor.launcher = run.FaultTolerance(
+            max_restarts=self.num_in_process_restarts,
+            initial_rank_heartbeat_timeout=self.initial_rank_heartbeat_timeout,
+            rank_heartbeat_timeout=self.rank_heartbeat_timeout,
+        )
+        executor.retries = self.num_job_retries_on_failure
+
+        assert isinstance(task, run.Partial)
+
+        callbacks = [
+            run.Config(
+                res_module.FaultToleranceCallback, autoresume=True, calculate_timeouts=True, exp_dir=task.log.log_dir
+            )
+        ]
+
+        assert not executor.launcher.nsys_profile, "Nsys not supported with the FaultTolerancePlugin."
+        if hasattr(task, "trainer") and hasattr(task.trainer, "callbacks"):
+            assert all(
+                map(
+                    lambda cb: not cb.__fn_or_cls__ == NsysCallback if "__fn_or_cls__" in dir(cb) else True,
+                    task.trainer.callbacks,
+                )
+            ), "Nsys not supported with FaultTolerancePlugin."
+
+        _merge_callbacks(task, callbacks=callbacks)
+
+
 @dataclass(kw_only=True)
 class NsysPlugin(run.Plugin):
     """

From 341580e61deebfe6a43edf813d615fd1579018ca Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Wed, 20 Nov 2024 07:01:21 +0100
Subject: [PATCH 29/48] Fix patching of NeMo tokenizers for correct Lambada
 evaluation (#11326)

* Fix patching of NeMo tokenizers for correct Lambada evaluation

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>
---
 .../trt_llm/nemo_ckpt_loader/nemo_file.py     | 39 +++++++++++--------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
index 23d227d32acf..9ace6425f533 100644
--- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -317,24 +317,31 @@ def build_tokenizer(tokenizer):
         if tokenizer.eos_token_id is None:
             tokenizer.add_special_tokens({"eos_token": "</s>"})
     else:
-        try:
-            # If NeMo tokenizer, monkey patch interface
-            from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-
-            if isinstance(tokenizer, TokenizerSpec):
-
-                def batch_encode_patch(self, ids):
+        # For NeMo tokenizers, monkey patch encode & batch_decode methods for unified interface
+        from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer, TokenizerSpec
+
+        if isinstance(tokenizer, TokenizerSpec):
+            if isinstance(tokenizer, AutoTokenizer):
+                # Unwrap the original methods of HF tokenizer
+                batch_decode = tokenizer.tokenizer.batch_decode
+                encode = tokenizer.tokenizer.encode
+            elif isinstance(tokenizer, SentencePieceTokenizer):
+                # Define HF equivalents based on available SP methods
+                def batch_decode(self, ids):
                     if torch.is_tensor(ids):
                         ids = ids.cpu().numpy()
-                        ids = ids[0] if len(ids.shape) > 1 else ids
-                    return self.ids_to_text(ids)
-
-                tokenizer.bos_token_id = tokenizer.bos_id
-                tokenizer.eos_token_id = tokenizer.eos_id
-                tokenizer.encode = tokenizer.text_to_ids
-                TokenizerSpec.batch_decode = batch_encode_patch
-        except:
-            raise TypeError(f'Unsupported tokenizer build input: {type(tokenizer)}')
+                    if isinstance(ids, np.ndarray):
+                        ids = ids.tolist()
+                    return self.tokenizer.decode(ids)
+
+                encode = tokenizer.tokenizer.encode_as_ids
+            else:
+                raise NotImplementedError(f"Patching tokenizer methods for {type(tokenizer)} is not available")
+
+            tokenizer.bos_token_id = tokenizer.bos_id
+            tokenizer.eos_token_id = tokenizer.eos_id
+            TokenizerSpec.encode = encode
+            TokenizerSpec.batch_decode = batch_decode
 
     return tokenizer
 

From cae18e60690facc683d51d8cca0e56af66059c2b Mon Sep 17 00:00:00 2001
From: Huy Vu <86480512+huvunvidia@users.noreply.github.com>
Date: Wed, 20 Nov 2024 10:02:54 -0500
Subject: [PATCH 30/48] Huvu/t5 nemo2.0 nemoci (#11291)

* workable code

* debugging

* adding debugging code

* adding debugging code

* adding debugging code

* restore /huggingface/auto_tokenizer.py

* Apply isort and black reformatting

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>

---------

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: huvunvidia <huvunvidia@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py         | 9 +++++++++
 nemo/collections/llm/recipes/t5_220m.py  | 8 +++++---
 nemo/collections/llm/t5/data/__init__.py | 3 ++-
 nemo/collections/llm/t5/model/t5.py      | 2 ++
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 64246cf80ec7..c5c2e007bc1e 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -115,10 +115,15 @@
     gpt_forward_step,
 )
 from nemo.collections.llm.quantization import Quantizer, get_calib_data_iter
+from nemo.collections.llm.t5.data import FineTuningDataModule as T5FineTuningDataModule
+from nemo.collections.llm.t5.data import MockDataModule as T5MockDataModule
+from nemo.collections.llm.t5.data import PreTrainingDataModule as T5PreTrainingDataModule
+from nemo.collections.llm.t5.data import SquadDataModule as T5SquadDataModule
 from nemo.collections.llm.t5.model import T5Config, T5Model, t5_data_step, t5_forward_step
 
 __all__ = [
     "MockDataModule",
+    "T5MockDataModule",
     "GPTModel",
     "GPTConfig",
     "gpt_data_step",
@@ -198,6 +203,10 @@
     "PreTrainingDataModule",
     "FineTuningDataModule",
     "SquadDataModule",
+    "T5PreTrainingDataModule",
+    "T5FineTuningDataModule",
+    "T5SquadDataModule",
+    "T5MockDataModule",
     "DollyDataModule",
     "tokenizer",
     "mock",
diff --git a/nemo/collections/llm/recipes/t5_220m.py b/nemo/collections/llm/recipes/t5_220m.py
index 77b68f43f0e1..27feb43837fb 100644
--- a/nemo/collections/llm/recipes/t5_220m.py
+++ b/nemo/collections/llm/recipes/t5_220m.py
@@ -175,7 +175,8 @@ def pretrain_recipe(
         guide in the `examples/llm/pretrain/` directory.
     """
 
-    opt_config = OptimizerConfig(
+    opt_config = run.Config(
+        OptimizerConfig,
         optimizer='adam',
         lr=0.0001,
         use_distributed_optimizer=True,
@@ -183,7 +184,8 @@ def pretrain_recipe(
         weight_decay=0.01,
     )
 
-    lr_scheduler = WarmupAnnealingScheduler(
+    lr_scheduler = run.Config(
+        WarmupAnnealingScheduler,
         warmup_steps=None,
         warmup_ratio=0.01,
         max_steps=1000000,
@@ -200,7 +202,7 @@ def pretrain_recipe(
         ),
         data=run.Config(MockDataModule, seq_length=512, seq_length_dec=128, global_batch_size=512, micro_batch_size=1),
         log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
-        optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler),
+        optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler),
         resume=default_resume(),
     )
 
diff --git a/nemo/collections/llm/t5/data/__init__.py b/nemo/collections/llm/t5/data/__init__.py
index d65f6923033f..e4f879b76a06 100644
--- a/nemo/collections/llm/t5/data/__init__.py
+++ b/nemo/collections/llm/t5/data/__init__.py
@@ -1,5 +1,6 @@
 from nemo.collections.llm.t5.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.t5.data.mock import MockDataModule
 from nemo.collections.llm.t5.data.pre_training import PreTrainingDataModule
 from nemo.collections.llm.t5.data.squad import SquadDataModule
 
-__all__ = ["FineTuningDataModule", "PreTrainingDataModule", "SquadDataModule"]
+__all__ = ["FineTuningDataModule", "PreTrainingDataModule", "SquadDataModule", "MockDataModule"]
diff --git a/nemo/collections/llm/t5/model/t5.py b/nemo/collections/llm/t5/model/t5.py
index 8160fb729229..940c0e51ee92 100644
--- a/nemo/collections/llm/t5/model/t5.py
+++ b/nemo/collections/llm/t5/model/t5.py
@@ -152,6 +152,8 @@ class T5Config(TransformerConfig, io.IOMixin):
     max_position_embeddings: int = 512
     rotary_percent: float = 1.0
     seq_len_interpolation_factor: Optional[float] = None
+    seq_length: int = 512
+    seq_length_dec: int = 128
     encoder_pipeline_model_parallel_size: int = 0
     attention_softmax_in_fp32: float = False
     bias_activation_fusion: bool = True

From fcd128edec9fc709dcaaa13afc96e33dc60dfe84 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Wed, 20 Nov 2024 10:45:02 -0500
Subject: [PATCH 31/48] TE acceleration using callbacks (#11261)

* TE acceleration using callbacks

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* TE accelerator example added

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* fp8 autocast added

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* single GPU support with TE

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* generalized model callback added

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* remove te_transform

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* DDP with TE is working

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* address feedback

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* updated function name and added export guard

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* move the torch.no_grad

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* gemma hf example added

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* gemma hf example added

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* add docstrings

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Fix the hf te test param

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* fix minor issue

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* fix the recent callback related issue

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               |  44 ++++--
 examples/llm/sft/hf.py                        |  19 +++
 .../gpt/model/hf_auto_model_for_causal_lm.py  |   4 +-
 nemo/lightning/pytorch/accelerate/__init__.py |  13 ++
 .../pytorch/accelerate/transformer_engine.py  | 121 +++++++++++++++
 nemo/lightning/pytorch/callbacks/__init__.py  |   2 +
 .../pytorch/callbacks/model_callback.py       | 145 ++++++++++++++++++
 7 files changed, 330 insertions(+), 18 deletions(-)
 mode change 100644 => 100755 examples/llm/sft/hf.py
 create mode 100644 nemo/lightning/pytorch/accelerate/__init__.py
 create mode 100755 nemo/lightning/pytorch/accelerate/transformer_engine.py
 mode change 100644 => 100755 nemo/lightning/pytorch/callbacks/__init__.py
 create mode 100755 nemo/lightning/pytorch/callbacks/model_callback.py

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index d1cc1b910f29..4d788fb866d8 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3457,7 +3457,20 @@ jobs:
         inference.repetition_penalty=1.0 \
         inference.outfile_path=/tmp/nlp_mcore_t5_lora_tuning_tp2/out.jsonl
 
-  # L2: Megatron Mock Data Generation                
+
+  L2_HF_Transformer_SFT_TE_Acceleration:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_SFT_TE_Acceleration') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/llm/sft/hf.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --model-accelerator te
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+        
+
+  # L2: Megatron Mock Data Generation
   L2_Megatron_Mock_Data_Generation_MockGPTDataset:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -3578,12 +3591,12 @@ jobs:
   #   timeout-minutes: 10
   #   container:
   #     image: nemoci.azurecr.io/nemo_container:${{ github.run_id }}
-  #     options: 
+  #     options:
   #       # --user 0:128
   #       --device=/dev/nvidia0
   #       --gpus all
   #       --shm-size=8g
-  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env TRANSFORMERS_OFFLINE=0
   #       --env HYDRA_FULL_ERROR=1
   #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
@@ -3643,12 +3656,12 @@ jobs:
   #   runs-on: self-hosted-azure
   #   container:
   #     image: nemoci.azurecr.io/nemo_container:${{ github.run_id }}
-  #     options: 
+  #     options:
   #       # --user 0:128
   #       --device=/dev/nvidia0
   #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --shm-size=8g
+  #       --env TRANSFORMERS_OFFLINE=0
   #       --env HYDRA_FULL_ERROR=1
   #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
@@ -4333,7 +4346,7 @@ jobs:
         rm -rf /tmp/nemo2_ptq_engine
 
   Nemo_CICD_Test:
-    needs: 
+    needs:
       - pre-flight
       - cicd-test-container-setup
 
@@ -4348,7 +4361,7 @@ jobs:
       - L0_Unit_Tests_GPU_Hydra
       - L0_Unit_Tests_GPU_Lightning
       - L0_Unit_Tests_GPU_Others
-      
+
       - L0_Unit_Tests_CPU_ASR
       - L0_Unit_Tests_CPU_Audio
       - L0_Unit_Tests_CPU_Common
@@ -4446,7 +4459,8 @@ jobs:
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
       - L2_NeMo_2_GPT_DDP_Param_Parity_check
       - L2_NeMo_2_HF_MODEL_IMPORT
-      - L2_NeMo_2_llama3_pretraining_recipe 
+      - L2_NeMo_2_llama3_pretraining_recipe
+      - L2_HF_Transformer_SFT_TE_Acceleration
       - L2_NeMo_2_SSM_Pretraining
       - L2_NeMo_2_SSM_Finetuning
       - L2_NeMo_2_T5_Pretraining
@@ -4485,7 +4499,7 @@ jobs:
       - L2_NeMo_2_PTQ_Llama2_FP8
     if: always()
     runs-on: ubuntu-latest
-    steps:  
+    steps:
       - name: Evaluate conclusion
         if: ${{ always() }}
         id: pipeline-conclusion
@@ -4499,14 +4513,14 @@ jobs:
           echo "SUCCESS=$SUCCESS" >> $GITHUB_OUTPUT
 
       # This should depend on all the tests so we block/unblock based on all tests passing
-      - name: Pipeline successful, set exit code to 0 
+      - name: Pipeline successful, set exit code to 0
         if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }}
         run: exit 0
 
-      - name: Pipeline successful, add PR comment 
+      - name: Pipeline successful, add PR comment
         if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' && github.event_name == 'pull_request' && env.SLACK_WEBHOOK != '' }}
         uses: peter-evans/create-or-update-comment@v4
-        env: 
+        env:
           SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
           REPOSITORY: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
@@ -4525,7 +4539,7 @@ jobs:
 
       - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
         if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }}
-        env: 
+        env:
           SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
           SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -4618,4 +4632,4 @@ jobs:
 
       - name: "Pipeline not successful, set exit code to 1"
         if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
-        run: exit 1
+        run: exit 1
\ No newline at end of file
diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py
old mode 100644
new mode 100755
index 20f915f87287..59b8b4ad3491
--- a/examples/llm/sft/hf.py
+++ b/examples/llm/sft/hf.py
@@ -19,6 +19,8 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.lightning.pytorch.accelerate.transformer_engine import is_te_accelerated, te_accelerate
+from nemo.lightning.pytorch.callbacks import ModelCallback
 
 
 class SquadDataModuleWithPthDataloader(llm.SquadDataModule):
@@ -53,7 +55,9 @@ def squad(tokenizer) -> pl.LightningDataModule:
     parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
     parser.add_argument('--devices', default=1)
     parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--model-accelerator', default=None, choices=['te'])
     parser.add_argument('--max-steps', type=int, default=100)
+    parser.add_argument("--fp8-autocast", default=False, action='store_true')
     parser.add_argument('--wandb-project', type=str, default=None)
     parser.add_argument('--model-save-path', type=str, default=None)
     args = parser.parse_args()
@@ -74,6 +78,14 @@ def squad(tokenizer) -> pl.LightningDataModule:
     model = llm.HfAutoModelForCausalLM(args.model)
     tokenizer = model.tokenizer
 
+    callbacks = []
+    if args.model_accelerator:
+        if args.model_accelerator == "te":
+            model_transform = ModelCallback(
+                on_train_start=lambda model: te_accelerate(model, fp8_autocast=args.fp8_autocast)
+            )
+            callbacks.append(model_transform)
+
     llm.api.finetune(
         model=model,
         data=squad(tokenizer),
@@ -88,11 +100,18 @@ def squad(tokenizer) -> pl.LightningDataModule:
             accumulate_grad_batches=10,
             gradient_clip_val=grad_clip,
             use_distributed_sampler=use_dist_samp,
+            callbacks=callbacks,
             logger=wandb,
         ),
         optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
         log=None,
     )
 
+    if args.model_accelerator:
+        if args.model_accelerator == "te":
+            te_acc = is_te_accelerated(model.model)
+            assert te_acc, "Transformer Engine acceleration was unsuccessful"
+            print("TE Accelerated: ", te_acc)
+
     if args.model_save_path is not None:
         model.save_pretrained(args.model_save_path)
diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
index 6922f38cfb26..be6b5e708604 100644
--- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
+++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -55,9 +55,7 @@ def __init__(
     @property
     def tokenizer(self):
         if self._tokenizer is None:
-            self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(
-                self.model_name, trust_remote_code=self.trust_remote_code
-            )
+            self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(self.model_name)
         return self._tokenizer
 
     @tokenizer.setter
diff --git a/nemo/lightning/pytorch/accelerate/__init__.py b/nemo/lightning/pytorch/accelerate/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/lightning/pytorch/accelerate/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/lightning/pytorch/accelerate/transformer_engine.py b/nemo/lightning/pytorch/accelerate/transformer_engine.py
new file mode 100755
index 000000000000..a96a9590ea75
--- /dev/null
+++ b/nemo/lightning/pytorch/accelerate/transformer_engine.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import MethodType
+
+import torch
+from nemo.utils import logging
+from nemo.utils.import_utils import safe_import_from
+
+te, HAVE_TE = safe_import_from("transformer_engine", "pytorch")
+
+
+def te_accelerate(model, fp8_autocast=False):
+    """
+    Replaces original model layers with TE's accelerated layers
+    Args:
+        model: HF model
+        fp8_autocast (bool): apply autocast or not
+    """
+
+    if not HAVE_TE:
+        logging.warning("Transformer Engine is not available and the module replacements " "will not be applied.")
+    else:
+        _apply_basic_module_replacement(model)
+        if fp8_autocast:
+            apply_fp8_autocast(model)
+
+
+@torch.no_grad
+def _apply_basic_module_replacement(model):
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            has_bias = module.bias is not None
+            if any(p % 16 != 0 for p in module.weight.shape):
+                continue
+            te_module = te.Linear(
+                module.in_features, module.out_features, bias=has_bias, params_dtype=module.weight.dtype
+            )
+            te_module.weight.copy_(module.weight)
+            if has_bias:
+                te_module.bias.copy_(module.bias)
+
+            setattr(module, name.split(".")[-1], te_module)
+        elif isinstance(module, torch.nn.LayerNorm):
+            te_module = te.LayerNorm(module.normalized_shape[0], eps=module.eps, params_dtype=module.weight.dtype)
+            te_module.weight.copy_(module.weight)
+            te_module.bias.copy_(module.bias)
+            setattr(module, name.split(".")[-1], te_module)
+        elif isinstance(module, torch.nn.RMSNorm):
+            te_module = te.RMSNorm(module.normalized_shape[0], eps=module.eps, dtype=module.weight.dtype)
+            te_module.weight.copy_(module.weight)
+            te_module.bias.copy_(module.bias)
+            setattr(module, name.split(".")[-1], te_module)
+
+
+def is_te_accelerated(model):
+    """
+    Checks whether model has TE layers or not
+    Args:
+        model: HF model
+    """
+
+    if not HAVE_TE:
+        logging.warning("Transformer Engine is not available.")
+        return False
+    else:
+        for name, module in model.named_modules():
+            if isinstance(module, (te.LayerNorm, te.Linear, te.TransformerLayer)):
+                return True
+
+        return False
+
+
+def apply_fp8_autocast(model, fp8_recipe_handler=None):
+    """
+    Applies TE's autocast
+    Args:
+        model: HF model
+        fp8_recipe_handler: fpt handler
+    """
+
+    if not HAVE_TE:
+        logging.warning("Transformer Engine is not available and the FP8 autocast " "will not be applied.")
+    else:
+        import transformer_engine.common.recipe as te_recipe
+
+        kwargs = fp8_recipe_handler.to_kwargs() if fp8_recipe_handler is not None else {}
+        if "fp8_format" in kwargs:
+            kwargs["fp8_format"] = getattr(te_recipe.Format, kwargs["fp8_format"])
+        use_during_eval = kwargs.pop("use_autocast_during_eval", False)
+        fp8_recipe = te_recipe.DelayedScaling(**kwargs)
+        new_forward = _contextual_fp8_autocast(model.forward, fp8_recipe, use_during_eval)
+
+        if hasattr(model.forward, "__func__"):
+            model.forward = MethodType(new_forward, model)
+        else:
+            model.forward = new_forward
+
+
+def _contextual_fp8_autocast(model_forward, fp8_recipe, use_during_eval=False):
+    from transformer_engine.pytorch import fp8_autocast
+
+    def forward(self, *args, **kwargs):
+        enabled = use_during_eval or self.training
+        with fp8_autocast(enabled=enabled, fp8_recipe=fp8_recipe):
+            return model_forward(*args, **kwargs)
+
+    forward.__wrapped__ = model_forward
+
+    return forward
diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py
old mode 100644
new mode 100755
index 8da1a50dcd64..031f027e63b2
--- a/nemo/lightning/pytorch/callbacks/__init__.py
+++ b/nemo/lightning/pytorch/callbacks/__init__.py
@@ -16,6 +16,7 @@
 from nemo.lightning.pytorch.callbacks.debugging import ParameterDebugger
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.pytorch.callbacks.memory_profiler import MemoryProfileCallback
+from nemo.lightning.pytorch.callbacks.model_callback import ModelCallback
 from nemo.lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform
 from nemo.lightning.pytorch.callbacks.nsys import NsysCallback
@@ -36,4 +37,5 @@
     "DdpParityChecker",
     "GarbageCollectionCallback",
     "ParameterDebugger",
+    "ModelCallback",
 ]
diff --git a/nemo/lightning/pytorch/callbacks/model_callback.py b/nemo/lightning/pytorch/callbacks/model_callback.py
new file mode 100755
index 000000000000..0625e3f006c5
--- /dev/null
+++ b/nemo/lightning/pytorch/callbacks/model_callback.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Callable, Optional
+from lightning.pytorch.callbacks import LambdaCallback
+
+
+class ModelCallback(LambdaCallback):
+    """
+    A callback that extends LambdaCallback to intelligently handle function parameters.
+    Functions can take either (trainer, pl_module), just (pl_module), or just (trainer).
+
+    Supported parameter names:
+    - trainer, pl_trainer
+    - model, pl_model, pl_module, module
+
+    Example:
+        >>> # Using with torch.compile
+        >>> callback = ModelCallback(on_train_start=torch.compile)
+        >>>
+        >>> # Using with thunder_compile
+        >>> callback = ModelCallback(on_train_start=thunder_compile)
+        >>>
+        >>> # Mix different callbacks
+        >>> callback = ModelCallback(
+        ...     on_train_start=lambda model: torch.compile(model),
+        ...     on_fit_start=lambda trainer, model: print(f"Starting fit with {model}")
+        ... )
+    """
+
+    TRAINER_PARAMS = {'trainer', 'pl_trainer'}
+    MODEL_PARAMS = {'model', 'pl_model', 'pl_module', 'module'}
+
+    def __init__(
+        self,
+        setup: Optional[Callable] = None,
+        teardown: Optional[Callable] = None,
+        on_fit_start: Optional[Callable] = None,
+        on_fit_end: Optional[Callable] = None,
+        on_sanity_check_start: Optional[Callable] = None,
+        on_sanity_check_end: Optional[Callable] = None,
+        on_train_batch_start: Optional[Callable] = None,
+        on_train_batch_end: Optional[Callable] = None,
+        on_train_epoch_start: Optional[Callable] = None,
+        on_train_epoch_end: Optional[Callable] = None,
+        on_validation_epoch_start: Optional[Callable] = None,
+        on_validation_epoch_end: Optional[Callable] = None,
+        on_test_epoch_start: Optional[Callable] = None,
+        on_test_epoch_end: Optional[Callable] = None,
+        on_validation_batch_start: Optional[Callable] = None,
+        on_validation_batch_end: Optional[Callable] = None,
+        on_test_batch_start: Optional[Callable] = None,
+        on_test_batch_end: Optional[Callable] = None,
+        on_train_start: Optional[Callable] = None,
+        on_train_end: Optional[Callable] = None,
+        on_validation_start: Optional[Callable] = None,
+        on_validation_end: Optional[Callable] = None,
+        on_test_start: Optional[Callable] = None,
+        on_test_end: Optional[Callable] = None,
+        on_exception: Optional[Callable] = None,
+        on_save_checkpoint: Optional[Callable] = None,
+        on_load_checkpoint: Optional[Callable] = None,
+        on_before_backward: Optional[Callable] = None,
+        on_after_backward: Optional[Callable] = None,
+        on_before_optimizer_step: Optional[Callable] = None,
+        on_before_zero_grad: Optional[Callable] = None,
+        on_predict_start: Optional[Callable] = None,
+        on_predict_end: Optional[Callable] = None,
+        on_predict_batch_start: Optional[Callable] = None,
+        on_predict_batch_end: Optional[Callable] = None,
+        on_predict_epoch_start: Optional[Callable] = None,
+        on_predict_epoch_end: Optional[Callable] = None,
+    ):
+        # Create a dictionary of non-None callbacks
+        callbacks = {
+            name: self._wrap_func(func)
+            for name, func in locals().items()
+            if name != 'self' and name != '__class__' and func is not None
+        }
+
+        super().__init__(**callbacks)
+
+    def _get_param_type(self, param_name: str) -> Optional[str]:
+        """Determine if a parameter name refers to trainer or model."""
+        param_name = param_name.lower()
+        if param_name in self.TRAINER_PARAMS:
+            return 'trainer'
+        if param_name in self.MODEL_PARAMS:
+            return 'model'
+        return None
+
+    def _wrap_func(self, func: Callable) -> Callable:
+        """Wraps a function to handle parameter inspection and passing."""
+        sig = inspect.signature(func)
+        params = sig.parameters
+
+        def wrapped(trainer, pl_module, *args, **kwargs):
+            call_args = {}
+
+            for param_name, param in params.items():
+                param_type = self._get_param_type(param_name)
+
+                if param_type == 'trainer':
+                    call_args[param_name] = trainer
+                elif param_type == 'model':
+                    call_args[param_name] = pl_module
+                else:
+                    # If parameter name is not recognized, use position to determine
+                    if len(params) == 1:
+                        call_args[param_name] = pl_module
+                    elif len(params) == 2:
+                        if len(call_args) == 0:
+                            call_args[param_name] = trainer
+                        else:
+                            call_args[param_name] = pl_module
+                    else:
+                        raise ValueError(
+                            f"Unable to determine parameter mapping for '{param_name}'. "
+                            f"Please use recognized parameter names: "
+                            f"trainer/pl_trainer for trainer, "
+                            f"model/pl_model/pl_module/module for model."
+                        )
+
+            try:
+                return func(**call_args)
+            except TypeError as e:
+                raise TypeError(
+                    f"Failed to call callback function {func.__name__ if hasattr(func, '__name__') else func}. "
+                    f"Attempted to pass arguments: {call_args.keys()}. Error: {str(e)}"
+                ) from e
+
+        return wrapped

From 3d8bcc892ee2528532a9fa645dfe2478b40fc734 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Wed, 20 Nov 2024 12:03:39 -0500
Subject: [PATCH 32/48] move target module specification to performance recipe
 only (#11334)

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/recipes/llama31_405b.py          | 2 +-
 nemo/collections/llm/recipes/llama31_70b.py           | 2 +-
 nemo/collections/llm/recipes/llama31_8b.py            | 3 ++-
 nemo/collections/llm/recipes/llama3_70b.py            | 2 +-
 nemo/collections/llm/recipes/llama3_8b.py             | 3 ++-
 nemo/collections/llm/recipes/phi3_mini_4k_instruct.py | 1 -
 6 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py
index b1ef9719975c..d71f3791a0af 100644
--- a/nemo/collections/llm/recipes/llama31_405b.py
+++ b/nemo/collections/llm/recipes/llama31_405b.py
@@ -311,7 +311,6 @@ def finetune_recipe(
         recipe.peft = run.Config(LoRA)
         recipe.peft.dim = 16
         recipe.peft.alpha = 32
-        recipe.peft.target_modules = ['linear_qkv']
         recipe.optim.config.use_distributed_optimizer = False
 
         # some settings currently do not function correctly with LoRA
@@ -387,6 +386,7 @@ def finetune_performance_optimizations(
         recipe.trainer.strategy.tensor_model_parallel_size = 4
         recipe.trainer.strategy.pipeline_model_parallel_size = 6
         recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 7
+        recipe.peft.target_modules = ['linear_qkv']
 
     recipe.trainer.strategy.sequence_parallel = True
 
diff --git a/nemo/collections/llm/recipes/llama31_70b.py b/nemo/collections/llm/recipes/llama31_70b.py
index 6c0a108e10c7..37e809e5bc8f 100644
--- a/nemo/collections/llm/recipes/llama31_70b.py
+++ b/nemo/collections/llm/recipes/llama31_70b.py
@@ -314,7 +314,6 @@ def finetune_recipe(
         recipe.peft = run.Config(LoRA)
         recipe.peft.dim = 16
         recipe.peft.alpha = 32
-        recipe.peft.target_modules = ['linear_qkv']
         recipe.optim.config.use_distributed_optimizer = False
 
         # some settings currently do not function correctly with LoRA
@@ -388,6 +387,7 @@ def finetune_performance_optimizations(
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.trainer.strategy.pipeline_model_parallel_size = 4
         recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 5
+        recipe.peft.target_modules = ['linear_qkv']
 
     recipe.trainer.strategy.sequence_parallel = True
 
diff --git a/nemo/collections/llm/recipes/llama31_8b.py b/nemo/collections/llm/recipes/llama31_8b.py
index d2cbd454d483..32a77ce076f2 100644
--- a/nemo/collections/llm/recipes/llama31_8b.py
+++ b/nemo/collections/llm/recipes/llama31_8b.py
@@ -307,7 +307,6 @@ def finetune_recipe(
         recipe.peft = run.Config(LoRA)
         recipe.peft.dim = 8
         recipe.peft.alpha = 16
-        recipe.peft.target_modules = ['linear_qkv']
         recipe.optim.config.use_distributed_optimizer = False
 
         # some settings currently do not function correctly with LoRA
@@ -372,6 +371,8 @@ def finetune_performance_optimizations(
                 tp_comm_overlap=False,
             )
         )
+    else:
+        recipe.peft.target_modules = ['linear_qkv']
 
     recipe.trainer.callbacks.append(run.Config(TimingCallback))
     recipe.trainer.callbacks.append(
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
index 2b721b3f71c6..93aeb5c07dc1 100644
--- a/nemo/collections/llm/recipes/llama3_70b.py
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -311,7 +311,6 @@ def finetune_recipe(
         recipe.peft = run.Config(LoRA)
         recipe.peft.dim = 16
         recipe.peft.alpha = 32
-        recipe.peft.target_modules = ['linear_qkv']
         recipe.optim.config.use_distributed_optimizer = False
 
         # some settings currently do not function correctly with LoRA
@@ -385,6 +384,7 @@ def finetune_performance_optimizations(
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.trainer.strategy.pipeline_model_parallel_size = 4
         recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 5
+        recipe.peft.target_modules = ['linear_qkv']
 
     recipe.trainer.strategy.sequence_parallel = True
 
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
index 13e2d13cc21a..3e26f8fe1082 100644
--- a/nemo/collections/llm/recipes/llama3_8b.py
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -291,7 +291,6 @@ def finetune_recipe(
         recipe.peft = run.Config(LoRA)
         recipe.peft.dim = 8
         recipe.peft.alpha = 16
-        recipe.peft.target_modules = ['linear_qkv']
         recipe.optim.config.use_distributed_optimizer = False
 
         # some settings currently do not function correctly with LoRA
@@ -356,6 +355,8 @@ def finetune_performance_optimizations(
                 tp_comm_overlap=False,
             )
         )
+    else:
+        recipe.peft.target_modules = ['linear_qkv']
 
     recipe.trainer.callbacks.append(run.Config(TimingCallback))
     recipe.trainer.callbacks.append(
diff --git a/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py b/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
index 80ac60d054ce..1cbc877dc33e 100644
--- a/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
+++ b/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
@@ -264,7 +264,6 @@ def finetune_recipe(
         recipe.peft = run.Config(LoRA)
         recipe.peft.dim = 8
         recipe.peft.alpha = 16
-        recipe.peft.target_modules = ['linear_qkv']
         recipe.optim.config.use_distributed_optimizer = False
 
         # some settings currently do not function correctly with LoRA

From 4633db67a6536471e76c295d8bcc850ec5484cd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Wed, 20 Nov 2024 12:49:26 -0500
Subject: [PATCH 33/48] More robust tar file loading from AIStore (#11323)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
---
 .../common/data/lhotse/nemo_adapters.py       | 68 ++++++++++---------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index ee623f617e26..a34a2c074a11 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import logging
 import random
 import re
@@ -398,40 +397,43 @@ def basename(d: dict) -> str:
 
             shard_manifest: dict[str, list[dict]] = groupby(basename, self.shard_id_to_manifest[sid])
             tar_path = self.shard_id_to_tar_path[sid]
-            for data, raw_audio, tar_info in iter_fn(tar_path, shard_manifest, manifest_path):
-                meta = soundfile.info(BytesIO(raw_audio))
-                recording = Recording(
-                    id=tar_info.path,
-                    sources=[AudioSource(type="memory", channels=list(range(meta.channels)), source=raw_audio)],
-                    sampling_rate=int(meta.samplerate),
-                    num_samples=meta.frames,
-                    duration=meta.duration,
-                )
-                cuts_for_recording = []
-                for data in sorted(shard_manifest[tar_info.name], key=lambda d: d["audio_filepath"]):
-                    # Cut the recording into corresponding segment and discard audio data outside the segment.
-                    cut = make_cut_with_subset_inmemory_recording(
-                        recording, offset=data.get("offset", 0.0), duration=data.get("duration")
+            try:
+                for data, raw_audio, tar_info in iter_fn(tar_path, shard_manifest, manifest_path):
+                    meta = soundfile.info(BytesIO(raw_audio))
+                    recording = Recording(
+                        id=tar_info.path,
+                        sources=[AudioSource(type="memory", channels=list(range(meta.channels)), source=raw_audio)],
+                        sampling_rate=int(meta.samplerate),
+                        num_samples=meta.frames,
+                        duration=meta.duration,
                     )
-                    cut.supervisions.append(
-                        SupervisionSegment(
-                            id=cut.id,
-                            recording_id=cut.recording_id,
-                            start=0,
-                            duration=cut.duration,
-                            text=data.get(self.text_field),
-                            language=data.get(self.lang_field),
+                    cuts_for_recording = []
+                    for data in sorted(shard_manifest[tar_info.name], key=lambda d: d["audio_filepath"]):
+                        # Cut the recording into corresponding segment and discard audio data outside the segment.
+                        cut = make_cut_with_subset_inmemory_recording(
+                            recording, offset=data.get("offset", 0.0), duration=data.get("duration")
                         )
-                    )
-                    cut.custom = _to_custom_attr_dict(data)
-                    cut.manifest_origin = manifest_path
-                    cut.tar_origin = tar_path
-                    for extra_field in extra_fields:
-                        extra_field.attach_to(cut)
-                    cuts_for_recording.append(cut)
-                del recording  # free the memory - helps with very large audio files
-                del raw_audio
-                yield from cuts_for_recording
+                        cut.supervisions.append(
+                            SupervisionSegment(
+                                id=cut.id,
+                                recording_id=cut.recording_id,
+                                start=0,
+                                duration=cut.duration,
+                                text=data.get(self.text_field),
+                                language=data.get(self.lang_field),
+                            )
+                        )
+                        cut.custom = _to_custom_attr_dict(data)
+                        cut.manifest_origin = manifest_path
+                        cut.tar_origin = tar_path
+                        for extra_field in extra_fields:
+                            extra_field.attach_to(cut)
+                        cuts_for_recording.append(cut)
+                    del recording  # free the memory - helps with very large audio files
+                    del raw_audio
+                    yield from cuts_for_recording
+            except tarfile.ReadError:
+                logging.warning(f"Skipping tar file due to read errors (unstable storage or bad file?): {tar_path=}")
 
     def __len__(self) -> int:
         return len(self.source)

From 8b7999d64301468a3882c115d1ca0b6c3cae1974 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Thu, 21 Nov 2024 03:32:42 +0900
Subject: [PATCH 34/48] Use NCCL bootsrap backend for TP communication overlaps
 (#10622)

* Use NCCL bootsrap backend for TP communication overlaps

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: erhoo82 <erhoo82@users.noreply.github.com>

* fix for 2.0

Signed-off-by: Sangkug Lym <slym@nvidia.com>

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: erhoo82 <erhoo82@users.noreply.github.com>
Co-authored-by: erhoo82 <erhoo82@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_base_model.py  | 4 +++-
 .../nlp/models/language_modeling/megatron_gpt_model.py   | 1 +
 nemo/collections/nlp/parts/nlp_overrides.py              | 9 +++++++--
 nemo/lightning/_strategy_lib.py                          | 3 ++-
 .../lightning/pytorch/callbacks/megatron_comm_overlap.py | 7 +++++++
 5 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index dee36b255297..37ec8a82cef1 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -200,7 +200,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
             global_batch_size=cfg.get('global_batch_size'),
             rampup_batch_size=cfg.get('rampup_batch_size', None),
             use_fp8=cfg.get('fp8', False),
-            init_mpi_proc_group=cfg.get('ub_tp_comm_overlap', False),
+            init_mpi_proc_group=cfg.get('ub_tp_comm_overlap', False)
+            and cfg.get('ub_tp_comm_bootstrap_backend', 'nccl') == 'mpi',
             seed=self.cfg.get('seed', 1234),
             apex_transformer_log_level=self.cfg.get('apex_transformer_log_level', 30),
             use_te_rng_tracker=self.cfg.get('use_te_rng_tracker', False),
@@ -1173,6 +1174,7 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
             "grad_sync_func": None,  # set dynamically during training
             "param_sync_func": None,  # set dynamically during training
             "tp_comm_overlap": self.cfg.get('ub_tp_comm_overlap', False),
+            "tp_comm_bootstrap_backend": self.cfg.get('ub_tp_comm_bootstrap_backend', 'nccl'),
         }
 
         # instantitate ModelParallelConfig from this dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 0e1bd83dfe1c..f71b1ad13c6d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -803,6 +803,7 @@ def initialize_ub_func(self):
             tp_size=self.cfg.get('tensor_model_parallel_size'),
             use_fp8=self.cfg.get('fp8'),
             ub_cfgs=ub_cfgs,
+            bootstrap_backend=self.cfg.get('ub_tp_comm_bootstrap_backend', 'nccl'),
         )
         self.initialize_ub = False
 
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index c16116145a12..939f2067d7eb 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -176,9 +176,14 @@ def init_model_parallel(
             app_state.data_parallel_size = parallel_state.get_data_parallel_world_size()
             app_state.pipeline_model_parallel_group = parallel_state.get_pipeline_model_parallel_group()
 
-            # create MPI process group for UCX-based communication APIs
             if app_state.init_mpi_proc_group:
-                torch.distributed.new_group(backend='mpi')
+                import packaging
+
+                te_version = packaging.version.Version(version('transformer_engine'))
+                if te_version < packaging.version.Version("1.9"):
+                    # Create MPI process group for bootstrapping at old TE versions.
+                    # From TE version v1.9, the process group is initialized in TE.
+                    torch.distributed.new_group(backend='mpi')
 
 
 class NLPDDPStrategy(DDPStrategy):
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index ea6fa4c4d226..182454012d79 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -89,7 +89,8 @@ def init_parallel_ranks(
         seed=seed,
         pipeline_model_parallel_split_rank=getattr(parallel_config, "pipeline_model_parallel_split_rank", None),
         use_fp8=fp8,
-        init_mpi_proc_group=getattr(parallel_config, "tp_comm_overlap", False),
+        init_mpi_proc_group=getattr(parallel_config, "tp_comm_overlap", False)
+        and getattr(parallel_config, "tp_comm_bootstrap_backend", None) == 'mpi',
         # apex_transformer_log_level=self.cfg.get('apex_transformer_log_level', 30),
     )
 
diff --git a/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py b/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py
index 58173642527a..172aeaeb855d 100644
--- a/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py
+++ b/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py
@@ -43,6 +43,7 @@ class _CommOverlapConfig:
     # Tensor parallel communication overlap (experimental)
     tp_comm_overlap: bool = None
     tp_comm_overlap_cfg: dict = None
+    tp_comm_bootstrap_backend: str = None
     # Pipeline parallel communication overlap
     overlap_p2p_comm: bool = None
     batch_p2p_comm: bool = None
@@ -88,6 +89,7 @@ def __init__(
         self,
         tp_comm_overlap: bool = None,
         tp_comm_overlap_cfg: TransformerLayerTPOverlapCfg = None,
+        tp_comm_bootstrap_backend: str = None,
         overlap_p2p_comm: bool = None,
         batch_p2p_comm: bool = None,
         overlap_grad_reduce: bool = None,
@@ -102,6 +104,7 @@ def __init__(
         self.user_comm_overlap_cfg = _CommOverlapConfig(
             tp_comm_overlap=tp_comm_overlap,
             tp_comm_overlap_cfg=tp_comm_overlap_cfg,
+            tp_comm_bootstrap_backend=tp_comm_bootstrap_backend,
             overlap_p2p_comm=overlap_p2p_comm,
             batch_p2p_comm=batch_p2p_comm,
             overlap_grad_reduce=overlap_grad_reduce,
@@ -114,6 +117,7 @@ def __init__(
         )
 
         self.tp_comm_overlap_cfg = None
+        self.tp_comm_bootstrap_backend = None
         self.need_tp_overlap_ub_init = False
 
     def _get_model_comm_overlap_cfgs(
@@ -129,6 +133,7 @@ def _get_model_comm_overlap_cfgs(
         # Optimizations disabled by default, can be overriden by user
         comm_overlap_cfg.tp_comm_overlap = False
         comm_overlap_cfg.tp_comm_overlap_cfg = None
+        comm_overlap_cfg.tp_comm_bootstrap_backend = None
         comm_overlap_cfg.defer_embedding_wgrad_compute = False
         comm_overlap_cfg.wgrad_deferral_limit = -1
 
@@ -216,6 +221,7 @@ def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str)
 
             if trainer.model.config.tp_comm_overlap:
                 self.tp_comm_overlap_cfg = comm_overlap_cfg.tp_comm_overlap_cfg
+                self.tp_comm_bootstrap_backend = comm_overlap_cfg.tp_comm_bootstrap_backend
                 self.need_tp_overlap_ub_init = True
 
         # Data parallel overlap is only available with the Megatron DDP and Distributed optimizer
@@ -258,6 +264,7 @@ def _init_te_userbuffers(self, model_parallel_cfg: ModelParallelConfig):
                 tp_size=parallel_state.get_tensor_model_parallel_world_size(),
                 use_fp8=fp8,
                 ub_cfgs=self.tp_comm_overlap_cfg,
+                bootstrap_backend=self.tp_comm_bootstrap_backend,
             )
         except Exception as error:
             raise Exception(f"Tensor parallel overlap: userbuffer initialization failed with {error}")

From 4e56c403e57d2f9f9dace8f9c0a758dc1e838255 Mon Sep 17 00:00:00 2001
From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:24:27 -0800
Subject: [PATCH 35/48] fix api (#11337)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 .../vision/modules/common/megatron/vision_transformer.py      | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/nemo/collections/vision/modules/common/megatron/vision_transformer.py b/nemo/collections/vision/modules/common/megatron/vision_transformer.py
index 80793067128c..2abaf6dfe224 100644
--- a/nemo/collections/vision/modules/common/megatron/vision_transformer.py
+++ b/nemo/collections/vision/modules/common/megatron/vision_transformer.py
@@ -169,6 +169,8 @@ def forward(
         self_attention_relative_position_bias=None,
         cross_attention_relative_position_bias=None,
         checkpoint_core_attention=False,
+        decoder_max_sequence_len=None,
+        encoder_max_sequence_len=None,
     ):
         # Self attention.
         if rotary_pos_emb is not None:
@@ -373,6 +375,8 @@ def forward(
         self_attention_relative_position_bias=None,
         cross_attention_relative_position_bias=None,
         checkpoint_core_attention=False,
+        decoder_max_sequence_len=None,
+        encoder_max_sequence_len=None,
     ):
         kwargs = locals()
         for key in ["self", "__class__"]:

From cbb7b173b32b702c2710c20dabb885362505affb Mon Sep 17 00:00:00 2001
From: Hainan Xu <hainan.xv@gmail.com>
Date: Wed, 20 Nov 2024 14:56:44 -0500
Subject: [PATCH 36/48] Making TDT models support all-positive durations
 (previously duration must contain 0) (#9656)

* change kernel file and relevant tests for allowing all positive durations for TDT

Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hainan-xv <hainan-xv@users.noreply.github.com>

* uncomment numba version check in test code

Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* address review comments

---------

Signed-off-by: Hainan Xu <hainanx@nvidia.com>
Signed-off-by: hainan-xv <hainan-xv@users.noreply.github.com>
Co-authored-by: Hainan Xu <hainanx@nvidia.com>
Co-authored-by: hainan-xv <hainan-xv@users.noreply.github.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com>
---
 .../utils/cuda_utils/gpu_rnnt_kernel.py       | 113 +++++++++++-------
 .../asr/numba/rnnt_loss/test_rnnt_pytorch.py  | 108 ++++++++++-------
 2 files changed, 137 insertions(+), 84 deletions(-)

diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py b/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py
index 4153af060941..219e9d0453b2 100644
--- a/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py
+++ b/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py
@@ -961,7 +961,9 @@ def compute_tdt_alphas_kernel(
             if t > 0 and t < T:
                 alphas[offset + t * maxU + u] = -INF
 
-                for i in range(1, num_durations):  # skip 0 since blank emission has to advance by at least one
+                for i in range(num_durations):
+                    if durations[i] == 0:  # skip 0 since blank emission has to advance by at least one
+                        continue
                     if t >= durations[i]:
                         alphas[offset + t * maxU + u] = rnnt_helper.log_sum_exp(
                             alphas[offset + t * maxU + u],  # the current alpha value
@@ -981,21 +983,26 @@ def compute_tdt_alphas_kernel(
         elif u < U:
             # when t == 0, we only consider the non-blank emission.
             if t == 0:
-                alphas[offset + u] = (
-                    alphas[offset + u - 1]  # alpha(t, u - 1)
-                    + logp(
-                        denom, acts, maxT, maxU, alphabet_size, b, t, u - 1, labels[u - 1]
-                    )  # logp of token emission
-                    - sigma  # logit under-normalization
-                    + logp_duration(
-                        duration_acts, maxT, maxU, num_durations, b, t, u - 1, 0
-                    )  # t = 0, so it must be duration = 0. Therefore the last argument passed to logp_duration() is 0.
-                )
+                if durations[0] == 0:
+                    alphas[offset + u] = (
+                        alphas[offset + u - 1]  # alpha(t, u - 1)
+                        + logp(
+                            denom, acts, maxT, maxU, alphabet_size, b, t, u - 1, labels[u - 1]
+                        )  # logp of token emission
+                        - sigma  # logit under-normalization
+                        + logp_duration(
+                            duration_acts, maxT, maxU, num_durations, b, t, u - 1, 0
+                        )  # t = 0, so it must be duration = 0. Therefore the last argument passed to logp_duration() is 0.
+                    )
+                else:
+                    alphas[offset + u] = -INF
 
             # now we have t != 0 and u != 0, and we need to consider both non-blank and blank emissions.
             elif t > 0 and t < T:
                 no_emit = -INF  # no_emit stores the score for all blank emissions.
-                for i in range(1, num_durations):
+                for i in range(num_durations):
+                    if durations[i] == 0:
+                        continue
                     if t >= durations[i]:
                         no_emit = rnnt_helper.log_sum_exp(
                             no_emit,  # current score
@@ -1012,7 +1019,7 @@ def compute_tdt_alphas_kernel(
                         break  # we can exit the loop early here, same as the case for u == 0 above.
 
                 emit = -INF  # emit stores the score for non-blank emissions.
-                for i in range(0, num_durations):
+                for i in range(num_durations):
                     if t >= durations[i]:
                         emit = rnnt_helper.log_sum_exp(
                             emit,  # current score
@@ -1037,16 +1044,21 @@ def compute_tdt_alphas_kernel(
     # After final sync, the forward log-likelihood can be computed as the summataion of
     # alpha(T - duration, U - 1) + logp(blank, duration | t - duration, U - 1), over different durations.
     if u == 0:
-        # first we consider duration = 1
-        loglike = (
-            alphas[offset + (T - 1) * maxU + U - 1]
-            + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_)
-            - sigma
-            + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, U - 1, 1)
-        )
+        # initialize with negative infinite and start add terms later
+        loglike = -INF
 
         # then we add the scores for duration > 1, if such durations are possible given the audio lengths.
-        for i in range(2, num_durations):
+        for i in range(num_durations):
+            if durations[i] == 0:
+                continue
+            if durations[i] == 1:
+                loglike = (
+                    alphas[offset + (T - 1) * maxU + U - 1]
+                    + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_)
+                    - sigma
+                    + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, U - 1, i)
+                )
+                continue
             if T >= durations[i]:
                 big_blank_loglike = (
                     alphas[offset + (T - durations[i]) * maxU + U - 1]
@@ -1122,11 +1134,18 @@ def compute_tdt_betas_kernel(
 
     # Initilize beta[b, t=T-1, u=U-1] for all b in B with log_probs[b, t=T-1, u=U-1, blank]
     if u == 0:
-        betas[offset + (T - 1) * maxU + U - 1] = (
-            logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_)
-            - sigma
-            + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, U - 1, 1)
-        )
+        if durations[0] == 1:
+            betas[offset + (T - 1) * maxU + U - 1] = (
+                logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_)
+                - sigma
+                + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, U - 1, 0)
+            )
+        elif durations[1] == 1:
+            betas[offset + (T - 1) * maxU + U - 1] = (
+                logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_)
+                - sigma
+                + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, U - 1, 1)
+            )
 
     # sync until all betas are initialized
     cuda.syncthreads()
@@ -1140,11 +1159,12 @@ def compute_tdt_betas_kernel(
             # u == U - 1, we only consider blank emissions.
             if t >= 0 and t + 1 < T:
                 betas[offset + t * maxU + U - 1] = -INF
-                for i in range(1, num_durations):
+                for i in range(num_durations):
                     # although similar, the computation for beta's is slightly more complex for boundary cases.
                     # the following two cases correspond to whether t is exactly certain duration away from T.
                     # and they have slightly different update rules.
-
+                    if durations[i] == 0:
+                        continue
                     if t + durations[i] < T:
                         betas[offset + t * maxU + U - 1] = rnnt_helper.log_sum_exp(
                             betas[offset + t * maxU + U - 1],
@@ -1172,17 +1192,24 @@ def compute_tdt_betas_kernel(
         elif u < U - 1:
             if t == T - 1:
                 # t == T - 1, so we only consider non-blank with duration 0. (Note, we can't have blank emissions with duration = 0)
-                betas[offset + (T - 1) * maxU + u] = (
-                    betas[offset + (T - 1) * maxU + u + 1]
-                    + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, u, labels[u])  # non-blank log prob
-                    + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, u, 0)  # log prob of duration 0
-                    - sigma
-                )
+                if durations[0] == 0:
+                    betas[offset + (T - 1) * maxU + u] = (
+                        betas[offset + (T - 1) * maxU + u + 1]
+                        + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, u, labels[u])  # non-blank log prob
+                        + logp_duration(
+                            duration_acts, maxT, maxU, num_durations, b, T - 1, u, 0
+                        )  # log prob of duration 0
+                        - sigma
+                    )
+                else:
+                    betas[offset + (T - 1) * maxU + u] = -INF
 
             elif t >= 0 and t < T - 1:
                 # now we need to consider both blank andnon-blanks. Similar to alphas, we first compute them separately with no_emit and emit.
                 no_emit = -INF
-                for i in range(1, num_durations):
+                for i in range(num_durations):
+                    if durations[i] == 0:
+                        continue
                     if t + durations[i] < T:
                         no_emit = rnnt_helper.log_sum_exp(
                             no_emit,
@@ -1193,7 +1220,7 @@ def compute_tdt_betas_kernel(
                         )
 
                 emit = -INF
-                for i in range(0, num_durations):
+                for i in range(num_durations):
                     if t + durations[i] < T:
                         emit = rnnt_helper.log_sum_exp(
                             emit,
@@ -1304,10 +1331,10 @@ def compute_tdt_grad_kernel(
                 logpk_label = denom[col] + acts[col * alphabet_size + labels[u]] - sigma
                 grad -= math.exp(alphas[col] + betas[col + 1 + durations[idx] * maxU] + logpk_label - logll[mb])
 
-            if t + durations[idx] < T and idx > 0:  # for blank in the middle
+            if t + durations[idx] < T and durations[idx] > 0:  # for blank in the middle
                 grad -= math.exp(alphas[col] + betas[col + durations[idx] * maxU] + logpk_blank - logll[mb])
 
-            if t + durations[idx] == T and idx >= 1 and u == U - 1:  # for blank as the last symbol
+            if t + durations[idx] == T and u == U - 1 and durations[idx] > 0:  # for blank as the last symbol
                 grad -= math.exp(alphas[col] + logpk_blank - logll[mb])
 
             grad = grad * math.exp(duration_acts[col * num_durations + idx])
@@ -1335,7 +1362,7 @@ def compute_tdt_grad_kernel(
             if fastemit_lambda > 0.0 and u < U - 1:
                 fastemit_grad = 0.0
 
-                for i in range(0, num_durations):
+                for i in range(num_durations):
                     if t + durations[i] < T:
                         fastemit_grad += fastemit_lambda * math.exp(
                             alphas[col]  # alphas(t, u)
@@ -1355,7 +1382,9 @@ def compute_tdt_grad_kernel(
             # grad to last blank transition
             # grad[b, T-1, U-1, v=blank] -= exp(alphas[b, t, u] + logpk - sigma - logll[b] + logp(duration) for all possible non-zero durations.
             if idx == blank_ and u == U - 1:
-                for i in range(1, num_durations):
+                for i in range(num_durations):
+                    if durations[i] == 0:
+                        continue
                     if t == T - durations[i]:
                         grad -= math.exp(
                             alphas[col] + logpk - sigma - logll[mb] + duration_acts[col * num_durations + i]
@@ -1364,7 +1393,9 @@ def compute_tdt_grad_kernel(
             # grad of blank across t < T;
             # grad[b, t<T-1, u, v=blank] -= exp(alphas[b, t, u] + logpk - sigma + logp_duration - logll[b] + betas[b, t + duration, u]) for all non-zero durations
             if idx == blank_:
-                for i in range(1, num_durations):
+                for i in range(num_durations):
+                    if durations[i] == 0:
+                        continue
                     if t < T - durations[i]:
                         grad -= math.exp(
                             alphas[col]
diff --git a/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py b/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py
index 1a29a14f540d..9f38bf6dbe8a 100644
--- a/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py
+++ b/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py
@@ -33,6 +33,7 @@
 if torch.cuda.is_available():
     DEVICES.append('cuda')
 
+CUDA_ONLY_DEVICE = ['cuda']
 
 DTYPES = [np.float32]
 if numba_utils.is_numba_cuda_fp16_supported():
@@ -542,65 +543,86 @@ def test_case_randomized_act_label(self, device):
 
 class TestTDTLoss:
     @pytest.mark.unit
-    @pytest.mark.parametrize('device', DEVICES)
+    @pytest.mark.parametrize('device', CUDA_ONLY_DEVICE)
     def test_case_randomized_act_label(self, device):
-        if device == 'cuda':
-            numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__)
+        numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__)
 
-            B, T, U, V = 4, 8, 4, 8  # here V is number of non blank labels
-            durations = [0, 1, 2, 3, 4, 5]
-            sigma = 0.05
+        B, T, U, V = 4, 8, 4, 8  # here V is number of non blank labels
+        durations = [0, 1, 2, 3, 4, 5]
+        sigma = 0.05
 
-            acts = torch.rand([B, T, U, V + 1 + len(durations)])
-            labels = [[random.randrange(0, V) for i in range(U - 1)] for j in range(B)]
+        acts = torch.rand([B, T, U, V + 1 + len(durations)])
+        labels = [[random.randrange(0, V) for i in range(U - 1)] for j in range(B)]
 
-            fn_pt = TDTLossNumba(blank=V, reduction='sum', durations=durations, sigma=sigma)
-            pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device)
+        fn_pt = TDTLossNumba(blank=V, reduction='sum', durations=durations, sigma=sigma)
+        pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device)
 
-            fn_ag = TDTLossPytorch(
-                blank=V, reduction='sum', durations=durations, sigma=sigma
-            )  # ag for automatic gradient computation
-            ag_cost, ag_grads = wrap_and_call(fn_ag, acts, labels, device)
+        fn_ag = TDTLossPytorch(
+            blank=V, reduction='sum', durations=durations, sigma=sigma
+        )  # ag for automatic gradient computation
+        ag_cost, ag_grads = wrap_and_call(fn_ag, acts, labels, device)
 
-            assert np.allclose(pt_cost, ag_cost, rtol=1e-6), "tdt costs mismatch."
-            assert np.allclose(pt_grads, ag_grads, rtol=1e-2), "td gradient mismatch."
+        assert np.allclose(pt_cost, ag_cost, rtol=1e-6), "tdt costs mismatch."
+        assert np.allclose(pt_grads, ag_grads, rtol=1e-2), "td gradient mismatch."
 
     @pytest.mark.unit
-    @pytest.mark.parametrize('device', DEVICES)
+    @pytest.mark.parametrize('device', CUDA_ONLY_DEVICE)
+    def test_case_randomized_act_label_no_0_duration(self, device):
+        numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__)
+
+        B, T, U, V = 4, 8, 4, 8  # here V is number of non blank labels
+        durations = [1, 2, 3, 4, 5]
+        sigma = 0.05
+
+        acts = torch.rand([B, T, U, V + 1 + len(durations)])
+        labels = [[random.randrange(0, V) for i in range(U - 1)] for j in range(B)]
+
+        fn_pt = TDTLossNumba(blank=V, reduction='sum', durations=durations, sigma=sigma)
+        pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device)
+
+        fn_ag = TDTLossPytorch(
+            blank=V, reduction='sum', durations=durations, sigma=sigma
+        )  # ag for automatic gradient computation
+        ag_cost, ag_grads = wrap_and_call(fn_ag, acts, labels, device)
+
+        assert np.allclose(pt_cost, ag_cost, rtol=1e-6), "tdt costs mismatch."
+        assert np.allclose(pt_grads, ag_grads, rtol=1e-2), "td gradient mismatch."
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('device', CUDA_ONLY_DEVICE)
     def test_case_fixed_case_act_label(self, device):
-        if device == 'cuda':
-            numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__)
+        numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__)
 
-            B, T, U, V = 1, 3, 2, 3  # here V is number of non blank labels
-            durations = [0, 1, 2]
-            sigma = 0.05
+        B, T, U, V = 1, 3, 2, 3  # here V is number of non blank labels
+        durations = [0, 1, 2]
+        sigma = 0.05
 
-            acts = torch.zeros([B, T, U, V + 1 + len(durations)])
-            labels = [[(i + j) % (V - 1) for i in range(U - 1)] for j in range(B)]
+        acts = torch.zeros([B, T, U, V + 1 + len(durations)])
+        labels = [[(i + j) % (V - 1) for i in range(U - 1)] for j in range(B)]
 
-            fn_pt = TDTLossNumba(blank=V, reduction='sum', durations=durations, sigma=sigma)
-            pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device)
+        fn_pt = TDTLossNumba(blank=V, reduction='sum', durations=durations, sigma=sigma)
+        pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device)
 
-            expected_cost = 4.155739
-            expected_grads = [
+        expected_cost = 4.155739
+        expected_grads = [
+            [
                 [
-                    [
-                        [-0.64962804, 0.25, 0.25, 0.14962798, 0.2672583, -0.16792619, -0.09933221],
-                        [0.01651875, 0.01651875, 0.01651875, -0.04955626, 0.022025, -0.01227201, -0.009753],
-                    ],
-                    [
-                        [-0.04892651, 0.01714851, 0.01714851, 0.01462949, -0.01143234, -0.01143234, 0.02286467],
-                        [0.12531489, 0.12531489, 0.12531489, -0.37594467, 0.16708651, 0.13027048, -0.29735702],
-                    ],
-                    [
-                        [-0.02572276, 0.00857425, 0.00857425, 0.00857425, -0.02286468, 0.01143234, 0.01143234],
-                        [0.13388914, 0.13388914, 0.13388914, -0.40166742, 0.17851885, -0.35703772, 0.17851885],
-                    ],
-                ]
+                    [-0.64962804, 0.25, 0.25, 0.14962798, 0.2672583, -0.16792619, -0.09933221],
+                    [0.01651875, 0.01651875, 0.01651875, -0.04955626, 0.022025, -0.01227201, -0.009753],
+                ],
+                [
+                    [-0.04892651, 0.01714851, 0.01714851, 0.01462949, -0.01143234, -0.01143234, 0.02286467],
+                    [0.12531489, 0.12531489, 0.12531489, -0.37594467, 0.16708651, 0.13027048, -0.29735702],
+                ],
+                [
+                    [-0.02572276, 0.00857425, 0.00857425, 0.00857425, -0.02286468, 0.01143234, 0.01143234],
+                    [0.13388914, 0.13388914, 0.13388914, -0.40166742, 0.17851885, -0.35703772, 0.17851885],
+                ],
             ]
+        ]
 
-            assert np.allclose(pt_cost, expected_cost, rtol=1e-6), "tdt costs mismatch."
-            assert np.allclose(pt_grads, expected_grads, rtol=1e-2), "td gradient mismatch."
+        assert np.allclose(pt_cost, expected_cost, rtol=1e-6), "tdt costs mismatch."
+        assert np.allclose(pt_grads, expected_grads, rtol=1e-2), "td gradient mismatch."
 
 
 if __name__ == "__main__":

From 05b7d4f711bbf3ebf260095ac56ab45cdcf099a8 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Wed, 20 Nov 2024 15:37:50 -0800
Subject: [PATCH 37/48] Add support for restoring from 2.0 checkpoint in 1.0
 (#11347)

* Add support for restoring from 2.0 checkpoint in 1.0

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 939f2067d7eb..73263896af82 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -382,7 +382,7 @@ def save_checkpoint(
         self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None
     ) -> None:
         app_state = AppState()
-        """ PTL method which we override to accomodate distributed checkpoints and 
+        """ PTL method which we override to accomodate distributed checkpoints and
             the legacy model parallel checkpoints.
 
             When using megatron core, the distributed checkpointing library expects save functions to be
@@ -1275,6 +1275,7 @@ def restore_from(
         return_config: bool = False,
         trainer: Trainer = None,
         validate_access_integrity: bool = True,
+        replace_sharded_tensor_key: Optional[str] = None,
     ):
         """
         Restores model instance (weights and configuration) into .nemo file
@@ -1362,6 +1363,9 @@ def dummy():
                 checkpoint = {}
                 sharded_state_dict = instance.sharded_state_dict()
                 checkpoint['state_dict'] = sharded_state_dict
+                if replace_sharded_tensor_key:
+                    for v in checkpoint["state_dict"].values():
+                        v.key = v.key.replace("model", replace_sharded_tensor_key)
 
                 checkpoint_io = DistributedCheckpointIO.from_config(conf)
                 checkpoint = checkpoint_io.load_checkpoint(

From d19140e74c678343faf4f21e804a673088394f14 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Wed, 20 Nov 2024 20:12:50 -0800
Subject: [PATCH 38/48] Adding alinger export (#11269)

* Adding alinger export

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Formatting

* Formatting

* Formatting

* Convert

* Fix docs

---------

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py | 389 ++++++++++++++++++++++++++++++------
 1 file changed, 325 insertions(+), 64 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index a1e6cb0e03c4..8f2b0db20341 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -30,10 +30,11 @@
 import wrapt
 from tensorrt_llm._utils import numpy_to_torch
 
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.deploy import ITritonDeployable
 from nemo.export.tarutils import TarPath, unpack_tarball
 from nemo.export.trt_llm.converter.model_converter import model_to_trtllm_ckpt
-from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import dist_model_to_trt_llm_ckpt
+from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import dist_model_to_trt_llm_ckpt, get_layer_prefix
 from nemo.export.trt_llm.converter.utils import init_model_parallel_from_nemo
 from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import (
     build_tokenizer,
@@ -65,6 +66,8 @@
 
 @wrapt.decorator
 def noop_decorator(func):
+    """No op decorator"""
+
     def wrapper(*args, **kwargs):
         return func(*args, **kwargs)
 
@@ -80,6 +83,7 @@ def wrapper(*args, **kwargs):
     use_pytriton = False
 
 
+# pylint: disable=line-too-long
 class TensorRTLLM(ITritonDeployable):
     """
     Exports nemo checkpoints to TensorRT-LLM and run fast inference.
@@ -347,43 +351,14 @@ def export(
                         DEFAULT_CONVERSION_DICT,
                     )
                     from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
-                    from megatron.core.transformer.transformer_config import TransformerConfig
                     from tensorrt_llm.layers import MoeConfig
 
-                    def get_transformer_config(nemo_model_config):
-                        normalization = nemo_model_config.get('normalization', 'layernorm')
-                        transformer_config_normalization = 'LayerNorm'
-                        layernorm_zero_centered_gamma = False
-                        if normalization == 'layernorm1p':
-                            layernorm_zero_centered_gamma = True
-                        elif normalization == 'rmsnorm':
-                            transformer_config_normalization = 'RMSNorm'
-
-                        conf = TransformerConfig(
-                            num_layers=nemo_model_config.get('num_layers'),
-                            moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
-                            num_attention_heads=nemo_model_config.get('num_attention_heads'),
-                            num_query_groups=nemo_model_config.get(
-                                'num_query_groups', nemo_model_config['num_attention_heads']
-                            ),
-                            kv_channels=nemo_model_config.get("kv_channels", None),
-                            hidden_size=nemo_model_config.get('hidden_size'),
-                            ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
-                            layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
-                            add_bias_linear=nemo_model_config.get('bias'),
-                            num_moe_experts=nemo_model_config.get('num_moe_experts', None),
-                            normalization=transformer_config_normalization,
-                            layernorm_zero_centered_gamma=layernorm_zero_centered_gamma,
-                        )
-
-                        return conf
-
                     # We build the transformer config using the nemo model config.
-                    transformer_config = get_transformer_config(model_configs)
+                    transformer_config = self.get_transformer_config(model_configs)
                     input_model_type = getattr(ModelType, model_type)
 
                     # MCore export supports some default conversion dictionaries
-                    mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT[input_model_type]
+                    mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT
                     # All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models start with "model.decoder.layers.4.blahblah". so we append model. to the keys
                     nemo_model_conversion_dict = {
                         f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
@@ -524,6 +499,34 @@ def get_transformer_config(nemo_model_config):
         if load_model:
             self._load()
 
+    def get_transformer_config(self, nemo_model_config):
+        """Given nemo model config get transformer config"""
+        from megatron.core.transformer.transformer_config import TransformerConfig
+
+        normalization = nemo_model_config.get('normalization', 'layernorm')
+        transformer_config_normalization = 'LayerNorm'
+        layernorm_zero_centered_gamma = False
+        if normalization == 'layernorm1p':
+            layernorm_zero_centered_gamma = True
+        elif normalization == 'rmsnorm':
+            transformer_config_normalization = 'RMSNorm'
+
+        conf = TransformerConfig(
+            num_layers=nemo_model_config.get('num_layers'),
+            moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
+            num_attention_heads=nemo_model_config.get('num_attention_heads'),
+            num_query_groups=nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
+            kv_channels=nemo_model_config.get("kv_channels", None),
+            hidden_size=nemo_model_config.get('hidden_size'),
+            ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
+            layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
+            add_bias_linear=nemo_model_config.get('bias'),
+            num_moe_experts=nemo_model_config.get('num_moe_experts', None),
+            normalization=transformer_config_normalization,
+            layernorm_zero_centered_gamma=layernorm_zero_centered_gamma,
+        )
+        return conf
+
     def convert_to_safe_tensors(
         self,
         nemo_checkpoint_path: str,
@@ -536,6 +539,7 @@ def convert_to_safe_tensors(
         use_embedding_sharing: bool = False,
         dtype: str = "bfloat16",
     ):
+        """Convert to safe tensor"""
         gpus_per_node = tensor_parallelism_size if gpus_per_node is None else gpus_per_node
 
         if Path(self.model_dir).exists():
@@ -601,6 +605,167 @@ def convert_to_safe_tensors(
         if tensorrt_llm.mpi_world_size() > 1:
             tensorrt_llm.mpi_barrier()
 
+    def gather_and_reshard_model(self, model_config, model, storage_dtype):
+        """
+        Accumulate all vp model chunks together, and reshard model (i.e) gather all pp ranks
+        if required and return the final model state dict
+        """
+
+        def _get_layer_index(split_key):
+            for index, key in enumerate(split_key):
+                if key == "layers":
+                    return index + 1
+            raise ValueError(f"Unknown layer name format: {split_key}")
+
+        def rename_layer_num(param_name, layer_num):
+            split_key = param_name.split(".")
+            layer_index = int(_get_layer_index(split_key))
+            split_key[layer_index] = str(layer_num)
+            return ".".join(split_key)
+
+        def get_layer_num(param_name):
+            split_key = param_name.split(".")
+            layer_index = int(_get_layer_index(split_key))
+            return int(split_key[layer_index])
+
+        from megatron.core import parallel_state
+
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        pp_first_rank = parallel_state.get_pipeline_model_parallel_first_rank()
+        pp_last_rank = parallel_state.get_pipeline_model_parallel_last_rank()
+        pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+        pp_group = parallel_state.get_pipeline_model_parallel_group()
+        vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+        if not vp_size:
+            vp_size = 1
+
+        inference_tp_size = self.tp_size
+        inference_pp_size = self.pp_size
+        reshard_model = False
+        if inference_tp_size != tp_size or inference_pp_size != pp_size:
+            LOGGER.info("Training/Generation model parallelism resharding enabled")
+            if inference_pp_size == 1 and pp_size > 1 and inference_tp_size == tp_size:
+                reshard_model = True
+            else:
+                raise NotImplementedError(
+                    f"NeMo currently only supports PP>1 -> PP=1 resharding, other types of resharding will come in future releases."
+                )
+
+        num_layers = model_config["num_layers"]
+        layers_per_pp = num_layers // pp_size
+        layers_per_chunk = layers_per_pp // vp_size
+
+        tl_params = {}
+        model_level_params = {}
+        if vp_size > 1:  # consolidate params across model chunks
+            for idx, model_chunk in enumerate(model):
+                for key, val in model_chunk.state_dict().items():
+                    if torch.is_tensor(val):
+                        if 'layers' in key:
+                            key2 = rename_layer_num(key, get_layer_num(key) + idx * pp_size * layers_per_chunk)
+                            tl_params[key2] = val
+                        else:
+                            model_level_params[key] = val
+        else:
+            for key, val in model.state_dict().items():
+                if torch.is_tensor(val):
+                    if 'decoder.layers' in key:
+                        tl_params[key] = val
+                    else:
+                        model_level_params[key] = val
+
+        if vp_size > 1 or reshard_model:
+            # gather layers across pp ranks
+            gathered_params = {}
+            for key, val in tl_params.items():
+                weight_list = [torch.zeros_like(val) for _ in range(pp_size)]
+                torch.distributed.all_gather(weight_list, val, group=pp_group)
+                for idx in range(pp_size):
+                    layer_num = get_layer_num(key) + idx * layers_per_chunk
+                    key2 = rename_layer_num(key, layer_num)
+                    if not reshard_model:  # Save only layers of 1 single PP stage
+                        layers_start = layers_per_pp * pp_rank
+                        layers_end = layers_per_pp * (pp_rank + 1) - 1
+                        if layer_num >= layers_start and layer_num <= layers_end:
+                            key2 = rename_layer_num(key, layer_num % layers_per_pp)
+                            gathered_params[key2] = weight_list[idx]
+                    else:
+                        gathered_params[key2] = weight_list[idx]
+            tl_params = gathered_params
+
+        model_state_dict = model_level_params
+        model_state_dict.update(tl_params)
+
+        def get_tensor_if_available(key, pp_src_idx, group):
+            tensor = model_state_dict.get(key)
+            if tensor is not None:
+                tensor_shape = [tensor.shape]
+            else:
+                tensor_shape = [None]
+
+            torch.distributed.broadcast_object_list(tensor_shape, pp_src_idx, group=group)
+
+            if tensor_shape[0] is None:
+                return None
+            if torch.distributed.get_rank() != pp_src_idx:
+                tensor = torch.empty(tensor_shape[0], dtype=storage_dtype).cuda()
+
+            torch.distributed.broadcast(tensor.contiguous(), pp_src_idx, group=pp_group)
+            return tensor
+
+        if reshard_model:
+            key = 'decoder.final_layernorm.weight'
+            tensor = get_tensor_if_available(key, pp_last_rank, pp_group)
+            if tensor is not None:
+                model_state_dict[key] = tensor
+
+            key = 'decoder.final_layernorm.bias'
+            tensor = get_tensor_if_available(key, pp_last_rank, pp_group)
+            if tensor is not None:
+                model_state_dict[key] = tensor
+
+            key = 'embedding.word_embeddings.weight'
+            tensor = get_tensor_if_available(key, pp_first_rank, pp_group)
+            if tensor is not None:
+                model_state_dict[key] = tensor
+
+            key = 'output_layer.weight'
+            tensor = get_tensor_if_available(key, pp_last_rank, pp_group)
+            if tensor is not None:
+                model_state_dict[key] = tensor
+
+        return model_state_dict
+
+    def get_input_dtype(self, storage_dtype):
+        """
+        Return mcore export dtype given torch dtype
+        """
+        from megatron.core.export.data_type import DataType
+
+        if storage_dtype == torch.bfloat16:
+            return DataType.bfloat16
+        elif storage_dtype == torch.float32:
+            return DataType.float32
+        elif storage_dtype == torch.float16:
+            return DataType.float16
+
+    def get_nemo_to_trtllm_conversion_dict(self, model_state_dict):
+        """MCore export supports some default conversion dictionaries
+        All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models sometimes start with "model.decoder.layers.4.blahblah". so we append model prefix. to the keys
+        """
+        from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT
+
+        model_prefix, _ = get_layer_prefix(layer_names=model_state_dict.keys(), is_mcore=True)
+
+        nemo_model_conversion_dict = {}
+        for key, value in DEFAULT_CONVERSION_DICT.items():
+            if 'layers' in key:
+                nemo_model_conversion_dict[f'{model_prefix}.{key}'] = value
+            else:
+                nemo_model_conversion_dict[key] = value
+        return nemo_model_conversion_dict
+
     def build(
         self,
         model,
@@ -613,6 +778,7 @@ def build(
         max_batch_size: int = 4,
         use_refit: bool = True,
         reshard_model: bool = False,
+        use_mcore_path: bool = True,
     ):
         """
         Convert a model parallel nemo model to TensorRT-LLM.
@@ -627,31 +793,103 @@ def build(
         if self.dp_size > 1:
             self.model_dir = os.path.join(self.model_dir, f"dp_rank{self.dp_rank}")
 
-        weights, model_config = model_to_trtllm_ckpt(
-            model=model,
-            nemo_model_config=model_config,
-            nemo_export_dir=self.model_dir,
-            decoder_type=model_type,
-            tensor_parallel_size=self.tp_size,
-            pipeline_parallel_size=self.pp_size,
-            gpus_per_node=gpus_per_node,
-            use_parallel_embedding=True,
-            use_distributed_convert=True,
-            model_parallel_rank=self.mp_rank,
-            vocab_size=self.tokenizer.vocab_size,
-        )
+        if use_mcore_path:
+            from megatron.core.export.model_type import ModelType
+            from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+            from tensorrt_llm.layers import MoeConfig
+
+            storage_dtype = torch_dtype_from_precision(model_config.precision)
+            model_state_dict = self.gather_and_reshard_model(model_config, model, storage_dtype)
+            # We build the transformer config using the nemo model config.
+            transformer_config = self.get_transformer_config(model_config)
+            input_model_type = getattr(ModelType, model_type)
+
+            nemo_model_conversion_dict = self.get_nemo_to_trtllm_conversion_dict(model_state_dict)
+
+            self.trtllm_helper = TRTLLMHelper(
+                transformer_config=transformer_config,
+                model_type=input_model_type,
+                trtllm_conversion_dict=nemo_model_conversion_dict,
+                position_embedding_type=model_config.get('position_embedding_type'),
+                max_position_embeddings=model_config.get('max_position_embeddings'),
+                rotary_percentage=model_config.get('rotary_percentage', 1.0),
+                rotary_base=model_config.get('rotary_base', 10000),
+                moe_tp_mode=model_config.get('moe_tp_mode', 2),
+                multi_query_mode=model_config.get("multi_query_mode", False),
+                activation=model_config.get('activation', "gelu"),
+                seq_len_interpolation_factor=model_config.get("seq_len_interpolation_factor"),
+                moe_renorm_mode=model_config.get(
+                    'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+                ),
+                share_embeddings_and_output_weights=model_config.get("share_embeddings_and_output_weights", False),
+            )
+
+            input_dtype = self.get_input_dtype(storage_dtype)
+
+            trtllm_model_weights_list, trtllm_model_config_list = (
+                self.trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                    model_state_dict=model_state_dict,
+                    dtype=input_dtype,
+                    state_dict_split_by_layer_numbers=True,
+                    on_device_distributed_conversion=True,
+                    vocab_size=self.tokenizer.vocab_size,
+                    gpus_per_node=gpus_per_node,
+                )
+            )
+            trtllm_model_config = trtllm_model_config_list[0]
+            trtllm_model_weights = trtllm_model_weights_list[0]
+
+            if reshard_model:
+                assert self.pp_size == 1, 'Reshard is true, but pp size is not one'
+                # MCORE Export will use parallel_state to determine pp .
+                # Since we reshard to pp = 1, we need to modify the config and mapping
+                world_size = self.tp_size * self.pp_size
+                trtllm_model_config.pp_size = self.pp_size
+                trtllm_model_config.world_size = world_size
+                trtllm_model_config.mapping = tensorrt_llm.Mapping(
+                    world_size=world_size,
+                    rank=self.mp_rank,
+                    tp_size=self.tp_size,
+                    pp_size=self.pp_size,
+                )
+
+            engine = self.trtllm_helper.build_and_save_engine(
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_seq_len=max_input_len + max_output_len,
+                max_batch_size=max_batch_size,
+                trtllm_model_config=trtllm_model_config,
+                trtllm_model_weights=trtllm_model_weights,
+                engine_dir=self.model_dir,
+                use_refit=use_refit,
+            )
+        else:
+            weights, model_config = model_to_trtllm_ckpt(
+                model=model,
+                nemo_model_config=model_config,
+                nemo_export_dir=self.model_dir,
+                decoder_type=model_type,
+                tensor_parallel_size=self.tp_size,
+                pipeline_parallel_size=self.pp_size,
+                gpus_per_node=gpus_per_node,
+                use_parallel_embedding=True,
+                use_distributed_convert=True,
+                model_parallel_rank=self.mp_rank,
+                vocab_size=self.tokenizer.vocab_size,
+            )
+
+            engine = build_and_save_engine(
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_seq_len=max_input_len + max_output_len,
+                max_batch_size=max_batch_size,
+                model_config=model_config[0],
+                model_weights=weights[0],
+                model_dir=self.model_dir,
+                model_type=model_type,
+                use_refit=use_refit,
+            )
 
-        engine = build_and_save_engine(
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_seq_len=max_input_len + max_output_len,
-            max_batch_size=max_batch_size,
-            model_config=model_config[0],
-            model_weights=weights[0],
-            model_dir=self.model_dir,
-            model_type=model_type,
-            use_refit=use_refit,
-        )
         torch.distributed.barrier()
 
         cfg_path = Path(os.path.join(self.model_dir, f'config_{torch.distributed.get_rank()}.json'))
@@ -660,18 +898,33 @@ def build(
 
         load_distributed(self.model_dir, self.mp_rank, gpus_per_node)
 
-    def refit(self, model, model_config):
+    def refit(self, model, model_config, use_mcore_path=True):
         """
         Refits an TensorRT engine using an instantiated nemo model.
         This function should only be used after calling build()
         """
-        weights_dict = dist_model_to_trt_llm_ckpt(
-            model=model,
-            nemo_model_config=model_config,
-            inference_tp_size=self.tp_size,
-            inference_pp_size=self.pp_size,
-            tokenizer_vocab_size=self.tokenizer.vocab_size,
-        )
+        weights_dict = None
+        if use_mcore_path:
+            storage_dtype = torch_dtype_from_precision(model_config.precision)
+
+            model_state_dict = self.gather_and_reshard_model(model_config, model, storage_dtype)
+
+            nemo_model_conversion_dict = self.get_nemo_to_trtllm_conversion_dict(model_state_dict)
+            self.trtllm_helper.weights_converter.convert(
+                model_state_dict=model_state_dict,
+                tokenizer_vocab_size=self.tokenizer.vocab_size,
+                trtllm_conversion_dict=nemo_model_conversion_dict,
+            )
+            weights_dict = self.trtllm_helper.weights_converter.trtllm_model_weights
+
+        else:
+            weights_dict = dist_model_to_trt_llm_ckpt(
+                model=model,
+                nemo_model_config=model_config,
+                inference_tp_size=self.tp_size,
+                inference_pp_size=self.pp_size,
+                tokenizer_vocab_size=self.tokenizer.vocab_size,
+            )
         load_distributed(self.model_dir, self.mp_rank, self.gpus_per_node)
         gc.collect()
         torch.cuda.empty_cache()
@@ -815,6 +1068,7 @@ def forward(
                 )
 
     def add_prompt_table(self, task_name: str, prompt_embeddings_checkpoint_path: str):
+        """Add prompt table"""
         if self.model is None:
             raise Exception(
                 "A nemo checkpoint should be exported to TensorRT-LLM and "
@@ -836,6 +1090,7 @@ def add_prompt_table(self, task_name: str, prompt_embeddings_checkpoint_path: st
         self._prep_ptuning_table()
 
     def remove_prompt_table(self, task_name: str):
+        """Remove prompt table"""
         if self.ptuning_tables is not None:
             for i in range(len(self.ptuning_tables)):
                 if self.ptuning_tables[i]["task_name"] == task_name:
@@ -847,11 +1102,13 @@ def remove_prompt_table(self, task_name: str):
 
     @property
     def get_supported_models_list(self):
+        """Supported model list"""
         # gpt and gptnext are the same. Keeping the gptnext due to backward compatibility.
         return ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma"]
 
     @property
     def get_hidden_size(self):
+        """Get hidden size"""
         if self.config is None:
             return None
         else:
@@ -859,6 +1116,7 @@ def get_hidden_size(self):
 
     @property
     def get_triton_input(self):
+        """Get triton input"""
         inputs = (
             Tensor(name="prompts", shape=(-1,), dtype=bytes),
             Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
@@ -885,6 +1143,7 @@ def get_triton_output(self):
 
     @batch
     def triton_infer_fn(self, **inputs: np.ndarray):
+        """Triton infer function for streaming"""
         output_dict = {}
         try:
             infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
@@ -929,6 +1188,7 @@ def triton_infer_fn(self, **inputs: np.ndarray):
 
     @batch
     def triton_infer_fn_streaming(self, **inputs: np.ndarray):
+        """Triton infer function for streaming"""
         try:
             infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
             if "max_output_len" in inputs:
@@ -1138,4 +1398,5 @@ def _load(self):
                     ) from error
 
     def unload_engine(self):
+        """Unload engine"""
         unload_engine()

From 5e5fc4af2156747bafeb7e38a72cca18bc761123 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Thu, 21 Nov 2024 03:53:09 -0800
Subject: [PATCH 39/48] pass trust_remote_code to AutoTokenizer (#11343)

* pass trust_remote_code to AutoTokenizer

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../llm/gpt/model/hf_auto_model_for_causal_lm.py            | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
index be6b5e708604..c0f02d706ceb 100644
--- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
+++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -55,7 +55,7 @@ def __init__(
     @property
     def tokenizer(self):
         if self._tokenizer is None:
-            self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(self.model_name)
+            self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(self.model_name, self.trust_remote_code)
         return self._tokenizer
 
     @tokenizer.setter
@@ -64,8 +64,8 @@ def tokenizer(self, value):
         self._tokenizer = value
 
     @staticmethod
-    def configure_tokenizer(model_name):
-        return AutoTokenizer(model_name)
+    def configure_tokenizer(model_name, trust_remote_code=False):
+        return AutoTokenizer(model_name, trust_remote_code=trust_remote_code)
 
     def configure_model(self):
         # create all your layers here

From dabd47f357130285cc71f1385aa8499a5a2f6f3c Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Thu, 21 Nov 2024 07:43:34 -0500
Subject: [PATCH 40/48] fix layer replacement (#11356)

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
---
 .../lightning/pytorch/accelerate/transformer_engine.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/nemo/lightning/pytorch/accelerate/transformer_engine.py b/nemo/lightning/pytorch/accelerate/transformer_engine.py
index a96a9590ea75..8e621352d099 100755
--- a/nemo/lightning/pytorch/accelerate/transformer_engine.py
+++ b/nemo/lightning/pytorch/accelerate/transformer_engine.py
@@ -39,7 +39,7 @@ def te_accelerate(model, fp8_autocast=False):
 
 @torch.no_grad
 def _apply_basic_module_replacement(model):
-    for name, module in model.named_modules():
+    for name, module in model.named_children():
         if isinstance(module, torch.nn.Linear):
             has_bias = module.bias is not None
             if any(p % 16 != 0 for p in module.weight.shape):
@@ -51,17 +51,19 @@ def _apply_basic_module_replacement(model):
             if has_bias:
                 te_module.bias.copy_(module.bias)
 
-            setattr(module, name.split(".")[-1], te_module)
+            setattr(model, name, te_module)
         elif isinstance(module, torch.nn.LayerNorm):
             te_module = te.LayerNorm(module.normalized_shape[0], eps=module.eps, params_dtype=module.weight.dtype)
             te_module.weight.copy_(module.weight)
             te_module.bias.copy_(module.bias)
-            setattr(module, name.split(".")[-1], te_module)
+            setattr(model, name, te_module)
         elif isinstance(module, torch.nn.RMSNorm):
             te_module = te.RMSNorm(module.normalized_shape[0], eps=module.eps, dtype=module.weight.dtype)
             te_module.weight.copy_(module.weight)
             te_module.bias.copy_(module.bias)
-            setattr(module, name.split(".")[-1], te_module)
+            setattr(model, name, te_module)
+        else:
+            _apply_basic_module_replacement(module)
 
 
 def is_te_accelerated(model):

From ed0560015f831b2090aaabdb7af5a03dfcd94c68 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Thu, 21 Nov 2024 18:10:09 +0300
Subject: [PATCH 41/48] fix typo (#11351)

---
 nemo/collections/llm/recipes/mamba2_hybrid_8b.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
index 09bb88a57089..5a557de46066 100644
--- a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
+++ b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
@@ -39,7 +39,7 @@ def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
 
     return run.Config(
         get_nmt_tokenizer,
-        library='megatronNVIDIAMambaConfig8B',
+        library='megatron',
         model_name="GPTSentencePieceTokenizer",
         tokenizer_model=tokenizer_model,
         use_fast=True,

From 34f740822a11802c2f0cea1c8039163b2b8bff92 Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Thu, 21 Nov 2024 16:15:54 +0100
Subject: [PATCH 42/48] Add torchrun local executor to recipes (#11342)

* Add torchrun local executor to recipes

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>

* Remove code from init

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>

* Fix copyright

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Fix failing test

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

---------

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
---
 nemo/collections/llm/recipes/__init__.py     |  2 ++
 nemo/collections/llm/recipes/run/__init__.py | 13 ++++++++
 nemo/collections/llm/recipes/run/executor.py | 35 ++++++++++++++++++++
 3 files changed, 50 insertions(+)
 create mode 100644 nemo/collections/llm/recipes/run/__init__.py
 create mode 100644 nemo/collections/llm/recipes/run/executor.py

diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index e76729d5e31a..449592298d41 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -73,6 +73,7 @@
 )
 from nemo.collections.llm.recipes.log.default import default_log, default_resume
 from nemo.collections.llm.recipes.optim import adam
+from nemo.collections.llm.recipes.run.executor import torchrun
 
 __all__ = [
     "baichuan2_7b",
@@ -134,4 +135,5 @@
     "adam",
     "default_log",
     "default_resume",
+    "torchrun",
 ]
diff --git a/nemo/collections/llm/recipes/run/__init__.py b/nemo/collections/llm/recipes/run/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/llm/recipes/run/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/llm/recipes/run/executor.py b/nemo/collections/llm/recipes/run/executor.py
new file mode 100644
index 000000000000..305fa6b0a3c7
--- /dev/null
+++ b/nemo/collections/llm/recipes/run/executor.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import nemo_run as run
+
+
+@run.cli.factory
+def torchrun(devices: int = 8) -> run.Config[run.LocalExecutor]:
+    """Local executor using torchrun."""
+    env_vars = {
+        "TRANSFORMERS_OFFLINE": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
+        "NCCL_NVLS_ENABLE": "0",
+        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
+        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+    }
+
+    executor = run.Config(
+        run.LocalExecutor,
+        ntasks_per_node=devices,
+        launcher="torchrun",
+        env_vars=env_vars,
+    )
+
+    return executor

From 773590c88c534727a20c8ae15ca1c92980748cf4 Mon Sep 17 00:00:00 2001
From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Date: Thu, 21 Nov 2024 09:58:32 -0800
Subject: [PATCH 43/48] Add PP support in NeVA along with few bug fixes
 (#11170)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* evian3 update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add encoder parallel default config

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add encoder parallel default config

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clean up

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add aspect ratio in model

* support energon dataloader

* some pp update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix kv merging

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix get_key_value_tensors

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* rename files

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update to HF style position embedding

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix energon dataloader and support batching

* update forward args

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clean up and move to aspect_ratio_ids

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* rename back to language.py

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix loss function

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update and fix energon

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add hf import

* Fix type

* Change config

* update energon pretrain

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clean up

* clean up

* reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update inference files for new code

* update to instruct

* update to instruct

* update few names

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update generation

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix importer embedding.weight

* few fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add hf script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix kv import

* remove interleaved

* fixes and updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* lora fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* some code clean ups

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update training scripts

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* refactors

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add LoRA finetuning

* fixes and nemo update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix importer registering issue by adding 11B and 90B configs

* update `decoder_seq_len`

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* science vqa script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clean up script name

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix ckpt save serialization issue

* fix predefined config classes

* add num_chunks in input

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix format

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update finetuning scripts for PEFT

* add 11b recipe (need #10645 to test)

* fix mask generation

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* minor fix code style

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Support no image inference

* add llama svqa eval

* fix masking

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix generation

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* add 90b recipe and revise 11b recipe

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* clean up typing

* add option to disable vision padding

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* base model finetuning (does not work yet)

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fixed default conversation template config for MLLama

* Update svqa

* add multinode

* bot happy

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Perf improvements. Mainly from XAttn mask calculation (#10901)

* Perf improvements. Mainly from XAttn mask calculation

* Apply isort and black reformatting

Signed-off-by: parthmannan <parthmannan@users.noreply.github.com>

---------

Signed-off-by: parthmannan <parthmannan@users.noreply.github.com>
Co-authored-by: parthmannan <parthmannan@users.noreply.github.com>

* fix existing issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix scripts

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix lora

* few fixes for non image support

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update masking gen

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update lazy dataset

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix data sampler and loading issue

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add vlm generation

* Apply isort and black reformatting

Signed-off-by: meatybobby <meatybobby@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* generation update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update lazy dataset

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix _strategy_lib.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix warning

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* hide vlm examples

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Revert "Add vlm generation"

This reverts commit 4711c75a

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix VisionEncoder multi-batch bug

* update mcore parallelism initialization

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Update megatron_init.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* add encoder parallel default config

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix _strategy_lib.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* llm.generate fixes (#10983)

* fix context path, disable optimizer init, add tp

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* format

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* address comments, require user to provide trainer

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fix

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fixes

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* use __dict__ in check (#11012)

* check is_hf_model in leaf module

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* disable getattr alternative path

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* undo;

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* LoRA support for HF::AutoModelForCausalLM (#10982)

* add LinearAdapter

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add hf lora example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* subclass mixin

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* undo

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix scale

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* regex selector for peft

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* move lora

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fmt

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* hf_auto_model_for_causal_lm finetune recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Change default for always_save_context to True (#11014)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>

* Add a build option to load_context (#10713)

* Add a build option to load_context

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Adding test

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Trying to fix failing CPU test

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* cherry-pick fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Fix pip install (#11026)

* Move AutoTokenizer inline

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Move einops to common requirements

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Move AutoTokenizer import to top-level again in fine_tuning

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Move megatron init inside nemo.lightning

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Make megatron_lazy_init_context work when transformer-engine is not installed

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Only import get_nmt_tokenizer when needed

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>

* [WIP] Add docs for NEST SSL (#10804)

* add docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc and fix missing param

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Change dist ckpt defaults (#10913)

* Enable ckpt features by default (async ckpt), ckpt every 15mins and reduce preemption time to 1min

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* fix ssm tests

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Make note that ckpt_async_save is disabled for SSMs

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Enable async ckpt for SSMs with fix

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Disable async ckpt in the peft test as it is a known bug, add note.

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Fix failing unit tests

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Ashors/peft async ckpt (#11010)

* [WIP] prototype for supporting async checkpointing with peft

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Enable async ckpt for the peft test

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Fix peft setup test

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

---------

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Co-authored-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* Akoumparouli/mixtral recipe fix r2.0.0 (#10994)

* Mixtral TP8 EP1

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Fix _strategy_lib tests (#11033)

* fix world size and don't mock

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* cleanup global state

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* check app state instead

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix syntax nemo logger test

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Update `BaseMegatronSampler` for compatibility with PTL's `_BatchProgress` (#11016)

* Revert "[NeMo-UX] Use custom `BatchProgress` class which does not restore states (#10383)"

This reverts commit b5798ded9f27168db9d7d77cbe4f9da80bf49268.

* make megatron sampler return the total number of batches in the dataset

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: ashors1 <ashors@nvidia.com>

* PTQ example for NeMo 2.0 (#10642)

* initial commit

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* create Quantizer for NeMo 2.0

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* refactor

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Call quantize on an unwrapped mcore model

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Add tests, adjust unwrapping

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* fix export

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Fix output_path argument for HF import

Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>

* fix fabric ckpt loading

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* code review suggestions

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* remove unused import

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* use cnn dataset in github ci

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* applied code review

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* code review changes

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* simplify interface for data iterator

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* (partial) PP fix

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

---------

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: Piotr Kaminski <pikaminski@nvidia.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>

* TDT compute timestamps option and Extra Whitespace handling for SPE (#10875)

* add token duration

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* revert rnnt change

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add remove_extra_whitespaces arg to spe tokenizer

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add token duration retrieval

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add ignore_extra_whitespace to spe

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add compute_timestamp support for tdt

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix config field name

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add refinement for tdt timestamps

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add segments timestamp support and  refinement for ctc

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* modify tests for ctc decoding timestamps

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* add rnnt timestamp tests

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* updated doc

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix in test

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* fix of unicode char

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix rnnt_decoding test

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* workaround for tesst tokenizer

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* modify segments formation

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* modify segments for ctc

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* fix in ctc refinement

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* minor changes

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* reverse offset change

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* warning mode=once

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* make ignore_extrawhitespaces false

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* minor changes

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* adjust changes to the tests

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* modify prompt_formatter tests

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

---------

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* Basic online dynamic FP8 quantization with vLLM (#10904)

* Basic online dynamic quantization with vLLM

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

* vllm 0.6.3 updates

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Pass quantization param in deploy_vllm_triton.py script

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>

* ci: Improve VM maintenance (#10758)

* ci: Improve VM maintenance

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* rename stuff

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* title

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* use team

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* run on failure too

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* yrdy

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* neva update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add comment for vision transpose

* update megatron_init.py inside lightning

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix PP

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add examples

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix test

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* try fix test

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* try fix test

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix megatron megatron_init.py dp

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* Update lightning megatron_init.py dp

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* make it possible to update pre_preprocess and post_process for llm, required in vlm

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fixes for neva to run with PP

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add mcore vit support, and checkpoint conversion

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix checkpoint loading for epp

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* rename llama to mllama folder name

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update to attention bias

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* added datamodule for llava-next

* modified state dict transform

* neva model changes to support  llava-next

* remove accidentally checked in files

Signed-off-by: Yashaswi Karnati <ykarnati@login-eos01.eos.clusters.nvidia.com>

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* remove unused imports

* added io_init to not save task_encoder and image_processor

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* added scripts for pretrain and finetune

Signed-off-by: Yashaswi Karnati <ykarnati@login-eos02.eos.clusters.nvidia.com>

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* generation example

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* small change in llava next example

* llava next end-end train

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* finetune changes

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* finetune debug changes

* update dropout to 0

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* added example generation script

* added doc strings, formating, remove debug statemens and unsued imports

* remove example scripts

* fix attention bias

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* remove disable_vision_padding since we now have a fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Update init for mllama

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix copyright title

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* multiple fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* bug fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix code scan

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix for SP

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* update vision code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* revert attention bias changes until latest MLM code got merged

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix warning

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Turn off system message check, as it's "" now

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update layer spec and add siglip support

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* update pretrain script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Fix scripts

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* add neva training recipes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix mllama mock ds

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix recipe

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix pp

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* scripts update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* scripts update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* update config api

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* few updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* update 70b

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* hide examples for pr

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix few issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add docstring layer spec

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add docstring to vit config

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix copyright

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Signed-off-by: parthmannan <parthmannan@users.noreply.github.com>
Signed-off-by: meatybobby <meatybobby@users.noreply.github.com>
Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Signed-off-by: Yashaswi Karnati <ykarnati@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>
Signed-off-by: Yashaswi Karnati <ykarnati@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Yashaswi Karnati <ykarnati@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: Parth Mannan <38387286+parthmannan@users.noreply.github.com>
Co-authored-by: parthmannan <parthmannan@users.noreply.github.com>
Co-authored-by: meatybobby <meatybobby@users.noreply.github.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com>
Co-authored-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Co-authored-by: Piotr Kaminski <pikaminski@nvidia.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: monica-sekoyan <166123533+monica-sekoyan@users.noreply.github.com>
Co-authored-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: ykarnati <ykarnati@nvidia.com>
Co-authored-by: Yashaswi Karnati <ykarnati@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>
---
 nemo/collections/llm/fn/activation.py         |   6 +
 nemo/collections/llm/gpt/model/base.py        |   6 +-
 nemo/collections/llm/gpt/model/llama.py       |   4 +-
 nemo/collections/llm/gpt/model/ssm.py         |   6 +-
 nemo/collections/vlm/__init__.py              |  11 +-
 nemo/collections/vlm/layer_specs.py           | 131 ++++
 nemo/collections/vlm/mllama/data/mock.py      |   8 +-
 nemo/collections/vlm/mllama/model/base.py     |  30 +-
 nemo/collections/vlm/neva/data/__init__.py    |   2 +
 .../collections/vlm/neva/data/conversation.py |   4 +-
 nemo/collections/vlm/neva/data/lazy.py        |   8 +-
 .../vlm/neva/data/llava_next_energon.py       | 161 +++++
 nemo/collections/vlm/neva/data/mock.py        |  14 +-
 nemo/collections/vlm/neva/model/__init__.py   |   9 +-
 nemo/collections/vlm/neva/model/api.py        |  14 +-
 nemo/collections/vlm/neva/model/base.py       | 561 +++++++++---------
 nemo/collections/vlm/neva/model/llava.py      | 216 ++++---
 nemo/collections/vlm/neva/model/vit_config.py |  79 +++
 nemo/collections/vlm/peft/lora.py             |   2 +-
 nemo/collections/vlm/recipes/__init__.py      |   4 +-
 nemo/collections/vlm/recipes/llava15_13b.py   | 150 +++++
 nemo/collections/vlm/recipes/llava15_7b.py    | 152 +++++
 nemo/lightning/io/state.py                    |   7 +-
 23 files changed, 1128 insertions(+), 457 deletions(-)
 create mode 100644 nemo/collections/vlm/layer_specs.py
 create mode 100644 nemo/collections/vlm/neva/data/llava_next_energon.py
 create mode 100644 nemo/collections/vlm/neva/model/vit_config.py
 create mode 100644 nemo/collections/vlm/recipes/llava15_13b.py
 create mode 100644 nemo/collections/vlm/recipes/llava15_7b.py

diff --git a/nemo/collections/llm/fn/activation.py b/nemo/collections/llm/fn/activation.py
index 5970846d32b2..db82f95b4bcc 100644
--- a/nemo/collections/llm/fn/activation.py
+++ b/nemo/collections/llm/fn/activation.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import torch
+from megatron.core.jit import jit_fuser
 
 
 @torch.jit.script
@@ -25,6 +26,11 @@ def openai_gelu(x):
     return gelu_impl(x)
 
 
+@jit_fuser
+def quick_gelu(x: torch.Tensor) -> torch.Tensor:
+    return x * torch.sigmoid(1.702 * x)
+
+
 # @torch.jit.script # remove until we have serialization
 def squared_relu(x):
     """Squared ReLU activation function."""
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 8c3b47835ab1..e411077aca31 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -179,7 +179,7 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     forward_step_fn: Callable = gpt_forward_step
     data_step_fn: Callable = gpt_data_step
 
-    def configure_model(self, tokenizer) -> "MCoreGPTModel":
+    def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MCoreGPTModel":
         vp_size = self.virtual_pipeline_model_parallel_size
         if vp_size:
             p_size = self.pipeline_model_parallel_size
@@ -214,8 +214,8 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
             rotary_percent=self.rotary_percent,
             rotary_base=self.rotary_base,
             seq_len_interpolation_factor=self.seq_len_interpolation_factor,
-            pre_process=parallel_state.is_pipeline_first_stage(),
-            post_process=parallel_state.is_pipeline_last_stage(),
+            pre_process=pre_process or parallel_state.is_pipeline_first_stage(),
+            post_process=post_process or parallel_state.is_pipeline_last_stage(),
         )
 
         # If using full TE layer, need to set TP, CP group since the module call
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index a9d18220bcaf..f5f6de6c79e7 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -115,8 +115,8 @@ class Llama31Config(Llama3Config):
     old_context_len: int = 8192
     init_method_std: float = 0.02
 
-    def configure_model(self, tokenizer) -> "MCoreGPTModel":
-        model = super().configure_model(tokenizer)
+    def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MCoreGPTModel":
+        model = super().configure_model(tokenizer, pre_process, post_process)
         # Apply rope scaling for Llama3.1 model
         model.rotary_pos_emb.inv_freq = apply_rope_scaling(
             model.rotary_pos_emb.inv_freq,
diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py
index d38a690cb4ad..e828d85f2814 100644
--- a/nemo/collections/llm/gpt/model/ssm.py
+++ b/nemo/collections/llm/gpt/model/ssm.py
@@ -86,7 +86,7 @@ class SSMConfig(TransformerConfig, io.IOMixin):
     data_step_fn: Callable = gpt_data_step
     tokenizer_model_path: str = None
 
-    def configure_model(self, tokenizer) -> "MCoreMambaModel":
+    def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MCoreMambaModel":
 
         return MCoreMambaModel(
             self,
@@ -101,8 +101,8 @@ def configure_model(self, tokenizer) -> "MCoreMambaModel":
             rotary_percent=self.rotary_percent,
             rotary_base=self.rotary_base,
             seq_len_interpolation_factor=self.seq_len_interpolation_factor,
-            pre_process=parallel_state.is_pipeline_first_stage(),
-            post_process=parallel_state.is_pipeline_last_stage(),
+            pre_process=pre_process or parallel_state.is_pipeline_first_stage(),
+            post_process=post_process or parallel_state.is_pipeline_last_stage(),
         )
 
 
diff --git a/nemo/collections/vlm/__init__.py b/nemo/collections/vlm/__init__.py
index 7d8cc2c94247..266790f3af71 100644
--- a/nemo/collections/vlm/__init__.py
+++ b/nemo/collections/vlm/__init__.py
@@ -29,6 +29,7 @@
     DataConfig,
     ImageDataConfig,
     ImageToken,
+    LlavaNextTaskEncoder,
     MultiModalToken,
     NevaLazyDataModule,
     NevaMockDataModule,
@@ -42,7 +43,8 @@
     NevaConfig,
     NevaModel,
 )
-from nemo.collections.vlm.neva.model.llava import Llava1_5Config7B, Llava1_5Config13B, LlavaConfig, LlavaModel
+from nemo.collections.vlm.neva.model.llava import Llava15Config7B, Llava15Config13B, LlavaConfig, LlavaModel
+from nemo.collections.vlm.neva.model.vit_config import CLIPViTL_14_336_Config, SigLIPViT400M_14_384_Config
 from nemo.collections.vlm.peft import LoRA
 from nemo.collections.vlm.recipes import *
 
@@ -59,13 +61,16 @@
     "VideoToken",
     "CLIPViTConfig",
     "HFCLIPVisionConfig",
+    "CLIPViTL_14_336_Config",
+    "SigLIPViT400M_14_384_Config",
     "MultimodalProjectorConfig",
     "NevaConfig",
     "NevaModel",
     "LlavaConfig",
-    "Llava1_5Config7B",
-    "Llava1_5Config13B",
+    "Llava15Config7B",
+    "Llava15Config13B",
     "LlavaModel",
+    "LlavaNextTaskEncoder",
     "MLlamaModel",
     "MLlamaModelConfig",
     "CrossAttentionTextConfig",
diff --git a/nemo/collections/vlm/layer_specs.py b/nemo/collections/vlm/layer_specs.py
new file mode 100644
index 000000000000..11c4d697a5aa
--- /dev/null
+++ b/nemo/collections/vlm/layer_specs.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+
+
+def get_layer_spec(is_vit, normalization) -> ModuleSpec:
+    """Transformer Layer Spec"""
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+    if normalization == "LayerNorm":
+        norm = LNImpl
+    elif normalization == "RMSNorm":
+        norm = TENorm
+    else:
+        raise RuntimeError("unknown normalization", normalization)
+
+    mlp = get_mlp_module_spec(use_te=False)  # doesn't include norm.
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=norm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=norm,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def get_layer_spec_te(is_vit=False) -> ModuleSpec:
+    """Transformer Layer Spec w/ TE Modules"""
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+
+    mlp = get_norm_mlp_module_spec_te()
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=IdentityOp,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    """MLP Submodule Spec"""
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
+
+
+def get_norm_mlp_module_spec_te() -> ModuleSpec:
+    """Norm + MLP Submodule Spec"""
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear),
+    )
diff --git a/nemo/collections/vlm/mllama/data/mock.py b/nemo/collections/vlm/mllama/data/mock.py
index a88838b0025f..fae92b097200 100644
--- a/nemo/collections/vlm/mllama/data/mock.py
+++ b/nemo/collections/vlm/mllama/data/mock.py
@@ -34,6 +34,8 @@ def __init__(
         micro_batch_size: int = 4,
         global_batch_size: int = 8,
         rampup_batch_size: Optional[List[int]] = None,
+        tokenizer: Optional = None,
+        image_processor: Optional = None,
         num_train_samples: int = 10_000,
         num_val_samples: int = 10_000,
         num_test_samples: int = 10_000,
@@ -52,6 +54,8 @@ def __init__(
         self.persistent_workers = persistent_workers
         self.vocab_size = vocab_size
         self.crop_size = crop_size
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
 
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
@@ -142,8 +146,8 @@ def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
 
         return {
             "images": images,
-            "masks": [[5, 512]],
-            "num_chunks": [4],
+            "masks": torch.tensor([[5, 512]]),
+            "num_chunks": torch.tensor([4]),
             "tokens": tokens,
             "aspect_ratio_ids": aspect_ratio_ids,
             "loss_mask": self.loss_mask,
diff --git a/nemo/collections/vlm/mllama/model/base.py b/nemo/collections/vlm/mllama/model/base.py
index 7dd84fefbb18..d417af27aedd 100644
--- a/nemo/collections/vlm/mllama/model/base.py
+++ b/nemo/collections/vlm/mllama/model/base.py
@@ -40,6 +40,7 @@
 from nemo.collections.vlm.mllama.model.language import CrossAttentionTextModel
 from nemo.collections.vlm.mllama.model.utils import _generate_cross_attention_mask, _pad_attention_masks
 from nemo.collections.vlm.mllama.model.vision import VisionEncoder
+from nemo.collections.vlm.neva.model.base import MODEL_CONFIG_ATTR
 from nemo.lightning import get_vocab_size, io
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
@@ -240,35 +241,8 @@ class MLlamaModelConfig(TransformerConfig, io.IOMixin):
     data_step_fn: Callable = llama_data_step
 
     def __post_init__(self):
-        model_config_attr = [
-            'num_layers',
-            'hidden_size',
-            'num_attention_heads',
-            'num_query_groups',
-            'ffn_hidden_size',
-            'kv_channels',
-            'hidden_dropout',
-            'attention_dropout',
-            'fp32_residual_connection',
-            'apply_residual_connection_post_layernorm',
-            'layernorm_epsilon',
-            'layernorm_zero_centered_gamma',
-            'add_bias_linear',
-            'add_qkv_bias',
-            'gated_linear_unit',
-            'activation_func',
-            'activation_func_fp8_input_store',
-            'num_moe_experts',
-            'rotary_interleaved',
-            'window_size',
-            'normalization',
-            'qk_layernorm',
-            'test_mode',
-            'calculate_per_token_loss',
-        ]
-
         if self.language_model_config is not None:
-            for attr in model_config_attr:
+            for attr in MODEL_CONFIG_ATTR:
                 setattr(self, attr, getattr(self.language_model_config, attr))
 
     def configure_model(self, tokenizer) -> "MLlamaBaseModel":
diff --git a/nemo/collections/vlm/neva/data/__init__.py b/nemo/collections/vlm/neva/data/__init__.py
index f210d01a06fd..df9716fe5610 100644
--- a/nemo/collections/vlm/neva/data/__init__.py
+++ b/nemo/collections/vlm/neva/data/__init__.py
@@ -14,6 +14,7 @@
 
 from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig, VideoDataConfig
 from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule
+from nemo.collections.vlm.neva.data.llava_next_energon import LlavaNextTaskEncoder
 from nemo.collections.vlm.neva.data.mock import MockDataModule as NevaMockDataModule
 from nemo.collections.vlm.neva.data.multimodal_tokens import ImageToken, MultiModalToken, VideoToken
 
@@ -26,4 +27,5 @@
     "MultiModalToken",
     "ImageToken",
     "VideoToken",
+    "LlavaNextTaskEncoder",
 ]
diff --git a/nemo/collections/vlm/neva/data/conversation.py b/nemo/collections/vlm/neva/data/conversation.py
index d78d3bd28acb..58953dc53b7a 100644
--- a/nemo/collections/vlm/neva/data/conversation.py
+++ b/nemo/collections/vlm/neva/data/conversation.py
@@ -77,7 +77,6 @@ def process_chat_template(self, tokenizer_name_or_path, messages):
 
     def get_prompt(self):
         messages = self.messages
-        messages = self.process_prompt_with_images(messages)
 
         if self.sep_style == SeparatorStyle.SINGLE:
             ret = self.system + self.sep
@@ -100,6 +99,8 @@ def get_prompt(self):
                     if type(message) is tuple:
                         message, _, _ = message
                     ret += role + ": " + message + seps[i % 2]
+                    # Add space to make sure the labels can be correctly generated.
+                    self.messages[i][1] = " " + self.messages[i][1]
                 else:
                     ret += role + ":"
 
@@ -155,7 +156,6 @@ def get_prompt(self):
             ret = self.process_chat_template(tokenizer_name_or_path, messages)
 
         elif self.sep_style == SeparatorStyle.MLLAMA:
-            """ """
             tokenizer_name_or_path = self.tokenizer_name_or_path or "meta-llama/Llama-3.2-11B-Vision-Instruct"
             ret = self.process_chat_template(tokenizer_name_or_path, messages)
 
diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py
index fddaca14faeb..5bc2cbe0458e 100644
--- a/nemo/collections/vlm/neva/data/lazy.py
+++ b/nemo/collections/vlm/neva/data/lazy.py
@@ -251,7 +251,7 @@ def __init__(
         data_config,
         tokenizer,
         image_processor,
-        sequence_length,
+        sequence_length=None,
     ):
         super().__init__()
         if data_path is not None:
@@ -497,6 +497,7 @@ def __init__(
         weights: Optional[List[float]] = None,
         data_config: Optional[DataConfig] = ImageDataConfig,
         seq_length: int = 2048,
+        decoder_seq_length: Optional[int] = None,
         tokenizer: Optional = None,
         image_processor: Optional = None,
         micro_batch_size: int = 4,
@@ -523,6 +524,7 @@ def __init__(
         self.weights = weights
         self.data_config = data_config
         self.seq_length = seq_length
+        self.decoder_seq_length = decoder_seq_length
         self.tokenizer = tokenizer
         self.image_processor = image_processor
         self.num_train_samples = num_train_samples
@@ -538,13 +540,15 @@ def __init__(
         if tokenizer is None or image_processor is None:
             logging.warning(f"Processor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.")
             from transformers import AutoProcessor
+            from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
             processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
-            self.tokenizer = tokenizer or processor.tokenizer
+            self.tokenizer = tokenizer or AutoTokenizer("llava-hf/llava-1.5-7b-hf")
             self.image_processor = image_processor or processor.image_processor
 
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
+            decoder_seq_len=self.decoder_seq_length,
             micro_batch_size=micro_batch_size,
             global_batch_size=global_batch_size,
             dataloader_type="cyclic",
diff --git a/nemo/collections/vlm/neva/data/llava_next_energon.py b/nemo/collections/vlm/neva/data/llava_next_energon.py
new file mode 100644
index 000000000000..c45ee50e5be3
--- /dev/null
+++ b/nemo/collections/vlm/neva/data/llava_next_energon.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Dict, List
+
+import torch
+from megatron.energon import VQASample, batch_list, batch_pad_stack
+from torch.nn.utils.rnn import pad_sequence
+
+from nemo.collections.multimodal.data.energon.config import ImageTextRawBatch, ImageTextSample, MultiModalSampleConfig
+from nemo.collections.multimodal.data.energon.sample_encoder import SampleEncoder, VQASampleEncoder
+from nemo.collections.multimodal.data.energon.task_encoder import MultiModalTaskEncoder
+from nemo.utils import logging
+
+
+class LlavaNextTextSample(ImageTextSample):
+    num_media_tiles: int = 0
+
+
+@dataclass
+class LlavaNextTextRawBatch(ImageTextRawBatch):
+    num_media_tiles: List[int] = field(default_factory=list)
+
+
+class LlavaNextSampleEncoder(VQASampleEncoder):
+    def __init__(self, tokenizer, image_processor, multimodal_sample_config=MultiModalSampleConfig()):
+        """
+        Initialize the LlavaNextSampleEncoder, inherited from VQASampleEncoder for multimodal samples
+        focused on VQA-style data to support LLaVANeXT
+
+        Parameters:
+        tokenizer (Tokenizer): The HF tokenizer used for processing text.
+        image_processor (ImageProcessor): The HF image processor used for preprocessing images.
+        multimodal_sample_config (MultiModalSampleConfig, optional): Configuration object for multimodal samples.
+            Defaults to MultiModalSampleConfig().
+        """
+        super().__init__(tokenizer, image_processor, multimodal_sample_config)
+
+    def process_image(self, image):
+        """
+        Process and prepare an image sample for encoding.
+
+        This method preprocesses the image using the HF image_processor, converting it to
+        a tensor.
+
+        Parameters:
+        image: The input image to be processed.
+
+        Returns:
+        torch.Tensor: The processed image tensor.
+        """
+        image_array = self.image_processor.preprocess(image, return_tensors='pt', do_rescale=False)['pixel_values'][0]
+        return image_array
+
+    def encode(self, input_sample: VQASample, output_sample: LlavaNextTextSample):
+        """
+        Encode a single sample into a format suitable for model input.
+
+        This method prepares the conversation prompt, tokenizes it, and processes
+        the associated image. It fills the output sample with tokens, labels, loss mask,
+        and other required fields for multimodal processing.
+
+        Parameters:
+        input_sample (VQASample): The input VQA sample containing an image and conversation text.
+        output_sample (LlavaNextTextSample): The output sample structure where encoded results are stored.
+
+        Returns:
+        LlavaNextTextSample: The encoded output sample, containing processed tokens, labels,
+            images, loss masks, and metadata.
+        """
+        conversation_prompt = self.apply_prompt_template(input_sample)
+        logging.debug(f"task encoder encode_sample conversation_prompt {conversation_prompt}")
+        # tokenize prompt
+        tokens = self.tokenize(conversation_prompt)
+        labels = self.compute_labels(tokens, input_sample)
+
+        tokens = tokens[:-1].contiguous()
+        labels = labels[1:].contiguous()
+        logging.debug(f"task encoder encode_sample after tokenize prompt tokens {tokens}")
+        logging.debug(f"task encoder encode_sample lables {labels}")
+        loss_mask = self.compute_loss_mask(labels)
+        processed_image = self.process_image(input_sample.image)
+        output_sample.__key__ = input_sample.__key__
+        output_sample.images = processed_image
+        output_sample.tokens = tokens
+        output_sample.labels = labels
+        output_sample.loss_mask = loss_mask
+        output_sample.num_media_tiles = processed_image.shape[0]
+        return output_sample
+
+
+class LlavaNextTaskEncoder(MultiModalTaskEncoder):
+    def __init__(self, tokenizer, image_processor, multimodal_sample_config):
+        """
+        Initialize the LlavaNextTaskEncoder.
+
+        This encoder extends MultiModalTaskEncoder to specifically handle LlavaNeXT,
+        overriding  encoders for VQA sample type.
+
+        Parameters:
+        tokenizer (Tokenizer): The tokenizer for processing text data across sample types.
+        image_processor (ImageProcessor): The image processor for preprocessing images.
+        multimodal_sample_config (MultiModalSampleConfig): Configuration settings for multimodal samples.
+        """
+        super().__init__(tokenizer, image_processor, multimodal_sample_config)
+        self.encoders: Dict[str, SampleEncoder] = {
+            VQASample.__name__: LlavaNextSampleEncoder(tokenizer, image_processor, multimodal_sample_config)
+        }
+
+    def batch(self, samples: List[LlavaNextTextSample]) -> LlavaNextTextRawBatch:
+        """
+        Batch multiple encoded samples into a single batch structure for model input.
+
+        This method combines individual sample fields (keys, images, tokens, labels, etc.) and
+        pads or stacks them as needed to create a unified batch.
+
+        Parameters:
+        samples (List[LlavaNextTextSample]): A list of LlavaNextTextSample instances to be batched.
+
+        Returns:
+        LlavaNextTextRawBatch: A batch containing all input samples' images, tokens, labels,
+            loss masks, and other metadata prepared for model processing.
+        """
+        keys, images, tokens, labels, loss_mask, num_media_tiles = [], [], [], [], [], []
+        for sample in samples:
+            keys.append(sample.__key__)
+            images.append(sample.images)
+            tokens.append(sample.tokens)
+            labels.append(sample.labels)
+            loss_mask.append(sample.loss_mask)
+            num_media_tiles.append(sample.num_media_tiles)
+
+        batch_keys = batch_list(keys)
+
+        batch_images = torch.cat(images, dim=0)
+
+        batch_tokens = pad_sequence(tokens, batch_first=True)
+        batch_labels = pad_sequence(labels, batch_first=True)
+
+        batch_loss_mask = batch_pad_stack(loss_mask)
+        batch_num_media_tiles = torch.tensor(batch_list(num_media_tiles), dtype=torch.int)
+        return LlavaNextTextRawBatch(
+            __keys__=batch_keys,
+            images=batch_images,
+            tokens=batch_tokens,
+            labels=batch_labels,
+            loss_mask=batch_loss_mask,
+            num_media_tiles=batch_num_media_tiles,
+        )
diff --git a/nemo/collections/vlm/neva/data/mock.py b/nemo/collections/vlm/neva/data/mock.py
index ede06e9f5778..9e2308752641 100644
--- a/nemo/collections/vlm/neva/data/mock.py
+++ b/nemo/collections/vlm/neva/data/mock.py
@@ -23,26 +23,29 @@
 
 from nemo.collections.vlm.neva.data.multimodal_tokens import IMAGE_TOKEN_INDEX
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
+from nemo.utils import logging
 
 
 class MockDataModule(pl.LightningDataModule):
     def __init__(
         self,
         seq_length: int = 2048,
+        decoder_seq_length: Optional[int] = None,
         tokenizer: Optional = None,
         image_processor: Optional = None,
         micro_batch_size: int = 4,
         global_batch_size: int = 8,
         rampup_batch_size: Optional[List[int]] = None,
-        num_train_samples: int = 10_000,
-        num_val_samples: int = 10_000,
-        num_test_samples: int = 10_000,
+        num_train_samples: int = 10_000_000,
+        num_val_samples: int = 10_000_000,
+        num_test_samples: int = 10_000_000,
         num_workers: int = 8,
         pin_memory: bool = True,
         persistent_workers: bool = False,
     ):
         super().__init__()
         self.seq_length = seq_length
+        self.decoder_seq_len = decoder_seq_length
         self.num_train_samples = num_train_samples
         self.num_val_samples = num_val_samples
         self.num_test_samples = num_test_samples
@@ -51,13 +54,16 @@ def __init__(
         self.persistent_workers = persistent_workers
 
         if tokenizer is None or image_processor is None:
+            logging.warning(f"Processor or tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.")
             from transformers import AutoProcessor
+            from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
             processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
-            self.tokenizer = tokenizer or processor.tokenizer
+            self.tokenizer = tokenizer or AutoTokenizer("llava-hf/llava-1.5-7b-hf")
             self.image_processor = image_processor or processor.image_processor
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
+            decoder_seq_len=self.decoder_seq_len,
             micro_batch_size=micro_batch_size,
             global_batch_size=global_batch_size,
             rampup_batch_size=rampup_batch_size,
diff --git a/nemo/collections/vlm/neva/model/__init__.py b/nemo/collections/vlm/neva/model/__init__.py
index 25842186ecfe..99862f97b9ed 100644
--- a/nemo/collections/vlm/neva/model/__init__.py
+++ b/nemo/collections/vlm/neva/model/__init__.py
@@ -19,16 +19,19 @@
     NevaConfig,
     NevaModel,
 )
-from nemo.collections.vlm.neva.model.llava import Llava1_5Config7B, Llava1_5Config13B, LlavaConfig, LlavaModel
+from nemo.collections.vlm.neva.model.llava import Llava15Config7B, Llava15Config13B, LlavaConfig, LlavaModel
+from nemo.collections.vlm.neva.model.vit_config import CLIPViTL_14_336_Config, SigLIPViT400M_14_384_Config
 
 __all__ = [
     "CLIPViTConfig",
+    "CLIPViTL_14_336_Config",
+    "SigLIPViT400M_14_384_Config",
     "HFCLIPVisionConfig",
     "MultimodalProjectorConfig",
     "NevaConfig",
     "NevaModel",
     "LlavaConfig",
-    "Llava1_5Config7B",
-    "Llava1_5Config13B",
+    "Llava15Config7B",
+    "Llava15Config13B",
     "LlavaModel",
 ]
diff --git a/nemo/collections/vlm/neva/model/api.py b/nemo/collections/vlm/neva/model/api.py
index 19e94c70381e..13444632464e 100644
--- a/nemo/collections/vlm/neva/model/api.py
+++ b/nemo/collections/vlm/neva/model/api.py
@@ -14,18 +14,18 @@
 
 import lightning.pytorch as pl
 
-from nemo.collections.vlm.neva.model import Llava1_5Config7B, Llava1_5Config13B, LlavaModel
+from nemo.collections.vlm.neva.model import Llava15Config7B, Llava15Config13B, LlavaModel
 
 
-def llava1_5_7b() -> pl.LightningModule:
-    return LlavaModel(Llava1_5Config7B())
+def llava15_7b() -> pl.LightningModule:
+    return LlavaModel(Llava15Config7B())
 
 
-def llava1_5_13b() -> pl.LightningModule:
-    return LlavaModel(Llava1_5Config13B())
+def llava15_13b() -> pl.LightningModule:
+    return LlavaModel(Llava15Config13B())
 
 
 __all__ = [
-    "llava1_5_7b",
-    "llava1_5_13b",
+    "llava15_7b",
+    "llava15_13b",
 ]
diff --git a/nemo/collections/vlm/neva/model/base.py b/nemo/collections/vlm/neva/model/base.py
index d4e578218ed2..360874152cf7 100644
--- a/nemo/collections/vlm/neva/model/base.py
+++ b/nemo/collections/vlm/neva/model/base.py
@@ -22,17 +22,20 @@
 import torch.distributed
 import torch.nn.functional as F
 from megatron.core import dist_checkpointing
+from megatron.core import parallel_state as ps
+from megatron.core.enums import ModelType
+from megatron.core.extensions.transformer_engine import TEDotProductAttention
 from megatron.core.inference_params import InferenceParams
 from megatron.core.models.multimodal.llava_model import LLaVAModel as MCoreLLaVAModel
 from megatron.core.models.vision.clip_vit_model import CLIPViTModel as MCoreCLIPViTModel
 from megatron.core.models.vision.multimodal_projector import MultimodalProjector as MCoreMultimodalProjector
 from megatron.core.optimizer import OptimizerConfig
+from megatron.core.tensor_parallel import gather_from_sequence_parallel_region
 from megatron.core.transformer.custom_layers.transformer_engine import (
     TEColumnParallelLinear,
     TENorm,
     TERowParallelLinear,
 )
-from megatron.core.transformer.enums import ModelType
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -41,15 +44,43 @@
 
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.collections.llm import fn
-from nemo.collections.llm.gpt.model import local_layer_spec, transformer_engine_layer_spec
+from nemo.collections.llm.gpt.model import transformer_engine_layer_spec
 from nemo.collections.llm.gpt.model.base import get_batch_on_this_context_parallel_rank, get_packed_seq_params
-from nemo.collections.nlp.modules.common.megatron.module import MegatronModule
-from nemo.collections.vlm.neva.data.multimodal_tokens import IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from nemo.collections.vlm.neva.data.multimodal_tokens import IMAGE_TOKEN_INDEX
 from nemo.lightning import io
+from nemo.lightning.io.pl import ckpt_to_weights_subdir
 from nemo.lightning.megatron_parallel import MaskedTokenLossReductionWithLossMask
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
 from nemo.utils import logging
 
+MODEL_CONFIG_ATTR = [
+    'num_layers',
+    'hidden_size',
+    'num_attention_heads',
+    'num_query_groups',
+    'ffn_hidden_size',
+    'kv_channels',
+    'hidden_dropout',
+    'attention_dropout',
+    'fp32_residual_connection',
+    'apply_residual_connection_post_layernorm',
+    'layernorm_epsilon',
+    'layernorm_zero_centered_gamma',
+    'add_bias_linear',
+    'add_qkv_bias',
+    'gated_linear_unit',
+    'activation_func',
+    'activation_func_fp8_input_store',
+    'num_moe_experts',
+    'rotary_interleaved',
+    'window_size',
+    'normalization',
+    'qk_layernorm',
+    'test_mode',
+    'calculate_per_token_loss',
+    'seq_length',
+]
+
 
 def get_image_sequence_length(img_h, img_w, patch_dim, add_class_token, class_token_len):
     """Get image sequence length given image size, patch size, and class token."""
@@ -64,9 +95,7 @@ def neva_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
 
     # Based on: https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py#L87
     # https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L828-L842
-
     batch = next(dataloader_iter)
-
     _batch: dict
     if isinstance(batch, tuple) and len(batch) == 3:
         _batch = batch[0]
@@ -74,11 +103,23 @@ def neva_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
         _batch = batch
 
     required_keys = set()
-    required_keys.add("attention_mask")
+    required_keys.update(
+        (
+            "tokens",
+            "attention_mask",
+            "media",
+            "num_media_tiles",
+        )
+    )
     if parallel_state.is_pipeline_first_stage():
-        required_keys.update(("media", "tokens", "position_ids"))
+        required_keys.update(("position_ids",))
     if parallel_state.is_pipeline_last_stage():
-        required_keys.update(("labels", "loss_mask"))
+        required_keys.update(
+            (
+                "labels",
+                "loss_mask",
+            )
+        )
 
     _batch = {
         key: val.cuda(non_blocking=True) if key in required_keys and val is not None else None
@@ -98,6 +139,7 @@ def neva_forward_step(model, batch) -> torch.Tensor:
         "attention_mask": batch.get("attention_mask", None),
         "loss_mask": batch.get("loss_mask", None),
         "labels": batch.get("labels", None),
+        "num_media_tiles": batch.get("num_media_tiles", None),
     }
 
     if 'cu_seqlens' in batch:
@@ -176,10 +218,11 @@ class HFCLIPVisionConfig(CLIPVisionConfig, io.IOMixin):
     https://github.com/huggingface/transformers/blob/v4.44.0/src/transformers/models/clip/configuration_clip.py#L261
     """
 
+    hidden_size: int = 1024
     pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None
 
-    def configure_hf_config(self, *args, **kwargs) -> None:
-        CLIPVisionConfig.__init__(self, *args, **kwargs)
+    def __post_init__(self, *args, **kwargs) -> None:
+        CLIPVisionConfig.__init__(self, *args, **kwargs, hidden_size=self.hidden_size)
 
     def configure_model(self) -> "CLIPVisionModel":
         # Monkey patch the method to the vision encoder
@@ -198,26 +241,40 @@ def configure_model(self) -> "CLIPVisionModel":
 @dataclass
 class CLIPViTConfig(TransformerConfig, io.IOMixin):
     ln_pre_impl: Union[ModuleSpec, type] = TENorm
+    ln_post_impl: Union[ModuleSpec, type] = TENorm
     add_class_token: bool = True
     class_token_len: int = 1
     patch_dim: int = 14
     img_h: int = 336
     img_w: int = 336
+    vision_model_type: str = "clip"  # ["clip", "siglip"]
     transformer_layer_spec: ModuleSpec = transformer_engine_layer_spec
 
-    def configure_model(self) -> "MCoreCLIPViTModel":
+    num_layers: int = 1  # Placeholder, NOT used!
+    num_attention_heads: int = 8  # Placeholder, NOT used!
+
+    def __post_init__(self):
+        if self.vision_model_type == "siglip":
+            self.add_class_token = False
+            self.class_token_len = 0
+
+    def configure_model(self) -> "CLIPViTModel":
         transformer_layer_spec = self.transformer_layer_spec
         if not isinstance(transformer_layer_spec, ModuleSpec):
-            transformer_layer_spec = transformer_layer_spec(self)
-        return MCoreCLIPViTModel(
+            from nemo.collections.vlm.layer_specs import get_layer_spec_te
+
+            transformer_layer_spec = get_layer_spec_te(is_vit=True)
+        return CLIPViTModel(
             self,
             transformer_layer_spec,
             ln_pre_impl=self.ln_pre_impl,
+            ln_post_impl=self.ln_post_impl,
             add_class_token=self.add_class_token,
             class_token_len=self.class_token_len,
             patch_dim=self.patch_dim,
             img_h=self.img_h,
             img_w=self.img_w,
+            model_subtype=self.vision_model_type,
         )
 
 
@@ -226,283 +283,173 @@ class NevaConfig(TransformerConfig, io.IOMixin):
     language_transformer_config: Optional[TransformerConfig] = None
     vision_transformer_config: Optional[TransformerConfig] = None
     vision_projection_config: Optional[TransformerConfig] = None
+
     drop_vision_class_token: bool = True
+    vision_feature_layer: int = -2
+
+    encoder_pipeline_model_parallel_size: int = 0
+    encoder_tensor_model_parallel_size: int = 1
     num_layers: int = 1  # Placeholder, NOT used!
     num_attention_heads: int = 8  # Placeholder, NOT used!
-    vision_feature_layer: int = -2
+
+    seq_length: int = 1024
 
     language_model_from_pretrained: Optional[str] = None
     vision_model_from_pretrained: Optional[str] = None  # TODO
     vision_projection_from_pretrained: Optional[str] = None  # TODO
 
-    freeze_language_model: bool = True
-    freeze_vision_model: bool = True
+    freeze_language_model: bool = False
+    freeze_vision_model: bool = False
     freeze_vision_projection: bool = False
 
     forward_step_fn: Callable = neva_forward_step
     data_step_fn: Callable = neva_data_step
 
-    def configure_model(self, tokenizer) -> "MCoreLLaVAModel":
-        language_model = self.language_transformer_config.configure_model(tokenizer=tokenizer)
-        vision_model = self.vision_transformer_config.configure_model()
-        vision_projection = self.vision_projection_config.configure_model()
-
-        if self.language_model_from_pretrained is not None:
-            sharded_state_dict = dict(state_dict=language_model.sharded_state_dict(prefix="module."))
-            loaded_state_dict = dist_checkpointing.load(
-                sharded_state_dict=sharded_state_dict, checkpoint_dir=self.language_model_from_pretrained
+    def __post_init__(self):
+        if self.language_transformer_config is not None:
+            for attr in MODEL_CONFIG_ATTR:
+                setattr(self, attr, getattr(self.language_transformer_config, attr))
+
+    def configure_model(self, tokenizer) -> "MCoreNevaModel":
+        from megatron.core import parallel_state as ps
+
+        self.language_transformer_config.tensor_model_parallel_size = self.tensor_model_parallel_size
+        self.language_transformer_config.sequence_parallel = self.sequence_parallel
+        self.vision_transformer_config.tensor_model_parallel_size = self.tensor_model_parallel_size
+        self.vision_projection_config.tensor_model_parallel_size = self.tensor_model_parallel_size
+        self.language_transformer_config.pipeline_model_parallel_size = self.pipeline_model_parallel_size
+
+        if self.encoder_pipeline_model_parallel_size > 0:
+            assert self.encoder_pipeline_model_parallel_size == 1, "ViT can only live on 1 pipeline stage."
+            self.vision_transformer_config.pipeline_model_parallel_size = self.encoder_pipeline_model_parallel_size
+            self.vision_projection_config.pipeline_model_parallel_size = self.encoder_pipeline_model_parallel_size
+            self.language_transformer_config.encoder_pipeline_model_parallel_size = (
+                self.encoder_pipeline_model_parallel_size
             )
-            loaded_state_dict = {k.removeprefix("module."): v for k, v in loaded_state_dict["state_dict"].items()}
-            language_model.load_state_dict(loaded_state_dict)
-            logging.info(f"Restored language model weights from {self.language_model_from_pretrained}")
+            if self.encoder_tensor_model_parallel_size > 0:
+                self.vision_transformer_config.tensor_model_parallel_size = self.encoder_tensor_model_parallel_size
+                self.vision_projection_config.tensor_model_parallel_size = self.encoder_tensor_model_parallel_size
+
         model = MCoreNevaModel(
-            transformer_config=self,
-            language_model=language_model,
-            vision_model=vision_model,
-            vision_projection=vision_projection,
+            config=self,
+            tokenizer=tokenizer,
+            pre_process=ps.is_pipeline_first_stage()
+            or ps.get_pipeline_model_parallel_rank() == self.encoder_pipeline_model_parallel_size,
+            post_process=ps.is_pipeline_last_stage(),
+            add_encoder=ps.is_pipeline_first_stage(),
+            add_decoder=ps.is_pipeline_last_stage()
+            or ps.get_pipeline_model_parallel_rank() >= self.encoder_pipeline_model_parallel_size,
             drop_vision_class_token=self.drop_vision_class_token,
         )
-        model.freeze(
-            freeze_language_model=self.freeze_language_model,
-            freeze_vision_model=self.freeze_vision_model,
-            freeze_vision_projection=self.freeze_vision_projection,
-        )
+
         return model
 
 
+class CLIPViTModel(MCoreCLIPViTModel):
+    """CLIP ViT vision model."""
+
+    def forward(
+        self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, num_unused_layers: int = 0
+    ) -> torch.Tensor:
+        if num_unused_layers > 0:
+            unused_layers = self.decoder.layers[-num_unused_layers:]
+            self.decoder.layers = self.decoder.layers[:-num_unused_layers]
+            x = super().forward(x, attention_mask)
+            self.decoder.layers.append(unused_layers)
+            return x
+
+        return super().forward(x, attention_mask)
+
+
 class MCoreNevaModel(MCoreLLaVAModel):
     def __init__(
         self,
-        transformer_config: TransformerConfig,
-        language_model: MegatronModule,
-        vision_model: MegatronModule,
-        vision_projection: MegatronModule,
+        config: NevaConfig,
+        tokenizer: Optional = None,
         pre_process: bool = True,
         post_process: bool = True,
+        add_encoder: bool = True,
+        add_decoder: bool = True,
         drop_vision_class_token: bool = False,
     ) -> None:
-        super(MCoreLLaVAModel, self).__init__(config=transformer_config)
+        super(MCoreLLaVAModel, self).__init__(config=config)
 
-        logging.warning("LLaVA model is under development and may be missing features.")
+        language_transformer_config = config.language_transformer_config
+        vision_transformer_config = config.vision_transformer_config
+        vision_projection_config = config.vision_projection_config
 
         self.pre_process = pre_process
         self.post_process = post_process
+        self.add_encoder = add_encoder
+        self.add_decoder = add_decoder
 
         self.encoder_hidden_state = None
-        self.vision_model = vision_model
-        self.vision_projection = vision_projection
-        self.language_model = language_model
-        self.model_type = ModelType.encoder_or_decoder
-        # This attribute is needed to check if an all-reduce is required
-        # on the word embeddings inside `finalize_model_grads._allreduce_word_embedding_grads`.
-        self.share_embeddings_and_output_weights = False
-        if self.language_model is not None:
-            self.share_embeddings_and_output_weights = self.language_model.share_embeddings_and_output_weights
-            self._language_max_sequence_length = self.language_model.max_sequence_length
+        self.vision_model = None
+        self.vision_projection = None
+        self.language_model = None
 
-        if self.vision_model is not None:
-            self._drop_vision_class_token = drop_vision_class_token
+        self.sequence_parallel_lm = language_transformer_config.sequence_parallel
+        self.tp_comm_overlap_lm = language_transformer_config.tp_comm_overlap
 
-        self.add_encoder = self.vision_model is not None
-        self.add_decoder = self.language_model is not None
-        self.vision_model_from_hf = str(self.vision_model.__class__.__module__).startswith("transformers.")
+        self.share_embeddings_and_output_weights = False
         if self.add_decoder:
-            vision_config = self.config.vision_transformer_config
-            if self.vision_model_from_hf:
-                # img_h, img_w, patch_dim, add_class_token, class_token_len
-                self._img_seq_len = get_image_sequence_length(
-                    img_h=vision_config.image_size,
-                    img_w=vision_config.image_size,
-                    patch_dim=vision_config.patch_size,
-                    add_class_token=not drop_vision_class_token,
-                    class_token_len=0 if "siglip" in vision_config.model_type else 1,
+            self.language_model = language_transformer_config.configure_model(
+                tokenizer=tokenizer, pre_process=pre_process, post_process=post_process
+            )
+            self.share_embeddings_and_output_weights = self.language_model.share_embeddings_and_output_weights
+            self._language_max_sequence_length = self.language_model.max_sequence_length
+            self._language_is_pipeline_parallel = language_transformer_config.pipeline_model_parallel_size > 1
+            if config.language_model_from_pretrained is not None:
+                sharded_state_dict = dict(state_dict=self.language_model.sharded_state_dict(prefix="module."))
+                loaded_state_dict = dist_checkpointing.load(
+                    sharded_state_dict=sharded_state_dict,
+                    checkpoint_dir=ckpt_to_weights_subdir(config.language_model_from_pretrained, is_saving=False),
+                    validate_access_integrity=False,
                 )
-            else:
-                self._img_seq_len = 576  # TODO(yuya): Fix hardcode
+                loaded_state_dict = {k.removeprefix("module."): v for k, v in loaded_state_dict["state_dict"].items()}
+                self.language_model.load_state_dict(loaded_state_dict)
+                logging.info(f"Restored language model weights from {config.language_model_from_pretrained}")
         else:
-            self._img_seq_len = 0
-
-    def _preprocess_data(
-        self,
-        image_embeddings,
-        language_embeddings,
-        input_ids,
-        loss_mask,
-        labels,
-        use_inference_kv_cache,
-        image_token_index,
-        num_image_tiles,
-    ):
-        # TODO (yuya): remove this and use the mcore method
-        """Preprocess input data before input to language model.
-
-        This function is adopted from
-        https://github.com/huggingface/transformers/blob/85817d98fb60977c97e3014196a462b732d2ed1a/src/transformers/models/llava_next/modeling_llava_next.py#L409
-        for our input data conventions.
-
-        image_token_index = -200 indicates the image position in the input_ids = [0, 1, -200, 2, 3] and labels = [1, -200, 2, 3, 4], for example.
-        We want to replace the image position (-200) with image_embeddings and return the following:
-        - final_embeddings = [0, 1, image_embeddings, 2, 3],
-        - final_labels = [1, -100, 2, 3, 4]
-        - final_loss_mask = [1, 0, 0, 1, 1]
-
-        This function also handles the case where the input does not contain an image (text-only sample). It also handles the case where a single input
-        image is split into multiple tiles.
-
-        If pipeline parallelism is not used, then self.pre_process and self.post_process are both True and we update both
-        input embeddings, labels and loss masks (if available).
-
-        If pipeline parallelism is used, then we do the following
-        - the first language model chunk has self.pre_process = True and self.post_process = False. We update input embeddings.
-        - the middle language model chunk(s) has self.pre_process = False and self.post_process = False. We don't need to update anything.
-        - the last language model chunk has self.pre_process = False and self.post_process = True. We update labels and loss mask.
+            if config.language_model_from_pretrained is not None:
+                dist_checkpointing.load(
+                    sharded_state_dict=dict(state_dict={}),
+                    checkpoint_dir=config.language_model_from_pretrained,
+                    validate_access_integrity=False,
+                )
 
-        TODO: This function should adjust the attention mask too. Currently, we assume the language model uses a causal mask.
+        if self.add_encoder:
+            self.vision_model = vision_transformer_config.configure_model()
+            self.vision_projection = vision_projection_config.configure_model()
+            self._drop_vision_class_token = drop_vision_class_token
 
-        Returns:
-            final_embedding (torch.Tensor): image and text embeddings concated [combined_seq_len, b, h].
-            final_labels (torch.Tensor): labels for image and text positions [b, combined_seq_len].
-            final_loss_mask (torch.Tensor): loss mask for image and text positions [b, combined_seq_len].
-        """
-        assert self.add_decoder, "input text preprocessing is only needed for the language model"
+        self.freeze(
+            freeze_language_model=config.freeze_language_model,
+            freeze_vision_model=config.freeze_vision_model,
+            freeze_vision_projection=config.freeze_vision_projection,
+        )
 
-        # No pre- or postprocessing needed. With pipeline parallel > 2, this means a chunk in the middle of the model.
-        if not self.pre_process and not self.post_process:
-            return language_embeddings, loss_mask, labels
+        self.model_type = ModelType.encoder_or_decoder
+        # This attribute is needed to check if an all-reduce is required
+        # on the word embeddings inside `finalize_model_grads._allreduce_word_embedding_grads`.
 
-        # If using the inference KV cache, the image tokens are already computed.
-        if use_inference_kv_cache:
-            return language_embeddings, loss_mask, labels
-
-        img_seq_len = self._img_seq_len
-        batch_size, text_seq_len = input_ids.shape
-
-        has_labels = labels is not None
-        if has_labels:
-            assert (
-                labels.shape == loss_mask.shape
-            ), f"mismatching labels shape {labels.shape} and loss mask shape {loss_mask.shape}"
-
-        # Create indices for new text and label positions.
-        with torch.no_grad():
-            image_token_mask = input_ids == image_token_index
-            num_image_tokens = torch.sum(image_token_mask, dim=-1)
-
-            # Number of tiles per sample.
-            num_image_tiles_batch = num_image_tiles.split(num_image_tokens.tolist(), dim=0)
-            num_image_tiles_batch = torch.tensor([x.sum() for x in num_image_tiles_batch], device=input_ids.device)
-
-            # Sequence length for each sample is the image sequence length multiplied by the number of tiles for that image, minus image token indices,
-            # plus text sequence length.
-            seq_lens = num_image_tiles_batch * img_seq_len - num_image_tokens + text_seq_len
-            max_seq_len = seq_lens.max()
-            batch_indices, non_image_indices = torch.where(input_ids != image_token_index)
-
-            # New position ids for the text tokens, shifted by the image sequence length.
-            # E.g. for input_ids = [-200, 1, 2, 3] and img_seq_len = 576, we get new_position_ids = [576, 577, 578, 579].
-            # text_position_ids are then [577, 578, 579].
-            image_token_mask_lens = image_token_mask.int().clone()
-            # -1 is for the removed image token index.
-            image_token_mask_lens[image_token_mask] = num_image_tiles * img_seq_len - 1
-            # +1 is needed here for the cumulative sum. -1 is adjusting for zero-based indexing.
-            new_position_ids = torch.cumsum((image_token_mask_lens + 1), dim=-1) - 1
-            text_position_ids = new_position_ids[batch_indices, non_image_indices]
-
-            # Labels are shifted to left by one. So, shift text position ids and non-image indices to left by one.
-            if has_labels:
-                label_text_position_ids = text_position_ids - 1
-                valid_label_text_position_ids = label_text_position_ids >= 0
-                label_text_position_ids = label_text_position_ids[valid_label_text_position_ids]
-
-                label_batch_indices = batch_indices[valid_label_text_position_ids]
-
-                label_non_image_indices = non_image_indices - 1
-                valid_label_non_image_indices = label_non_image_indices >= 0
-                label_non_image_indices = label_non_image_indices[valid_label_non_image_indices]
-
-        # Create a mask for the image embedding positions.
-        with torch.no_grad():
-            images_mask = torch.full((batch_size, max_seq_len), True, dtype=torch.bool, device=input_ids.device)
-            # No images in the text positions.
-            images_mask[batch_indices, text_position_ids] = False
-            # Samples can have different amount of images tokens. new_position_ids[:, -1] gives the last text position id for each sample.
-            # Padding is needed when the number of image tokens differs.
-            first_padding_idx = new_position_ids[:, -1] + 1
-            images_mask[
-                torch.arange(max_seq_len, device=first_padding_idx.device).repeat(batch_size, 1)
-                >= first_padding_idx.unsqueeze(1)
-            ] = False
-
-        # Create the final input embedding (if this is the first language model stage).
-        final_embedding = None
-        if self.pre_process:
-            embed_dim = language_embeddings.shape[-1]
-            final_embedding = torch.zeros(
-                batch_size,
-                max_seq_len,
-                embed_dim,
-                dtype=image_embeddings.dtype,
-                device=image_embeddings.device,
+        self.vision_model_from_hf = hasattr(vision_transformer_config, "image_size")
+        if self.vision_model_from_hf:
+            # img_h, img_w, patch_dim, add_class_token, class_token_len
+            self._img_seq_len = get_image_sequence_length(
+                img_h=vision_transformer_config.image_size,
+                img_w=vision_transformer_config.image_size,
+                patch_dim=vision_transformer_config.patch_size,
+                add_class_token=not drop_vision_class_token,
+                class_token_len=0 if "siglip" in vision_transformer_config.model_type else 1,
             )
-
-            # Put text embeddings to the text positions in the result tensor.
-            final_embedding[batch_indices, text_position_ids] = language_embeddings[batch_indices, non_image_indices]
-
-            # Put image embeddings to image positions.
-            final_embedding[images_mask] = image_embeddings.reshape(-1, embed_dim).contiguous()
-
-        # Create the final labels and loss mask (if this is the last language model stage).
-        final_labels, final_loss_mask = None, None
-        if has_labels:
-            final_labels = torch.full(
-                (batch_size, max_seq_len), IGNORE_INDEX, dtype=labels.dtype, device=labels.device
+        else:
+            self._img_seq_len = get_image_sequence_length(
+                img_h=vision_transformer_config.img_h,
+                img_w=vision_transformer_config.img_w,
+                patch_dim=vision_transformer_config.patch_dim,
+                add_class_token=not drop_vision_class_token,
+                class_token_len=vision_transformer_config.class_token_len,
             )
-            final_loss_mask = torch.full((batch_size, max_seq_len), 0, dtype=loss_mask.dtype, device=loss_mask.device)
-
-            # Put text labels and loss mask to the text positions.
-            final_labels[label_batch_indices, label_text_position_ids] = labels[
-                label_batch_indices, label_non_image_indices
-            ]
-
-            final_loss_mask[batch_indices, text_position_ids] = loss_mask[batch_indices, non_image_indices]
-
-            # For labels, we need to pick the last label index that got dropped by the shift to left.
-            label_extra_text_position_ids = seq_lens - 1
-            batch_range = torch.arange(len(label_extra_text_position_ids))
-            final_labels[batch_range, label_extra_text_position_ids] = labels[batch_range, -1]
-
-            # Loss mask the image positions.
-            final_loss_mask[images_mask] = 0
-
-            # Loss mask last text position just before an image so that text token does not need to predict the first image token.
-            batch_image_indices, image_indices = torch.where(image_token_mask)
-            # Indices just before image tokens. If it's -1, skip it.
-            before_image_indices = image_indices - 1
-            valid = before_image_indices >= 0
-            valid_batch_image_indices = batch_image_indices[valid]
-            valid_before_image_indices = before_image_indices[valid]
-            # Map those indices those position ids.
-            valid_before_image_indices = new_position_ids[valid_batch_image_indices, valid_before_image_indices]
-
-            final_loss_mask[valid_batch_image_indices, valid_before_image_indices] = 0
-
-        if final_embedding is not None and has_labels:
-            assert (
-                final_embedding.shape[:2] == final_labels.shape == final_loss_mask.shape
-            ), "unexpected shapes after data preprocessing"
-
-        if final_embedding is not None:
-            final_embedding = final_embedding.transpose(1, 0).contiguous()
-
-        # Truncate if exceeding the language model's max sequence length.
-        if final_embedding is not None and final_embedding.shape[0] > self._language_max_sequence_length:
-            final_embedding = final_embedding[: self._language_max_sequence_length]
-
-        if has_labels and final_labels.shape[1] > self._language_max_sequence_length:
-            final_labels = final_labels[:, : self._language_max_sequence_length]
-            final_loss_mask = final_loss_mask[:, : self._language_max_sequence_length]
-
-        return final_embedding, final_labels, final_loss_mask
 
     def forward(
         self,
@@ -515,6 +462,7 @@ def forward(
         inference_params: Optional[InferenceParams] = None,
         num_media_tiles: Optional[List[int]] = None,
         media_token_index: Optional[int] = IMAGE_TOKEN_INDEX,
+        runtime_gather_output: Optional[bool] = None,
     ) -> torch.Tensor:
         """Forward function of the LLaVA model.
 
@@ -533,34 +481,44 @@ def forward(
             output (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
             loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s].
         """
+
         use_inference_kv_cache = (
             inference_params is not None and "image_tokens_count" in inference_params.key_value_memory_dict
         )
+        has_images = media.shape[0] > 0
+
         # If running inference, we can skip media token computation if they were computed already earlier for this sample.
-        if use_inference_kv_cache or media is None:
+        if use_inference_kv_cache:
             media_embeddings = None
-        elif self.add_encoder:
+        elif self.add_encoder and not has_images:
+            # If no images provided, use an empty image embeddings tensor.
+            media_embeddings = torch.tensor([], dtype=media.dtype, device=media.device).reshape(0, 0, 0)
+        elif self.add_encoder and has_images:
             # media is in shape of (num_images_in_mbs, c, h, w)
             # note num_images_in_mbs is not mbs but total images in this mbs.
             if self.vision_model_from_hf:
-                media_embeddings = self.vision_model(
-                    media, output_hidden_states=True
-                )  # [num_images, img_seq_len, h_vision]
+                self.vision_model = self.vision_model.eval()
+                media_embeddings = self.vision_model(media, output_hidden_states=True)
                 media_embeddings = media_embeddings[-1][
                     self.config.vision_feature_layer
-                ]  # take second from last layer
+                ]  # [num_images, img_seq_len, h_vision]
             else:
                 # TODO(yuya): MCore Clip path not yet support taking a specific layer hidden states
-                media_embeddings = self.vision_model(media)
+                media = media.to(next(self.vision_model.parameters()).dtype)
+                media_embeddings = self.vision_model(media, num_unused_layers=-self.config.vision_feature_layer - 1)
             if self._drop_vision_class_token:
                 class_token_len = getattr(self.vision_model, "class_token_len", 1)
                 media_embeddings = media_embeddings[:, class_token_len:, :]
 
+            # contiguous() required as `permute` can sparsify the tensor and this breaks pipelining
+            media_embeddings = media_embeddings.permute(1, 0, 2).contiguous()  # [img_seq_len, num_tiles, h_vision]
+
             # map vision model output size to language model input size.
-            media_embeddings = self.vision_projection(media_embeddings)  # [img_seq_len, num_tiles, h_vision]
+            media_embeddings = self.vision_projection(media_embeddings)  # [img_seq_len, num_tiles, h_language]
 
-            # If running inference, the language model KV cache will be updated for media token positions.
-            # Here we store the media tokens sequence length, which can be used as an offset to the KV cache later.
+            # TODO: Support batched inference.
+            # In inference, the language model KV cache will be updated for image token positions.
+            # Store the image tokens sequence length to be used as an offset to the KV cache later.
             if inference_params is not None:
                 inference_params.key_value_memory_dict["media_tokens_count"] = (
                     media_embeddings.shape[0] * media_embeddings.shape[1]
@@ -569,40 +527,61 @@ def forward(
             media_embeddings = self.encoder_hidden_state
 
         if not self.add_decoder:
-            return media_embeddings, loss_mask
+            return media_embeddings
 
         language_embeddings = None
         if self.pre_process:
             input_ids_text = input_ids.clone()
             # MultiModal Token indices are assumed to be values
             input_ids_text[input_ids_text < 0] = 0
-            # Note: This adds absolute position embedding but not RoPE. Each image is counted as one position.
-            # RoPE is added in language_model forward call. Each image embedding is one position.
+            # Note: This adds absolute position embedding but not RoPE.
+            # Each image is counted as one position.
+            # RoPE is added in language_model forward. Each image embedding is one position.
+            if self.sequence_parallel_lm:
+                # Pad to nearest multiple of TP world size for embedding.
+                tp_world_size = ps.get_tensor_model_parallel_world_size()
+                padded_seq_len = (
+                    int((input_ids_text.shape[1] + tp_world_size - 1) // tp_world_size * tp_world_size)
+                    - input_ids_text.shape[1]
+                )
+                if padded_seq_len != 0:
+                    input_ids_text = torch.nn.functional.pad(input_ids_text, (0, padded_seq_len))
+                    if position_ids is not None:
+                        position_ids = torch.nn.functional.pad(position_ids, (0, padded_seq_len))
             language_embeddings = self.language_model.embedding(
                 input_ids=input_ids_text, position_ids=position_ids
             )  # [text_seq_len, b, h_language]
+            if self.sequence_parallel_lm:
+                # Gather the language embeddings back.
+                # We use the full embedding to insert image embeddings
+                # and then scatter to avoid load imbalance.
+                language_embeddings = gather_from_sequence_parallel_region(
+                    language_embeddings, tensor_parallel_output_grad=False
+                )
+                # Remove the padding done for SP as we'll need new padding calculation
+                # after image embeddings are inserted.
+                if padded_seq_len != 0:
+                    language_embeddings = language_embeddings[:-padded_seq_len]
             language_embeddings = language_embeddings.transpose(1, 0).contiguous()  # [b, text_seq_len, h_language]
 
-        if media is None:
-            combined_embeddings = language_embeddings.transpose(1, 0).contiguous()
-            final_labels = labels
-            final_loss_mask = loss_mask
-        else:
-            # Assume 1 tile per image if the number of tiles is not provided.
-            if num_media_tiles is None:
-                num_media_tiles = torch.ones(media.shape[0], dtype=torch.int, device=input_ids.device)
-
-            # Preprocess input, labels and loss mask.
-            combined_embeddings, final_labels, final_loss_mask = self._preprocess_data(
-                media_embeddings,
-                language_embeddings,
-                input_ids,
-                loss_mask,
-                labels,
-                use_inference_kv_cache,
-                media_token_index,
-                num_media_tiles,
-            )  # [combined_seq_len, b, h_language], [b, combined_seq_len], [b, combined_seq_len]
+        # Assume 1 tile per image if the number of tiles is not provided.
+        if num_media_tiles is None:
+            num_media_tiles = torch.ones(media.shape[0], dtype=torch.int, device=input_ids.device)
+        elif isinstance(num_media_tiles, list):
+            num_media_tiles = torch.tensor(num_media_tiles, dtype=torch.int, device=input_ids.device)
+
+        # Preprocess input, labels and loss mask.
+        combined_embeddings, final_labels, final_loss_mask, final_attention_mask = self._preprocess_data(
+            media_embeddings,
+            language_embeddings,
+            input_ids,
+            loss_mask,
+            labels,
+            use_inference_kv_cache,
+            media_token_index,
+            num_media_tiles,
+            attention_mask,
+        )  # [combined_seq_len, b, h_language], [b, combined_seq_len], [b, combined_seq_len]
 
         output = self.language_model(
             input_ids=None,
@@ -611,6 +590,7 @@ def forward(
             decoder_input=combined_embeddings,
             labels=final_labels,
             inference_params=inference_params,
+            runtime_gather_output=runtime_gather_output,
         )
 
         if labels is None or loss_mask is None:
@@ -618,6 +598,23 @@ def forward(
 
         return output, final_loss_mask.contiguous()
 
+    def set_input_tensor(self, input_tensor) -> None:
+        """Set model chunk input tensor."""
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for llava'
+
+        if self.add_encoder and self.add_decoder:
+            self.vision_model.set_input_tensor(input_tensor[0])
+        elif self.add_encoder:
+            self.vision_model.set_input_tensor(input_tensor[0])
+        elif self.pre_process:
+            self.encoder_hidden_state = input_tensor[0]
+        else:
+            self.language_model.set_input_tensor(input_tensor[0])
+
 
 class NevaModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
     def __init__(
@@ -649,6 +646,7 @@ def forward(
         media: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         inference_params: InferenceParams = None,
+        num_media_tiles: Optional[List[int]] = None,
     ) -> torch.Tensor:
         output_tensor = self.module(
             media=media,
@@ -658,6 +656,7 @@ def forward(
             attention_mask=attention_mask,
             labels=labels,
             inference_params=inference_params,
+            num_media_tiles=num_media_tiles,
         )
 
         return output_tensor
@@ -697,6 +696,4 @@ def validation_loss_reduction(self) -> MaskedTokenLossReductionWithLossMask:
     "NevaConfig",
     "neva_data_step",
     "neva_forward_step",
-    "transformer_engine_layer_spec",
-    "local_layer_spec",
 ]
diff --git a/nemo/collections/vlm/neva/model/llava.py b/nemo/collections/vlm/neva/model/llava.py
index 52b55b6f9c2d..5e02b4f9e9d7 100644
--- a/nemo/collections/vlm/neva/model/llava.py
+++ b/nemo/collections/vlm/neva/model/llava.py
@@ -43,7 +43,7 @@ class LlavaConfig(NevaConfig):
 
 
 @dataclass
-class Llava1_5Config7B(LlavaConfig):
+class Llava15Config7B(LlavaConfig):
     from transformers import PretrainedConfig
 
     language_transformer_config: TransformerConfig = field(default_factory=lambda: Llama2Config7B())
@@ -56,7 +56,7 @@ class Llava1_5Config7B(LlavaConfig):
 
 
 @dataclass
-class Llava1_5Config13B(LlavaConfig):
+class Llava15Config13B(LlavaConfig):
     from transformers import PretrainedConfig
 
     language_transformer_config: TransformerConfig = field(default_factory=lambda: Llama2Config13B())
@@ -111,7 +111,6 @@ def convert_state(self, source, target):
             "language_model.model.layers.*.post_attention_layernorm.weight": "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
             "language_model.model.norm.weight": "language_model.decoder.final_layernorm.weight",
             "language_model.lm_head.weight": "language_model.output_layer.weight",
-            "vision_tower.vision_model.**": "vision_model.vision_model.**",
         }
         if "vision_projection.encoder.linear_fc1.weight" in target.module.state_dict().keys():
             mapping.update(
@@ -134,7 +133,45 @@ def convert_state(self, source, target):
         else:
             raise KeyError("Unable to map vision projection keys.")
 
-        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
+        if "vision_model.vision_model.embeddings.class_embedding" in target.module.state_dict().keys():
+            mapping.update(
+                {
+                    "vision_tower.vision_model.**": "vision_model.vision_model.**",
+                }
+            )
+        elif "vision_model.class_token" in target.module.state_dict().keys():
+            mapping.update(
+                {
+                    "vision_tower.vision_model.embeddings.patch_embedding.weight": "vision_model.conv1.weight",
+                    "vision_tower.vision_model.embeddings.position_embedding.weight": "vision_model.position_embeddings.weight",
+                    "vision_tower.vision_model.encoder.layers.*.layer_norm1.weight": "vision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+                    "vision_tower.vision_model.encoder.layers.*.layer_norm1.bias": "vision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_bias",
+                    "vision_tower.vision_model.encoder.layers.*.layer_norm2.weight": "vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+                    "vision_tower.vision_model.encoder.layers.*.layer_norm2.bias": "vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_bias",
+                    "vision_tower.vision_model.encoder.layers.*.self_attn.out_proj.weight": "vision_model.decoder.layers.*.self_attention.linear_proj.weight",
+                    "vision_tower.vision_model.encoder.layers.*.self_attn.out_proj.bias": "vision_model.decoder.layers.*.self_attention.linear_proj.bias",
+                    "vision_tower.vision_model.encoder.layers.*.mlp.fc1.weight": "vision_model.decoder.layers.*.mlp.linear_fc1.weight",
+                    "vision_tower.vision_model.encoder.layers.*.mlp.fc1.bias": "vision_model.decoder.layers.*.mlp.linear_fc1.bias",
+                    "vision_tower.vision_model.encoder.layers.*.mlp.fc2.weight": "vision_model.decoder.layers.*.mlp.linear_fc2.weight",
+                    "vision_tower.vision_model.encoder.layers.*.mlp.fc2.bias": "vision_model.decoder.layers.*.mlp.linear_fc2.bias",
+                    "vision_tower.vision_model.pre_layrnorm.weight": "vision_model.ln_pre.weight",
+                    "vision_tower.vision_model.pre_layrnorm.bias": "vision_model.ln_pre.bias",
+                }
+            )
+        else:
+            raise KeyError("Unable to map vision encoder keys.")
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[
+                _import_language_qkv,
+                _import_vision_qkv,
+                _import_vision_qkv_bias,
+                _import_cls_token,
+                _import_linear_fc1,
+            ],
+        )
 
     @property
     def tokenizer(self) -> "AutoTokenizer":
@@ -183,80 +220,7 @@ def make_vocab_size_divisible_by(vocab_size):
         return output
 
 
-@io.model_exporter(LlavaModel, "hf")
-class HFLlavaExporter(io.ModelConnector[LlavaModel, "LlavaForConditionalGeneration"]):
-    def init(self) -> "LlavaForConditionalGeneration":
-        raise NotImplementedError("Neva Exporter hasn't been verified!")
-
-        from transformers import AutoModelForCausalLM
-
-        return AutoModelForCausalLM.from_config(self.config)
-
-    def apply(self, output_path: Path) -> Path:
-        target = self.init()
-        source, _ = self.nemo_load(str(self))
-
-        target = self.convert_state(source, target)
-
-        target = target.cpu()
-        target.save_pretrained(output_path)
-        self.tokenizer.save_pretrained(output_path)
-
-        return output_path
-
-    def convert_state(self, source, target):
-        mapping = {
-            "language_model.embedding.word_embeddings.weight": "language_model.model.embed_tokens.weight",
-            "language_model.decoder.layers.*.self_attention.linear_proj.weight": "language_model.model.layers.*.self_attn.o_proj.weight",
-            "language_model.decoder.layers.*.mlp.linear_fc2.weight": "language_model.model.layers.*.mlp.down_proj.weight",
-            "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "language_model.model.layers.*.input_layernorm.weight",
-            "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "language_model.model.layers.*.post_attention_layernorm.weight",
-            "language_model.decoder.final_layernorm.weight": "language_model.model.norm.weight",
-            "language_model.output_layer.weight": "language_model.lm_head.weight",
-        }
-
-        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])
-
-    @property
-    def tokenizer(self):
-        return io.load_context(str(self)).model.tokenizer.tokenizer
-
-    @property
-    def config(self) -> "HFLlavaConfig":
-        source: LlavaConfig = io.load_context(str(self)).model.config
-
-        from transformers import LlavaConfig as HFLlavaConfig
-
-        return HFLlavaConfig(
-            num_hidden_layers=source.num_layers,
-            hidden_size=source.hidden_size,
-            intermediate_size=source.ffn_hidden_size,
-            num_attention_heads=source.num_attention_heads,
-            max_position_embeddings=source.seq_length,
-            initializer_range=source.init_method_std,
-            rms_norm_eps=source.layernorm_epsilon,
-            num_key_value_heads=source.num_query_groups,
-            rope_theta=source.rotary_base,
-            vocab_size=self.tokenizer.vocab_size,
-        )
-
-
-@io.state_transform(
-    source_key=(
-        "language_model.model.layers.*.self_attn.q_proj.weight",
-        "language_model.model.layers.*.self_attn.k_proj.weight",
-        "language_model.model.layers.*.self_attn.v_proj.weight",
-    ),
-    target_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
-)
-def _import_qkv(ctx: io.TransformCTX, q, k, v):
-    megatron_config = ctx.target.config.language_transformer_config
-    head_num = megatron_config.num_attention_heads
-    num_query_groups = megatron_config.num_query_groups
-    heads_per_group = head_num // num_query_groups
-    hidden_size = megatron_config.hidden_size
-    head_size = megatron_config.kv_channels
-
+def import_qkv(q, k, v, head_num, num_query_groups, heads_per_group, hidden_size, head_size):
     old_tensor_shape = q.size()
     new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
     new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
@@ -282,59 +246,85 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
 
 
 @io.state_transform(
-    source_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
-    target_key=(
+    source_key=(
         "language_model.model.layers.*.self_attn.q_proj.weight",
         "language_model.model.layers.*.self_attn.k_proj.weight",
         "language_model.model.layers.*.self_attn.v_proj.weight",
     ),
+    target_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
 )
-def _export_qkv(ctx: io.TransformCTX, linear_qkv):
-    megatron_config = ctx.source.config
-
-    head_num = megatron_config.num_attention_heads
-    num_query_groups = megatron_config.num_query_groups
-    heads_per_group = head_num // num_query_groups
-    hidden_size = megatron_config.hidden_size
-    head_size = megatron_config.kv_channels
-    qkv_total_dim = head_num + 2 * num_query_groups
-
-    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
-    q_slice = torch.cat(
-        [
-            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
-            for i in range(num_query_groups)
-        ]
+def _import_language_qkv(ctx: io.TransformCTX, q, k, v):
+    megatron_config = ctx.target.config.language_transformer_config
+    return import_qkv(
+        q,
+        k,
+        v,
+        head_num=megatron_config.num_attention_heads,
+        num_query_groups=megatron_config.num_query_groups,
+        heads_per_group=megatron_config.num_attention_heads // megatron_config.num_query_groups,
+        hidden_size=megatron_config.hidden_size,
+        head_size=megatron_config.kv_channels,
     )
-    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
-    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
 
-    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
-    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
-    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
 
-    return q_proj, k_proj, v_proj
+@io.state_transform(
+    source_key=(
+        "vision_tower.vision_model.encoder.layers.*.self_attn.q_proj.weight",
+        "vision_tower.vision_model.encoder.layers.*.self_attn.k_proj.weight",
+        "vision_tower.vision_model.encoder.layers.*.self_attn.v_proj.weight",
+    ),
+    target_key="vision_model.decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_vision_qkv(ctx: io.TransformCTX, q, k, v):
+    megatron_config = ctx.target.config.vision_transformer_config
+    return import_qkv(
+        q,
+        k,
+        v,
+        head_num=megatron_config.num_attention_heads,
+        num_query_groups=megatron_config.num_query_groups,
+        heads_per_group=megatron_config.num_attention_heads // megatron_config.num_query_groups,
+        hidden_size=megatron_config.hidden_size,
+        head_size=megatron_config.kv_channels,
+    )
 
 
 @io.state_transform(
     source_key=(
-        "language_model.model.layers.*.mlp.gate_proj.weight",
-        "language_model.model.layers.*.mlp.up_proj.weight",
+        "vision_tower.vision_model.encoder.layers.*.self_attn.q_proj.bias",
+        "vision_tower.vision_model.encoder.layers.*.self_attn.k_proj.bias",
+        "vision_tower.vision_model.encoder.layers.*.self_attn.v_proj.bias",
     ),
-    target_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
+    target_key="vision_model.decoder.layers.*.self_attention.linear_qkv.bias",
 )
-def _import_linear_fc1(down, gate):
-    return torch.cat((down, gate), axis=0)
+def _import_vision_qkv_bias(ctx: io.TransformCTX, q_bias, k_bias, v_bias):
+    megatron_config = ctx.target.config.vision_transformer_config
+    return import_qkv(
+        q_bias.unsqueeze(-1),
+        k_bias.unsqueeze(-1),
+        v_bias.unsqueeze(-1),
+        head_num=megatron_config.num_attention_heads,
+        num_query_groups=megatron_config.num_query_groups,
+        heads_per_group=megatron_config.num_attention_heads // megatron_config.num_query_groups,
+        hidden_size=1,
+        head_size=megatron_config.kv_channels,
+    ).squeeze(-1)
+
+
+@io.state_transform(
+    source_key=("vision_tower.vision_model.embeddings.class_embedding",),
+    target_key="vision_model.class_token",
+)
+def _import_cls_token(ctx: io.TransformCTX, cls_token):
+    return cls_token.reshape(1, 1, -1)
 
 
 @io.state_transform(
-    source_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
-    target_key=(
+    source_key=(
         "language_model.model.layers.*.mlp.gate_proj.weight",
         "language_model.model.layers.*.mlp.up_proj.weight",
     ),
+    target_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
 )
-def _export_linear_fc1(linear_fc1):
-    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
-
-    return gate_proj, up_proj
+def _import_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0)
diff --git a/nemo/collections/vlm/neva/model/vit_config.py b/nemo/collections/vlm/neva/model/vit_config.py
new file mode 100644
index 000000000000..5d60a84313ca
--- /dev/null
+++ b/nemo/collections/vlm/neva/model/vit_config.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from nemo.collections.llm.fn.activation import openai_gelu, quick_gelu
+
+from nemo.collections.vlm.neva.model.base import CLIPViTConfig
+
+
+@dataclass
+class CLIPViTL_14_336_Config(CLIPViTConfig):
+    """Clip vit large patch14 config"""
+
+    vision_model_type = "clip"
+    patch_dim = 14
+    img_h = 336
+    img_w = 336
+    num_layers = 24
+    num_attention_heads = 16
+    add_bias_linear = True
+    add_qkv_bias = True
+    hidden_size = 1024
+    hidden_dropout = 0.0
+    attention_dropout = 0.0
+    ffn_hidden_size = 4096
+    gated_linear_unit = False
+    activation_func = quick_gelu
+    kv_channels = 64
+    num_query_groups = 16
+    layernorm_zero_centered_gamma = False
+    apply_query_key_layer_scaling = False
+    bias_activation_fusion = False
+    bias_dropout_fusion = False
+    attention_softmax_in_fp32 = True
+    normalization = 'LayerNorm'
+    apply_rope_fusion = False
+
+
+@dataclass
+class SigLIPViT400M_14_384_Config(CLIPViTConfig):
+    """Siglip so400m patch14 384 config"""
+
+    vision_model_type = "siglip"
+    patch_dim = 14
+    img_h = 384
+    img_w = 384
+    num_layers = 27
+    num_attention_heads = 16
+    add_bias_linear = True
+    add_qkv_bias = True
+    hidden_size = 1152
+    hidden_dropout = 0.0
+    attention_dropout = 0.0
+    ffn_hidden_size = 4304
+    gated_linear_unit = False
+    activation_func = openai_gelu
+    kv_channels = 72
+    num_query_groups = 16
+    layernorm_zero_centered_gamma = False
+    apply_query_key_layer_scaling = False
+    bias_activation_fusion = False
+    bias_dropout_fusion = False
+    attention_softmax_in_fp32 = True
+    normalization = 'LayerNorm'
+    apply_rope_fusion = False
+    qk_layernorm = False
+    layernorm_epsilon = 1e-6
diff --git a/nemo/collections/vlm/peft/lora.py b/nemo/collections/vlm/peft/lora.py
index 1e394daa8ead..7a80b7e06883 100644
--- a/nemo/collections/vlm/peft/lora.py
+++ b/nemo/collections/vlm/peft/lora.py
@@ -48,7 +48,7 @@ class LoRA(LLMLoRA):
     """
 
     freeze_language_model: bool = True
-    freeze_vision_model: bool = False
+    freeze_vision_model: bool = True
 
     def freeze_model(self, model: nn.Module) -> None:
         modules = []
diff --git a/nemo/collections/vlm/recipes/__init__.py b/nemo/collections/vlm/recipes/__init__.py
index 2b71ecc50f8f..ba8706437c56 100644
--- a/nemo/collections/vlm/recipes/__init__.py
+++ b/nemo/collections/vlm/recipes/__init__.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 
-from nemo.collections.vlm.recipes import mllama_11b, mllama_90b
+from nemo.collections.vlm.recipes import llava15_7b, llava15_13b, mllama_11b, mllama_90b
 
 __all__ = [
+    "llava15_7b",
+    "llava15_13b",
     "mllama_11b",
     "mllama_90b",
 ]
diff --git a/nemo/collections/vlm/recipes/llava15_13b.py b/nemo/collections/vlm/recipes/llava15_13b.py
new file mode 100644
index 000000000000..97b77b82d3de
--- /dev/null
+++ b/nemo/collections/vlm/recipes/llava15_13b.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+
+from nemo import lightning as nl
+from nemo.collections import llm, vlm
+from nemo.collections.llm.recipes.finetune_default import nemo_resume
+from nemo.collections.llm.recipes.log.default import tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.vlm.neva.data.mock import MockDataModule
+
+NAME = "llava15_13b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llava 1.5 13B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llava 1.5 13B model model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=llava15_13b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(vlm.LlavaModel, config=run.Config(vlm.Llava15Config13B))
+
+
+@run.cli.factory(target=llm.finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llava1.5 13B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory llava15_13b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llava15_13b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        encoder_pipeline_model_parallel_size=0,
+        pipeline_dtype=torch.bfloat16,
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_val_batches=10,
+        log_every_n_steps=1,
+        max_steps=5190,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        val_check_interval=1000,
+    )
+
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(),
+        trainer=trainer,
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=128,
+            micro_batch_size=1,
+            tokenizer=None,
+            image_processor=None,
+            num_workers=4,
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=2.0e-05, min_lr=2.0e-07, warmup_steps=150),
+        resume=nemo_resume("llava-hf/llava-1.5-13b-hf"),
+    )
+
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 2e-05
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(
+            vlm.LoRA,
+            freeze_vision_model=False,
+            target_modules=[
+                "*.language_model.*.linear_qkv",
+                "*.language_model.*.linear_q",
+                "*.language_model.*.linear_kv",
+                "*.language_model.*.linear_proj",
+                "*.language_model.*.linear_fc1",
+                "*.language_model.*.linear_fc2",
+            ],
+        )
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+
+    return recipe
diff --git a/nemo/collections/vlm/recipes/llava15_7b.py b/nemo/collections/vlm/recipes/llava15_7b.py
new file mode 100644
index 000000000000..04e6bd36f4d4
--- /dev/null
+++ b/nemo/collections/vlm/recipes/llava15_7b.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+
+from nemo import lightning as nl
+from nemo.collections import llm, vlm
+from nemo.collections.llm.recipes.finetune_default import nemo_resume
+from nemo.collections.llm.recipes.log.default import tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.vlm.neva.data.mock import MockDataModule
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "llava15_7b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llava 1.5 7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llava 1.5 7B model model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=llava15_7b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(vlm.LlavaModel, config=run.Config(vlm.Llava15Config7B))
+
+
+@run.cli.factory(target=llm.finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llava1.5 7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory llava15_7b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llava15_7b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        encoder_pipeline_model_parallel_size=0,
+        pipeline_dtype=torch.bfloat16,
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_val_batches=10,
+        log_every_n_steps=1,
+        max_steps=5190,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        val_check_interval=1000,
+        callbacks=[run.Config(TimingCallback)],
+    )
+
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(),
+        trainer=trainer,
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=128,
+            micro_batch_size=2,
+            tokenizer=None,
+            image_processor=None,
+            num_workers=4,
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=2.0e-05, min_lr=2.0e-07, warmup_steps=150),
+        resume=nemo_resume("llava-hf/llava-1.5-7b-hf"),
+    )
+
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 2e-05
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(
+            vlm.LoRA,
+            freeze_vision_model=False,
+            target_modules=[
+                "*.language_model.*.linear_qkv",
+                "*.language_model.*.linear_q",
+                "*.language_model.*.linear_kv",
+                "*.language_model.*.linear_proj",
+                "*.language_model.*.linear_fc1",
+                "*.language_model.*.linear_fc2",
+            ],
+        )
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+
+    return recipe
diff --git a/nemo/lightning/io/state.py b/nemo/lightning/io/state.py
index 6632768ec8dd..f2c26aa4d495 100644
--- a/nemo/lightning/io/state.py
+++ b/nemo/lightning/io/state.py
@@ -242,7 +242,12 @@ def __call__(self, ctx: TransformCTX) -> TransformCTX:
             source_matches_dict = {k: _match_keys(list(source_dict.keys()), v) for k, v in source_key_dict.items()}
             target_matches = _match_keys(list(target_dict.keys()), target_key)
             param_names = list(filter(lambda x: x in source_matches_dict, fn_params))
-            for layer_names_group in zip(*([source_matches_dict[v] for v in param_names] + [target_matches])):
+            source_matches = [
+                source_matches_dict[v] if source_matches_dict[v].ndim > 0 else [source_matches_dict[v].item()]
+                for v in param_names
+            ]
+            target_matches = [target_matches if target_matches.ndim > 0 else [target_matches.item()]]
+            for layer_names_group in zip(*(source_matches + target_matches)):
                 # Wrap in a list if it's a single layer (ie non-expert)
                 if isinstance(layer_names_group[0], str):
                     layer_names_group = [[x] for x in layer_names_group]

From ffdccaf719510e27145b2592eb9c39c620446027 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Thu, 21 Nov 2024 19:08:06 +0100
Subject: [PATCH 44/48] Export & deploy updates (part II) (#11344)

* Set debug level accordingly and use logger for some prints

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Param model_type is in fact required for vLLM deployment

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* It's time to remove version check

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Start using v2 block manager (default) in vLLM

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Rename original NeMo SP tokenizer to solve UnboundLocalError

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Add output_generation_logits as Triton input for vLLM exporter

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Update requirements_vllm.txt

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Refer to NeMo tokenizers explicitely

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 .../trt_llm/nemo_ckpt_loader/nemo_file.py     | 12 +++++------
 .../trt_llm/qnemo/qnemo_to_tensorrt_llm.py    |  9 ++-------
 nemo/export/trt_llm/qnemo/tokenizer_utils.py  |  8 +++++---
 nemo/export/vllm_exporter.py                  |  6 +++++-
 requirements/requirements_multimodal.txt      |  2 +-
 requirements/requirements_vllm.txt            | 20 ++++++++++++++++++-
 scripts/deploy/nlp/deploy_vllm_triton.py      |  2 +-
 tests/export/nemo_export.py                   |  3 +++
 8 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
index 9ace6425f533..f3cb73811af1 100644
--- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -318,14 +318,14 @@ def build_tokenizer(tokenizer):
             tokenizer.add_special_tokens({"eos_token": "</s>"})
     else:
         # For NeMo tokenizers, monkey patch encode & batch_decode methods for unified interface
-        from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer, TokenizerSpec
+        import nemo.collections.common.tokenizers as nemo_tokenizers
 
-        if isinstance(tokenizer, TokenizerSpec):
-            if isinstance(tokenizer, AutoTokenizer):
+        if isinstance(tokenizer, nemo_tokenizers.TokenizerSpec):
+            if isinstance(tokenizer, nemo_tokenizers.AutoTokenizer):
                 # Unwrap the original methods of HF tokenizer
                 batch_decode = tokenizer.tokenizer.batch_decode
                 encode = tokenizer.tokenizer.encode
-            elif isinstance(tokenizer, SentencePieceTokenizer):
+            elif isinstance(tokenizer, nemo_tokenizers.SentencePieceTokenizer):
                 # Define HF equivalents based on available SP methods
                 def batch_decode(self, ids):
                     if torch.is_tensor(ids):
@@ -340,8 +340,8 @@ def batch_decode(self, ids):
 
             tokenizer.bos_token_id = tokenizer.bos_id
             tokenizer.eos_token_id = tokenizer.eos_id
-            TokenizerSpec.encode = encode
-            TokenizerSpec.batch_decode = batch_decode
+            nemo_tokenizers.TokenizerSpec.encode = encode
+            nemo_tokenizers.TokenizerSpec.batch_decode = batch_decode
 
     return tokenizer
 
diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
index eac1ab743849..f601c8cb1c5a 100644
--- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
@@ -18,7 +18,6 @@
 import warnings
 from typing import List, Optional
 
-import tensorrt_llm
 from tensorrt_llm.models import PretrainedConfig
 
 from nemo.export.trt_llm.qnemo.utils import CONFIG_NAME, WEIGHTS_NAME
@@ -51,7 +50,7 @@ def qnemo_to_tensorrt_llm(
 
     warnings.warn(
         "Note that setting tensor_parallel_size, pipeline_parallel_size and use_parallel_embedding "
-        " parameters for quantized models is done on calibration step with nemo.export.quantize module."
+        " parameters for quantized models is done on the calibration step (in PTQ workflow)."
         " These parameters are ignored when building and running TensorRT-LLM engine below.",
         UserWarning,
         stacklevel=3,
@@ -93,11 +92,7 @@ def qnemo_to_tensorrt_llm(
     build_cmd += f"--remove_input_padding {'enable' if remove_input_padding else 'disable'} "
     build_cmd += f"--multiple_profiles {'enable' if multiple_profiles else 'disable'} "
     build_cmd += f"--reduce_fusion {'enable' if reduce_fusion else 'disable'} "
-    # TODO: resolve version check for setting use_fused_mlp once we move to 0.13.0 in the NeMo container
-    if tensorrt_llm.__version__ >= "0.13.0":
-        build_cmd += f"--use_fused_mlp {'enable' if use_fused_mlp else 'disable'} "
-    else:
-        build_cmd += "--use_fused_mlp " if use_fused_mlp else ""
+    build_cmd += f"--use_fused_mlp {'enable' if use_fused_mlp else 'disable'} "
 
     if not use_qdq:
         build_cmd += f"--gemm_plugin auto "
diff --git a/nemo/export/trt_llm/qnemo/tokenizer_utils.py b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
index beca40bcd3d7..5a6b6280d7c1 100644
--- a/nemo/export/trt_llm/qnemo/tokenizer_utils.py
+++ b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 
 from omegaconf import OmegaConf
@@ -24,22 +25,23 @@
 
 TOKENIZER_CONFIG_FILE = "tokenizer_config.yaml"
 TOKENIZER_DIR = "tokenizer"
+LOGGER = logging.getLogger("NeMo")
 
 
 def get_nmt_tokenizer(nemo_checkpoint_path: str):
     """Build tokenizer from Nemo tokenizer config."""
 
-    print(f"Initializing tokenizer from {TOKENIZER_CONFIG_FILE}")
+    LOGGER.info(f"Initializing tokenizer from {TOKENIZER_CONFIG_FILE}")
     tokenizer_cfg = OmegaConf.load(os.path.join(nemo_checkpoint_path, TOKENIZER_CONFIG_FILE))
 
     library = tokenizer_cfg.library
     legacy = tokenizer_cfg.get("sentencepiece_legacy", library == "sentencepiece")
 
     if library == "huggingface":
-        print(f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_cfg.type}")
+        LOGGER.info(f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_cfg.type}")
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_cfg["type"], use_fast=tokenizer_cfg.get("use_fast", False))
     elif library == "sentencepiece":
-        print(f"Getting SentencePieceTokenizer with model: {tokenizer_cfg.model}")
+        LOGGER.info(f"Getting SentencePieceTokenizer with model: {tokenizer_cfg.model}")
         tokenizer = SentencePieceTokenizer(
             model_path=os.path.join(nemo_checkpoint_path, tokenizer_cfg.model), legacy=legacy
         )
diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py
index 0ce7d49126d3..97575058bd1c 100644
--- a/nemo/export/vllm_exporter.py
+++ b/nemo/export/vllm_exporter.py
@@ -222,7 +222,6 @@ def export(
             max_num_seqs=256,
             # Note: max_model_len can be derived by model_config if the input value is None
             max_model_len=model_config.max_model_len,
-            use_v2_block_manager=False,
             num_lookahead_slots=0,
             delay_factor=0.0,
             enable_chunked_prefill=False,
@@ -403,6 +402,7 @@ def get_triton_input(self):
             Tensor(name="top_p", shape=(-1,), dtype=numpy.single, optional=True),
             Tensor(name="temperature", shape=(-1,), dtype=numpy.single, optional=True),
             Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="output_generation_logits", shape=(-1,), dtype=numpy.bool_, optional=True),
         )
         return inputs
 
@@ -455,6 +455,7 @@ def forward(
         prompt_embeddings_checkpoint_path: Optional[str] = None,
         streaming: bool = False,
         output_log_probs: bool = False,
+        output_generation_logits: bool = False,
     ) -> Union[List[List[str]], Iterable[List[List[str]]]]:
         """
         The forward function performs LLM evaluation on the provided array of prompts with other parameters shared,
@@ -484,6 +485,9 @@ def forward(
         if output_log_probs:
             raise NotImplementedError("output_log_probs is not supported")
 
+        if output_generation_logits:
+            raise NotImplementedError("output_generation_logits is not supported")
+
         request_ids = []
         for index in range(len(input_texts)):
             prompt = input_texts[index]
diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
index 18abe82c9f96..aa33b3b55127 100644
--- a/requirements/requirements_multimodal.txt
+++ b/requirements/requirements_multimodal.txt
@@ -5,7 +5,7 @@ diffusers>=0.19.3
 einops_exts
 imageio
 kornia
-megatron-energon
+megatron-energon<3.0.0
 nerfacc>=0.5.3
 open_clip_torch==2.24.0
 PyMCubes
diff --git a/requirements/requirements_vllm.txt b/requirements/requirements_vllm.txt
index 414e05078680..6f5c8880f632 100644
--- a/requirements/requirements_vllm.txt
+++ b/requirements/requirements_vllm.txt
@@ -1 +1,19 @@
-vllm==0.5.3.post1
+# Minimal set of NeMo requirements to run vLLM export & deployment in /opt/venv in a NeMo container
+braceexpand
+faiss-cpu
+h5py
+hydra-core>1.3,<=1.3.2
+ijson
+jieba
+lightning>2.2.1
+matplotlib>=3.3.2
+omegaconf<=2.3
+onnx>=1.7.0
+OpenCC
+pangu
+rouge_score
+sacrebleu
+scikit-learn
+vllm==0.6.3
+webdataset>=0.2.86
+wget
diff --git a/scripts/deploy/nlp/deploy_vllm_triton.py b/scripts/deploy/nlp/deploy_vllm_triton.py
index ab9f13a1b8da..a3cf5e8ec762 100755
--- a/scripts/deploy/nlp/deploy_vllm_triton.py
+++ b/scripts/deploy/nlp/deploy_vllm_triton.py
@@ -41,7 +41,7 @@ def get_args(argv):
         "-mt",
         "--model_type",
         type=str,
-        required=False,
+        required=True,
         choices=["llama", "mistral", "mixtral", "starcoder2", "gemma"],
         help="Type of the model",
     )
diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py
index e929f2601022..16aca9ccea4b 100644
--- a/tests/export/nemo_export.py
+++ b/tests/export/nemo_export.py
@@ -849,6 +849,9 @@ def run_inference_tests(args):
             "Use the same value for --min_tps and --max_tps."
         )
 
+    if args.debug:
+        LOGGER.setLevel(logging.DEBUG)
+
     result_dic: Dict[int, Tuple[FunctionalResult, Optional[AccuracyResult]]] = {}
 
     if args.existing_test_models:

From 376558045d0ff3df573145482cdd374da40b5191 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Thu, 21 Nov 2024 12:02:57 -0800
Subject: [PATCH 45/48] nemo2 peft merge (#11017)

* initial draft

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* refactor wip

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* refac v2 WIP

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* update address comments and add model dump

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* remove merge script

* move driver script

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* format

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* format

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* Apply isort and black reformatting

Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>

* update with nemo2 main

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* Apply isort and black reformatting

Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>

* cleanup import

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* merge api v3

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* cleanup

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* refac merge func to transform(by ChenCui)

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* read base model from io instead of user input and bug fix

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* Apply isort and black reformatting

Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>

* add docstring

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* refac

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* add test

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* Apply isort and black reformatting

Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>

* add trainer to model

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* add copyright

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* clean up

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Co-authored-by: HuiyingLi <HuiyingLi@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml          |  11 ++
 nemo/collections/llm/peft/__init__.py    |   4 +-
 nemo/collections/llm/peft/api.py         | 123 ++++++++++++++++++++++-
 nemo/collections/llm/peft/lora.py        |  40 ++++++++
 tests/collections/llm/peft/lora_merge.py |  42 ++++++++
 5 files changed, 216 insertions(+), 4 deletions(-)
 create mode 100644 tests/collections/llm/peft/lora_merge.py

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 4d788fb866d8..1e148312a49e 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4319,7 +4319,18 @@ jobs:
         --mbs 1 \
         --model mistral \
         --dist-opt
+  
+  L2_NEMO_2_LoRA_MERGE:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NEMO_2_LoRA_MERGE') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
 
+        python tests/collections/llm/peft/lora_merge.py \
+        --lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint/ \
+        --output_path=/tmp/nemo2_lora_merge/${{ github.run_id }}
 
   L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact:
     needs: [cicd-test-container-setup]
diff --git a/nemo/collections/llm/peft/__init__.py b/nemo/collections/llm/peft/__init__.py
index 11511ffe72b7..1dcc070a5a97 100644
--- a/nemo/collections/llm/peft/__init__.py
+++ b/nemo/collections/llm/peft/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.peft.api import gpt_lora, merge_lora
 from nemo.collections.llm.peft.dora import DoRA
 from nemo.collections.llm.peft.lora import LoRA
 
@@ -23,4 +23,4 @@
     "dora": DoRA,
 }
 
-__all__ = ["LoRA", "DoRA", "gpt_lora", "PEFT_STR2CLS"]
+__all__ = ["LoRA", "DoRA", "gpt_lora", "PEFT_STR2CLS", "merge_lora"]
diff --git a/nemo/collections/llm/peft/api.py b/nemo/collections/llm/peft/api.py
index 85c0ae6cae41..a089a6d17515 100644
--- a/nemo/collections/llm/peft/api.py
+++ b/nemo/collections/llm/peft/api.py
@@ -12,9 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.llm.peft.lora import LoRA
+import json
+from pathlib import Path
+from typing import Tuple, Union
+
+import pytorch_lightning as pl
+from megatron.core import dist_checkpointing
+from pytorch_lightning.trainer.states import TrainerFn
+
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.llm.peft.lora import LoRA, LoRAMerge
 from nemo.collections.llm.utils import factory
+from nemo.lightning import MegatronStrategy, Trainer, _strategy_lib, io
+from nemo.lightning.ckpt_utils import ADAPTER_META_FILENAME, ckpt_to_context_subdir
+from nemo.lightning.io.pl import TrainerContext, ckpt_to_weights_subdir
+from nemo.lightning.pytorch.callbacks import PEFT
 from nemo.lightning.pytorch.callbacks.peft import PEFT
+from nemo.lightning.pytorch.strategies.utils import RestoreConfig
+from nemo.utils import logging
 
 
 @factory
@@ -22,4 +37,108 @@ def gpt_lora() -> PEFT:
     return LoRA()
 
 
-__all__ = ["gpt_lora"]
+def merge_lora(
+    lora_checkpoint_path: str,
+    output_path: str,
+) -> None:
+    """
+    Merges the LoRA adapter weights into the base model's weights.
+
+    Python Usage:
+    ```python
+    if __name__ == '__main__':
+        llm.peft.merge_lora(
+            lora_checkpoint_path=your_lora_checkpoint_path,
+            output_path=your_output_path,
+        )
+    ```
+
+    Args:
+        lora_checkpoint_path: The path to the LoRA checkpoint.
+        output_path: The path to save the merged checkpoint.
+
+    """
+    from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+
+    trainer = Trainer(
+        devices=1,
+        accelerator="cpu",
+        strategy=MegatronStrategy(ddp="pytorch", setup_optimizers=False, plugins=bf16_mixed()),
+    )
+
+    model, lora = _load_base_model_and_lora(lora_checkpoint_path)
+    _setup_trainer_and_restore_model_and_adapter(Path(lora_checkpoint_path), trainer, model, lora)
+
+    lora_merge = LoRAMerge()
+    merged_model = lora_merge(trainer.strategy.megatron_parallel)
+    merged_weights = {k: v for k, v in merged_model.sharded_state_dict().items() if ".adapter." not in k}
+    _save_merged_weight(output_path, merged_weights, model, trainer)
+
+
+def _load_base_model_and_lora(lora_checkpoint_path: Path) -> Tuple[pl.LightningModule, LoRA]:
+    model = io.load_context(ckpt_to_context_subdir(lora_checkpoint_path), "model")
+    model.model_transform, model.__io__.model_transform = None, None
+    model.config.bf16 = False
+    lora: Union[io.TrainerContext, LoRA] = io.load_context(
+        ckpt_to_context_subdir(lora_checkpoint_path), "model.model_transform"
+    )
+    assert isinstance(lora, LoRA), "LoRA config not found in checkpoint"
+    return model, lora
+
+
+def _setup_trainer_and_restore_model_and_adapter(
+    lora_checkpoint_path: Path, trainer: Trainer, model: pl.LightningModule, lora: LoRA
+) -> None:
+    if (
+        adapter_meta_path := ckpt_to_weights_subdir(lora_checkpoint_path, is_saving=False) / ADAPTER_META_FILENAME
+    ).exists():
+        with open(adapter_meta_path, "r") as f:
+            metadata = json.load(f)
+        restore_config = RestoreConfig(
+            path=metadata["model_ckpt_path"],
+            load_model_state=True,
+            load_optim_state=False,
+        )
+    else:
+        raise ValueError(f"Cannot find adapter meta file in {lora_checkpoint_path}")
+
+    trainer.strategy.restore_config = restore_config
+    trainer.strategy._setup_optimizers = False
+    trainer.ckpt_path = None
+    trainer.strategy.connect(model)
+    trainer.strategy.setup_environment()
+
+    if not model.state_dict():
+        with _strategy_lib.megatron_cpu_init_context(model.config):
+            model.configure_model()
+
+    trainer.strategy.setup(trainer)  # load base model ckpt
+    trainer.state.fn = TrainerFn.TESTING
+    trainer.strategy.setup_megatron_parallel(trainer=trainer)
+    trainer.strategy.trainer = trainer
+    model.trainer = trainer
+
+    lora(model)
+    adapter_sharded_state_dict = {
+        k: v for k, v in trainer.strategy.megatron_parallel.sharded_state_dict().items() if ".adapter." in k
+    }
+    adapter_state = trainer.strategy.checkpoint_io.load_checkpoint(
+        ckpt_to_weights_subdir(lora_checkpoint_path, is_saving=False), sharded_state_dict=adapter_sharded_state_dict
+    )
+    trainer.strategy.load_model_state_dict(adapter_state, strict=False)
+
+
+def _save_merged_weight(output_path: str, merged_weights: dict, model: pl.LightningModule, trainer: Trainer):
+    weight_path = ckpt_to_weights_subdir(output_path, is_saving=True)
+    Path(weight_path).mkdir(parents=True, exist_ok=True)
+    dist_checkpointing.save(merged_weights, str(ckpt_to_weights_subdir(output_path, is_saving=True)))
+    if hasattr(model.tokenizer, "save_pretrained"):
+        model.tokenizer.save_pretrained("/tmp/nemo_tokenizer")
+        model.tokenizer = AutoTokenizer("/tmp/nemo_tokenizer")
+    if hasattr(trainer.model, "__io__") and hasattr(trainer.model.tokenizer, '__io__'):
+        trainer.model.__io__.tokenizer = trainer.model.tokenizer.__io__
+    TrainerContext.from_trainer(trainer).io_dump(ckpt_to_context_subdir(output_path), yaml_attrs=["model"])
+    logging.info(f"Merged checkpoint saved to {output_path}")
+
+
+__all__ = ["gpt_lora", "merge_lora"]
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index 55f807bf1f9c..57cdda3a2871 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -227,3 +227,43 @@ def wildcard_match(pattern, key):
             )
             return AdapterParallelAdd(m, adapter)
         return m
+
+
+class LoRAMerge(PEFT):
+    """
+    Implements the LoRA weight merge for parameter-efficient fine-tuning.
+
+    Example:
+    --------
+        >>> from nemo.collections.llm.peft.lora import LoRAMerge
+        >>> lora_merge = LoRAMerge()
+        >>> merged_model = lora_merge(trainer.strategy.megatron_parallel)
+    """
+
+    @torch.no_grad()
+    def transform(self, m: nn.Module, name=None, prefix=None):
+        """
+        Merges the LoRA adapter with the base model weights.
+
+        Args:
+            m (nn.Module): The module to apply LoRA merge to.
+            name (str, optional): Name of the module to merge. Defaults to None.
+            prefix (str, optional): Prefix for the module name. Defaults to None.
+
+        Returns:
+            nn.Module: The modified module with the LoRA adapter merged into the base model weights.
+        """
+
+        if not isinstance(m, AdapterParallelAdd):
+            return m
+        logging.info(f'merging {(prefix if prefix else "") + "." + (name if name else "")}')
+        base_weight = m.to_wrap.weight
+        lora_weight = (
+            m.adapter.alpha
+            / m.adapter.dim
+            * m.adapter.linear_out.weight.to(base_weight.device)
+            @ m.adapter.linear_in.weight.to(base_weight.device)
+        )
+        merged_weight = base_weight + lora_weight
+        m.to_wrap.weight.data = merged_weight
+        return m
diff --git a/tests/collections/llm/peft/lora_merge.py b/tests/collections/llm/peft/lora_merge.py
new file mode 100644
index 000000000000..2ca7390ea7e6
--- /dev/null
+++ b/tests/collections/llm/peft/lora_merge.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from dataclasses import dataclass
+
+from nemo.collections import llm
+
+
+@dataclass
+class Llama3ConfigCI(llm.Llama3Config8B):
+    seq_length: int = 2048
+    num_layers: int = 2
+    hidden_size: int = 768
+    ffn_hidden_size: int = 3072
+    num_attention_heads: int = 8
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Merge LoRA weights with base LLM')
+    parser.add_argument('--lora_checkpoint_path', type=str, help="Path to finetuned LORA checkpoint")
+    parser.add_argument('--output_path', type=str, help="Path to save merged checkpoint")
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    llm.peft.merge_lora(
+        lora_checkpoint_path=args.lora_checkpoint_path,
+        output_path=args.output_path,
+    )

From ebc6a8898642fec7d2c7489c63f4855c4a04c6a6 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Thu, 21 Nov 2024 15:24:32 -0500
Subject: [PATCH 46/48] Add dora recipes (#11330)

* add dora recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* pylint

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/api.py                   |  2 +-
 nemo/collections/llm/recipes/baichuan2_7b.py  | 12 +++---
 nemo/collections/llm/recipes/chatglm3_6b.py   | 12 +++---
 .../llm/recipes/finetune_default.py           | 40 +++++++++++++++++++
 nemo/collections/llm/recipes/gemma2_27b.py    | 12 +++---
 nemo/collections/llm/recipes/gemma2_2b.py     | 12 +++---
 nemo/collections/llm/recipes/gemma2_9b.py     | 12 +++---
 nemo/collections/llm/recipes/gemma_2b.py      | 12 +++---
 nemo/collections/llm/recipes/gemma_7b.py      | 12 +++---
 nemo/collections/llm/recipes/llama31_405b.py  | 14 ++++---
 nemo/collections/llm/recipes/llama31_70b.py   | 14 ++++---
 nemo/collections/llm/recipes/llama31_8b.py    | 12 +++---
 nemo/collections/llm/recipes/llama3_70b.py    | 14 ++++---
 nemo/collections/llm/recipes/llama3_8b.py     | 12 +++---
 nemo/collections/llm/recipes/mistral_7b.py    | 13 +++---
 .../llm/recipes/mistral_nemo_12b.py           | 15 ++++---
 nemo/collections/llm/recipes/mixtral_8x22b.py | 17 ++++----
 nemo/collections/llm/recipes/mixtral_8x7b.py  | 17 ++++----
 nemo/collections/llm/recipes/nemotron3_22b.py | 14 ++++---
 nemo/collections/llm/recipes/nemotron3_4b.py  | 12 +++---
 nemo/collections/llm/recipes/nemotron3_8b.py  | 15 +++----
 nemo/collections/llm/recipes/nemotron4_15b.py | 14 ++++---
 .../collections/llm/recipes/nemotron4_340b.py | 15 +++----
 .../llm/recipes/phi3_mini_4k_instruct.py      | 10 ++---
 nemo/collections/llm/recipes/qwen2_1p5b.py    | 12 +++---
 nemo/collections/llm/recipes/qwen2_500m.py    | 12 +++---
 nemo/collections/llm/recipes/qwen2_72b.py     | 12 +++---
 nemo/collections/llm/recipes/qwen2_7b.py      | 12 +++---
 .../collections/llm/recipes/starcoder2_15b.py | 12 +++---
 nemo/collections/llm/recipes/starcoder2_3b.py | 12 +++---
 nemo/collections/llm/recipes/starcoder2_7b.py | 12 +++---
 nemo/collections/llm/recipes/starcoder_15b.py |  9 +++--
 nemo/collections/llm/recipes/t5_11b.py        |  9 +++--
 nemo/collections/llm/recipes/t5_220m.py       |  9 +++--
 nemo/collections/llm/recipes/t5_3b.py         |  9 +++--
 35 files changed, 277 insertions(+), 177 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index aaef714ef738..4bafdd97ba21 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import json
 import os
 from copy import deepcopy
 from pathlib import Path
diff --git a/nemo/collections/llm/recipes/baichuan2_7b.py b/nemo/collections/llm/recipes/baichuan2_7b.py
index 823f6e07cd57..1350cbaa7edd 100644
--- a/nemo/collections/llm/recipes/baichuan2_7b.py
+++ b/nemo/collections/llm/recipes/baichuan2_7b.py
@@ -25,7 +25,7 @@
 from nemo.collections.llm import Baichuan2Config7B, Baichuan2Model
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -254,8 +254,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -279,8 +281,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/chatglm3_6b.py b/nemo/collections/llm/recipes/chatglm3_6b.py
index b6c640372074..2cd424ce5bf6 100644
--- a/nemo/collections/llm/recipes/chatglm3_6b.py
+++ b/nemo/collections/llm/recipes/chatglm3_6b.py
@@ -25,7 +25,7 @@
 from nemo.collections.llm import ChatGLM3Config6B, ChatGLMModel
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -254,8 +254,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -279,8 +281,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/finetune_default.py b/nemo/collections/llm/recipes/finetune_default.py
index f05fd7cb2d13..e8af7f67bdbd 100644
--- a/nemo/collections/llm/recipes/finetune_default.py
+++ b/nemo/collections/llm/recipes/finetune_default.py
@@ -21,9 +21,11 @@
 import nemo.lightning as nl
 from nemo.collections import llm
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
+from nemo.collections.llm.peft import DoRA, LoRA
 from nemo.collections.llm.recipes.log.default import tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks import PEFT
 
 
 def default_finetune_recipe(
@@ -158,3 +160,41 @@ def nemo_resume(model_id: str) -> run.Config[nl.AutoResume]:
         nl.AutoResume,
         restore_config=run.Config(nl.RestoreConfig, path=f"nemo://{model_id}"),
     )
+
+
+@run.cli.factory(name='lora')
+def lora() -> run.Config[PEFT]:
+    """
+    Factory function to create a LoRA configuration.
+
+    Returns:
+        run.Config[PEFT]: Configuration for the LoRA class.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune -f llama3_8b peft=lora
+
+        Python API usage:
+            >>> lora_config = lora()
+            >>> print(lora_config)
+    """
+    return run.Config(LoRA)
+
+
+@run.cli.factory(name='dora')
+def dora() -> run.Config[PEFT]:
+    """
+    Factory function to create a DoRA configuration.
+
+    Returns:
+        run.Config[PEFT]: Configuration for the DoRA class.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune -f llama3_8b peft=dora
+
+        Python API usage:
+            >>> dora_config = dora()
+            >>> print(dora_config)
+    """
+    return run.Config(DoRA)
diff --git a/nemo/collections/llm/recipes/gemma2_27b.py b/nemo/collections/llm/recipes/gemma2_27b.py
index 2025bd570503..d6b41c0a221c 100644
--- a/nemo/collections/llm/recipes/gemma2_27b.py
+++ b/nemo/collections/llm/recipes/gemma2_27b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.gemma2 import gemma2_model, gemma2_trainer
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
@@ -191,8 +191,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -220,8 +222,8 @@ def finetune_recipe(
         recipe.optim.config.lr = 5e-6
         recipe.trainer.strategy.tensor_model_parallel_size = 8
         recipe.trainer.strategy.pipeline_model_parallel_size = 2
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.trainer.strategy.tensor_model_parallel_size = 4
         recipe.optim.config.lr = 1e-4
     else:
diff --git a/nemo/collections/llm/recipes/gemma2_2b.py b/nemo/collections/llm/recipes/gemma2_2b.py
index e1aa3ad4be86..138140d0515d 100644
--- a/nemo/collections/llm/recipes/gemma2_2b.py
+++ b/nemo/collections/llm/recipes/gemma2_2b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.gemma2 import gemma2_model, gemma2_trainer
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
@@ -191,8 +191,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -218,8 +220,8 @@ def finetune_recipe(
 
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/gemma2_9b.py b/nemo/collections/llm/recipes/gemma2_9b.py
index 8117102f1b75..c49ac0246307 100644
--- a/nemo/collections/llm/recipes/gemma2_9b.py
+++ b/nemo/collections/llm/recipes/gemma2_9b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.gemma2 import gemma2_model, gemma2_trainer
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
@@ -191,8 +191,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -219,8 +221,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.optim.config.lr = 5e-6
         recipe.trainer.strategy.tensor_model_parallel_size = 4
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py
index 8798af436a9c..8bdf89696d56 100644
--- a/nemo/collections/llm/recipes/gemma_2b.py
+++ b/nemo/collections/llm/recipes/gemma_2b.py
@@ -24,7 +24,7 @@
 from nemo.collections.llm import GemmaConfig2B, GemmaModel
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -253,8 +253,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given maximum seq_length for better efficiency.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -284,8 +286,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.context_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py
index 0bfd62b33e9e..46c91e27575a 100644
--- a/nemo/collections/llm/recipes/gemma_7b.py
+++ b/nemo/collections/llm/recipes/gemma_7b.py
@@ -24,7 +24,7 @@
 from nemo.collections.llm import GemmaConfig7B, GemmaModel
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -256,8 +256,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -287,8 +289,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py
index d71f3791a0af..5f08d82bd888 100644
--- a/nemo/collections/llm/recipes/llama31_405b.py
+++ b/nemo/collections/llm/recipes/llama31_405b.py
@@ -26,7 +26,7 @@
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.llm.gpt.model.llama import Llama31Config405B, LlamaModel
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -266,7 +266,8 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
         seq_length (int): Maximum number of tokens per microbatch.
         packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
             maximum seq_length for better efficiency. By default, this value equals performance_mode.
@@ -296,7 +297,7 @@ def finetune_recipe(
     if num_nodes is None:
         if peft_scheme is None or peft_scheme.lower() == 'none':
             num_nodes = 12
-        elif peft_scheme.lower() == 'lora':
+        elif peft_scheme.lower() in ['lora', 'dora']:
             num_nodes = 3
 
     recipe = default_finetune_recipe(
@@ -307,8 +308,8 @@ def finetune_recipe(
         recipe.trainer.strategy.pipeline_model_parallel_size = 14
         recipe.data.global_batch_size = 6
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.peft.dim = 16
         recipe.peft.alpha = 32
         recipe.optim.config.use_distributed_optimizer = False
@@ -348,7 +349,8 @@ def finetune_performance_optimizations(
 
     Args:
         recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
-        peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
 
     Returns:
         run.Partial: Partial configuration for performance-optimized fine-tuning.
diff --git a/nemo/collections/llm/recipes/llama31_70b.py b/nemo/collections/llm/recipes/llama31_70b.py
index 37e809e5bc8f..3120fedd7923 100644
--- a/nemo/collections/llm/recipes/llama31_70b.py
+++ b/nemo/collections/llm/recipes/llama31_70b.py
@@ -26,7 +26,7 @@
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.llm.gpt.model.llama import Llama31Config70B, LlamaModel
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -266,7 +266,8 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
         seq_length (int): Maximum number of tokens per microbatch.
         packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
             maximum seq_length for better efficiency. By default, this value equals performance_mode.
@@ -300,7 +301,7 @@ def finetune_recipe(
     if num_nodes is None:
         if peft_scheme is None or peft_scheme.lower() == 'none':
             num_nodes = 4
-        elif peft_scheme.lower() == 'lora':
+        elif peft_scheme.lower() in ['lora', 'dora']:
             num_nodes = 1
 
     recipe = default_finetune_recipe(
@@ -310,8 +311,8 @@ def finetune_recipe(
         recipe.trainer.strategy.tensor_model_parallel_size = 8
         recipe.trainer.strategy.pipeline_model_parallel_size = 4
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.peft.dim = 16
         recipe.peft.alpha = 32
         recipe.optim.config.use_distributed_optimizer = False
@@ -349,7 +350,8 @@ def finetune_performance_optimizations(
 
     Args:
         recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
-        peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
 
     Returns:
         run.Partial: Partial configuration for performance-optimized fine-tuning.
diff --git a/nemo/collections/llm/recipes/llama31_8b.py b/nemo/collections/llm/recipes/llama31_8b.py
index 32a77ce076f2..62514940b678 100644
--- a/nemo/collections/llm/recipes/llama31_8b.py
+++ b/nemo/collections/llm/recipes/llama31_8b.py
@@ -26,7 +26,7 @@
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.llm.gpt.model.llama import Llama31Config8B, LlamaModel
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -266,7 +266,8 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
         seq_length (int): Maximum number of tokens per microbatch.
         packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
             maximum seq_length for better efficiency. By default, this value equals performance_mode.
@@ -303,8 +304,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.peft.dim = 8
         recipe.peft.alpha = 16
         recipe.optim.config.use_distributed_optimizer = False
@@ -341,7 +342,8 @@ def finetune_performance_optimizations(
 
     Args:
         recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
-        peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
 
     Returns:
         run.Partial: Partial configuration for performance-optimized fine-tuning.
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
index 93aeb5c07dc1..8b61bff80e01 100644
--- a/nemo/collections/llm/recipes/llama3_70b.py
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -26,7 +26,7 @@
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -263,7 +263,8 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
         seq_length (int): Maximum number of tokens per microbatch.
         packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
             maximum seq_length for better efficiency. By default, this value equals performance_mode.
@@ -297,7 +298,7 @@ def finetune_recipe(
     if num_nodes is None:
         if peft_scheme is None or peft_scheme.lower() == 'none':
             num_nodes = 4
-        elif peft_scheme.lower() == 'lora':
+        elif peft_scheme.lower() in ['lora', 'dora']:
             num_nodes = 1
 
     recipe = default_finetune_recipe(
@@ -307,8 +308,8 @@ def finetune_recipe(
         recipe.trainer.strategy.tensor_model_parallel_size = 8
         recipe.trainer.strategy.pipeline_model_parallel_size = 4
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.peft.dim = 16
         recipe.peft.alpha = 32
         recipe.optim.config.use_distributed_optimizer = False
@@ -346,7 +347,8 @@ def finetune_performance_optimizations(
 
     Args:
         recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
-        peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
 
     Returns:
         run.Partial: Partial configuration for performance-optimized fine-tuning.
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
index 3e26f8fe1082..36b20c12ddb2 100644
--- a/nemo/collections/llm/recipes/llama3_8b.py
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -26,7 +26,7 @@
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -250,7 +250,8 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
         seq_length (int): Maximum number of tokens per microbatch.
         packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
             maximum seq_length for better efficiency. By default, this value equals performance_mode.
@@ -287,8 +288,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.peft.dim = 8
         recipe.peft.alpha = 16
         recipe.optim.config.use_distributed_optimizer = False
@@ -325,7 +326,8 @@ def finetune_performance_optimizations(
 
     Args:
         recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
-        peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
 
     Returns:
         run.Partial: Partial configuration for performance-optimized fine-tuning.
diff --git a/nemo/collections/llm/recipes/mistral_7b.py b/nemo/collections/llm/recipes/mistral_7b.py
index 3bc1e568185a..9e2d2e256fbe 100644
--- a/nemo/collections/llm/recipes/mistral_7b.py
+++ b/nemo/collections/llm/recipes/mistral_7b.py
@@ -24,9 +24,8 @@
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -207,8 +206,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -237,8 +238,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/mistral_nemo_12b.py b/nemo/collections/llm/recipes/mistral_nemo_12b.py
index 7d9fa1d792e9..a10f8ae804b8 100644
--- a/nemo/collections/llm/recipes/mistral_nemo_12b.py
+++ b/nemo/collections/llm/recipes/mistral_nemo_12b.py
@@ -24,9 +24,8 @@
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -255,8 +254,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -285,8 +286,10 @@ def finetune_recipe(
     )
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(
+            PEFT_STR2CLS[peft_scheme.lower()], target_modules=['linear_qkv', 'linear_proj'], dim=32
+        )
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py
index 16e6168e649b..ec1641a08d80 100644
--- a/nemo/collections/llm/recipes/mixtral_8x22b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x22b.py
@@ -24,9 +24,8 @@
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x22B, MixtralModel
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -227,7 +226,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
             ),
             run.Config(
                 MegatronCommOverlapCallback,
-                overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing
+                overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to issue with checkpointing
                 align_param_gather=True,
             ),
         ]
@@ -259,8 +258,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given maximum seq_length for better efficiency.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
+            maximum seq_length for better efficiency.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -286,8 +287,10 @@ def finetune_recipe(
         recipe.trainer.strategy.pipeline_model_parallel_size = 4
         recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 14
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(
+            PEFT_STR2CLS[peft_scheme.lower()], target_modules=['linear_qkv', 'linear_proj'], dim=32
+        )
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
index 5fbb0ac22c61..d06e22fc2180 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -24,9 +24,8 @@
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -222,7 +221,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
             run.Config(MegatronTokenDropCallback),
             run.Config(
                 MegatronCommOverlapCallback,
-                overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing.
+                overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to issue with checkpointing.
                 align_param_gather=True,
             ),
         ]
@@ -254,8 +253,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -280,8 +281,10 @@ def finetune_recipe(
         recipe.trainer.strategy.pipeline_model_parallel_size = 4
         recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 8
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(
+            PEFT_STR2CLS[peft_scheme.lower()], target_modules=['linear_qkv', 'linear_proj'], dim=32
+        )
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/nemotron3_22b.py b/nemo/collections/llm/recipes/nemotron3_22b.py
index 2dd9c3ff5205..4c763301bc52 100644
--- a/nemo/collections/llm/recipes/nemotron3_22b.py
+++ b/nemo/collections/llm/recipes/nemotron3_22b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Callable, Optional
+from typing import Optional
 
 import lightning.pytorch as pl
 import nemo_run as run
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
@@ -239,8 +239,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -265,8 +267,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 8
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/nemotron3_4b.py b/nemo/collections/llm/recipes/nemotron3_4b.py
index c208ee740265..fc6f09a09358 100644
--- a/nemo/collections/llm/recipes/nemotron3_4b.py
+++ b/nemo/collections/llm/recipes/nemotron3_4b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
@@ -191,8 +191,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -216,8 +218,8 @@ def finetune_recipe(
     )
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py
index 7799512c6260..f60463330cad 100644
--- a/nemo/collections/llm/recipes/nemotron3_8b.py
+++ b/nemo/collections/llm/recipes/nemotron3_8b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Callable, Optional
+from typing import Optional
 
 import lightning.pytorch as pl
 import nemo_run as run
@@ -21,8 +21,7 @@
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
@@ -256,8 +255,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -282,8 +283,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
index ad0f884b0d3b..49f92fcc1616 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Callable, Optional
+from typing import Optional
 
 import lightning.pytorch as pl
 import nemo_run as run
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
@@ -228,8 +228,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -254,8 +256,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 4
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
index b22abc43d558..14d4c0f32d11 100644
--- a/nemo/collections/llm/recipes/nemotron4_340b.py
+++ b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -12,16 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Callable, Optional
+from typing import Optional
 
 import lightning.pytorch as pl
 import nemo_run as run
 import torch
 
-from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
@@ -240,8 +239,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -268,8 +269,8 @@ def finetune_recipe(
         recipe.trainer.strategy.tensor_model_parallel_size = 8
         recipe.trainer.strategy.pipeline_model_parallel_size = 12
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.trainer.strategy.tensor_model_parallel_size = 8
         recipe.trainer.strategy.pipeline_model_parallel_size = 4
         recipe.optim.config.lr = 1e-4
diff --git a/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py b/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
index 1cbc877dc33e..73bbe4735adb 100644
--- a/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
+++ b/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py
@@ -25,7 +25,7 @@
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.llm.gpt.model.phi3mini import Phi3ConfigMini, Phi3Model
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -222,8 +222,8 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora',
-        'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
         seq_length (int): Maximum number of tokens per microbatch.
         packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
         maximum seq_length for better efficiency. By default, this value equals performance_mode.
@@ -260,8 +260,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 1
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.peft.dim = 8
         recipe.peft.alpha = 16
         recipe.optim.config.use_distributed_optimizer = False
diff --git a/nemo/collections/llm/recipes/qwen2_1p5b.py b/nemo/collections/llm/recipes/qwen2_1p5b.py
index a3d705c4fb3a..99ba5cd907fc 100644
--- a/nemo/collections/llm/recipes/qwen2_1p5b.py
+++ b/nemo/collections/llm/recipes/qwen2_1p5b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -194,8 +194,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -218,8 +220,8 @@ def finetune_recipe(
     )
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/qwen2_500m.py b/nemo/collections/llm/recipes/qwen2_500m.py
index 08541ca9e421..96d99c271c85 100644
--- a/nemo/collections/llm/recipes/qwen2_500m.py
+++ b/nemo/collections/llm/recipes/qwen2_500m.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -194,8 +194,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -218,8 +220,8 @@ def finetune_recipe(
     )
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/qwen2_72b.py b/nemo/collections/llm/recipes/qwen2_72b.py
index c0bc9bf40611..33bb0dd40835 100644
--- a/nemo/collections/llm/recipes/qwen2_72b.py
+++ b/nemo/collections/llm/recipes/qwen2_72b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -194,8 +194,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -221,8 +223,8 @@ def finetune_recipe(
         recipe.trainer.strategy.tensor_model_parallel_size = 8
         recipe.trainer.strategy.pipeline_model_parallel_size = 4
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.trainer.strategy.tensor_model_parallel_size = 8
         recipe.optim.config.lr = 1e-4
     else:
diff --git a/nemo/collections/llm/recipes/qwen2_7b.py b/nemo/collections/llm/recipes/qwen2_7b.py
index 67bcc5e953bf..2e62176a408e 100644
--- a/nemo/collections/llm/recipes/qwen2_7b.py
+++ b/nemo/collections/llm/recipes/qwen2_7b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -194,8 +194,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -219,8 +221,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/starcoder2_15b.py b/nemo/collections/llm/recipes/starcoder2_15b.py
index 14b53809111a..e424cb67dba4 100644
--- a/nemo/collections/llm/recipes/starcoder2_15b.py
+++ b/nemo/collections/llm/recipes/starcoder2_15b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -194,8 +194,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -219,8 +221,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 4
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/starcoder2_3b.py b/nemo/collections/llm/recipes/starcoder2_3b.py
index 3ee81522ebc9..faf0b416c56a 100644
--- a/nemo/collections/llm/recipes/starcoder2_3b.py
+++ b/nemo/collections/llm/recipes/starcoder2_3b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -194,8 +194,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -219,8 +221,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/starcoder2_7b.py b/nemo/collections/llm/recipes/starcoder2_7b.py
index 96b5ab36b876..091e882cd932 100644
--- a/nemo/collections/llm/recipes/starcoder2_7b.py
+++ b/nemo/collections/llm/recipes/starcoder2_7b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -194,8 +194,10 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
-        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
+            efficiency. Default sequence length is 2048.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -219,8 +221,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/starcoder_15b.py b/nemo/collections/llm/recipes/starcoder_15b.py
index d87788be5613..382d0eb4d8ca 100644
--- a/nemo/collections/llm/recipes/starcoder_15b.py
+++ b/nemo/collections/llm/recipes/starcoder_15b.py
@@ -23,7 +23,7 @@
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.model.starcoder import StarcoderConfig15B, StarcoderModel
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
@@ -280,7 +280,8 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -302,8 +303,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.pipeline_model_parallel_size = 8
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/t5_11b.py b/nemo/collections/llm/recipes/t5_11b.py
index 8baf54b4f42f..ee7323aa044f 100644
--- a/nemo/collections/llm/recipes/t5_11b.py
+++ b/nemo/collections/llm/recipes/t5_11b.py
@@ -24,7 +24,7 @@
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_trainer, nemo_resume
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
@@ -229,7 +229,8 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -279,8 +280,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 4
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/t5_220m.py b/nemo/collections/llm/recipes/t5_220m.py
index 27feb43837fb..edc9fdba62d7 100644
--- a/nemo/collections/llm/recipes/t5_220m.py
+++ b/nemo/collections/llm/recipes/t5_220m.py
@@ -24,7 +24,7 @@
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_trainer, nemo_resume
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
@@ -229,7 +229,8 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -279,8 +280,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 1
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
diff --git a/nemo/collections/llm/recipes/t5_3b.py b/nemo/collections/llm/recipes/t5_3b.py
index 333661d97117..82772e1b865a 100644
--- a/nemo/collections/llm/recipes/t5_3b.py
+++ b/nemo/collections/llm/recipes/t5_3b.py
@@ -24,7 +24,7 @@
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
-from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.peft import PEFT_STR2CLS
 from nemo.collections.llm.recipes.finetune_default import default_finetune_trainer, nemo_resume
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
@@ -229,7 +229,8 @@ def finetune_recipe(
         name (str): Name of the fine-tuning run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
-        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
 
     Returns:
         run.Partial: Partial configuration for fine-tuning.
@@ -279,8 +280,8 @@ def finetune_recipe(
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
-    elif peft_scheme.lower() == 'lora':
-        recipe.peft = run.Config(LoRA)
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.optim.config.lr = 1e-4
     else:
         raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")

From 8ab46ffcb0fc8b892d2e35b166772076fb630e8f Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Thu, 21 Nov 2024 16:03:50 -0500
Subject: [PATCH 47/48] update hypothesis when passed through cfg (#11366)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
---
 examples/asr/transcribe_speech.py          | 2 ++
 nemo/collections/asr/models/ctc_models.py  | 1 +
 nemo/collections/asr/models/rnnt_models.py | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index f1d61edc990e..b3264e7e6d90 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -275,6 +275,8 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
 
     # we will adjust this flag if the model does not support it
     compute_langs = cfg.compute_langs
+    if cfg.timestamps:
+        cfg.return_hypotheses = True
 
     # Check whether model and decoder type match
     if isinstance(asr_model, EncDecCTCModel):
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 3df6a7352c4d..962c7f2902eb 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -160,6 +160,7 @@ def transcribe(
             A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as 
             paths2audio_files
         """
+        timestamps = timestamps or override_config.timestamps if override_config is not None else None
         if timestamps is not None:
             # else retain the decoder state (users can set it using change_decoding_strategy)
             if timestamps or (override_config is not None and override_config.timestamps):
diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
index a6408b5e935e..4facd59a8c14 100644
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@@ -285,7 +285,7 @@ def transcribe(
             * A list of greedy transcript texts / Hypothesis
             * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis.
         """
-
+        timestamps = timestamps or override_config.timestamps if override_config is not None else None
         if timestamps is not None:
             if timestamps or (override_config is not None and override_config.timestamps):
                 logging.info(

From 9ea442c4e67627857d4361a521091f7ca9991e65 Mon Sep 17 00:00:00 2001
From: Ao Tang <aot@nvidia.com>
Date: Thu, 21 Nov 2024 18:44:15 -0500
Subject: [PATCH 48/48] Fix Gemma2 Attention Args (#11365)

* fix gemma2 attention args

* pylint

* kwargs

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>
---
 .../megatron/gemma2/gemma2_modules.py              | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py
index 5113ee745895..9ea1b4afe318 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py
@@ -49,7 +49,8 @@ class Gemma2DotProductAttention(MegatronModule):
     Region where selective activation recomputation is applied.
     This region is memory intensive but less compute intensive which
     makes activation checkpointing more efficient for LLMs (20B+).
-    See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+    See Reducing Activation Recomputation in Large Transformer Models:
+    https://arxiv.org/abs/2205.05198 for more details.
 
     We use the following notation:
      h: hidden size
@@ -126,7 +127,12 @@ def forward(
         attention_mask: Tensor,
         attn_mask_type: AttnMaskType = None,
         packed_seq_params: PackedSeqParams = None,
+        **kwargs,
     ):
+        """Forward.
+        Modified from mcore.transformer.dot_product_attention to support Gemma2-specific
+        final_logit_softcapping.
+        """
         assert packed_seq_params is None, (
             "Packed sequence is not supported by DotProductAttention." "Please use TEDotProductAttention instead."
         )
@@ -243,6 +249,8 @@ def forward(
 
 
 class TERowParallelLinearLayerNorm(TERowParallelLinear):
+    """Modified From TERowParallelLinear with an additional Post-LN."""
+
     def __init__(
         self,
         input_size: int,
@@ -270,12 +278,16 @@ def __init__(
         self.post_layernorm = TENorm(config, output_size)
 
     def forward(self, x):
+        """Forward with additional Post LN on output"""
         output, bias = super().forward(x)
         return self.post_layernorm(output), bias
 
 
 class Gemma2OutputLayer(ColumnParallelLinear):
+    """Extends from ColumnParallelLinear with logit soft capping."""
+
     def forward(self, *args, **kwargs):
+        """Forward with logit soft capping."""
         output, bias = super().forward(*args, **kwargs)
         output = logit_softcapping(output, self.config.final_logit_softcapping)
         return output, bias