From 387144d063df75e45ed0f2cf4560ec4d70912992 Mon Sep 17 00:00:00 2001
From: guyueh1 <140554423+guyueh1@users.noreply.github.com>
Date: Mon, 10 Feb 2025 18:45:44 -0800
Subject: [PATCH 01/14] Bug fix with generation of expert_tensor_parallel_rank
 (#12125)

* Bug fix with generation of expert_tensor_parallel_rank

Signed-off-by: Guyue Huang <guyueh@nvidia.com>

* Fix pylint

Signed-off-by: Guyue Huang <guyueh@nvidia.com>

---------

Signed-off-by: Guyue Huang <guyueh@nvidia.com>
---
 nemo/lightning/megatron_init.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/lightning/megatron_init.py b/nemo/lightning/megatron_init.py
index 5f1d744e5b77..fab6d17da3cb 100644
--- a/nemo/lightning/megatron_init.py
+++ b/nemo/lightning/megatron_init.py
@@ -108,7 +108,7 @@ def initialize_model_parallel_for_nemo(
     use_tp_pp_dp_mapping=False,
     use_te_rng_tracker=False,
 ):
-
+    """Initialize model parallel groups in NeMo."""
     if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED:
         raise ValueError("set_virtual_pipeline_model_parallel_world_size is needed in megatron-core for interleaved.")
 
@@ -498,7 +498,7 @@ def generator_wrapper(group_type, is_expert=False, **kwargs):
     # ETP
     expert_tensor_parallel_rank = 0
     if expert_tensor_parallel_size_ is not None and expert_tensor_parallel_size_ > 1:
-        for ranks in generator_wrapper('tp-ep', is_expert=True):
+        for ranks in generator_wrapper('tp', is_expert=True):
             if rank in ranks:
                 expert_tensor_parallel_rank = list(ranks).index(rank)
 

From 26e2bf9baff1fdcf71fc0650b52a71949cfa5a09 Mon Sep 17 00:00:00 2001
From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 10 Feb 2025 23:07:56 -0700
Subject: [PATCH 02/14] Rename neva datamodule (#12121)

* Rename dataset

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* pylink

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix f string

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix intern vit default factory

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 nemo/collections/vlm/__init__.py                   |  8 ++++----
 nemo/collections/vlm/mllama/data/__init__.py       |  4 ++--
 .../vlm/mllama/data/{lazy.py => preloaded.py}      | 10 ++++++----
 nemo/collections/vlm/neva/data/__init__.py         |  4 ++--
 nemo/collections/vlm/neva/data/api.py              | 10 ++++++----
 .../vlm/neva/data/{lazy.py => preloaded.py}        |  2 +-
 nemo/collections/vlm/vision/intern_vit.py          | 14 ++++++++------
 scripts/vlm/mllama_finetune.py                     |  4 ++--
 scripts/vlm/neva_finetune.py                       |  2 +-
 9 files changed, 32 insertions(+), 26 deletions(-)
 rename nemo/collections/vlm/mllama/data/{lazy.py => preloaded.py} (96%)
 rename nemo/collections/vlm/neva/data/{lazy.py => preloaded.py} (99%)

diff --git a/nemo/collections/vlm/__init__.py b/nemo/collections/vlm/__init__.py
index 97b154085f4b..71a605a7da61 100644
--- a/nemo/collections/vlm/__init__.py
+++ b/nemo/collections/vlm/__init__.py
@@ -26,7 +26,7 @@
 from nemo.collections.vlm.llava_next.model.llava_next import LlavaNextConfig7B, LlavaNextConfig13B, LlavaNextModel
 
 # MLLAMA
-from nemo.collections.vlm.mllama.data import MLlamaLazyDataModule, MLlamaMockDataModule
+from nemo.collections.vlm.mllama.data import MLlamaMockDataModule, MLlamaPreloadedDataModule
 from nemo.collections.vlm.mllama.model.base import (
     CrossAttentionTextConfig,
     CrossAttentionVisionConfig,
@@ -46,8 +46,8 @@
     ImageDataConfig,
     ImageToken,
     MultiModalToken,
-    NevaLazyDataModule,
     NevaMockDataModule,
+    NevaPreloadedDataModule,
     VideoDataConfig,
     VideoToken,
 )
@@ -77,9 +77,9 @@
     "HFDatasetDataModule",
     "HFAutoModelForImageTextToText",
     "NevaMockDataModule",
-    "NevaLazyDataModule",
+    "NevaPreloadedDataModule",
     "MLlamaMockDataModule",
-    "MLlamaLazyDataModule",
+    "MLlamaPreloadedDataModule",
     "DataConfig",
     "ImageDataConfig",
     "VideoDataConfig",
diff --git a/nemo/collections/vlm/mllama/data/__init__.py b/nemo/collections/vlm/mllama/data/__init__.py
index 0e89762a4c9a..5c6b53ec666d 100644
--- a/nemo/collections/vlm/mllama/data/__init__.py
+++ b/nemo/collections/vlm/mllama/data/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.vlm.mllama.data.lazy import MLlamaLazyDataModule
 from nemo.collections.vlm.mllama.data.mock import MockDataModule as MLlamaMockDataModule
+from nemo.collections.vlm.mllama.data.preloaded import MLlamaPreloadedDataModule
 
 __all__ = [
     "MLlamaMockDataModule",
-    "MLlamaLazyDataModule",
+    "MLlamaPreloadedDataModule",
 ]
diff --git a/nemo/collections/vlm/mllama/data/lazy.py b/nemo/collections/vlm/mllama/data/preloaded.py
similarity index 96%
rename from nemo/collections/vlm/mllama/data/lazy.py
rename to nemo/collections/vlm/mllama/data/preloaded.py
index eac29d081a34..2b727d595fa1 100644
--- a/nemo/collections/vlm/mllama/data/lazy.py
+++ b/nemo/collections/vlm/mllama/data/preloaded.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# pylint: disable=C0115,C0116
 
 import json
 import logging
@@ -28,7 +29,7 @@
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
 from nemo.collections.vlm.mllama.model.utils import create_vision_mask_tensor
 from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig
-from nemo.collections.vlm.neva.data.lazy import IGNORE_INDEX, LazySupervisedDataset
+from nemo.collections.vlm.neva.data.preloaded import IGNORE_INDEX, LazySupervisedDataset
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 
 
@@ -170,7 +171,7 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         return batch
 
 
-class MLlamaLazyDataModule(pl.LightningDataModule):
+class MLlamaPreloadedDataModule(pl.LightningDataModule):
     def __init__(
         self,
         paths: str | List[str],
@@ -223,7 +224,7 @@ def __init__(
 
         if tokenizer is None or image_processor is None:
             logging.warning(
-                f"Processor and tokenizer are not provided! Fall back to `meta-llama/Llama-3.2-11B-Vision-Instruct`."
+                "Processor and tokenizer are not provided! Fall back to `meta-llama/Llama-3.2-11B-Vision-Instruct`."
             )
             from transformers import AutoProcessor
 
@@ -246,7 +247,8 @@ def setup(self, stage: str = "") -> None:
         else:
             # TODO:
             # rng = torch.Generator().manual_seed(self.seed)
-            # train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], generator=rng)
+            # train_dataset, val_dataset, test_dataset =
+            # random_split(dataset, [train_size, val_size, test_size], generator=rng)
             self._train_ds = MLlamaDataset(
                 self.paths[0], self.data_config, self.tokenizer, self.image_processor, self.seq_length
             )
diff --git a/nemo/collections/vlm/neva/data/__init__.py b/nemo/collections/vlm/neva/data/__init__.py
index f210d01a06fd..94fe741b8831 100644
--- a/nemo/collections/vlm/neva/data/__init__.py
+++ b/nemo/collections/vlm/neva/data/__init__.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig, VideoDataConfig
-from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule
 from nemo.collections.vlm.neva.data.mock import MockDataModule as NevaMockDataModule
 from nemo.collections.vlm.neva.data.multimodal_tokens import ImageToken, MultiModalToken, VideoToken
+from nemo.collections.vlm.neva.data.preloaded import NevaPreloadedDataModule
 
 __all__ = [
-    "NevaLazyDataModule",
+    "NevaPreloadedDataModule",
     "NevaMockDataModule",
     "DataConfig",
     "ImageDataConfig",
diff --git a/nemo/collections/vlm/neva/data/api.py b/nemo/collections/vlm/neva/data/api.py
index 15ba45c82fd9..a50c0bdf513d 100644
--- a/nemo/collections/vlm/neva/data/api.py
+++ b/nemo/collections/vlm/neva/data/api.py
@@ -14,16 +14,18 @@
 
 import lightning.pytorch as pl
 
-from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule
 from nemo.collections.vlm.neva.data.mock import MockDataModule
+from nemo.collections.vlm.neva.data.preloaded import NevaPreloadedDataModule
 
 
 def mock() -> pl.LightningDataModule:
+    """Mock Neva Data Module"""
     return MockDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
 
 
-def lazy() -> pl.LightningDataModule:
-    return NevaLazyDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
+def preloaded() -> pl.LightningDataModule:
+    """Preloaded Llava-like Data Module"""
+    return NevaPreloadedDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
 
 
-__all__ = ["mock", "lazy"]
+__all__ = ["mock", "preloaded"]
diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/preloaded.py
similarity index 99%
rename from nemo/collections/vlm/neva/data/lazy.py
rename to nemo/collections/vlm/neva/data/preloaded.py
index 0076d3439270..40320a3c3799 100644
--- a/nemo/collections/vlm/neva/data/lazy.py
+++ b/nemo/collections/vlm/neva/data/preloaded.py
@@ -489,7 +489,7 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         return batch
 
 
-class NevaLazyDataModule(pl.LightningDataModule):
+class NevaPreloadedDataModule(pl.LightningDataModule):
     def __init__(
         self,
         paths: str | List[str],
diff --git a/nemo/collections/vlm/vision/intern_vit.py b/nemo/collections/vlm/vision/intern_vit.py
index 6f718f7258d0..086467a39cb2 100644
--- a/nemo/collections/vlm/vision/intern_vit.py
+++ b/nemo/collections/vlm/vision/intern_vit.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import partial
 from pathlib import Path
 from typing import Callable
@@ -337,7 +337,7 @@ class InternViTConfig(CLIPViTConfig):
     normalization: str = 'RMSNorm'
     layernorm_epsilon: float = 1e-6
     apply_rope_fusion: bool = False
-    transformer_layer_spec: ModuleSpec = get_internvit_layer_spec(use_te=True)
+    transformer_layer_spec: ModuleSpec = field(default_factory=lambda: get_internvit_layer_spec(use_te=True))
 
 
 @dataclass
@@ -363,10 +363,12 @@ class InternViT_300M_448px_Config(InternViTConfig):
     attention_dropout: float = 0.0
     ffn_hidden_size: int = 4096
     normalization: str = 'LayerNorm'
-    transformer_layer_spec: ModuleSpec = get_internvit_layer_spec(
-        use_te=True,
-        add_qk_norm=False,
-        norm_type='LayerNorm',
+    transformer_layer_spec: ModuleSpec = field(
+        default_factory=lambda: get_internvit_layer_spec(
+            use_te=True,
+            add_qk_norm=False,
+            norm_type='LayerNorm',
+        )
     )
 
 
diff --git a/scripts/vlm/mllama_finetune.py b/scripts/vlm/mllama_finetune.py
index 9e37d9c3fc0c..6191145c2afd 100644
--- a/scripts/vlm/mllama_finetune.py
+++ b/scripts/vlm/mllama_finetune.py
@@ -22,7 +22,7 @@
 from nemo import lightning as nl
 from nemo.collections import llm, vlm
 from nemo.collections.vlm import ImageDataConfig
-from nemo.collections.vlm.mllama.data.lazy import MLlamaLazyDataModule
+from nemo.collections.vlm.mllama.data.preloaded import MLlamaPreloadedDataModule
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
 from nemo.utils.exp_manager import TimingCallback
@@ -71,7 +71,7 @@ def main(args):
         )
 
         # Data module setup
-        data = MLlamaLazyDataModule(
+        data = MLlamaPreloadedDataModule(
             paths=args.data_path,
             data_config=data_config,
             seq_length=seq_length,
diff --git a/scripts/vlm/neva_finetune.py b/scripts/vlm/neva_finetune.py
index 3d733711a514..e62b2208b6c4 100644
--- a/scripts/vlm/neva_finetune.py
+++ b/scripts/vlm/neva_finetune.py
@@ -94,7 +94,7 @@ def main(args):
         )
 
         # Data module setup
-        data = vlm.NevaLazyDataModule(
+        data = vlm.NevaPreloadedDataModule(
             paths=args.data_path,
             data_config=data_config,
             seq_length=decoder_seq_length,

From 7bb74fa059b3d9b3f843f0ebac60ca7a71a036f8 Mon Sep 17 00:00:00 2001
From: Taejin Park <tango4j@gmail.com>
Date: Tue, 11 Feb 2025 02:04:18 -0800
Subject: [PATCH 03/14] fix the issue during batched inference of Sortformer
 diarizer (#12047)

* Added changes that fix the issue during batched inference

Signed-off-by: Taejin Park <tango4j@gmail.com>

* Adding changes to prevent ghost output

Signed-off-by: Taejin Park <tango4j@gmail.com>

---------

Signed-off-by: Taejin Park <tango4j@gmail.com>
---
 .../asr/data/audio_to_diar_label_lhotse.py    |  2 +-
 .../asr/models/sortformer_diar_models.py      | 15 ++++++++-----
 .../asr/modules/sortformer_modules.py         | 22 +++++++------------
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/nemo/collections/asr/data/audio_to_diar_label_lhotse.py b/nemo/collections/asr/data/audio_to_diar_label_lhotse.py
index 927e3887de78..6b9a687013a2 100644
--- a/nemo/collections/asr/data/audio_to_diar_label_lhotse.py
+++ b/nemo/collections/asr/data/audio_to_diar_label_lhotse.py
@@ -76,7 +76,7 @@ def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]:
             target_fr_len = get_hidden_length_from_sample_length(
                 audio_len, self.num_sample_per_mel_frame, self.num_mel_frame_per_target_frame
             )
-            target_lens_list.append([target_fr_len])
+            target_lens_list.append(target_fr_len)
         target_lens = torch.tensor(target_lens_list)
 
         return audio, audio_lens, targets, target_lens
diff --git a/nemo/collections/asr/models/sortformer_diar_models.py b/nemo/collections/asr/models/sortformer_diar_models.py
index e2ac0b09c81b..bf773f1e0006 100644
--- a/nemo/collections/asr/models/sortformer_diar_models.py
+++ b/nemo/collections/asr/models/sortformer_diar_models.py
@@ -256,21 +256,24 @@ def frontend_encoder(self, processed_signal, processed_signal_length):
             emb_seq = self.sortformer_modules.encoder_proj(emb_seq)
         return emb_seq, emb_seq_length
 
-    def forward_infer(self, emb_seq):
+    def forward_infer(self, emb_seq, emb_seq_length):
         """
         The main forward pass for diarization for offline diarization inference.
 
         Args:
             emb_seq (torch.Tensor): tensor containing FastConformer encoder states (embedding vectors).
                 Dimension: (batch_size, diar_frame_count, emb_dim)
+            emb_seq_length (torch.Tensor): tensor containing lengths of FastConformer encoder states.
+                Dimension: (batch_size,)
 
         Returns:
             preds (torch.Tensor): Sorted tensor containing Sigmoid values for predicted speaker labels.
                 Dimension: (batch_size, diar_frame_count, num_speakers)
         """
-        encoder_mask = self.sortformer_modules.length_to_mask(emb_seq)
+        encoder_mask = self.sortformer_modules.length_to_mask(emb_seq_length, emb_seq.shape[1])
         trans_emb_seq = self.transformer_encoder(encoder_states=emb_seq, encoder_mask=encoder_mask)
-        preds = self.sortformer_modules.forward_speaker_sigmoids(trans_emb_seq)
+        _preds = self.sortformer_modules.forward_speaker_sigmoids(trans_emb_seq)
+        preds = _preds * encoder_mask.unsqueeze(-1)
         return preds
 
     def _diarize_forward(self, batch: Any):
@@ -407,6 +410,8 @@ def process_signal(self, audio_signal, audio_signal_length):
         processed_signal, processed_signal_length = self.preprocessor(
             input_signal=audio_signal, length=audio_signal_length
         )
+        if not self.training:
+            torch.cuda.empty_cache()
         return processed_signal, processed_signal_length
 
     def forward(
@@ -434,10 +439,10 @@ def forward(
         if self._cfg.get("streaming_mode", False):
             raise NotImplementedError("Streaming mode is not implemented yet.")
         else:
-            emb_seq, _ = self.frontend_encoder(
+            emb_seq, emb_seq_length = self.frontend_encoder(
                 processed_signal=processed_signal, processed_signal_length=processed_signal_length
             )
-            preds = self.forward_infer(emb_seq)
+            preds = self.forward_infer(emb_seq, emb_seq_length)
         return preds
 
     def _get_aux_train_evaluations(self, preds, targets, target_lens) -> dict:
diff --git a/nemo/collections/asr/modules/sortformer_modules.py b/nemo/collections/asr/modules/sortformer_modules.py
index d99bf3b93e38..c158b22fe473 100644
--- a/nemo/collections/asr/modules/sortformer_modules.py
+++ b/nemo/collections/asr/modules/sortformer_modules.py
@@ -67,28 +67,22 @@ def __init__(
         self.dropout = nn.Dropout(dropout_rate)
         self.encoder_proj = nn.Linear(self.fc_d_model, self.tf_d_model)
 
-    def length_to_mask(self, context_embs):
+    def length_to_mask(self, lengths, max_length):
         """
-        Convert length values to encoder mask input tensor.
+        Convert length values to encoder mask input tensor
 
         Args:
-            lengths (torch.Tensor): tensor containing lengths of sequences
-            max_len (int): maximum sequence length
+            lengths (torch.Tensor): tensor containing lengths (frame counts) of sequences
+            max_length (int): maximum length (frame count) of the sequences in the batch
 
         Returns:
             mask (torch.Tensor): tensor of shape (batch_size, max_len) containing 0's
                                 in the padded region and 1's elsewhere
         """
-        lengths = torch.tensor([context_embs.shape[1]] * context_embs.shape[0])
-        batch_size = context_embs.shape[0]
-        max_len = context_embs.shape[1]
-        # create a tensor with the shape (batch_size, 1) filled with ones
-        row_vector = torch.arange(max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device)
-        # create a tensor with the shape (batch_size, max_len) filled with lengths
-        length_matrix = lengths.unsqueeze(1).expand(-1, max_len).to(lengths.device)
-        # create a mask by comparing the row vector and length matrix
-        mask = row_vector < length_matrix
-        return mask.float().to(context_embs.device)
+        batch_size = lengths.shape[0]
+        arange = torch.arange(max_length, device=lengths.device)
+        mask = arange.expand(batch_size, max_length) < lengths.unsqueeze(1)
+        return mask
 
     def forward_speaker_sigmoids(self, hidden_out):
         """

From 1e5214246fb50077991ba0838c51cfc92574620a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Kami=C5=84ski?=
 <67481570+Laplasjan107@users.noreply.github.com>
Date: Tue, 11 Feb 2025 15:06:27 +0100
Subject: [PATCH 04/14] Update vLLM to 0.7.2 (#12078)

* initial commit

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* vllm bump cleanup

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Flake8

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* flake should not fail with tensorstore

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* pylint also should not fail

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* local tokenizer load

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* add missing requirements

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* absolute path for sentencepiece tokenizer

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* fix absolute path, add new vllm params

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* bump vllm, fix tokenizer

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* code review + docstrings

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* flake8

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* fix formatting

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

---------

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/export/vllm/model_config.py    | 64 +++++++++++++++++++++++---
 nemo/export/vllm/model_loader.py    | 21 ++++-----
 nemo/export/vllm/tokenizer_group.py | 19 +++++++-
 nemo/export/vllm_exporter.py        | 71 +++++++++++++++++------------
 requirements/requirements_vllm.txt  |  7 ++-
 5 files changed, 134 insertions(+), 48 deletions(-)

diff --git a/nemo/export/vllm/model_config.py b/nemo/export/vllm/model_config.py
index 39f6397663b3..989f6c5300ee 100644
--- a/nemo/export/vllm/model_config.py
+++ b/nemo/export/vllm/model_config.py
@@ -17,8 +17,10 @@
 
 import torch
 import yaml
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
 from transformers import AutoConfig
-from vllm.config import ModelConfig, _get_and_verify_dtype, _get_and_verify_max_len
+from vllm.config import ModelConfig, ModelImpl, PoolerConfig, _get_and_verify_dtype, _get_and_verify_max_len
 from vllm.transformers_utils.config import get_hf_text_config
 
 from nemo.export.tarutils import TarPath
@@ -54,6 +56,11 @@ def __init__(
         max_logprobs: int = 5,
         disable_sliding_window: bool = False,
         use_async_output_proc: bool = False,
+        disable_mm_preprocessor_cache: bool = False,
+        logits_processor_pattern: Optional[str] = None,
+        override_pooler_config: Optional[PoolerConfig] = None,
+        enable_sleep_mode: bool = False,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         # Don't call ModelConfig.__init__ because we don't want it to call
         # transformers.AutoConfig.from_pretrained(...)
@@ -75,6 +82,7 @@ def __init__(
         self.rope_scaling = rope_scaling
         self.rope_theta = rope_theta
         self.tokenizer_revision = tokenizer_revision
+        self.model_impl = model_impl
         self.quantization = quantization
         self.quantization_param_path = quantization_param_path
         self.enforce_eager = enforce_eager
@@ -85,21 +93,39 @@ def __init__(
         self.multimodal_config = None
         self.mm_processor_kwargs = {}
         self.use_async_output_proc = use_async_output_proc
+        self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
+        self.logits_processor_pattern = logits_processor_pattern
+        self.generation_config = None
+        self.task = "generate"  # Only the generate task is supported
+        self.is_hybrid = False  # No hybrid models are supported
+
+        self.encoder_config = self._get_encoder_config()
+        self.pooler_config = self._init_pooler_config(override_pooler_config)
+        self.enable_sleep_mode = enable_sleep_mode
+
+        from vllm.platforms import current_platform  # vLLM uses local import for current_platform
+
+        if self.enable_sleep_mode and not current_platform.is_cuda():
+            raise ValueError("Sleep mode is only supported on CUDA devices.")
 
         self.model_converter = get_model_converter(model_type)
         if self.model_converter is None:
             raise RuntimeError(f'Unknown model type "{model_type}"')
 
         if is_nemo2_checkpoint(nemo_checkpoint):
-            from nemo.lightning.io import load_context
-
             nemo_checkpoint: Path = Path(nemo_checkpoint)
+            tokenizer_config = OmegaConf.load(nemo_checkpoint / "context/model.yaml").tokenizer
+            if ('additional_special_tokens' in tokenizer_config) and len(
+                tokenizer_config['additional_special_tokens']
+            ) == 0:
+                del tokenizer_config['additional_special_tokens']
+
+            tokenizer_config = self._change_paths_to_absolute_paths(tokenizer_config, nemo_checkpoint)
+            tokenizer = instantiate(tokenizer_config)
 
             with (nemo_checkpoint / "context/model.yaml").open('r') as config_file:
                 self.nemo_model_config: dict = yaml.load(config_file, Loader=yaml.SafeLoader)
-
             hf_args = self._load_hf_arguments(self.nemo_model_config['config'])
-            tokenizer = load_context((nemo_checkpoint / "context"), subpath="model.tokenizer")
 
             if hasattr(tokenizer, 'bos_id'):
                 tokenizer.tokenizer.bos_token_id = tokenizer.bos_id
@@ -134,10 +160,36 @@ def __init__(
         self.has_inner_state = self._init_has_inner_state()
 
         self._verify_tokenizer_mode()
-        self._verify_embedding_mode()
         self._verify_quantization()
         self._verify_cuda_graph()
 
+    @staticmethod
+    def _change_paths_to_absolute_paths(tokenizer_config: Dict[Any, Any], nemo_checkpoint: Path) -> Dict[Any, Any]:
+        """
+        Creates absolute path to the local tokenizers. Used for NeMo 2.0.
+
+        Args:
+            tokenizer_config (dict): Parameters for instantiating the tokenizer.
+            nemo_checkpoint (path): Path to the NeMo2 checkpoint.
+        Returns:
+            dict: Updated tokenizer config.
+        """
+        context_path = nemo_checkpoint / 'context'
+
+        # 'pretrained_model_name' -- huggingface tokenizer case
+        # 'model_path' -- sentencepiece tokenizer
+        path_keys = ['pretrained_model_name', 'model_path']
+
+        for path_key in path_keys:
+            if path := tokenizer_config.get(path_key, None):
+                tokenizer_path = context_path / path
+                if not tokenizer_path.exists():
+                    continue
+
+                tokenizer_config[path_key] = str(tokenizer_path.resolve())
+
+        return tokenizer_config
+
     def _load_hf_arguments(self, nemo_config: Dict[str, Any]) -> Dict[str, Any]:
         """
         Maps argument names used in NeMo to their corresponding names in HF.
diff --git a/nemo/export/vllm/model_loader.py b/nemo/export/vllm/model_loader.py
index 8c867f1bb994..45c86b8e0389 100644
--- a/nemo/export/vllm/model_loader.py
+++ b/nemo/export/vllm/model_loader.py
@@ -17,14 +17,16 @@
 import logging
 import os.path
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 import numpy
 import safetensors.torch
-import tensorstore  # needed to register 'bfloat16' dtype with numpy for zarr compatibility
+
+# needed to register 'bfloat16' dtype with numpy for zarr compatibility
+import tensorstore  # noqa: F401 pylint: disable=unused-import
 import torch
 import zarr
-from vllm.config import CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig
+from vllm.config import ModelConfig
 from vllm.model_executor.model_loader.loader import BaseModelLoader, _initialize_model
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 
@@ -81,29 +83,26 @@ def _load_nemo_checkpoint_state(nemo_file: str):
 
         return sharded_state_dict
 
-    def download_model(self, model_config: ModelConfig) -> None:
+    def download_model(self, model_config: ModelConfig) -> None:  # pylint: disable=missing-function-docstring
         raise NotImplementedError
 
     def load_model(
         self,
         *,
-        model_config: NemoModelConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        cache_config: CacheConfig,
+        vllm_config: NemoModelConfig,
     ) -> torch.nn.Module:
         """
         Overrides the load_model function from BaseModelLoader to convert Nemo weights at load time.
         """
+        model_config = vllm_config.model_config
+        device_config = vllm_config.device_config
 
         assert isinstance(model_config, NemoModelConfig)
         state_dict = NemoModelLoader._load_nemo_checkpoint_state(model_config.nemo_checkpoint)
 
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config, lora_config, cache_config)
+                model = _initialize_model(vllm_config)
 
             config = model_config.nemo_model_config
             if 'config' in config:
diff --git a/nemo/export/vllm/tokenizer_group.py b/nemo/export/vllm/tokenizer_group.py
index 592b784be04b..34d35af352c2 100644
--- a/nemo/export/vllm/tokenizer_group.py
+++ b/nemo/export/vllm/tokenizer_group.py
@@ -32,29 +32,44 @@ def __init__(self, tokenizer: SentencePieceTokenizer, add_bos_token: bool = Fals
 
     @classmethod
     def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig] = None, **init_kwargs):
+        """Create a tokenizer group from a config."""
         raise NotImplementedError
 
     def ping(self) -> bool:
+        """Check if the tokenizer group is alive."""
         return True
 
     def get_max_input_len(self, lora_request: Optional[LoRARequest] = None) -> Optional[int]:
+        """Get the maximum input length for the LoRA request."""
         return None
 
     def encode(
-        self, prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None
+        self,
+        prompt: str,
+        request_id: Optional[str] = None,
+        lora_request: Optional[LoRARequest] = None,
+        add_special_tokens: Optional[bool] = None,
     ) -> List[int]:
+        """Tokenizes the prompt."""
         ids = self.tokenizer.encode(prompt)
         if self.add_bos_token:
             ids = [self.tokenizer.bos_token_id] + ids
         return ids
 
     async def encode_async(
-        self, prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None
+        self,
+        prompt: str,
+        request_id: Optional[str] = None,
+        lora_request: Optional[LoRARequest] = None,
+        add_special_tokens: Optional[bool] = None,
     ) -> List[int]:
+        """Encode a prompt using the tokenizer group."""
         return self.tokenizer.encode(prompt)  # TODO: not sure how this is supposed to work
 
     def get_lora_tokenizer(self, lora_request: Optional[LoRARequest] = None) -> SentencePieceTokenizer:
+        """Get a tokenizer for a LoRA request."""
         return self.tokenizer
 
     async def get_lora_tokenizer_async(self, lora_request: Optional[LoRARequest] = None) -> SentencePieceTokenizer:
+        """Get a tokenizer for a LoRA request."""
         return self.tokenizer
diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py
index 0177adbd3587..1b2f69cc5d95 100644
--- a/nemo/export/vllm_exporter.py
+++ b/nemo/export/vllm_exporter.py
@@ -20,7 +20,16 @@
 import numpy
 import wrapt
 from vllm import RequestOutput, SamplingParams
-from vllm.config import CacheConfig, DeviceConfig, LoadConfig, LoadFormat, LoRAConfig, ParallelConfig, SchedulerConfig
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    LoadConfig,
+    LoadFormat,
+    LoRAConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.lora.request import LoRARequest
 
@@ -36,12 +45,15 @@
 
 @wrapt.decorator
 def noop_decorator(func):
+    """Used as batch if pytriton is not supported"""
+
     def wrapper(*args, **kwargs):
         return func(*args, **kwargs)
 
     return wrapper
 
 
+batch = noop_decorator
 use_pytriton = True
 try:
     from pytriton.decorators import batch
@@ -239,42 +251,39 @@ def export(
         )
 
         # Initialize the cluster and specify the executor class.
-        if device_config.device_type == "neuron":
-            from vllm.executor.neuron_executor import NeuronExecutor
-
-            executor_class = NeuronExecutor
-        elif device_config.device_type == "cpu":
-            from vllm.executor.cpu_executor import CPUExecutor
-
-            executor_class = CPUExecutor
-        elif parallel_config.distributed_executor_backend == "ray":
+        if parallel_config.distributed_executor_backend == "ray":
             initialize_ray_cluster(parallel_config)
-            from vllm.executor.ray_gpu_executor import RayGPUExecutor
+            from vllm.executor.ray_distributed_executor import RayDistributedExecutor
+
+            executor_class = RayDistributedExecutor
 
-            executor_class = RayGPUExecutor
         elif parallel_config.distributed_executor_backend == "mp":
-            from vllm.executor.multiproc_gpu_executor import MultiprocessingGPUExecutor
+            from vllm.executor.mp_distributed_executor import MultiprocessingDistributedExecutor
+
+            executor_class = MultiprocessingDistributedExecutor
 
-            executor_class = MultiprocessingGPUExecutor
         else:
-            assert parallel_config.world_size == 1, "Ray is required if parallel_config.world_size > 1."
-            from vllm.executor.gpu_executor import GPUExecutor
+            assert parallel_config.distributed_executor_backend == "uni" or parallel_config.world_size == 1
 
-            executor_class = GPUExecutor
+            from vllm.executor.uniproc_executor import UniProcExecutor
+
+            executor_class = UniProcExecutor
 
         # Initialize the engine
         self.engine = NemoLLMEngine(
-            model_config=model_config,
-            cache_config=cache_config,
-            parallel_config=parallel_config,
-            scheduler_config=scheduler_config,
-            device_config=device_config,
-            load_config=load_config,
-            lora_config=lora_config,
-            speculative_config=None,
-            decoding_config=None,
-            observability_config=None,
-            prompt_adapter_config=None,
+            vllm_config=VllmConfig(
+                model_config=model_config,
+                cache_config=cache_config,
+                parallel_config=parallel_config,
+                scheduler_config=scheduler_config,
+                device_config=device_config,
+                load_config=load_config,
+                lora_config=lora_config,
+                speculative_config=None,
+                decoding_config=None,
+                observability_config=None,
+                prompt_adapter_config=None,
+            ),
             executor_class=executor_class,
             log_stats=log_stats,
         )
@@ -414,6 +423,9 @@ def get_triton_output(self):
 
     @batch
     def triton_infer_fn(self, **inputs: numpy.ndarray):
+        """
+        This function is used to perform inference on a batch of prompts.
+        """
         request_ids = []
         num_requests = len(inputs["prompts"])
         for index in range(num_requests):
@@ -428,6 +440,9 @@ def triton_infer_fn(self, **inputs: numpy.ndarray):
 
     @batch
     def triton_infer_fn_streaming(self, **inputs: numpy.ndarray):
+        """
+        This function is used to perform streaming inference.
+        """
         request_ids = []
         num_requests = len(inputs["prompts"])
         for index in range(num_requests):
diff --git a/requirements/requirements_vllm.txt b/requirements/requirements_vllm.txt
index a72926bee267..8d376785fd9a 100644
--- a/requirements/requirements_vllm.txt
+++ b/requirements/requirements_vllm.txt
@@ -1,5 +1,8 @@
 # Minimal set of NeMo requirements to run vLLM export & deployment in /opt/venv in a NeMo container
 braceexpand
+# datasets and pandas import are triggered by hydra.utils.instantiate in nemo/export/vllm/model_config.py.
+# TODO: remove those dependencies by switching to local nemo.export tokenizers.
+datasets
 faiss-cpu
 fiddle
 h5py
@@ -11,10 +14,12 @@ matplotlib>=3.3.2
 omegaconf<=2.3
 onnx>=1.7.0
 OpenCC
+pandas
 pangu
 rouge_score
 sacrebleu
 scikit-learn
-vllm==0.6.3
+vllm==0.7.2
 webdataset>=0.2.86
 wget
+zarr>=2.18.2,<3.0.0

From 6b59ab8a7eba7164b753f384f4496c367b387dba Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Tue, 11 Feb 2025 16:23:04 -0500
Subject: [PATCH 05/14] Prevent downloading dataset every time in ci test
 (#12095)

* prevent downloading dataset everytime in ci test

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* newline

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml         |  5 ++---
 Dockerfile.ci                           |  1 +
 tests/collections/llm/gpt_finetuning.py | 26 ++++++++++++++++++-------
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 56f72233ac55..b1282ee63d30 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4859,7 +4859,7 @@ jobs:
         --tp_size 1 \
         --pp_size 1 \
         --mbs 1 \
-        --chat_dataset_path /home/TestData/nemo2_data/chat
+        --dataset chat
 
         python tests/collections/llm/gpt_finetuning.py \
         --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
@@ -4870,7 +4870,7 @@ jobs:
         --tp_size 1 \
         --pp_size 1 \
         --mbs 1 \
-        --chat_dataset_path /home/TestData/nemo2_data/chat
+        --dataset chat
 
   L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2_exclude:
     needs: [pre-flight, cicd-test-container-build]
@@ -4947,7 +4947,6 @@ jobs:
         --model mistral \
         --dist-opt
 
-
   L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1_exclude:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
diff --git a/Dockerfile.ci b/Dockerfile.ci
index 2bff4e0c0821..f035a1207ae5 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -94,3 +94,4 @@ RUN \
 EOF
 
 ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
+ENV NEMO_HOME="/home/TestData/nemo_home"
diff --git a/tests/collections/llm/gpt_finetuning.py b/tests/collections/llm/gpt_finetuning.py
index 668109d46b70..e59741ca6f53 100644
--- a/tests/collections/llm/gpt_finetuning.py
+++ b/tests/collections/llm/gpt_finetuning.py
@@ -20,12 +20,14 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.llm.gpt.data.core import get_dataset_root
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from tests.collections.llm.common import Llama3ConfigCI
 
-
 ## NOTE: This script is present for github-actions testing only.
+## CI tests that call this script should set max_steps=3 for initial training
+## and max_steps=6 for resume testing
 
 
 def get_args():
@@ -39,9 +41,7 @@ def get_args():
     parser.add_argument('--tp_size', type=int, default=1, help="tensor parallel size")
     parser.add_argument('--pp_size', type=int, default=1, help="pipeline parallel size")
     parser.add_argument('--packed', action='store_true', help="use packed sequence dataset")
-    parser.add_argument(
-        '--chat_dataset_path', type=str, default="", help="path to chat dataset. Uses dolly if this is empty."
-    )
+    parser.add_argument('--dataset', type=str, default="dolly", choices=['dolly', 'chat'], help="Dataset to use")
 
     return parser.parse_args()
 
@@ -54,6 +54,7 @@ def get_args():
         pipeline_model_parallel_size=args.pp_size,
         # Pipeline dtype is coupled with the bf16 mixed precision plugin
         pipeline_dtype=torch.bfloat16,
+        ckpt_load_strictness="log_all",  # Only for CI tests to use older versions of checkpoint
     )
 
     trainer = nl.Trainer(
@@ -101,10 +102,11 @@ def get_args():
     packed_sequence_specs = (
         PackedSequenceSpecs(packed_sequence_size=2048, tokenizer_model_name="dummy_tokenizer") if args.packed else None
     )
-    if args.chat_dataset_path:
+
+    if args.dataset == 'chat':
         assert not args.packed
         data = llm.ChatDataModule(
-            dataset_root=args.chat_dataset_path,
+            dataset_root=get_dataset_root("chat"),
             seq_length=2048,
             micro_batch_size=args.mbs,
             global_batch_size=8,
@@ -120,6 +122,9 @@ def get_args():
             packed_sequence_specs=packed_sequence_specs,
         )
 
+    # ensure using cached dir
+    assert str(data.dataset_root).startswith(os.environ.get("NEMO_HOME"))
+
     tokenizer = get_nmt_tokenizer(tokenizer_model=os.path.join(args.restore_path, "dummy_tokenizer.model"))
     llama3_8b = llm.LlamaModel(Llama3ConfigCI(), tokenizer=tokenizer)
 
@@ -138,6 +143,13 @@ def get_args():
         resume=resume,
     )
 
+    if args.max_steps == 3:
+        print("Initial Training Succeeded")
     if args.max_steps == 6:
         # assert a resume has happened for CI tests
-        assert 'reduced_train_loss=' in str(trainer.ckpt_path), "Resume did not happen in this resume test."
+        msg = (
+            "Resume did not happen in this resume test.\n"
+            "Hint: Scroll up and see whether 'Initial Training Succeeded' is printed out.\n"
+            "If not, then the issue is not with ckpt resume."
+        )
+        assert 'reduced_train_loss=' in str(trainer.ckpt_path), msg

From ee543c24d84e7e84f6b2a868d7f15c71b4c57a03 Mon Sep 17 00:00:00 2001
From: Ssofja <78349198+Ssofja@users.noreply.github.com>
Date: Wed, 12 Feb 2025 04:29:31 +0400
Subject: [PATCH 06/14] changed asr models outputs to be consistent (#11818)

* changed asr models outputs to be consistent

Signed-off-by: Ssofja <sofiakostandian@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Ssofja <Ssofja@users.noreply.github.com>
Signed-off-by: Ssofja <sofiakostandian@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Ssofja <Ssofja@users.noreply.github.com>

* adding needed changes

Signed-off-by: Ssofja <sofiakostandian@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Ssofja <Ssofja@users.noreply.github.com>

* Small fixes

* Returned previous names of return_hypotheses

Signed-off-by: Ssofja <sofiakostandian@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Ssofja <Ssofja@users.noreply.github.com>

---------

Signed-off-by: Ssofja <sofiakostandian@gmail.com>
Signed-off-by: Ssofja <Ssofja@users.noreply.github.com>
Co-authored-by: Ssofja <Ssofja@users.noreply.github.com>
---
 nemo/collections/asr/metrics/bleu.py          |   8 +-
 nemo/collections/asr/metrics/wer.py           |  10 +-
 .../asr/models/aed_multitask_models.py        |  21 +-
 nemo/collections/asr/models/ctc_models.py     |  32 +--
 .../asr/models/hybrid_rnnt_ctc_models.py      |  37 +--
 nemo/collections/asr/models/rnnt_models.py    |  32 +--
 .../context_biasing/context_biasing_utils.py  |  18 +-
 nemo/collections/asr/parts/mixins/mixins.py   |   7 +-
 .../asr/parts/submodules/ctc_beam_decoding.py |  24 +-
 .../asr/parts/submodules/ctc_decoding.py      |  38 ++-
 .../parts/submodules/ctc_greedy_decoding.py   |  16 +-
 .../cuda_graph_rnnt_greedy_decoding.py        |   4 +-
 .../submodules/multitask_beam_decoding.py     |   4 +-
 .../parts/submodules/multitask_decoding.py    |  31 +--
 .../submodules/multitask_greedy_decoding.py   |   4 +-
 .../parts/submodules/rnnt_beam_decoding.py    |  34 +--
 .../asr/parts/submodules/rnnt_decoding.py     |  52 ++---
 .../parts/submodules/rnnt_greedy_decoding.py  |  37 ++-
 .../submodules/rnnt_loop_labels_computer.py   |   8 +-
 .../asr/parts/submodules/tdt_beam_decoding.py |  14 +-
 .../submodules/tdt_loop_labels_computer.py    |   8 +-
 .../collections/asr/parts/utils/rnnt_utils.py | 113 ++++-----
 .../asr/parts/utils/streaming_utils.py        |   4 +-
 .../asr/parts/utils/transcribe_utils.py       |  44 ++--
 .../speech_cv/models/visual_ctc_models.py     |  24 +-
 .../models/visual_hybrid_rnnt_ctc_models.py   |  30 ++-
 .../speech_cv/models/visual_rnnt_models.py    |  27 +--
 nemo/collections/tts/g2p/models/ctc.py        |  13 +-
 .../ngram_lm/eval_beamsearch_ngram_ctc.py     |   6 +-
 .../eval_beamsearch_ngram_transducer.py       |   3 +-
 .../ngram_lm/eval_wfst_decoding_ctc.py        |   4 +-
 .../test_batched_hyps_and_alignments.py       |  68 +++---
 .../asr/decoding/test_ctc_decoding.py         |  54 ++---
 .../test_cuda_graph_rnnt_greedy_decoding.py   |  20 +-
 .../asr/decoding/test_rnnt_alignments.py      |   2 +-
 .../asr/decoding/test_rnnt_decoding.py        |  61 ++---
 .../asr/mixins/test_transcription.py          |  20 +-
 .../asr/test_asr_classification_model.py      |  10 +-
 .../asr/test_asr_context_biasing.py           |   4 +-
 .../asr/test_asr_ctc_encoder_model_bpe.py     |   3 +-
 .../asr/test_asr_ctcencdec_model.py           |   3 +-
 .../asr/test_asr_hybrid_rnnt_ctc_model_bpe.py |   3 +-
 .../test_asr_hybrid_rnnt_ctc_model_char.py    |  10 +-
 tests/collections/asr/test_asr_metrics.py     |  60 ++---
 .../asr/test_asr_multitask_model_bpe.py       |   8 +-
 .../asr/test_asr_rnnt_encdec_model.py         |  50 ++--
 .../asr/test_asr_rnnt_encoder_model_bpe.py    |   3 +-
 tutorials/asr/ASR_Context_Biasing.ipynb       |   8 +-
 tutorials/asr/ASR_with_NeMo.ipynb             | 220 +++++++++---------
 tutorials/asr/ASR_with_Transducers.ipynb      |   2 +-
 .../asr/Buffered_Transducer_Inference.ipynb   |   8 +-
 51 files changed, 647 insertions(+), 677 deletions(-)

diff --git a/nemo/collections/asr/metrics/bleu.py b/nemo/collections/asr/metrics/bleu.py
index 32bd25d952d4..f422f3665561 100644
--- a/nemo/collections/asr/metrics/bleu.py
+++ b/nemo/collections/asr/metrics/bleu.py
@@ -161,14 +161,16 @@ def update(
                 target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist()
                 reference = self.decoding.decode_tokens_to_str(target)
                 references.append(reference)
-            hypotheses, _ = self.decode(predictions, predictions_lengths, predictions_mask, input_ids, targets)
+            hypotheses = self.decode(predictions, predictions_lengths, predictions_mask, input_ids, targets)
 
         if self.log_prediction:
-            logging.info(f"\n")
+            logging.info("\n")
             logging.info(f"reference:{references[0]}")
             logging.info(f"predicted:{hypotheses[0]}")
 
-        super().update(hypotheses, [references])  # Note: [references] since BLEU allows multiple references.
+        super().update(
+            [h.text for h in hypotheses], [references]
+        )  # Note: [references] since BLEU allows multiple references.
 
     def compute(self, return_all_metrics=True, prefix="", suffix=""):
         """
diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py
index 7bda3a77b278..07ddb928e966 100644
--- a/nemo/collections/asr/metrics/wer.py
+++ b/nemo/collections/asr/metrics/wer.py
@@ -323,19 +323,19 @@ def update(
                 target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist()
                 reference = self.decoding.decode_tokens_to_str(target)
                 references.append(reference)
-            hypotheses, _ = self.decode(predictions, predictions_lengths, predictions_mask, input_ids, targets)
+            hypotheses = self.decode(predictions, predictions_lengths, predictions_mask, input_ids, targets)
 
         if self.log_prediction:
-            logging.info(f"\n")
+            logging.info("\n")
             logging.info(f"reference:{references[0]}")
-            logging.info(f"predicted:{hypotheses[0]}")
+            logging.info(f"predicted:{hypotheses[0].text}")
 
         for h, r in zip(hypotheses, references):
             if self.use_cer:
-                h_list = list(h)
+                h_list = list(h.text)
                 r_list = list(r)
             else:
-                h_list = h.split()
+                h_list = h.text.split()
                 r_list = r.split()
             words += len(r_list)
             # Compute Levenstein's distance
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index a609eeaccf9e..18570e306317 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -43,7 +43,6 @@
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.common import tokenizers
 from nemo.collections.common.data.lhotse.dataloader import get_lhotse_dataloader_from_config
-from nemo.collections.common.data.prompt_fn import get_prompt_format_fn
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
@@ -60,7 +59,6 @@
     SpectrogramType,
 )
 from nemo.utils import logging, model_utils
-from nemo.utils.decorators import deprecated
 
 __all__ = ['EncDecMultiTaskModel']
 
@@ -310,7 +308,7 @@ def change_vocabulary(
                 )
 
             if new_tokenizer_type.lower() not in ('bpe', 'wpe'):
-                raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`')
+                raise ValueError('New tokenizer type must be either `bpe` or `wpe`')
 
             tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type})
 
@@ -821,7 +819,7 @@ def _transcribe_on_begin(self, audio, trcfg: MultiTaskTranscriptionConfig):
 
         if isinstance(audio, list):
             logging.debug(f"Found 'audio' to be a list of {len(audio)} items.")
-            logging.debug(f"Assuming each item in 'audio' is a path to audio file.")
+            logging.debug("Assuming each item in 'audio' is a path to audio file.")
 
             if isinstance(self.tokenizer, tokenizers.AggregateTokenizer):
                 if hasattr(trcfg, '_internal') and hasattr(trcfg._internal, 'primary_language'):
@@ -929,10 +927,6 @@ def _transcribe_forward(
             decoder_input_ids=decoder_input_ids,
         )
 
-    @deprecated(
-        explanation='The return type of args will be updated in the upcoming release to ensure a consistent \
-        output format across all decoder types, such that a Hypothesis object is always returned.'
-    )
     def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionConfig) -> GenericTranscriptionType:
         """
         Internal function to process the model's outputs to return the results to the user. This function is called by
@@ -944,7 +938,7 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
 
         Returns:
             The output can be a list of
-            objects, list of list of objects, tuple of objects, tuple of list of objects, or a dict of list of objects.
+            objects, list of list of objects.
             Its type is defined in `TranscriptionReturnType`.
         """
         log_probs = outputs.pop('log_probs')
@@ -955,7 +949,7 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
 
         del log_probs, encoded_len
 
-        best_hypotheses, all_hypotheses = self.decoding.decode_predictions_tensor(
+        hypotheses = self.decoding.decode_predictions_tensor(
             encoder_hidden_states=enc_states,
             encoder_input_mask=enc_mask,
             decoder_input_ids=decoder_input_ids,
@@ -963,9 +957,8 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
         )
 
         del enc_states, enc_mask, decoder_input_ids
-        if all_hypotheses is None:
-            return best_hypotheses
-        return best_hypotheses, all_hypotheses
+
+        return hypotheses
 
     def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
         """
@@ -1092,7 +1085,7 @@ def predict_step(
             encoder_input_mask=enc_mask,
             decoder_input_ids=batch.prompt,
             return_hypotheses=False,
-        )[0]
+        )
         if batch.cuts:
             return list(zip(batch.cuts, text))
         else:
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index ae8c35220931..f65a28e85560 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -34,6 +34,7 @@
 from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
 from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import process_timestamp_outputs
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.common.parts.preprocessing.parsers import make_parser
@@ -41,7 +42,6 @@
 from nemo.core.classes.mixins import AccessMixin
 from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType
 from nemo.utils import logging
-from nemo.utils.decorators import deprecated
 
 __all__ = ['EncDecCTCModel']
 
@@ -612,7 +612,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
         else:
             log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len)
 
-        transcribed_texts, _ = self.wer.decoding.ctc_decoder_predictions_tensor(
+        transcribed_texts = self.wer.decoding.ctc_decoder_predictions_tensor(
             decoder_outputs=log_probs,
             decoder_lengths=encoded_len,
             return_hypotheses=False,
@@ -703,15 +703,11 @@ def _transcribe_forward(self, batch: Any, trcfg: TranscribeConfig):
         del greedy_predictions
         return output
 
-    @deprecated(
-        explanation='The return type of args will be updated in the upcoming release to ensure a consistent output \
-            format across all decoder types, such that a Hypothesis object is always returned.'
-    )
     def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> GenericTranscriptionType:
         logits = outputs.pop('logits')
         logits_len = outputs.pop('logits_len')
 
-        current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor(
+        hypotheses = self.decoding.ctc_decoder_predictions_tensor(
             logits,
             decoder_lengths=logits_len,
             return_hypotheses=trcfg.return_hypotheses,
@@ -732,30 +728,24 @@ def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> Gen
                 # cudaMallocHost()-allocated tensor to be floating
                 # around. Were that to be the case, then the pinned
                 # memory cache would always miss.
-                current_hypotheses[idx].y_sequence = logits_cpu[idx, : logits_len[idx]].clone()
-                if current_hypotheses[idx].alignments is None:
-                    current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence
+                hypotheses[idx].y_sequence = logits_cpu[idx, : logits_len[idx]].clone()
+                if hypotheses[idx].alignments is None:
+                    hypotheses[idx].alignments = hypotheses[idx].y_sequence
             del logits_cpu
 
         # cleanup memory
         del logits, logits_len
 
         if trcfg.timestamps:
-            current_hypotheses = process_timestamp_outputs(
-                current_hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
+            hypotheses = process_timestamp_outputs(
+                hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
             )
-            all_hyp = process_timestamp_outputs(
-                all_hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
-            )
-
-        hypotheses = []
-        if all_hyp is None:
-            hypotheses += current_hypotheses
-        else:
-            hypotheses += all_hyp
 
         return hypotheses
 
+    def get_best_hyptheses(self, all_hypothesis: list[list[Hypothesis]]):
+        return [hyp[0] for hyp in all_hypothesis]
+
     def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
         """
         Setup function for a temporary data loader which wraps the provided audio file.
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
index be795b6e4bc4..9b6ef4356559 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
@@ -13,15 +13,11 @@
 # limitations under the License.
 
 import copy
-import json
-import os
-import tempfile
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Optional, Union
 
 import torch
 from lightning.pytorch import Trainer
 from omegaconf import DictConfig, OmegaConf, open_dict
-from tqdm.auto import tqdm
 
 from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs
 from nemo.collections.asr.losses.ctc import CTCLoss
@@ -31,6 +27,7 @@
 from nemo.collections.asr.parts.mixins.transcription import TranscriptionReturnType
 from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import process_timestamp_outputs
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.classes.mixins import AccessMixin
@@ -200,7 +197,7 @@ def _transcribe_forward(self, batch: Any, trcfg: TranscribeConfig):
 
     def _transcribe_output_processing(
         self, outputs, trcfg: TranscribeConfig
-    ) -> Tuple[List['Hypothesis'], List['Hypothesis']]:
+    ) -> Union[List['Hypothesis'], List[List['Hypothesis']]]:
         if self.cur_decoder == "rnnt":
             return super()._transcribe_output_processing(outputs, trcfg)
 
@@ -208,7 +205,7 @@ def _transcribe_output_processing(
         logits = outputs.pop('logits')
         encoded_len = outputs.pop('encoded_len')
 
-        best_hyp, all_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor(
+        hypotheses = self.ctc_decoding.ctc_decoder_predictions_tensor(
             logits,
             encoded_len,
             return_hypotheses=trcfg.return_hypotheses,
@@ -218,9 +215,9 @@ def _transcribe_output_processing(
         if trcfg.return_hypotheses:
             # dump log probs per file
             for idx in range(logits.shape[0]):
-                best_hyp[idx].y_sequence = logits[idx][: encoded_len[idx]]
-                if best_hyp[idx].alignments is None:
-                    best_hyp[idx].alignments = best_hyp[idx].y_sequence
+                hypotheses[idx].y_sequence = logits[idx][: encoded_len[idx]]
+                if hypotheses[idx].alignments is None:
+                    hypotheses[idx].alignments = hypotheses[idx].y_sequence
 
         # DEPRECATED?
         # if logprobs:
@@ -228,25 +225,13 @@ def _transcribe_output_processing(
         #         logits_list.append(logit[:elen])
 
         if trcfg.timestamps:
-            best_hyp = process_timestamp_outputs(
-                best_hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
-            )
-            all_hyp = process_timestamp_outputs(
-                all_hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
+            hypotheses = process_timestamp_outputs(
+                hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
             )
 
         del logits, encoded_len
 
-        hypotheses = []
-        all_hypotheses = []
-
-        hypotheses += best_hyp
-        if all_hyp is not None:
-            all_hypotheses += all_hyp
-        else:
-            all_hypotheses += best_hyp
-
-        return (hypotheses, all_hypotheses)
+        return hypotheses
 
     def change_vocabulary(
         self,
@@ -515,7 +500,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
             encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len)
         del signal
 
-        best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor(
+        best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor(
             encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False
         )
         if isinstance(sample_id, torch.Tensor):
diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
index 78038d404107..b26337b26cba 100644
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@@ -15,7 +15,7 @@
 import copy
 import os
 from math import ceil
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import torch
@@ -40,6 +40,7 @@
 from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecoding, RNNTDecodingConfig
 from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import process_timestamp_outputs
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.common.parts.preprocessing.parsers import make_parser
@@ -47,7 +48,6 @@
 from nemo.core.classes.mixins import AccessMixin
 from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType
 from nemo.utils import logging
-from nemo.utils.decorators import deprecated
 
 
 class EncDecRNNTModel(ASRModel, ASRModuleMixin, ExportableEncDecModel, ASRTranscriptionMixin):
@@ -814,7 +814,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
             encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len)
         del signal
 
-        best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor(
+        best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor(
             encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False
         )
 
@@ -936,17 +936,13 @@ def _transcribe_forward(self, batch: Any, trcfg: TranscribeConfig):
         output = dict(encoded=encoded, encoded_len=encoded_len)
         return output
 
-    @deprecated(
-        explanation='The return type of args will be updated in the upcoming release to ensure a consistent \
-            output format across all decoder types, such that a "Hypothesis" object is always returned.'
-    )
     def _transcribe_output_processing(
         self, outputs, trcfg: TranscribeConfig
-    ) -> Tuple[List['Hypothesis'], List['Hypothesis']]:
+    ) -> Union[List['Hypothesis'], List[List['Hypothesis']]]:
         encoded = outputs.pop('encoded')
         encoded_len = outputs.pop('encoded_len')
 
-        best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor(
+        hyp = self.decoding.rnnt_decoder_predictions_tensor(
             encoded,
             encoded_len,
             return_hypotheses=trcfg.return_hypotheses,
@@ -956,23 +952,11 @@ def _transcribe_output_processing(
         del encoded, encoded_len
 
         if trcfg.timestamps:
-            best_hyp = process_timestamp_outputs(
-                best_hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
+            hyp = process_timestamp_outputs(
+                hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
             )
-            all_hyp = process_timestamp_outputs(
-                all_hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
-            )
-
-        hypotheses = []
-        all_hypotheses = []
-
-        hypotheses += best_hyp
-        if all_hyp is not None:
-            all_hypotheses += all_hyp
-        else:
-            all_hypotheses += best_hyp
 
-        return (hypotheses, all_hypotheses)
+        return hyp
 
     def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
         """
diff --git a/nemo/collections/asr/parts/context_biasing/context_biasing_utils.py b/nemo/collections/asr/parts/context_biasing/context_biasing_utils.py
index 6b3626920a2f..f168d92bdbc7 100644
--- a/nemo/collections/asr/parts/context_biasing/context_biasing_utils.py
+++ b/nemo/collections/asr/parts/context_biasing/context_biasing_utils.py
@@ -75,7 +75,7 @@ def merge_alignment_with_ws_hyps(
                 if idx + 1 < len(tokens) and not tokens[idx + 1].startswith(bow):
                     tokens[idx + 1] = bow + tokens[idx + 1]
                     continue
-            alignment_tokens.append([candidate.timestep[idx].item(), token])
+            alignment_tokens.append([candidate.timestamp[idx].item(), token])
     else:
         raise ValueError(f"decoder_type {decoder_type} is not supported")
 
@@ -86,20 +86,26 @@ def merge_alignment_with_ws_hyps(
     # step 2: get word-level alignment [word, start_frame, end_frame]
     word_alignment = []
     word = ""
-    l, r, = None, None
+    (
+        L,
+        r,
+    ) = (
+        None,
+        None,
+    )
     for item in alignment_tokens:
         if not word:
             word = item[1][1:]
-            l = r = item[0]
+            L = r = item[0]
         else:
             if item[1].startswith(bow):
-                word_alignment.append((word, l, r))
+                word_alignment.append((word, L, r))
                 word = item[1][1:]
-                l = r = item[0]
+                L = r = item[0]
             else:
                 word += item[1]
                 r = item[0]
-    word_alignment.append((word, l, r))
+    word_alignment.append((word, L, r))
     initial_text_transcript = " ".join([item[0] for item in word_alignment])
     if print_stats:
         logging.info(f"Word alignment: {word_alignment}")
diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py
index 2de8ec775104..577b6393248c 100644
--- a/nemo/collections/asr/parts/mixins/mixins.py
+++ b/nemo/collections/asr/parts/mixins/mixins.py
@@ -699,10 +699,10 @@ def conformer_stream_step(
                         decoder_lengths=encoded_len[preds_idx : preds_idx + 1],
                         return_hypotheses=False,
                     )
-                    all_hyp_or_transcribed_texts.append(decoded_out[0][0])
+                    all_hyp_or_transcribed_texts.append(decoded_out[0])
             best_hyp = None
         else:
-            best_hyp, all_hyp_or_transcribed_texts = self.decoding.rnnt_decoder_predictions_tensor(
+            best_hyp = self.decoding.rnnt_decoder_predictions_tensor(
                 encoder_output=encoded,
                 encoded_lengths=encoded_len,
                 return_hypotheses=True,
@@ -710,8 +710,7 @@ def conformer_stream_step(
             )
             greedy_predictions = [hyp.y_sequence for hyp in best_hyp]
 
-            if all_hyp_or_transcribed_texts is None:
-                all_hyp_or_transcribed_texts = best_hyp
+            all_hyp_or_transcribed_texts = best_hyp
 
         result = [
             greedy_predictions,
diff --git a/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py b/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py
index 0beab5f54cb1..5328af1b7785 100644
--- a/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import math
 import os
 from dataclasses import dataclass, field
@@ -20,7 +22,7 @@
 import torch
 
 from nemo.collections.asr.parts.k2.classes import GraphIntersectDenseConfig
-from nemo.collections.asr.parts.submodules.wfst_decoder import RivaDecoderConfig
+from nemo.collections.asr.parts.submodules.wfst_decoder import RivaDecoderConfig, WfstNbestHypothesis
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.core.classes import Typing, typecheck
@@ -72,7 +74,7 @@ def pack_wfst_hypotheses(
                 y_sequence=[],
                 score=cand.score,
                 text=" ".join(cand.words),
-                timestep=list(cand.timesteps),
+                timestamp=list(cand.timesteps),
                 alignments=list(cand.alignment),
             )
             cand_hyp.y_sequence = y_sequence
@@ -240,7 +242,7 @@ def __init__(
         self.compute_timestamps = compute_timestamps
 
         if self.compute_timestamps:
-            raise ValueError(f"Currently this flag is not supported for beam search algorithms.")
+            raise ValueError("Currently this flag is not supported for beam search algorithms.")
 
         self.vocab = None  # This must be set by specific method by user before calling forward() !
 
@@ -387,7 +389,7 @@ def default_beam_search(
             hypotheses = []
             for candidate_idx, candidate in enumerate(beams):
                 hypothesis = rnnt_utils.Hypothesis(
-                    score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None
+                    score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None
                 )
 
                 # For subword encoding, NeMo will double encode the subword (multiple tokens) into a
@@ -444,8 +446,8 @@ def _pyctcdecode_beam_search(
             import pyctcdecode
         except (ImportError, ModuleNotFoundError):
             raise ImportError(
-                f"Could not load `pyctcdecode` library. Please install it from pip using :\n"
-                f"pip install --upgrade pyctcdecode"
+                "Could not load `pyctcdecode` library. Please install it from pip using :\n"
+                "pip install --upgrade pyctcdecode"
             )
 
         if self.pyctcdecode_beam_scorer is None:
@@ -477,7 +479,7 @@ def _pyctcdecode_beam_search(
             for candidate_idx, candidate in enumerate(beams):
                 # Candidate = (text, last_lm_state, text_frames, logit_score, lm_score)
                 hypothesis = rnnt_utils.Hypothesis(
-                    score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None
+                    score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None
                 )
 
                 # TODO: Requires token ids to be returned rather than text.
@@ -498,7 +500,7 @@ def _pyctcdecode_beam_search(
                 hypothesis.score = candidate[4]  # score
 
                 # Inject word level timestamps
-                hypothesis.timestep = candidate[2]  # text_frames
+                hypothesis.timestamp = candidate[2]  # text_frames
 
                 if self.preserve_alignments:
                     hypothesis.alignments = torch.from_numpy(x[beams_idx][: out_len[beams_idx]])
@@ -535,7 +537,7 @@ def flashlight_beam_search(
             if self.kenlm_path is None or not os.path.exists(self.kenlm_path):
                 raise FileNotFoundError(
                     f"KenLM binary file not found at : {self.kenlm_path}. "
-                    f"Please set a valid path in the decoding config."
+                    "Please set a valid path in the decoding config."
                 )
 
             # perform token offset for subword models
@@ -575,7 +577,7 @@ def flashlight_beam_search(
             hypotheses = []
             for candidate_idx, candidate in enumerate(beams):
                 hypothesis = rnnt_utils.Hypothesis(
-                    score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None
+                    score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None
                 )
 
                 # We preserve the token ids and the score for this hypothesis
@@ -730,7 +732,7 @@ def forward(
 
         return (packed_result,)
 
-    def _prepare_decoding_lm_wfst(self) -> Union[str, 'kaldifst.StdFst', 'k2.Fsa']:
+    def _prepare_decoding_lm_wfst(self) -> Union[str, 'kaldifst.StdFst', 'k2.Fsa']:  # noqa: F821
         """TBD"""
         arpa_lm_path_exists = self.arpa_lm_path is not None and os.path.exists(self.arpa_lm_path)
         wfst_lm_path_exists = self.wfst_lm_path is not None and os.path.exists(self.wfst_lm_path)
diff --git a/nemo/collections/asr/parts/submodules/ctc_decoding.py b/nemo/collections/asr/parts/submodules/ctc_decoding.py
index 0603f5f77206..13591a8b113f 100644
--- a/nemo/collections/asr/parts/submodules/ctc_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_decoding.py
@@ -16,7 +16,7 @@
 import unicodedata
 from abc import abstractmethod
 from dataclasses import dataclass, field, is_dataclass
-from typing import Callable, Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Union
 
 import numpy as np
 import torch
@@ -360,7 +360,7 @@ def ctc_decoder_predictions_tensor(
         decoder_lengths: torch.Tensor = None,
         fold_consecutive: bool = True,
         return_hypotheses: bool = False,
-    ) -> Tuple[List[str], Optional[List[List[str]]], Optional[Union[Hypothesis, NBestHypotheses]]]:
+    ) -> Union[List[Hypothesis], List[List[Hypothesis]]]:
         """
         Decodes a sequence of labels to words
 
@@ -379,8 +379,7 @@ def ctc_decoder_predictions_tensor(
                 transcribe())
 
         Returns:
-            Either a list of str which represent the CTC decoded strings per sample,
-            or a list of Hypothesis objects containing additional information.
+            A list of Hypothesis objects containing additional information.
         """
 
         if isinstance(decoder_outputs, torch.Tensor):
@@ -410,9 +409,7 @@ def ctc_decoder_predictions_tensor(
         if isinstance(hypotheses_list[0], NBestHypotheses):
             if self.cfg.strategy == 'wfst':
                 all_hypotheses = [hyp.n_best_hypotheses for hyp in hypotheses_list]
-                hypotheses = [hyp[0] for hyp in all_hypotheses]
             else:
-                hypotheses = []
                 all_hypotheses = []
 
                 for nbest_hyp in hypotheses_list:  # type: NBestHypotheses
@@ -427,16 +424,14 @@ def ctc_decoder_predictions_tensor(
                         for hyp_idx in range(len(decoded_hyps)):
                             decoded_hyps[hyp_idx] = self.compute_ctc_timestamps(decoded_hyps[hyp_idx], timestamp_type)
 
-                    hypotheses.append(decoded_hyps[0])  # best hypothesis
                     all_hypotheses.append(decoded_hyps)
 
             if return_hypotheses:
-                return hypotheses, all_hypotheses
+                return all_hypotheses  # type: list[list[Hypothesis]]
 
-            best_hyp_text = [h.text for h in hypotheses]
             # alaptev: The line below might contain a bug. Do we really want all_hyp_text to be flat?
-            all_hyp_text = [h.text for hh in all_hypotheses for h in hh]
-            return best_hyp_text, all_hyp_text
+            all_hyp = [[Hypothesis(h.score, h.y_sequence, h.text) for h in hh] for hh in all_hypotheses]
+            return all_hyp
 
         else:
             if self.cfg.strategy == 'wfst':
@@ -460,10 +455,9 @@ def ctc_decoder_predictions_tensor(
                         hypotheses[hyp_idx] = self.compute_ctc_timestamps(hypotheses[hyp_idx], timestamp_type)
 
             if return_hypotheses:
-                return hypotheses, None
+                return hypotheses
 
-            best_hyp_text = [h.text for h in hypotheses]
-            return best_hyp_text, None
+            return [Hypothesis(h.score, h.y_sequence, h.text) for h in hypotheses]
 
     def decode_hypothesis(
         self, hypotheses_list: List[Hypothesis], fold_consecutive: bool
@@ -686,25 +680,25 @@ def compute_ctc_timestamps(self, hypothesis: Hypothesis, timestamp_type: str = "
             )
 
         # attach results
-        if len(hypothesis.timestep) > 0:
-            timestep_info = hypothesis.timestep
+        if len(hypothesis.timestamp) > 0:
+            timestep_info = hypothesis.timestamp
         else:
             timestep_info = []
 
         # Setup defaults
-        hypothesis.timestep = {"timestep": timestep_info}
+        hypothesis.timestamp = {"timestep": timestep_info}
 
         # Add char / subword time stamps
         if char_offsets is not None and timestamp_type in ['char', 'all']:
-            hypothesis.timestep['char'] = char_offsets
+            hypothesis.timestamp['char'] = char_offsets
 
         # Add word time stamps
         if word_offsets is not None and timestamp_type in ['word', 'all']:
-            hypothesis.timestep['word'] = word_offsets
+            hypothesis.timestamp['word'] = word_offsets
 
         # Add segment time stamps
         if segment_offsets is not None and timestamp_type in ['segment', 'all']:
-            hypothesis.timestep['segment'] = segment_offsets
+            hypothesis.timestamp['segment'] = segment_offsets
 
         # Convert the token indices to text
         hypothesis.text = self.decode_tokens_to_str(hypothesis.text)
@@ -731,8 +725,8 @@ def _compute_offsets(
 
         # If the exact timestep information is available, utilize the 1st non-ctc blank token timestep
         # as the start index.
-        if hypothesis.timestep is not None and len(hypothesis.timestep) > 0:
-            start_index = max(0, hypothesis.timestep[0] - 1)
+        if hypothesis.timestamp is not None and len(hypothesis.timestamp) > 0:
+            start_index = max(0, hypothesis.timestamp[0] - 1)
 
         # Construct the start and end indices brackets
         end_indices = np.asarray(token_lengths).cumsum()
diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index 74204cf73d8e..bdcb71e9d721 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -224,7 +224,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: Optional[torch.Tenso
         # out_len: [seq_len]
 
         # Initialize blank state and empty label set in Hypothesis
-        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
+        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None)
         prediction = x.cpu()
 
         if out_len is not None:
@@ -241,7 +241,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: Optional[torch.Tenso
             hypothesis.alignments = (prediction.clone(), prediction_labels.clone())
 
         if self.compute_timestamps:
-            hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist()
+            hypothesis.timestamp = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist()
 
         if self.preserve_frame_confidence:
             hypothesis.frame_confidence = self._get_confidence(prediction)
@@ -254,7 +254,7 @@ def _greedy_decode_labels(self, x: torch.Tensor, out_len: Optional[torch.Tensor]
         # out_len: [seq_len]
 
         # Initialize blank state and empty label set in Hypothesis
-        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
+        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None)
         prediction_labels = x.cpu()
 
         if out_len is not None:
@@ -268,7 +268,7 @@ def _greedy_decode_labels(self, x: torch.Tensor, out_len: Optional[torch.Tensor]
             raise ValueError("Requested for alignments, but predictions provided were labels, not log probabilities.")
 
         if self.compute_timestamps:
-            hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist()
+            hypothesis.timestamp = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist()
 
         if self.preserve_frame_confidence:
             raise ValueError(
@@ -447,7 +447,7 @@ def _greedy_decode_logprobs_batched(self, x: torch.Tensor, out_len: torch.Tensor
 
         # This mimics the for loop in GreedyCTCInfer::forward.
         for i in range(batch_size):
-            hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
+            hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None)
             hypothesis.score = scores[i]
 
             prediction_labels_no_padding = predictions_labels[i, : out_len[i]].tolist()
@@ -464,7 +464,7 @@ def _greedy_decode_logprobs_batched(self, x: torch.Tensor, out_len: torch.Tensor
                 # TOOD: Could do this in a vectorized manner... Would
                 # prefer to have nonzero_static, though, for sanity.
                 # Or do a prefix sum on out_len
-                hypothesis.timestep = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist()
+                hypothesis.timestamp = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist()
             if self.preserve_frame_confidence:
                 hypothesis.frame_confidence = self._get_confidence(predictions[i, : out_len[i], :])
 
@@ -493,7 +493,7 @@ def _greedy_decode_labels_batched(self, x: torch.Tensor, out_len: torch.Tensor):
         hypotheses = []
 
         for i in range(batch_size):
-            hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
+            hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None)
             hypothesis.y_sequence = predictions_labels[i, : out_len[i]].tolist()
             hypothesis.score = -1.0
 
@@ -505,7 +505,7 @@ def _greedy_decode_labels_batched(self, x: torch.Tensor, out_len: torch.Tensor):
                 # TOOD: Could do this in a vectorized manner... Would
                 # prefer to have nonzero_static, though, for sanity.
                 # Or do a prefix sum on out_len
-                hypothesis.timestep = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist()
+                hypothesis.timestamp = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist()
             if self.preserve_frame_confidence:
                 raise ValueError(
                     "Requested for per-frame confidence, but predictions provided were labels, not log probabilities."
diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
index fc501b3d00de..a0be0e1f4a04 100644
--- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
@@ -359,13 +359,13 @@ def __call__(
         labels_packed = self.labels_cpu[valid_labels_mask]
 
         hypotheses = [
-            rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batch_size)
+            rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestamp=[], dec_state=None) for _ in range(batch_size)
         ]
 
         timestep_start = 0
         labels_start = 0
         for i in range(batch_size):
-            hypotheses[i].timestep = timesteps_packed[timestep_start : timestep_start + timestep_segments[i]].tolist()
+            hypotheses[i].timestamp = timesteps_packed[timestep_start : timestep_start + timestep_segments[i]].tolist()
             timestep_start += timestep_segments[i]
             hypotheses[i].score = float(total_scores[i])
             hypotheses[i].y_sequence = labels_packed[labels_start : labels_start + labels_segments[i]].tolist()
diff --git a/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py b/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py
index e181772b7f18..d49c6e69215f 100644
--- a/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py
+++ b/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py
@@ -195,7 +195,7 @@ def forward(
                 beam_scores = [x.detach().cpu() for x in beam_scores]  # each item is [beam,]
                 packed_result = []
                 for i in range(len(topk_hypotheses)):
-                    hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestep=[]) for _ in range(self.beam_size)]
+                    hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(self.beam_size)]
                     # Pack results into Hypotheses
                     hypotheses = pack_hypotheses(hypotheses, topk_hypotheses[i], beam_scores[i])
                     self.format_hypotheses(hypotheses, decoder_input_ids)
@@ -204,7 +204,7 @@ def forward(
                 beam_scores = [None for _ in range(len(best_hypo))]
                 best_hypo = best_hypo.detach().cpu()
                 hypotheses = [
-                    Hypothesis(score=0.0, y_sequence=[], timestep=[]) for _ in range(encoder_hidden_states.shape[0])
+                    Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(encoder_hidden_states.shape[0])
                 ]
                 # Pack results into Hypotheses
                 packed_result = pack_hypotheses(hypotheses, best_hypo, beam_scores)
diff --git a/nemo/collections/asr/parts/submodules/multitask_decoding.py b/nemo/collections/asr/parts/submodules/multitask_decoding.py
index 790c95afbbfb..99010bdc14b8 100644
--- a/nemo/collections/asr/parts/submodules/multitask_decoding.py
+++ b/nemo/collections/asr/parts/submodules/multitask_decoding.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
 from abc import abstractmethod
 from dataclasses import dataclass, field, is_dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Union
 
 import torch
 from omegaconf import OmegaConf
@@ -216,7 +215,7 @@ def decode_predictions_tensor(
         decoder_input_ids: Optional[torch.Tensor] = None,
         return_hypotheses: bool = False,
         partial_hypotheses: Optional[List[Hypothesis]] = None,
-    ) -> Tuple[List[str], Optional[List[List[str]]], Optional[Union[Hypothesis, NBestHypotheses]]]:
+    ) -> Union[List[Hypothesis], List[List[Hypothesis]]]:
         """
         Decode an encoder output by autoregressive decoding of the Decoder+Joint networks.
 
@@ -226,18 +225,14 @@ def decode_predictions_tensor(
             return_hypotheses: bool. If set to True it will return list of Hypothesis or NBestHypotheses
 
         Returns:
-            If `return_best_hypothesis` is set:
-                A tuple (hypotheses, None):
-                hypotheses - list of Hypothesis (best hypothesis per sample).
+            If `return_all_hypothesis` is set:
+                A list[list[Hypothesis]].
                     Look at rnnt_utils.Hypothesis for more information.
 
-            If `return_best_hypothesis` is not set:
-                A tuple(hypotheses, all_hypotheses)
-                hypotheses - list of Hypothesis (best hypothesis per sample).
+            If `return_all_hypothesis` is not set:
+                A list[Hypothesis].
+                List of best hypotheses
                     Look at rnnt_utils.Hypothesis for more information.
-                all_hypotheses - list of NBestHypotheses. Each NBestHypotheses further contains a sorted
-                    list of all the hypotheses of the model per sample.
-                    Look at rnnt_utils.NBestHypotheses for more information.
         """
         # Compute hypotheses
         with torch.inference_mode():
@@ -265,11 +260,10 @@ def decode_predictions_tensor(
                 all_hypotheses.append(decoded_hyps)
 
             if return_hypotheses:
-                return hypotheses, all_hypotheses
+                return all_hypotheses
 
-            best_hyp_text = [h.text for h in hypotheses]
-            all_hyp_text = [h.text for hh in all_hypotheses for h in hh]
-            return best_hyp_text, all_hyp_text
+            all_hyp = [[Hypothesis(h.score, h.y_sequence, h.text) for h in hh] for hh in all_hypotheses]
+            return all_hyp
 
         else:
             hypotheses = self.decode_hypothesis(prediction_list)
@@ -280,10 +274,9 @@ def decode_predictions_tensor(
                     self.preserve_word_confidence or self.preserve_token_confidence
                 ):
                     hypotheses = self.compute_confidence(hypotheses)
-                return hypotheses, None
+                return hypotheses
 
-            best_hyp_text = [h.text for h in hypotheses]
-            return best_hyp_text, None
+            return [Hypothesis(h.score, h.y_sequence, h.text) for h in hypotheses]
 
     def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hypothesis, NBestHypotheses]]:
         """
diff --git a/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py b/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py
index f67cdd9f7944..eeae38ecef30 100644
--- a/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py
@@ -201,7 +201,7 @@ def forward(
                 packed_result = []
                 for i in range(len(topk_hypotheses)):
                     # Pack results into Hypotheses
-                    hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestep=[]) for _ in range(self.n_samples)]
+                    hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(self.n_samples)]
                     self.format_hypotheses(hypotheses, decoder_input_ids)
                     packed_result.append(
                         NBestHypotheses(
@@ -212,7 +212,7 @@ def forward(
                 beam_scores = [None for _ in range(len(best_hypo))]
                 best_hypo = best_hypo.cpu()
                 hypotheses = [
-                    Hypothesis(score=0.0, y_sequence=[], timestep=[]) for _ in range(encoder_hidden_states.shape[0])
+                    Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(encoder_hidden_states.shape[0])
                 ]
                 # Pack results into Hypotheses
                 packed_result = pack_hypotheses(hypotheses, best_hypo, beam_scores, step_confidence)
diff --git a/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py
index e0bd47bb8ce0..b34a962d280d 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py
@@ -76,8 +76,8 @@ def pack_hypotheses(hypotheses: List[Hypothesis]) -> List[Hypothesis]:
             hyp.dec_state = _states_to_device(hyp.dec_state)
 
         # Remove -1 from timestep
-        if hyp.timestep is not None and len(hyp.timestep) > 0 and hyp.timestep[0] == -1:
-            hyp.timestep = hyp.timestep[1:]
+        if hyp.timestamp is not None and len(hyp.timestamp) > 0 and hyp.timestamp[0] == -1:
+            hyp.timestamp = hyp.timestamp[1:]
 
     return hypotheses
 
@@ -485,7 +485,7 @@ def greedy_search(
 
         # Construct initial hypothesis
         hyp = Hypothesis(
-            score=0.0, y_sequence=[self.blank], dec_state=dec_state, timestep=[-1], length=encoded_lengths
+            score=0.0, y_sequence=[self.blank], dec_state=dec_state, timestamp=[-1], length=encoded_lengths
         )
 
         if partial_hypotheses is not None:
@@ -532,7 +532,7 @@ def greedy_search(
                     hyp.y_sequence.append(int(pred))
                     hyp.score += float(logp)
                     hyp.dec_state = state
-                    hyp.timestep.append(i)
+                    hyp.timestamp.append(i)
 
                     # Compute next state and token
                     y, state, _ = self.decoder.score_hypothesis(hyp, cache)
@@ -582,7 +582,7 @@ def default_beam_search(
         dec_state = self.decoder.initialize_state(h)
 
         # Initialize first hypothesis for the beam (blank)
-        kept_hyps = [Hypothesis(score=0.0, y_sequence=[self.blank], dec_state=dec_state, timestep=[-1], length=0)]
+        kept_hyps = [Hypothesis(score=0.0, y_sequence=[self.blank], dec_state=dec_state, timestamp=[-1], length=0)]
         cache = {}
 
         if partial_hypotheses is not None:
@@ -631,7 +631,7 @@ def default_beam_search(
                         y_sequence=max_hyp.y_sequence[:],
                         dec_state=max_hyp.dec_state,
                         lm_state=max_hyp.lm_state,
-                        timestep=max_hyp.timestep[:],
+                        timestamp=max_hyp.timestamp[:],
                         length=encoded_lengths,
                     )
 
@@ -645,7 +645,7 @@ def default_beam_search(
                         # if non-blank token was predicted, update state and sequence and then search more hypothesis
                         new_hyp.dec_state = state
                         new_hyp.y_sequence.append(int(k))
-                        new_hyp.timestep.append(i)
+                        new_hyp.timestamp.append(i)
 
                         hyps.append(new_hyp)
 
@@ -729,7 +729,7 @@ def time_sync_decoding(
                 y_sequence=[self.blank],
                 score=0.0,
                 dec_state=self.decoder.batch_select_state(beam_state, 0),
-                timestep=[-1],
+                timestamp=[-1],
                 length=0,
             )
         ]
@@ -775,7 +775,7 @@ def time_sync_decoding(
                             y_sequence=hyp.y_sequence[:],
                             dec_state=hyp.dec_state,
                             lm_state=hyp.lm_state,
-                            timestep=hyp.timestep[:],
+                            timestamp=hyp.timestamp[:],
                             length=encoded_lengths,
                         )
 
@@ -807,7 +807,7 @@ def time_sync_decoding(
                                 y_sequence=(hyp.y_sequence + [int(k)]),
                                 dec_state=beam_state[j],
                                 lm_state=hyp.lm_state,
-                                timestep=hyp.timestep[:] + [i],
+                                timestamp=hyp.timestamp[:] + [i],
                                 length=encoded_lengths,
                             )
 
@@ -903,7 +903,7 @@ def align_length_sync_decoding(
                 y_sequence=[self.blank],
                 score=0.0,
                 dec_state=beam_state[0],
-                timestep=[-1],
+                timestamp=[-1],
                 length=0,
             )
         ]
@@ -999,7 +999,7 @@ def align_length_sync_decoding(
                         y_sequence=hyp.y_sequence[:],
                         dec_state=hyp.dec_state,
                         lm_state=hyp.lm_state,
-                        timestep=hyp.timestep[:],
+                        timestamp=hyp.timestamp[:],
                         length=i,
                     )
 
@@ -1036,7 +1036,7 @@ def align_length_sync_decoding(
                             y_sequence=(hyp.y_sequence[:] + [int(k)]),
                             dec_state=beam_state[h_states_idx],
                             lm_state=hyp.lm_state,
-                            timestep=hyp.timestep[:] + [i],
+                            timestamp=hyp.timestamp[:] + [i],
                             length=i,
                         )
 
@@ -1116,7 +1116,7 @@ def modified_adaptive_expansion_search(
                 y_sequence=[self.blank],
                 score=0.0,
                 dec_state=self.decoder.batch_select_state(beam_state, 0),
-                timestep=[-1],
+                timestamp=[-1],
                 length=0,
             )
         ]
@@ -1160,7 +1160,7 @@ def modified_adaptive_expansion_search(
                 dec_out=[beam_dec_out[0]],
                 lm_state=lm_state,
                 lm_scores=lm_scores,
-                timestep=[-1],
+                timestamp=[-1],
                 length=0,
             )
         ]
@@ -1218,7 +1218,7 @@ def modified_adaptive_expansion_search(
                             dec_state=hyp.dec_state,
                             lm_state=hyp.lm_state,
                             lm_scores=hyp.lm_scores,
-                            timestep=hyp.timestep[:],
+                            timestamp=hyp.timestamp[:],
                             length=t,
                         )
                         if self.ngram_lm:
@@ -1232,7 +1232,7 @@ def modified_adaptive_expansion_search(
                             # new_hyp.y_sequence.append(int(k))
                             if (new_hyp.y_sequence + [int(k)]) not in duplication_check:
                                 new_hyp.y_sequence.append(int(k))
-                                new_hyp.timestep.append(t)
+                                new_hyp.timestamp.append(t)
 
                                 # Setup ngram LM:
                                 if self.ngram_lm:
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
index 18fcc57e5184..1a50f10d3ed4 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -17,7 +17,7 @@
 import unicodedata
 from abc import abstractmethod
 from dataclasses import dataclass, field, is_dataclass
-from typing import Callable, Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Union
 
 import numpy as np
 import torch
@@ -494,7 +494,7 @@ def rnnt_decoder_predictions_tensor(
         encoded_lengths: torch.Tensor,
         return_hypotheses: bool = False,
         partial_hypotheses: Optional[List[Hypothesis]] = None,
-    ) -> Tuple[List[str], Optional[List[List[str]]], Optional[Union[Hypothesis, NBestHypotheses]]]:
+    ) -> Union[List[Hypothesis], List[List[Hypothesis]]]:
         """
         Decode an encoder output by autoregressive decoding of the Decoder+Joint networks.
 
@@ -504,18 +504,14 @@ def rnnt_decoder_predictions_tensor(
             return_hypotheses: bool. If set to True it will return list of Hypothesis or NBestHypotheses
 
         Returns:
-            If `return_best_hypothesis` is set:
-                A tuple (hypotheses, None):
-                hypotheses - list of Hypothesis (best hypothesis per sample).
+            If `return_all_hypothesis` is set:
+                A list[list[Hypothesis]].
                     Look at rnnt_utils.Hypothesis for more information.
 
-            If `return_best_hypothesis` is not set:
-                A tuple(hypotheses, all_hypotheses)
-                hypotheses - list of Hypothesis (best hypothesis per sample).
+            If `return_all_hypothesis` is not set:
+                A list[Hypothesis].
+                List of best hypotheses
                     Look at rnnt_utils.Hypothesis for more information.
-                all_hypotheses - list of NBestHypotheses. Each NBestHypotheses further contains a sorted
-                    list of all the hypotheses of the model per sample.
-                    Look at rnnt_utils.NBestHypotheses for more information.
         """
         # Compute hypotheses
         with torch.inference_mode():
@@ -546,11 +542,10 @@ def rnnt_decoder_predictions_tensor(
                 all_hypotheses.append(decoded_hyps)
 
             if return_hypotheses:
-                return hypotheses, all_hypotheses
+                return all_hypotheses  # type: list[list[Hypothesis]]
 
-            best_hyp_text = [h.text for h in hypotheses]
-            all_hyp_text = [h.text for hh in all_hypotheses for h in hh]
-            return best_hyp_text, all_hyp_text
+            all_hyp = [[Hypothesis(h.score, h.y_sequence, h.text) for h in hh] for hh in all_hypotheses]
+            return all_hyp
 
         else:
             hypotheses = self.decode_hypothesis(prediction_list)  # type: List[str]
@@ -567,10 +562,9 @@ def rnnt_decoder_predictions_tensor(
                     self.preserve_word_confidence or self.preserve_token_confidence
                 ):
                     hypotheses = self.compute_confidence(hypotheses)
-                return hypotheses, None
+                return hypotheses
 
-            best_hyp_text = [h.text for h in hypotheses]
-            return best_hyp_text, None
+            return [Hypothesis(h.score, h.y_sequence, h.text) for h in hypotheses]
 
     def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hypothesis, NBestHypotheses]]:
         """
@@ -681,7 +675,7 @@ def compute_confidence(self, hypotheses_list: List[Hypothesis]) -> List[Hypothes
                     hyp.token_confidence = hyp.non_blank_frame_confidence
             else:
                 for hyp in hypotheses_list:
-                    timestep = hyp.timestep.tolist() if isinstance(hyp.timestep, torch.Tensor) else hyp.timestep
+                    timestep = hyp.timestamp.tolist() if isinstance(hyp.timestamp, torch.Tensor) else hyp.timestamp
                     offset = 0
                     token_confidence = []
                     if len(timestep) > 0:
@@ -894,25 +888,25 @@ def compute_rnnt_timestamps(self, hypothesis: Hypothesis, timestamp_type: str =
             )
 
         # attach results
-        if len(hypothesis.timestep) > 0:
-            timestep_info = hypothesis.timestep
+        if len(hypothesis.timestamp) > 0:
+            timestep_info = hypothesis.timestamp
         else:
             timestep_info = []
 
         # Setup defaults
-        hypothesis.timestep = {"timestep": timestep_info}
+        hypothesis.timestamp = {"timestep": timestep_info}
 
         # Add char / subword time stamps
         if char_offsets is not None and timestamp_type in ['char', 'all']:
-            hypothesis.timestep['char'] = char_offsets
+            hypothesis.timestamp['char'] = char_offsets
 
         # Add word time stamps
         if word_offsets is not None and timestamp_type in ['word', 'all']:
-            hypothesis.timestep['word'] = word_offsets
+            hypothesis.timestamp['word'] = word_offsets
 
         # Add segment time stamps
         if segment_offsets is not None and timestamp_type in ['segment', 'all']:
-            hypothesis.timestep['segment'] = segment_offsets
+            hypothesis.timestamp['segment'] = segment_offsets
 
         # Convert the flattened token indices to text
         hypothesis.text = self.decode_tokens_to_str(hypothesis.text)
@@ -939,8 +933,8 @@ def _compute_offsets(
 
         # If the exact timestep information is available, utilize the 1st non-rnnt blank token timestep
         # as the start index.
-        if hypothesis.timestep is not None and len(hypothesis.timestep) > 0:
-            first_timestep = hypothesis.timestep[0]
+        if hypothesis.timestamp is not None and len(hypothesis.timestamp) > 0:
+            first_timestep = hypothesis.timestamp[0]
             first_timestep = first_timestep if isinstance(first_timestep, int) else first_timestep.item()
             start_index = max(0, first_timestep - 1)
 
@@ -980,7 +974,7 @@ def _compute_offsets_tdt(hypothesis: Hypothesis, *args) -> List[Dict[str, Union[
         # Merge the results per token into a list of dictionaries
         offsets = [
             {"char": [t, -1], "start_offset": int(s), "end_offset": int(s + d)}
-            for t, s, d in zip(hypothesis.text[0], hypothesis.timestep, hypothesis.token_duration)
+            for t, s, d in zip(hypothesis.text[0], hypothesis.timestamp, hypothesis.token_duration)
         ]
         return offsets
 
@@ -991,7 +985,7 @@ def _refine_timestamps(
         supported_punctuation: Optional[Set] = None,
     ) -> List[Dict[str, Union[str, int]]]:
 
-        ## no refinement for rnnt
+        # no refinement for rnnt
 
         return encoded_char_offsets, char_offsets
 
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index bd169d0d224e..9200e3b0c2da 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -420,7 +420,7 @@ def _greedy_decode(
         # out_len: [seq_len]
 
         # Initialize blank state and empty label set in Hypothesis
-        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
+        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None)
 
         if partial_hypotheses is not None:
             hypothesis.last_token = partial_hypotheses.last_token
@@ -492,7 +492,7 @@ def _greedy_decode(
                     # Append token to label set, update RNN state.
                     hypothesis.y_sequence.append(k)
                     hypothesis.score += float(v)
-                    hypothesis.timestep.append(time_idx)
+                    hypothesis.timestamp.append(time_idx)
                     hypothesis.dec_state = hidden_prime
                     hypothesis.last_token = k
 
@@ -787,7 +787,7 @@ def _greedy_decode_blank_as_pad_loop_frames(
             # Initialize list of Hypothesis
             batchsize = x.shape[0]
             hypotheses = [
-                rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], token_duration=[], dec_state=None)
+                rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestamp=[], token_duration=[], dec_state=None)
                 for _ in range(batchsize)
             ]
 
@@ -924,7 +924,7 @@ def _greedy_decode_blank_as_pad_loop_frames(
                         for kidx, ki in enumerate(k):
                             if blank_mask[kidx] == 0:
                                 hypotheses[kidx].y_sequence.append(ki)
-                                hypotheses[kidx].timestep.append(time_idx)
+                                hypotheses[kidx].timestamp.append(time_idx)
                                 hypotheses[kidx].score += float(v[kidx])
                         symbols_added += 1
 
@@ -986,7 +986,7 @@ def _greedy_decode_masked(
         # Initialize state
         batchsize = x.shape[0]
         hypotheses = [
-            rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize)
+            rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestamp=[], dec_state=None) for _ in range(batchsize)
         ]
 
         # Initialize Hidden state matrix (shared by entire batch)
@@ -997,8 +997,6 @@ def _greedy_decode_masked(
             # alignments is a 3-dimensional dangling list representing B x T x U
             for hyp in hypotheses:
                 hyp.alignments = [[]]
-        else:
-            alignments = None
 
         # If confidence scores need to be preserved, register a danling list to hold the values
         if self.preserve_frame_confidence:
@@ -1135,7 +1133,7 @@ def _greedy_decode_masked(
                         for kidx, ki in enumerate(k):
                             if blank_mask[kidx] == 0:
                                 hypotheses[kidx].y_sequence.append(ki)
-                                hypotheses[kidx].timestep.append(time_idx)
+                                hypotheses[kidx].timestamp.append(time_idx)
                                 hypotheses[kidx].score += float(v[kidx])
 
                     symbols_added += 1
@@ -1403,7 +1401,7 @@ def __init__(self, encoder_model: str, decoder_joint_model: str, max_symbols_per
             import onnx
             import onnxruntime
         except (ModuleNotFoundError, ImportError):
-            raise ImportError(f"`onnx` or `onnxruntime` could not be imported, please install the libraries.\n")
+            raise ImportError("`onnx` or `onnxruntime` could not be imported, please install the libraries.\n")
 
         if torch.cuda.is_available():
             # Try to use onnxruntime-gpu
@@ -1731,7 +1729,7 @@ def _greedy_decode(
         # out_len: [seq_len]
 
         # Initialize blank state and empty label set in Hypothesis
-        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
+        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None)
 
         if partial_hypotheses is not None:
             hypothesis.last_token = partial_hypotheses.last_token
@@ -1816,7 +1814,7 @@ def _greedy_decode(
                     # Append token to label set, update RNN state.
                     hypothesis.y_sequence.append(k)
                     hypothesis.score += float(v)
-                    hypothesis.timestep.append(time_idx)
+                    hypothesis.timestamp.append(time_idx)
                     hypothesis.dec_state = hidden_prime
                     hypothesis.last_token = k
 
@@ -1952,7 +1950,7 @@ def _greedy_decode_blank_as_pad(
             # Initialize list of Hypothesis
             batchsize = x.shape[0]
             hypotheses = [
-                rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize)
+                rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestamp=[], dec_state=None) for _ in range(batchsize)
             ]
 
             # Initialize Hidden state matrix (shared by entire batch)
@@ -2112,7 +2110,7 @@ def _greedy_decode_blank_as_pad(
                         for kidx, ki in enumerate(k):
                             if blank_mask[kidx] == 0:
                                 hypotheses[kidx].y_sequence.append(ki)
-                                hypotheses[kidx].timestep.append(time_idx)
+                                hypotheses[kidx].timestamp.append(time_idx)
                                 hypotheses[kidx].score += float(v[kidx])
 
                         symbols_added += 1
@@ -2188,7 +2186,7 @@ def _greedy_decode_masked(
         # Initialize state
         batchsize = x.shape[0]
         hypotheses = [
-            rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize)
+            rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestamp=[], dec_state=None) for _ in range(batchsize)
         ]
 
         # Initialize Hidden state matrix (shared by entire batch)
@@ -2330,7 +2328,7 @@ def _greedy_decode_masked(
                         for kidx, ki in enumerate(k):
                             if blank_mask[kidx] == 0:
                                 hypotheses[kidx].y_sequence.append(ki)
-                                hypotheses[kidx].timestep.append(time_idx)
+                                hypotheses[kidx].timestamp.append(time_idx)
                                 hypotheses[kidx].score += float(v[kidx])
 
                     symbols_added += 1
@@ -2564,7 +2562,7 @@ def _greedy_decode(
 
         # Initialize blank state and empty label set in Hypothesis
         hypothesis = rnnt_utils.Hypothesis(
-            score=0.0, y_sequence=[], dec_state=None, timestep=[], token_duration=[], last_token=None
+            score=0.0, y_sequence=[], dec_state=None, timestamp=[], token_duration=[], last_token=None
         )
 
         if partial_hypotheses is not None:
@@ -2592,7 +2590,6 @@ def _greedy_decode(
             f = x.narrow(dim=0, start=time_idx, length=1)
 
             # Setup exit flags and counter
-            not_blank = True
             symbols_added = 0
 
             need_loop = True
@@ -2644,13 +2641,11 @@ def _greedy_decode(
                 del logp
 
                 # If blank token is predicted, exit inner loop, move onto next timestep t
-                if k == self._blank_index:
-                    not_blank = False
-                else:
+                if k != self._blank_index:
                     # Append token to label set, update RNN state.
                     hypothesis.y_sequence.append(k)
                     hypothesis.score += float(v)
-                    hypothesis.timestep.append(time_idx)
+                    hypothesis.timestamp.append(time_idx)
                     hypothesis.dec_state = hidden_prime
                     hypothesis.last_token = k
                     if self.include_duration:
diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
index 13bb0b471ed2..a3b7ac9d3c34 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
@@ -474,9 +474,9 @@ def loop_labels_torch(
                     torch.logical_and(
                         torch.logical_and(
                             labels != self._blank_index,
-                            batched_hyps.last_timestep_lasts >= self.max_symbols,
+                            batched_hyps.last_timestamp_lasts >= self.max_symbols,
                         ),
-                        batched_hyps.last_timestep == time_indices,
+                        batched_hyps.last_timestamp == time_indices,
                     ),
                 )
                 time_indices += force_blank_mask  # emit blank => advance time indices
@@ -878,9 +878,9 @@ def _after_inner_loop(self):
             torch.logical_and(
                 torch.logical_and(
                     self.state.labels != self._blank_index,
-                    self.state.batched_hyps.last_timestep_lasts >= self.max_symbols,
+                    self.state.batched_hyps.last_timestamp_lasts >= self.max_symbols,
                 ),
-                self.state.batched_hyps.last_timestep == self.state.time_indices,
+                self.state.batched_hyps.last_timestamp == self.state.time_indices,
             ),
         )
         self.state.time_indices.add_(force_blank_mask)  # emit blank => advance time indices
diff --git a/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py b/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py
index 908fc1c13d19..7aeb3417b8b2 100644
--- a/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py
+++ b/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py
@@ -349,7 +349,7 @@ def default_beam_search(
 
         # Initialize hypothesis array with blank hypothesis.
         start_hyp = Hypothesis(
-            score=0.0, y_sequence=[self.blank], dec_state=decoder_state, timestep=[-1], length=0, last_frame=0
+            score=0.0, y_sequence=[self.blank], dec_state=decoder_state, timestamp=[-1], length=0, last_frame=0
         )
         kept_hyps = [start_hyp]
 
@@ -394,7 +394,7 @@ def default_beam_search(
                         score=float(max_hyp.score + total_logp_topk),  # update score
                         y_sequence=max_hyp.y_sequence + [token_idx],  # update hypothesis sequence
                         dec_state=decoder_state,  # update decoder state
-                        timestep=max_hyp.timestep + [time_idx + duration],  # update timesteps
+                        timestamp=max_hyp.timestamp + [time_idx + duration],  # update timesteps
                         length=encoded_lengths,
                         last_frame=max_hyp.last_frame + duration,
                     )  # update frame idx where last token appeared
@@ -421,7 +421,7 @@ def default_beam_search(
                         score=float(max_hyp.score + logp[self.blank] + durations_logp[duration_idx]),  # update score
                         y_sequence=max_hyp.y_sequence[:],  # no need to update sequence
                         dec_state=max_hyp.dec_state,  # no need to update decoder state
-                        timestep=max_hyp.timestep[:],  # no need to update timesteps
+                        timestamp=max_hyp.timestamp[:],  # no need to update timesteps
                         length=encoded_lengths,
                         last_frame=max_hyp.last_frame + duration,
                     )  # update frame idx where last token appeared
@@ -482,7 +482,7 @@ def modified_adaptive_expansion_search(
             y_sequence=[self.blank],
             score=0.0,
             dec_state=self.decoder.batch_select_state(beam_state, 0),
-            timestep=[-1],
+            timestamp=[-1],
             length=0,
             last_frame=0,
         )
@@ -501,7 +501,7 @@ def modified_adaptive_expansion_search(
             score=0.0,
             dec_state=state,
             dec_out=[beam_decoder_output[0]],
-            timestep=[-1],
+            timestamp=[-1],
             length=0,
             last_frame=0,
         )
@@ -580,7 +580,7 @@ def modified_adaptive_expansion_search(
                             y_sequence=hyp.y_sequence[:],
                             dec_out=hyp.dec_out[:],
                             dec_state=hyp.dec_state,
-                            timestep=hyp.timestep[:],
+                            timestamp=hyp.timestamp[:],
                             length=time_idx,
                             last_frame=hyp.last_frame + duration,
                         )
@@ -593,7 +593,7 @@ def modified_adaptive_expansion_search(
                             list_b.append(new_hyp)
                         else:
                             new_hyp.y_sequence.append(k)
-                            new_hyp.timestep.append(time_idx + duration)
+                            new_hyp.timestamp.append(time_idx + duration)
 
                             if self.ngram_lm:
                                 lm_score, new_hyp.ngram_lm_state = self.compute_ngram_score(hyp.ngram_lm_state, int(k))
diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
index c0fbe5361761..a830bc304691 100644
--- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -541,9 +541,9 @@ def loop_labels_torch(
                     torch.logical_and(
                         torch.logical_and(
                             labels != self._blank_index,
-                            batched_hyps.last_timestep_lasts >= self.max_symbols,
+                            batched_hyps.last_timestamp_lasts >= self.max_symbols,
                         ),
-                        batched_hyps.last_timestep == time_indices,
+                        batched_hyps.last_timestamp == time_indices,
                     ),
                 )
                 time_indices += force_blank_mask  # emit blank => advance time indices
@@ -996,9 +996,9 @@ def _after_inner_loop(self):
             torch.logical_and(
                 torch.logical_and(
                     self.state.labels != self._blank_index,
-                    self.state.batched_hyps.last_timestep_lasts >= self.max_symbols,
+                    self.state.batched_hyps.last_timestamp_lasts >= self.max_symbols,
                 ),
-                self.state.batched_hyps.last_timestep == self.state.time_indices,
+                self.state.batched_hyps.last_timestamp == self.state.time_indices,
             ),
         )
         self.state.time_indices.add_(force_blank_mask)  # emit blank => advance time indices
diff --git a/nemo/collections/asr/parts/utils/rnnt_utils.py b/nemo/collections/asr/parts/utils/rnnt_utils.py
index 8d2755fcc0ae..3e3146ad3901 100644
--- a/nemo/collections/asr/parts/utils/rnnt_utils.py
+++ b/nemo/collections/asr/parts/utils/rnnt_utils.py
@@ -47,7 +47,7 @@ class Hypothesis:
         `blank` tokens, and optionally merging word-pieces). Should be used as decoded string for
         Word Error Rate calculation.
 
-    timestep: (Optional) A list of integer indices representing at which index in the decoding
+    timestamp: (Optional) A list of integer indices representing at which index in the decoding
         process did the token appear. Should be of same length as the number of non-blank tokens.
 
     alignments: (Optional) Represents the CTC / RNNT token alignments as integer tokens along an axis of
@@ -94,7 +94,7 @@ class Hypothesis:
     text: Optional[str] = None
     dec_out: Optional[List[torch.Tensor]] = None
     dec_state: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor]]] = None
-    timestep: Union[List[int], torch.Tensor] = field(default_factory=list)
+    timestamp: Union[List[int], torch.Tensor] = field(default_factory=list)
     alignments: Optional[Union[List[int], List[List[int]]]] = None
     frame_confidence: Optional[Union[List[float], List[List[float]]]] = None
     token_confidence: Optional[List[float]] = None
@@ -111,19 +111,19 @@ class Hypothesis:
 
     @property
     def non_blank_frame_confidence(self) -> List[float]:
-        """Get per-frame confidence for non-blank tokens according to self.timestep
+        """Get per-frame confidence for non-blank tokens according to self.timestamp
 
         Returns:
-            List with confidence scores. The length of the list is the same as `timestep`.
+            List with confidence scores. The length of the list is the same as `timestamp`.
         """
         non_blank_frame_confidence = []
-        # self.timestep can be a dict for RNNT
-        timestep = self.timestep['timestep'] if isinstance(self.timestep, dict) else self.timestep
-        if len(timestep) != 0 and self.frame_confidence is not None:
+        # self.timestamp can be a dict for RNNT
+        timestamp = self.timestamp['timestep'] if isinstance(self.timestamp, dict) else self.timestamp
+        if len(timestamp) != 0 and self.frame_confidence is not None:
             if any(isinstance(i, list) for i in self.frame_confidence):  # rnnt
                 t_prev = -1
                 offset = 0
-                for t in timestep:
+                for t in timestamp:
                     if t != t_prev:
                         t_prev = t
                         offset = 0
@@ -131,7 +131,7 @@ def non_blank_frame_confidence(self) -> List[float]:
                         offset += 1
                     non_blank_frame_confidence.append(self.frame_confidence[t][offset])
             else:  # ctc
-                non_blank_frame_confidence = [self.frame_confidence[t] for t in timestep]
+                non_blank_frame_confidence = [self.frame_confidence[t] for t in timestamp]
         return non_blank_frame_confidence
 
     @property
@@ -258,22 +258,22 @@ def __init__(
             raise ValueError(f"batch_size must be > 0, got {batch_size}")
         self._max_length = init_length
 
-        # batch of current lengths of hypotheses and correspoinding timesteps
+        # batch of current lengths of hypotheses and correspoinding timestamps
         self.current_lengths = torch.zeros(batch_size, device=device, dtype=torch.long)
         # tensor for storing transcripts
         self.transcript = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long)
-        # tensor for storing timesteps corresponding to transcripts
-        self.timesteps = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long)
+        # tensor for storing timestamps corresponding to transcripts
+        self.timestamps = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long)
         # tensor for storing durations corresponding to transcripts tokens
         self.token_durations = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long)
         # accumulated scores for hypotheses
         self.scores = torch.zeros(batch_size, device=device, dtype=float_dtype)
 
-        # tracking last timestep of each hyp to avoid infinite looping (when max symbols per frame is restricted)
-        # last observed timestep (with label) for each hypothesis
-        self.last_timestep = torch.full((batch_size,), -1, device=device, dtype=torch.long)
-        # number of labels for the last timestep
-        self.last_timestep_lasts = torch.zeros(batch_size, device=device, dtype=torch.long)
+        # tracking last timestamp of each hyp to avoid infinite looping (when max symbols per frame is restricted)
+        # last observed timestamp (with label) for each hypothesis
+        self.last_timestamp = torch.full((batch_size,), -1, device=device, dtype=torch.long)
+        # number of labels for the last timestamp
+        self.last_timestamp_lasts = torch.zeros(batch_size, device=device, dtype=torch.long)
         self._batch_indices = torch.arange(batch_size, device=device)
         self._ones_batch = torch.ones_like(self._batch_indices)
 
@@ -283,11 +283,11 @@ def clear_(self):
         """
         self.current_lengths.fill_(0)
         self.transcript.fill_(0)
-        self.timesteps.fill_(0)
+        self.timestamps.fill_(0)
         self.token_durations.fill_(0)
         self.scores.fill_(0.0)
-        self.last_timestep.fill_(-1)
-        self.last_timestep_lasts.fill_(0)
+        self.last_timestamp.fill_(-1)
+        self.last_timestamp_lasts.fill_(0)
 
     def _allocate_more(self):
         """
@@ -295,7 +295,7 @@ def _allocate_more(self):
         to maintain O(1) insertion time complexity
         """
         self.transcript = torch.cat((self.transcript, torch.zeros_like(self.transcript)), dim=-1)
-        self.timesteps = torch.cat((self.timesteps, torch.zeros_like(self.timesteps)), dim=-1)
+        self.timestamps = torch.cat((self.timestamps, torch.zeros_like(self.timestamps)), dim=-1)
         self.token_durations = torch.cat((self.token_durations, torch.zeros_like(self.token_durations)), dim=-1)
         self._max_length *= 2
 
@@ -353,17 +353,17 @@ def add_results_no_checks_(
         # accumulate scores
         self.scores[active_indices] += scores
 
-        # store transcript and timesteps
+        # store transcript and timestamps
         active_lengths = self.current_lengths[active_indices]
         self.transcript[active_indices, active_lengths] = labels
-        self.timesteps[active_indices, active_lengths] = time_indices
+        self.timestamps[active_indices, active_lengths] = time_indices
         if token_durations is not None:
             self.token_durations[active_indices, active_lengths] = token_durations
-        # store last observed timestep + number of observation for the current timestep
-        self.last_timestep_lasts[active_indices] = torch.where(
-            self.last_timestep[active_indices] == time_indices, self.last_timestep_lasts[active_indices] + 1, 1
+        # store last observed timestamp + number of observation for the current timestamp
+        self.last_timestamp_lasts[active_indices] = torch.where(
+            self.last_timestamp[active_indices] == time_indices, self.last_timestamp_lasts[active_indices] + 1, 1
         )
-        self.last_timestep[active_indices] = time_indices
+        self.last_timestamp[active_indices] = time_indices
         # increase lengths
         self.current_lengths[active_indices] += 1
 
@@ -417,27 +417,27 @@ def add_results_masked_no_checks_(
         # same as self.scores[active_mask] += scores[active_mask], but non-blocking
         torch.where(active_mask, self.scores + scores, self.scores, out=self.scores)
 
-        # store transcript and timesteps
+        # store transcript and timestamps
         self.transcript[self._batch_indices, self.current_lengths] = labels
-        self.timesteps[self._batch_indices, self.current_lengths] = time_indices
+        self.timestamps[self._batch_indices, self.current_lengths] = time_indices
         if token_durations is not None:
             self.token_durations[self._batch_indices, self.current_lengths] = token_durations
-        # store last observed timestep + number of observation for the current timestep
-        # if last_timestep == time_indices, increase; else set to 1
+        # store last observed timestamp + number of observation for the current timestamp
+        # if last_timestamp == time_indices, increase; else set to 1
         torch.where(
-            torch.logical_and(active_mask, self.last_timestep == time_indices),
-            self.last_timestep_lasts + 1,
-            self.last_timestep_lasts,
-            out=self.last_timestep_lasts,
+            torch.logical_and(active_mask, self.last_timestamp == time_indices),
+            self.last_timestamp_lasts + 1,
+            self.last_timestamp_lasts,
+            out=self.last_timestamp_lasts,
         )
         torch.where(
-            torch.logical_and(active_mask, self.last_timestep != time_indices),
+            torch.logical_and(active_mask, self.last_timestamp != time_indices),
             self._ones_batch,
-            self.last_timestep_lasts,
-            out=self.last_timestep_lasts,
+            self.last_timestamp_lasts,
+            out=self.last_timestamp_lasts,
         )
-        # same as: self.last_timestep[active_mask] = time_indices[active_mask], but non-blocking
-        torch.where(active_mask, time_indices, self.last_timestep, out=self.last_timestep)
+        # same as: self.last_timestamp[active_mask] = time_indices[active_mask], but non-blocking
+        torch.where(active_mask, time_indices, self.last_timestamp, out=self.last_timestamp)
         # increase lengths
         self.current_lengths += active_mask
 
@@ -479,8 +479,8 @@ def __init__(
         self.with_alignments = store_alignments
         self._max_length = init_length
 
-        # tensor to store observed timesteps (for alignments / confidence scores)
-        self.timesteps = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long)
+        # tensor to store observed timestamps (for alignments / confidence scores)
+        self.timestamps = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long)
         # current lengths of the utterances (alignments)
         self.current_lengths = torch.zeros(batch_size, device=device, dtype=torch.long)
 
@@ -508,7 +508,7 @@ def clear_(self):
         Clears batched hypotheses state.
         """
         self.current_lengths.fill_(0)
-        self.timesteps.fill_(0)
+        self.timestamps.fill_(0)
         self.logits.fill_(0.0)
         self.labels.fill_(0)
         self.frame_confidence.fill_(0)
@@ -518,7 +518,7 @@ def _allocate_more(self):
         Allocate 2x space for tensors, similar to common C++ std::vector implementations
         to maintain O(1) insertion time complexity
         """
-        self.timesteps = torch.cat((self.timesteps, torch.zeros_like(self.timesteps)), dim=-1)
+        self.timestamps = torch.cat((self.timestamps, torch.zeros_like(self.timestamps)), dim=-1)
         if self.with_alignments:
             self.logits = torch.cat((self.logits, torch.zeros_like(self.logits)), dim=1)
             self.labels = torch.cat((self.labels, torch.zeros_like(self.labels)), dim=-1)
@@ -553,8 +553,8 @@ def add_results_(
             self._allocate_more()
 
         active_lengths = self.current_lengths[active_indices]
-        # store timesteps - same for alignments / confidence
-        self.timesteps[active_indices, active_lengths] = time_indices
+        # store timestamps - same for alignments / confidence
+        self.timestamps[active_indices, active_lengths] = time_indices
 
         if self.with_alignments and logits is not None and labels is not None:
             self.logits[active_indices, active_lengths] = logits
@@ -609,11 +609,11 @@ def add_results_masked_no_checks_(
             labels: tensor with decoded labels (can contain blank)
             confidence: optional tensor with confidence for each item in batch
         """
-        # store timesteps - same for alignments / confidence
-        self.timesteps[self._batch_indices, self.current_lengths] = time_indices
+        # store timestamps - same for alignments / confidence
+        self.timestamps[self._batch_indices, self.current_lengths] = time_indices
 
         if self.with_alignments and logits is not None and labels is not None:
-            self.timesteps[self._batch_indices, self.current_lengths] = time_indices
+            self.timestamps[self._batch_indices, self.current_lengths] = time_indices
             self.logits[self._batch_indices, self.current_lengths] = logits
             self.labels[self._batch_indices, self.current_lengths] = labels
 
@@ -645,7 +645,7 @@ def batched_hyps_to_hypotheses(
         Hypothesis(
             score=batched_hyps.scores[i].item(),
             y_sequence=batched_hyps.transcript[i, : batched_hyps.current_lengths[i]],
-            timestep=batched_hyps.timesteps[i, : batched_hyps.current_lengths[i]],
+            timestamp=batched_hyps.timestamps[i, : batched_hyps.current_lengths[i]],
             token_duration=(
                 durations
                 if not torch.all(
@@ -673,17 +673,20 @@ def batched_hyps_to_hypotheses(
             if alignments.with_frame_confidence:
                 hypotheses[i].frame_confidence = []
             _, grouped_counts = torch.unique_consecutive(
-                alignments.timesteps[i, : alignment_lengths[i]], return_counts=True
+                alignments.timestamps[i, : alignment_lengths[i]], return_counts=True
             )
             start = 0
-            for timestep_cnt in grouped_counts.tolist():
+            for timestamp_cnt in grouped_counts.tolist():
                 if alignments.with_alignments:
                     hypotheses[i].alignments.append(
-                        [(alignment_logits[i, start + j], alignment_labels[i, start + j]) for j in range(timestep_cnt)]
+                        [
+                            (alignment_logits[i, start + j], alignment_labels[i, start + j])
+                            for j in range(timestamp_cnt)
+                        ]
                     )
                 if alignments.with_frame_confidence:
                     hypotheses[i].frame_confidence.append(
-                        [frame_confidence[i, start + j] for j in range(timestep_cnt)]
+                        [frame_confidence[i, start + j] for j in range(timestamp_cnt)]
                     )
-                start += timestep_cnt
+                start += timestamp_cnt
     return hypotheses
diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index cb272e3d0462..6497b4594184 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -107,7 +107,7 @@ def longest_common_subsequence_merge(X, Y, filepath=None):
     # value initially in each cell
     m = len(X)
     n = len(Y)
-    LCSuff = [[0 for k in range(n + 1)] for l in range(m + 1)]
+    LCSuff = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
 
     # To store the length of
     # longest common substring
@@ -1672,7 +1672,7 @@ def transcribe(
         """
         self.infer_logits(keep_logits)
 
-        hypothesis = " ".join(self.all_preds)
+        hypothesis = " ".join([h.text for h in self.all_preds])
         if not keep_logits:
             return hypothesis
 
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index 539d820e0814..f3a258e341fd 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -60,11 +60,11 @@ def get_buffered_pred_feat_rnnt(
         filepaths = []
         with open(manifest, "r", encoding='utf_8') as mfst_f:
             print("Parsing manifest files...")
-            for l in mfst_f:
-                l = l.strip()
-                if not l:
+            for L in mfst_f:
+                L = L.strip()
+                if not L:
                     continue
-                row = json.loads(l)
+                row = json.loads(L)
                 audio_file = get_full_path(audio_file=row['audio_filepath'], manifest_file=manifest)
                 filepaths.append(audio_file)
                 if 'text' in row:
@@ -145,19 +145,19 @@ def get_buffered_pred_feat(
         raise ValueError("Either filepaths or manifest shoud not be None")
 
     if filepaths:
-        for l in tqdm(filepaths, desc="Sample:"):
+        for L in tqdm(filepaths, desc="Sample:"):
             asr.reset()
-            asr.read_audio_file(l, delay, model_stride_in_secs)
+            asr.read_audio_file(L, delay, model_stride_in_secs)
             hyp = asr.transcribe(tokens_per_chunk, delay)
             hyps.append(hyp)
     else:
         with open(manifest, "r", encoding='utf_8') as mfst_f:
-            for l in tqdm(mfst_f, desc="Sample:"):
+            for L in tqdm(mfst_f, desc="Sample:"):
                 asr.reset()
-                l = l.strip()
-                if not l:
+                L = L.strip()
+                if not L:
                     continue
-                row = json.loads(l)
+                row = json.loads(L)
                 if 'text' in row:
                     refs.append(row['text'])
                 audio_file = get_full_path(audio_file=row['audio_filepath'], manifest_file=manifest)
@@ -452,7 +452,7 @@ def write_transcription(
                     item = {'audio_filepath': filepaths[idx], pred_text_attr_name: transcription.text}
 
                     if timestamps:
-                        timestamps = transcription.timestep
+                        timestamps = transcription.timestamp
                         if timestamps is not None and isinstance(timestamps, dict):
                             timestamps.pop(
                                 'timestep', None
@@ -480,7 +480,7 @@ def write_transcription(
                         item[pred_text_attr_name] = best_hyps[idx].text
 
                         if timestamps:
-                            timestamps = best_hyps[idx].timestep
+                            timestamps = best_hyps[idx].timestamp
                             if timestamps is not None and isinstance(timestamps, dict):
                                 timestamps.pop(
                                     'timestep', None
@@ -631,19 +631,19 @@ def process_timestamp(timestamp, subsampling_factor, window_stride):
         return timestamp
 
     for idx, hyp in enumerate(outputs):
-        if not hasattr(hyp, 'timestep'):
+        if not hasattr(hyp, 'timestamp'):
             raise ValueError(
-                f"Expected Hypothesis object to have 'timestep' attribute, when compute_timestamps is \
+                f"Expected Hypothesis object to have 'timestamp' attribute, when compute_timestamps is \
                     enabled but got {hyp}"
             )
-        timestep = hyp.timestep
-        if 'word' in timestep:
-            outputs[idx].timestep['word'] = process_timestamp(timestep['word'], subsampling_factor, window_stride)
-        if 'char' in timestep:
-            outputs[idx].timestep['char'] = process_timestamp(timestep['char'], subsampling_factor, window_stride)
-        if 'segment' in timestep:
-            outputs[idx].timestep['segment'] = process_timestamp(
-                timestep['segment'], subsampling_factor, window_stride
+        timestamp = hyp.timestamp
+        if 'word' in timestamp:
+            outputs[idx].timestamp['word'] = process_timestamp(timestamp['word'], subsampling_factor, window_stride)
+        if 'char' in timestamp:
+            outputs[idx].timestamp['char'] = process_timestamp(timestamp['char'], subsampling_factor, window_stride)
+        if 'segment' in timestamp:
+            outputs[idx].timestamp['segment'] = process_timestamp(
+                timestamp['segment'], subsampling_factor, window_stride
             )
     return outputs
 
diff --git a/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py b/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py
index 53ae4a2dfb65..54443f36a24d 100644
--- a/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py
+++ b/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py
@@ -143,10 +143,13 @@ def transcribe(
             return_hypotheses: (bool) Either return hypotheses or text
                 With hypotheses can do some postprocessing like getting timestamp or rescoring
             num_workers: (int) number of workers for DataLoader
-            channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
+            channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels
+            from multi-channel audio. If set to `'average'`, it performs averaging across channels.
+            Disabled if set to `None`. Defaults to `None`.
             augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied.
         Returns:
-            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2video_files
+            A list of transcriptions (or raw log probabilities if logprobs is True)
+            in the same order as paths2video_files
         """
         if paths2video_files is None or len(paths2video_files) == 0:
             return {}
@@ -162,7 +165,6 @@ def transcribe(
 
         # We will store transcriptions here
         hypotheses = []
-        all_hypotheses = []
 
         # Model's mode and device
         mode = self.training
@@ -206,7 +208,7 @@ def transcribe(
                             lg = logits[idx][: logits_len[idx]]
                             hypotheses.append(lg.cpu().numpy())
                     else:
-                        current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor(
+                        current_hypotheses = self.decoding.ctc_decoder_predictions_tensor(
                             logits,
                             decoder_lengths=logits_len,
                             return_hypotheses=return_hypotheses,
@@ -219,10 +221,7 @@ def transcribe(
                                 if current_hypotheses[idx].alignments is None:
                                     current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence
 
-                        if all_hyp is None:
-                            hypotheses += current_hypotheses
-                        else:
-                            hypotheses += all_hyp
+                        hypotheses += current_hypotheses
 
                     del greedy_predictions
                     del logits
@@ -240,9 +239,12 @@ def transcribe(
 
     def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[DictConfig] = None):
         """
-        Changes vocabulary used during CTC decoding process. Use this method when fine-tuning on from pre-trained model.
-        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
-        use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need
+        Changes vocabulary used during CTC decoding process. Use this method when 
+        fine-tuning on from pre-trained model.
+        This method changes only decoder and leaves encoder and pre-processing
+        modules unchanged. For example, you would
+        use it if you want to use pretrained encoder when fine-tuning on a data in another language,
+        or when you'd need
         model to learn capitalization, punctuation and/or special characters.
 
         If new_vocabulary == self.decoder.vocabulary then nothing will be changed.
diff --git a/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py b/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py
index 158bfaddcc96..eb4e4fa52271 100644
--- a/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py
+++ b/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py
@@ -16,7 +16,7 @@
 import json
 import os
 import tempfile
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import torch
 from lightning.pytorch import Trainer
@@ -28,6 +28,7 @@
 from nemo.collections.asr.parts.mixins import ASRBPEMixin, InterCTCMixin
 from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.multimodal.speech_cv.models.visual_rnnt_models import VisualEncDecRNNTModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.classes.mixins import AccessMixin
@@ -100,7 +101,7 @@ def transcribe(
         partial_hypothesis: Optional[List['Hypothesis']] = None,
         num_workers: int = 0,
         channel_selector: Optional[ChannelSelectorType] = None,
-    ) -> (List[str], Optional[List['Hypothesis']]):
+    ) -> Union[List['Hypothesis'], Optional[List['Hypothesis']]]:
         """
         Uses greedy decoding to transcribe video files. Use this method for debugging and prototyping.
 
@@ -112,12 +113,12 @@ def transcribe(
             return_hypotheses: (bool) Either return hypotheses or text
         With hypotheses can do some postprocessing like getting timestamp or rescoring
             num_workers: (int) number of workers for DataLoader
-            channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+            channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels 
+            from multi-channel audio. If set to `'average'`, it performs averaging across channels.
+            Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
 
         Returns:
-            Returns a tuple of 2 items -
-            * A list of greedy transcript texts / Hypothesis
-            * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis.
+            Returns a list of greedy transcript Hypothesis or list of all Hypothesis
         """
         if self.use_rnnt_decoder:
             return super().transcribe(
@@ -133,7 +134,6 @@ def transcribe(
             return {}
         # We will store transcriptions here
         hypotheses = []
-        all_hypotheses = []
         # Model's mode and device
         mode = self.training
         device = next(self.parameters()).device
@@ -177,7 +177,7 @@ def transcribe(
                     )
 
                     logits = self.ctc_decoder(encoder_output=encoded)
-                    best_hyp, all_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor(
+                    best_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor(
                         logits,
                         encoded_len,
                         return_hypotheses=return_hypotheses,
@@ -191,10 +191,6 @@ def transcribe(
                     del logits
 
                     hypotheses += best_hyp
-                    if all_hyp is not None:
-                        all_hypotheses += all_hyp
-                    else:
-                        all_hypotheses += best_hyp
 
                     del encoded
                     del test_batch
@@ -210,7 +206,7 @@ def transcribe(
                 self.joint.unfreeze()
                 if hasattr(self, 'ctc_decoder'):
                     self.ctc_decoder.unfreeze()
-        return hypotheses, all_hypotheses
+        return hypotheses
 
     def change_vocabulary(
         self,
@@ -220,9 +216,9 @@ def change_vocabulary(
     ):
         """
         Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning a pre-trained model.
-        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
-        use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need
-        model to learn capitalization, punctuation and/or special characters.
+        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you
+        would use it if you want to use pretrained encoder when fine-tuning on data in another language, or when
+        you'd need model to learn capitalization, punctuation and/or special characters.
 
         Args:
             new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \
@@ -473,7 +469,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
         encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len)
         del signal
 
-        best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor(
+        best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor(
             encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False
         )
 
diff --git a/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py b/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py
index 75202238d2d0..abe9660a7f4e 100644
--- a/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py
+++ b/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py
@@ -32,6 +32,7 @@
 from nemo.collections.asr.parts.mixins import ASRModuleMixin
 from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecoding, RNNTDecodingConfig
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.multimodal.speech_cv.data import video_to_text_dataset
 from nemo.core.classes import Exportable
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
@@ -236,18 +237,18 @@ def transcribe(
             return_hypotheses: (bool) Either return hypotheses or text
         With hypotheses can do some postprocessing like getting timestamp or rescoring
             num_workers: (int) number of workers for DataLoader
-            channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+            channel_selector (int | Iterable[int] | str): select a single channel or a subset
+            of channels from multi-channel audio. If set to `'average'`, 
+            it performs averaging across channels. 
+            Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
             augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied.
         Returns:
-            Returns a tuple of 2 items -
-            * A list of greedy transcript texts / Hypothesis
-            * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis.
+            Returns a list of greedy transcript Hypothesis or list of all Hypothesis
         """
         if paths2video_files is None or len(paths2video_files) == 0:
             return {}
         # We will store transcriptions here
         hypotheses = []
-        all_hypotheses = []
         # Model's mode and device
         mode = self.training
         device = next(self.parameters()).device
@@ -289,7 +290,7 @@ def transcribe(
                     encoded, encoded_len = self.forward(
                         input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
                     )
-                    best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor(
+                    best_hyp = self.decoding.rnnt_decoder_predictions_tensor(
                         encoded,
                         encoded_len,
                         return_hypotheses=return_hypotheses,
@@ -297,10 +298,6 @@ def transcribe(
                     )
 
                     hypotheses += best_hyp
-                    if all_hyp is not None:
-                        all_hypotheses += all_hyp
-                    else:
-                        all_hypotheses += best_hyp
 
                     del encoded
                     del test_batch
@@ -314,14 +311,14 @@ def transcribe(
                 self.encoder.unfreeze()
                 self.decoder.unfreeze()
                 self.joint.unfreeze()
-        return hypotheses, all_hypotheses
+        return hypotheses
 
     def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[DictConfig] = None):
         """
         Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning a pre-trained model.
-        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
-        use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need
-        model to learn capitalization, punctuation and/or special characters.
+        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, 
+        you would use it if you want to use pretrained encoder when fine-tuning on data in another language, 
+        or when you'd need model to learn capitalization, punctuation and/or special characters.
 
         Args:
             new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \
@@ -731,7 +728,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
         encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len)
         del signal
 
-        best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor(
+        best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor(
             encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False
         )
 
diff --git a/nemo/collections/tts/g2p/models/ctc.py b/nemo/collections/tts/g2p/models/ctc.py
index 1859b09594ff..3248774de571 100644
--- a/nemo/collections/tts/g2p/models/ctc.py
+++ b/nemo/collections/tts/g2p/models/ctc.py
@@ -27,7 +27,7 @@
 from nemo.collections.tts.models.base import G2PModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.classes.exportable import Exportable
-from nemo.core.neural_types import LengthsType, NeuralType, TokenIndex
+from nemo.core.neural_types import LengthsType, LossType, NeuralType, TokenIndex
 from nemo.utils import logging
 
 try:
@@ -38,7 +38,7 @@
     from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig
 
     ASR_AVAILABLE = True
-except (ModuleNotFoundError, ImportError) as e:
+except (ModuleNotFoundError, ImportError):
     ASR_AVAILABLE = False
 
 
@@ -356,9 +356,10 @@ def _infer(
                     input_len=input_len.to(device),
                 )
 
-                preds_str, _ = self.decoding.ctc_decoder_predictions_tensor(
+                preds_hyps = self.decoding.ctc_decoder_predictions_tensor(
                     log_probs, decoder_lengths=encoded_len, return_hypotheses=False
                 )
+                preds_str = [hyp.text for hyp in preds_hyps]
                 all_preds.extend(preds_str)
 
                 del greedy_predictions
@@ -396,7 +397,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, name: str):
     def setup_training_data(self, cfg: DictConfig):
         if not cfg or cfg.manifest_filepath is None:
             logging.info(
-                f"Dataloader config or file_path for the train is missing, so no data loader for train is created!"
+                "Dataloader config or file_path for the train is missing, so no data loader for train is created!"
             )
             self._train_dl = None
             return
@@ -417,7 +418,7 @@ def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict] = N
     def setup_validation_data(self, cfg: Optional[DictConfig]):
         if not cfg or cfg.manifest_filepath is None:
             logging.info(
-                f"Dataloader config or file_path for the validation is missing, so no data loader for validation is created!"
+                "Dataloader config or file_path for the validation is missing, so no data loader for validation is created!"
             )
             self._validation_dl = None
             return
@@ -426,7 +427,7 @@ def setup_validation_data(self, cfg: Optional[DictConfig]):
     def setup_test_data(self, cfg: Optional[DictConfig]):
         if not cfg or cfg.manifest_filepath is None:
             logging.info(
-                f"Dataloader config or file_path for the test is missing, so no data loader for test is created!"
+                "Dataloader config or file_path for the test is missing, so no data loader for test is created!"
             )
             self._test_dl = None
             return
diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py
index 3bb4fa4f4846..9735180d2659 100644
--- a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py
+++ b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py
@@ -193,7 +193,7 @@ def beam_search_eval(
                     probs_batch[prob_index], device=packed_batch.device, dtype=packed_batch.dtype
                 )
 
-            _, beams_batch = decoding.ctc_decoder_predictions_tensor(
+            beams_batch = decoding.ctc_decoder_predictions_tensor(
                 packed_batch,
                 decoder_lengths=probs_lens,
                 return_hypotheses=True,
@@ -335,9 +335,9 @@ def main(cfg: EvalBeamSearchNGramConfig):
         preds = np.argmax(probs, axis=1)
         preds_tensor = torch.tensor(preds, device='cpu').unsqueeze(0)
         if isinstance(asr_model, EncDecHybridRNNTCTCModel):
-            pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0]
+            pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor)[0]
         else:
-            pred_text = asr_model._wer.decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0]
+            pred_text = asr_model._wer.decoding.ctc_decoder_predictions_tensor(preds_tensor)[0]
 
         if cfg.text_processing.do_lowercase:
             pred_text = punctuation_capitalization.do_lowercase([pred_text])[0]
diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py
index c61a402c0942..57bf9db6f3bd 100644
--- a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py
+++ b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py
@@ -176,11 +176,12 @@ def decoding_step(
                 packed_batch[prob_index, :, : probs_lens[prob_index]] = torch.tensor(
                     probs_batch[prob_index].unsqueeze(0), device=packed_batch.device, dtype=packed_batch.dtype
                 )
-            best_hyp_batch, beams_batch = model.decoding.rnnt_decoder_predictions_tensor(
+            beams_batch = model.decoding.rnnt_decoder_predictions_tensor(
                 packed_batch,
                 probs_lens,
                 return_hypotheses=True,
             )
+            best_hyp_batch = [dec_hyp[0] for dec_hyp in beams_batch]
         if cfg.decoding_strategy == "greedy_batch":
             beams_batch = [[x] for x in best_hyp_batch]
 
diff --git a/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py b/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py
index 63ab24b0921e..0dd61b2d9afd 100644
--- a/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py
+++ b/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py
@@ -325,9 +325,9 @@ def main(cfg: EvalWFSTNGramConfig):
         preds_tensor = preds.to(device='cpu').unsqueeze(0)
         preds_lens = torch.tensor([preds_tensor.shape[1]], device='cpu')
         if isinstance(asr_model, EncDecHybridRNNTCTCModel):
-            pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor, preds_lens)[0][0]
+            pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor, preds_lens)[0]
         else:
-            pred_text = asr_model.decoding.ctc_decoder_predictions_tensor(preds_tensor, preds_lens)[0][0]
+            pred_text = asr_model.decoding.ctc_decoder_predictions_tensor(preds_tensor, preds_lens)[0]
 
         if cfg.text_processing.do_lowercase:
             pred_text = punctuation_capitalization.do_lowercase([pred_text])[0]
diff --git a/tests/collections/asr/decoding/test_batched_hyps_and_alignments.py b/tests/collections/asr/decoding/test_batched_hyps_and_alignments.py
index 60fd88144230..c2c0d5969ae2 100644
--- a/tests/collections/asr/decoding/test_batched_hyps_and_alignments.py
+++ b/tests/collections/asr/decoding/test_batched_hyps_and_alignments.py
@@ -46,10 +46,10 @@ class TestBatchedHyps:
     @pytest.mark.parametrize("device", DEVICES)
     def test_instantiate(self, device: torch.device):
         hyps = BatchedHyps(batch_size=2, init_length=3, device=device)
-        assert torch.is_tensor(hyps.timesteps)
+        assert torch.is_tensor(hyps.timestamps)
         # device: for mps device we need to use `type`, not directly compare
-        assert hyps.timesteps.device.type == device.type
-        assert hyps.timesteps.shape == (2, 3)
+        assert hyps.timestamps.device.type == device.type
+        assert hyps.timestamps.shape == (2, 3)
 
     @pytest.mark.unit
     @pytest.mark.parametrize("batch_size", [-1, 0])
@@ -76,10 +76,10 @@ def test_add_results(self, device: torch.device):
         )
         assert hyps.current_lengths.tolist() == [1, 0]
         assert hyps.transcript.tolist()[0][:1] == [5]
-        assert hyps.timesteps.tolist()[0][:1] == [1]
+        assert hyps.timestamps.tolist()[0][:1] == [1]
         assert hyps.scores.tolist() == pytest.approx([0.5, 0.0])
-        assert hyps.last_timestep.tolist() == [1, -1]
-        assert hyps.last_timestep_lasts.tolist() == [1, 0]
+        assert hyps.last_timestamp.tolist() == [1, -1]
+        assert hyps.last_timestamp_lasts.tolist() == [1, 0]
 
     @pytest.mark.unit
     @pytest.mark.parametrize("device", DEVICES)
@@ -101,11 +101,11 @@ def test_add_multiple_results(self, device: torch.device):
         assert hyps.current_lengths.tolist() == [2, 1]
         assert hyps.transcript.tolist()[0][:2] == [5, 2]
         assert hyps.transcript.tolist()[1][:1] == [4]
-        assert hyps.timesteps.tolist()[0][:2] == [1, 1]
-        assert hyps.timesteps.tolist()[1][:1] == [2]
+        assert hyps.timestamps.tolist()[0][:2] == [1, 1]
+        assert hyps.timestamps.tolist()[1][:1] == [2]
         assert hyps.scores.tolist() == pytest.approx([1.5, 1.0])
-        assert hyps.last_timestep.tolist() == [1, 2]
-        assert hyps.last_timestep_lasts.tolist() == [2, 1]
+        assert hyps.last_timestamp.tolist() == [1, 2]
+        assert hyps.last_timestamp_lasts.tolist() == [2, 1]
 
     @pytest.mark.unit
     @pytest.mark.parametrize("device", DEVICES)
@@ -117,14 +117,17 @@ def test_add_results_masked(self, device: torch.device):
         scores = torch.tensor([0.5, 10.0], device=device)
         labels = torch.tensor([5, 1], device=device)
         hyps.add_results_masked_(
-            active_mask=active_mask, labels=labels, time_indices=time_indices, scores=scores,
+            active_mask=active_mask,
+            labels=labels,
+            time_indices=time_indices,
+            scores=scores,
         )
         assert hyps.current_lengths.tolist() == [1, 0]
         assert hyps.transcript.tolist()[0][:1] == [5]
-        assert hyps.timesteps.tolist()[0][:1] == [1]
+        assert hyps.timestamps.tolist()[0][:1] == [1]
         assert hyps.scores.tolist() == pytest.approx([0.5, 0.0])  # last score should be ignored!
-        assert hyps.last_timestep.tolist() == [1, -1]
-        assert hyps.last_timestep_lasts.tolist() == [1, 0]
+        assert hyps.last_timestamp.tolist() == [1, -1]
+        assert hyps.last_timestamp_lasts.tolist() == [1, 0]
 
     @pytest.mark.unit
     @pytest.mark.parametrize("device", DEVICES)
@@ -138,14 +141,17 @@ def test_add_results_masked_no_checks(self, device: torch.device):
         # check there are no blocking operations
         with avoid_sync_operations(device=device):
             hyps.add_results_masked_no_checks_(
-                active_mask=active_mask, labels=labels, time_indices=time_indices, scores=scores,
+                active_mask=active_mask,
+                labels=labels,
+                time_indices=time_indices,
+                scores=scores,
             )
         assert hyps.current_lengths.tolist() == [1, 0]
         assert hyps.transcript.tolist()[0][:1] == [5]
-        assert hyps.timesteps.tolist()[0][:1] == [1]
+        assert hyps.timestamps.tolist()[0][:1] == [1]
         assert hyps.scores.tolist() == pytest.approx([0.5, 0.0])  # last score should be ignored!
-        assert hyps.last_timestep.tolist() == [1, -1]
-        assert hyps.last_timestep_lasts.tolist() == [1, 0]
+        assert hyps.last_timestamp.tolist() == [1, -1]
+        assert hyps.last_timestamp_lasts.tolist() == [1, 0]
 
     @pytest.mark.unit
     @pytest.mark.parametrize("device", DEVICES)
@@ -167,11 +173,11 @@ def test_add_multiple_results_masked(self, device: torch.device):
         assert hyps.current_lengths.tolist() == [2, 1]
         assert hyps.transcript.tolist()[0][:2] == [5, 2]
         assert hyps.transcript.tolist()[1][:1] == [4]
-        assert hyps.timesteps.tolist()[0][:2] == [1, 1]
-        assert hyps.timesteps.tolist()[1][:1] == [2]
+        assert hyps.timestamps.tolist()[0][:2] == [1, 1]
+        assert hyps.timestamps.tolist()[1][:1] == [2]
         assert hyps.scores.tolist() == pytest.approx([1.5, 1.0])
-        assert hyps.last_timestep.tolist() == [1, 2]
-        assert hyps.last_timestep_lasts.tolist() == [2, 1]
+        assert hyps.last_timestamp.tolist() == [1, 2]
+        assert hyps.last_timestamp_lasts.tolist() == [2, 1]
 
     @pytest.mark.unit
     @pytest.mark.parametrize("device", DEVICES)
@@ -252,7 +258,7 @@ def test_add_results(self, device: torch.device):
         )
         assert alignments.current_lengths.tolist() == [1, 1]
         assert torch.allclose(alignments.logits[:, 0], sample_logits[:, 0])
-        assert alignments.timesteps[:, 0].tolist() == [0, 0]
+        assert alignments.timestamps[:, 0].tolist() == [0, 0]
 
     @pytest.mark.unit
     @pytest.mark.parametrize("device", DEVICES)
@@ -294,7 +300,7 @@ def test_add_results_masked(self, device: torch.device):
         )
         assert alignments.current_lengths.tolist() == [1, 1]
         assert torch.allclose(alignments.logits[:, 0], sample_logits[:, 0])
-        assert alignments.timesteps[:, 0].tolist() == [0, 0]
+        assert alignments.timestamps[:, 0].tolist() == [0, 0]
 
     @pytest.mark.unit
     @pytest.mark.parametrize("device", DEVICES)
@@ -313,7 +319,7 @@ def test_add_results_masked_no_checks(self, device: torch.device):
             )
         assert alignments.current_lengths.tolist() == [1, 1]
         assert torch.allclose(alignments.logits[:, 0], sample_logits[:, 0])
-        assert alignments.timesteps[:, 0].tolist() == [0, 0]
+        assert alignments.timestamps[:, 0].tolist() == [0, 0]
 
     @pytest.mark.unit
     @pytest.mark.parametrize("device", DEVICES)
@@ -382,8 +388,8 @@ def test_convert_to_hypotheses(self, device: torch.device):
         assert (hypotheses[1].y_sequence == torch.tensor([4], device=device)).all()
         assert hypotheses[0].score == pytest.approx(1.5)
         assert hypotheses[1].score == pytest.approx(1.0)
-        assert (hypotheses[0].timestep == torch.tensor([1, 1], device=device)).all()
-        assert (hypotheses[1].timestep == torch.tensor([2], device=device)).all()
+        assert (hypotheses[0].timestamp == torch.tensor([1, 1], device=device)).all()
+        assert (hypotheses[1].timestamp == torch.tensor([2], device=device)).all()
 
     @pytest.mark.unit
     @pytest.mark.parametrize("device", DEVICES)
@@ -442,8 +448,8 @@ def test_convert_to_hypotheses_with_alignments(self, device: torch.device):
         assert (hypotheses[1].y_sequence == torch.tensor([4], device=device)).all()
         assert hypotheses[0].score == pytest.approx(1.5)
         assert hypotheses[1].score == pytest.approx(1.0)
-        assert (hypotheses[0].timestep == torch.tensor([0, 1], device=device)).all()
-        assert (hypotheses[1].timestep == torch.tensor([1], device=device)).all()
+        assert (hypotheses[0].timestamp == torch.tensor([0, 1], device=device)).all()
+        assert (hypotheses[1].timestamp == torch.tensor([1], device=device)).all()
 
         etalon = [
             [
@@ -462,7 +468,7 @@ def test_convert_to_hypotheses_with_alignments(self, device: torch.device):
             ],
         ]
         for batch_i in range(batch_size):
-            for t, group_for_timestep in enumerate(etalon[batch_i]):
-                for step, (label, current_logits) in enumerate(group_for_timestep):
+            for t, group_for_timestamp in enumerate(etalon[batch_i]):
+                for step, (label, current_logits) in enumerate(group_for_timestamp):
                     assert torch.allclose(hypotheses[batch_i].alignments[t][step][0], current_logits)
                     assert hypotheses[batch_i].alignments[t][step][1] == label
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index 7a16db4324bc..e122dd5a3fdd 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -49,16 +49,16 @@ def register_artifact(self, _, vocab_path):
 
 
 def check_char_timestamps(hyp: Hypothesis, decoding: CTCDecoding):
-    assert hyp.timestep is not None
-    assert isinstance(hyp.timestep, dict)
-    assert 'timestep' in hyp.timestep
-    assert 'char' in hyp.timestep
-    assert 'word' in hyp.timestep
-    assert 'segment' in hyp.timestep
+    assert hyp.timestamp is not None
+    assert isinstance(hyp.timestamp, dict)
+    assert 'timestep' in hyp.timestamp
+    assert 'char' in hyp.timestamp
+    assert 'word' in hyp.timestamp
+    assert 'segment' in hyp.timestamp
 
     words = hyp.text.split(decoding.word_seperator)
     words = list(filter(lambda x: x != '', words))
-    assert len(hyp.timestep['word']) == len(words)
+    assert len(hyp.timestamp['word']) == len(words)
 
     segments = []
     segment = []
@@ -72,20 +72,20 @@ def check_char_timestamps(hyp: Hypothesis, decoding: CTCDecoding):
     if segment:
         segments.append(' '.join(segment))
 
-    assert len(hyp.timestep['segment']) == len(segments)
+    assert len(hyp.timestamp['segment']) == len(segments)
 
 
 def check_subword_timestamps(hyp: Hypothesis, decoding: CTCBPEDecoding):
-    assert hyp.timestep is not None
-    assert isinstance(hyp.timestep, dict)
-    assert 'timestep' in hyp.timestep
-    assert 'char' in hyp.timestep
-    assert 'word' in hyp.timestep
-    assert 'segment' in hyp.timestep
+    assert hyp.timestamp is not None
+    assert isinstance(hyp.timestamp, dict)
+    assert 'timestep' in hyp.timestamp
+    assert 'char' in hyp.timestamp
+    assert 'word' in hyp.timestamp
+    assert 'segment' in hyp.timestamp
 
     chars = list(hyp.text)
     chars = list(filter(lambda x: x not in ['', ' ', '#'], chars))
-    all_chars = [list(decoding.tokenizer.tokens_to_text(data['char'])) for data in hyp.timestep['char']]
+    all_chars = [list(decoding.tokenizer.tokens_to_text(data['char'])) for data in hyp.timestamp['char']]
     all_chars = [char for subword in all_chars for char in subword]
     all_chars = list(filter(lambda x: x not in ['', ' ', '#'], all_chars))
     assert len(chars) == len(all_chars)
@@ -94,7 +94,7 @@ def check_subword_timestamps(hyp: Hypothesis, decoding: CTCBPEDecoding):
     if not hyp.text or hyp.text[-1] not in decoding.segment_seperators:
         segments_count += 1
 
-    assert len(hyp.timestep['segment']) == segments_count
+    assert len(hyp.timestamp['segment']) == segments_count
 
 
 class TestCTCDecoding:
@@ -125,9 +125,10 @@ def test_char_decoding_greedy_forward(
         length = torch.randint(low=1, high=T, size=[B])
 
         with torch.no_grad():
-            texts, _ = decoding.ctc_decoder_predictions_tensor(
+            hypotheses = decoding.ctc_decoder_predictions_tensor(
                 input_signal, length, fold_consecutive=True, return_hypotheses=False
             )
+            texts = [hyp.text for hyp in hypotheses]
 
             for text in texts:
                 assert isinstance(text, str)
@@ -146,7 +147,7 @@ def test_char_decoding_greedy_forward_hypotheses(self, alignments, timestamps):
         length = torch.randint(low=1, high=T, size=[B])
 
         with torch.no_grad():
-            hyps, _ = decoding.ctc_decoder_predictions_tensor(
+            hyps = decoding.ctc_decoder_predictions_tensor(
                 input_signal, length, fold_consecutive=True, return_hypotheses=True
             )
 
@@ -177,9 +178,10 @@ def test_subword_decoding_greedy_forward(self, tmp_tokenizer):
         length = torch.randint(low=1, high=T, size=[B])
 
         with torch.no_grad():
-            texts, _ = decoding.ctc_decoder_predictions_tensor(
+            hypotheses = decoding.ctc_decoder_predictions_tensor(
                 input_signal, length, fold_consecutive=True, return_hypotheses=False
             )
+            texts = [hyp.text for hyp in hypotheses]
 
             for text in texts:
                 assert isinstance(text, str)
@@ -197,7 +199,7 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
         length = torch.randint(low=1, high=T, size=[B])
 
         with torch.no_grad():
-            hyps, _ = decoding.ctc_decoder_predictions_tensor(
+            hyps = decoding.ctc_decoder_predictions_tensor(
                 input_signal, length, fold_consecutive=True, return_hypotheses=True
             )
 
@@ -283,11 +285,11 @@ def test_batched_decoding_logprobs(
             length = torch.randint(low=1, high=T, size=[B], device=length_device)
 
         with torch.inference_mode():
-            hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
+            hyps = unbatched_decoding.ctc_decoder_predictions_tensor(
                 input_signal, length, fold_consecutive=True, return_hypotheses=True
             )
 
-            batched_hyps, _ = batched_decoding.ctc_decoder_predictions_tensor(
+            batched_hyps = batched_decoding.ctc_decoder_predictions_tensor(
                 input_signal, length, fold_consecutive=True, return_hypotheses=True
             )
 
@@ -296,7 +298,7 @@ def test_batched_decoding_logprobs(
                 assert torch.abs(hyp.score - batched_hyp.score) <= 1e-5
                 assert torch.all(hyp.y_sequence == batched_hyp.y_sequence)
                 if timestamps:
-                    assert hyp.timestep == batched_hyp.timestep
+                    assert hyp.timestamp == batched_hyp.timestamp
                 if alignments:
                     assert torch.all(hyp.alignments[0] == batched_hyp.alignments[0])
                     assert torch.all(hyp.alignments[1] == batched_hyp.alignments[1])
@@ -350,11 +352,11 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
             length = torch.randint(low=1, high=T, size=[B], device=length_device)
 
         with torch.inference_mode():
-            hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
+            hyps = unbatched_decoding.ctc_decoder_predictions_tensor(
                 input_labels, length, fold_consecutive=True, return_hypotheses=True
             )
 
-            batched_hyps, _ = batched_decoding.ctc_decoder_predictions_tensor(
+            batched_hyps = batched_decoding.ctc_decoder_predictions_tensor(
                 input_labels, length, fold_consecutive=True, return_hypotheses=True
             )
 
@@ -363,4 +365,4 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
                 assert abs(hyp.score - batched_hyp.score) <= 1e-5
                 assert torch.all(hyp.y_sequence == batched_hyp.y_sequence)
                 if timestamps:
-                    assert hyp.timestep == batched_hyp.timestep
+                    assert hyp.timestamp == batched_hyp.timestamp
diff --git a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
index 4715f4826493..cb2ebc9d1202 100644
--- a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
+++ b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
@@ -80,14 +80,18 @@ def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16,
     audio_filepaths = glob.glob("tests/.data/asr/test/an4/wav/*.wav")
 
     with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
-        actual_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+        actual_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+
+    actual_transcripts = [hyp.text for hyp in actual_hypotheses]
 
     decoding_config["greedy"]["use_cuda_graph_decoder"] = True
 
     nemo_model.change_decoding_strategy(decoding_config)
 
     with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
-        fast_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+        fast_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+
+    fast_transcripts = [hyp.text for hyp in fast_hypotheses]
 
     wer = jiwer.wer(actual_transcripts, fast_transcripts)
 
@@ -136,7 +140,8 @@ def test_loop_labels_cuda_graph_rnnt_greedy_decoder_forced_mode(
     audio_filepaths = glob.glob("tests/.data/asr/test/an4/wav/*.wav")
 
     with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
-        actual_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+        actual_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+    actual_transcripts = [hyp.text for hyp in actual_hypotheses]
 
     # transcribe with use implementation with cuda graphs
     decoding_config["greedy"]["use_cuda_graph_decoder"] = True
@@ -144,7 +149,8 @@ def test_loop_labels_cuda_graph_rnnt_greedy_decoder_forced_mode(
     nemo_model.decoding.decoding._decoding_computer.force_cuda_graphs_mode(mode=force_mode)
 
     with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
-        fast_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+        fast_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+    fast_transcripts = [hyp.text for hyp in fast_hypotheses]
 
     wer = jiwer.wer(actual_transcripts, fast_transcripts)
 
@@ -185,7 +191,8 @@ def test_change_devices(loop_labels: bool, stt_en_fastconformer_transducer_xlarg
     nemo_model.to(first_device)
     audio_filepaths = glob.glob("tests/.data/asr/test/an4/wav/*.wav")
     with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True):
-        second_device_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+        second_device_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+    second_device_transcripts = [hyp.text for hyp in second_device_hypotheses]
 
     # Test that the model can run successfully back on second_device
     # after having been first run on first_device. Because the
@@ -195,7 +202,8 @@ def test_change_devices(loop_labels: bool, stt_en_fastconformer_transducer_xlarg
     nemo_model.to(second_device)
 
     with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True):
-        first_device_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+        first_device_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+    first_device_transcripts = [hyp.text for hyp in first_device_hypotheses]
     # Sanity check: The device we run on should not change execution
     # output.
     assert first_device_transcripts == second_device_transcripts
diff --git a/tests/collections/asr/decoding/test_rnnt_alignments.py b/tests/collections/asr/decoding/test_rnnt_alignments.py
index 5c43af28b1d4..d94e834aba05 100644
--- a/tests/collections/asr/decoding/test_rnnt_alignments.py
+++ b/tests/collections/asr/decoding/test_rnnt_alignments.py
@@ -83,7 +83,7 @@ def get_rnnt_alignments(
         num_workers=cfg.num_workers,
         return_hypotheses=True,
         channel_selector=cfg.channel_selector,
-    )[0]
+    )
 
     for transcription in transcriptions:
         for align_elem, frame_confidence in zip(transcription.alignments, transcription.frame_confidence):
diff --git a/tests/collections/asr/decoding/test_rnnt_decoding.py b/tests/collections/asr/decoding/test_rnnt_decoding.py
index b5250ad5f144..9da09cf0c4fa 100644
--- a/tests/collections/asr/decoding/test_rnnt_decoding.py
+++ b/tests/collections/asr/decoding/test_rnnt_decoding.py
@@ -119,16 +119,16 @@ def decode_text_from_nbest_hypotheses(hyps, decoding):
 
 
 def check_char_timestamps(hyp: rnnt_utils.Hypothesis, decoding: RNNTDecoding):
-    assert hyp.timestep is not None
-    assert isinstance(hyp.timestep, dict)
-    assert 'timestep' in hyp.timestep
-    assert 'char' in hyp.timestep
-    assert 'word' in hyp.timestep
-    assert 'segment' in hyp.timestep
+    assert hyp.timestamp is not None
+    assert isinstance(hyp.timestamp, dict)
+    assert 'timestep' in hyp.timestamp
+    assert 'char' in hyp.timestamp
+    assert 'word' in hyp.timestamp
+    assert 'segment' in hyp.timestamp
 
     words = hyp.text.split(decoding.word_seperator)
     words = list(filter(lambda x: x != '', words))
-    assert len(hyp.timestep['word']) == len(words)
+    assert len(hyp.timestamp['word']) == len(words)
 
     segments = []
     segment = []
@@ -142,20 +142,20 @@ def check_char_timestamps(hyp: rnnt_utils.Hypothesis, decoding: RNNTDecoding):
     if segment:
         segments.append(' '.join(segment))
 
-    assert len(hyp.timestep['segment']) == len(segments)
+    assert len(hyp.timestamp['segment']) == len(segments)
 
 
 def check_subword_timestamps(hyp: rnnt_utils.Hypothesis, decoding: RNNTBPEDecoding):
-    assert hyp.timestep is not None
-    assert isinstance(hyp.timestep, dict)
-    assert 'timestep' in hyp.timestep
-    assert 'char' in hyp.timestep
-    assert 'word' in hyp.timestep
-    assert 'segment' in hyp.timestep
+    assert hyp.timestamp is not None
+    assert isinstance(hyp.timestamp, dict)
+    assert 'timestep' in hyp.timestamp
+    assert 'char' in hyp.timestamp
+    assert 'word' in hyp.timestamp
+    assert 'segment' in hyp.timestamp
 
     chars = list(hyp.text)
     chars = list(filter(lambda x: x not in ['', ' ', '#'], chars))
-    all_chars = [list(decoding.tokenizer.tokens_to_text(data['char'])) for data in hyp.timestep['char']]
+    all_chars = [list(decoding.tokenizer.tokens_to_text(data['char'])) for data in hyp.timestamp['char']]
     all_chars = [char for subword in all_chars for char in subword]
     all_chars = list(filter(lambda x: x not in ['', ' ', '#'], all_chars))
     assert len(chars) == len(all_chars)
@@ -164,7 +164,7 @@ def check_subword_timestamps(hyp: rnnt_utils.Hypothesis, decoding: RNNTBPEDecodi
     if not hyp.text or hyp.text[-1] not in decoding.segment_seperators:
         segments_count += 1
 
-    assert len(hyp.timestep['segment']) == segments_count
+    assert len(hyp.timestamp['segment']) == segments_count
 
 
 def check_beam_decoding(test_data_dir, beam_config):
@@ -195,8 +195,8 @@ def check_beam_decoding(test_data_dir, beam_config):
         for idx, hyp_ in enumerate(all_hyps):
             print("Hyp index", idx + 1, "text :", hyp_.text)
 
-            assert len(hyp_.timestep) > 0
-            print("Timesteps", hyp_.timestep)
+            assert len(hyp_.timestamp) > 0
+            print("Timesteps", hyp_.timestamp)
             print()
 
 
@@ -258,7 +258,7 @@ def test_greedy_decoding_preserve_alignments(self, test_data_dir):
 
                     t_u.append(int(label))
 
-                print(f"Tokens at timestep {t} = {t_u}")
+                print(f"Tokens at timestamp {t} = {t_u}")
             print()
 
     @pytest.mark.skipif(
@@ -396,15 +396,15 @@ def test_rnnt_beam_decoding_preserve_alignments(self, test_data_dir, beam_config
                     if len(t_u) > 1:
                         assert t_u[-1] == blank_id
 
-                        # No blank token should be present in the current timestep other than at the end
+                        # No blank token should be present in the current timestamp other than at the end
                         for token in t_u[:-1]:
                             assert token != blank_id
 
-                    print(f"Tokens at timestep {t} = {t_u}")
+                    print(f"Tokens at timestamp {t} = {t_u}")
                 print()
 
-                assert len(hyp_.timestep) > 0
-                print("Timesteps", hyp_.timestep)
+                assert len(hyp_.timestamp) > 0
+                print("Timesteps", hyp_.timestamp)
                 print()
 
     @pytest.mark.skipif(
@@ -438,9 +438,11 @@ def test_subword_decoding_compute_timestamps(self, test_data_dir, decoding_strat
             decoding_cfg=cfg, decoder=model.decoder, joint=model.joint, tokenizer=model.tokenizer
         )
 
-        hyps, _ = decoding.rnnt_decoder_predictions_tensor(encoded, encoded_len, return_hypotheses=True)
-
-        check_subword_timestamps(hyps[0], decoding)
+        hyps = decoding.rnnt_decoder_predictions_tensor(encoded, encoded_len, return_hypotheses=True)
+        if isinstance(hyps[0], list):
+            check_subword_timestamps(hyps[0][0], decoding)
+        else:
+            check_subword_timestamps(hyps[0], decoding)
 
     @pytest.mark.skipif(
         not NUMBA_RNNT_LOSS_AVAILABLE,
@@ -473,9 +475,12 @@ def test_char_decoding_compute_timestamps(self, test_data_dir, decoding_strategy
 
         decoding = RNNTDecoding(decoding_cfg=cfg, decoder=model.decoder, joint=model.joint, vocabulary=vocab)
 
-        hyps, _ = decoding.rnnt_decoder_predictions_tensor(encoded, encoded_len, return_hypotheses=True)
+        hyps = decoding.rnnt_decoder_predictions_tensor(encoded, encoded_len, return_hypotheses=True)
 
-        check_char_timestamps(hyps[0], decoding)
+        if isinstance(hyps[0], list):
+            check_char_timestamps(hyps[0][0], decoding)
+        else:
+            check_char_timestamps(hyps[0], decoding)
 
     @pytest.mark.skipif(
         not NUMBA_RNNT_LOSS_AVAILABLE,
diff --git a/tests/collections/asr/mixins/test_transcription.py b/tests/collections/asr/mixins/test_transcription.py
index 6e2d5fe16c68..ad6613dd1d10 100644
--- a/tests/collections/asr/mixins/test_transcription.py
+++ b/tests/collections/asr/mixins/test_transcription.py
@@ -334,7 +334,7 @@ def test_transcribe_tensor(self, audio_files, fast_conformer_ctc_model):
         # Numpy array test
         outputs = fast_conformer_ctc_model.transcribe(audio, batch_size=1)
         assert len(outputs) == 1
-        assert isinstance(outputs[0], str)
+        assert isinstance(outputs[0], Hypothesis)
 
     @pytest.mark.with_downloads()
     @pytest.mark.unit
@@ -347,8 +347,8 @@ def test_transcribe_multiple_tensor(self, audio_files, fast_conformer_ctc_model)
         # Numpy array test
         outputs = fast_conformer_ctc_model.transcribe([audio, audio_2], batch_size=2)
         assert len(outputs) == 2
-        assert isinstance(outputs[0], str)
-        assert isinstance(outputs[1], str)
+        assert isinstance(outputs[0], Hypothesis)
+        assert isinstance(outputs[1], Hypothesis)
 
     @pytest.mark.with_downloads()
     @pytest.mark.unit
@@ -363,8 +363,8 @@ def test_transcribe_dataloader(self, audio_files, fast_conformer_ctc_model):
         # DataLoader test
         outputs = fast_conformer_ctc_model.transcribe(dataloader, batch_size=1)
         assert len(outputs) == 2
-        assert isinstance(outputs[0], str)
-        assert isinstance(outputs[1], str)
+        assert isinstance(outputs[0], Hypothesis)
+        assert isinstance(outputs[1], Hypothesis)
 
     @pytest.mark.with_downloads()
     @pytest.mark.unit
@@ -383,8 +383,8 @@ def test_timestamps_with_transcribe(self, audio_files, fast_conformer_ctc_model)
         assert output[1].text == 'start'
 
         # check timestamp
-        assert output[0].timestep['segment'][0]['start'] == pytest.approx(0.4)
-        assert output[0].timestep['segment'][0]['end'] == pytest.approx(0.48)
+        assert output[0].timestamp['segment'][0]['start'] == pytest.approx(0.4)
+        assert output[0].timestamp['segment'][0]['end'] == pytest.approx(0.48)
 
     @pytest.mark.with_downloads()
     @pytest.mark.unit
@@ -396,8 +396,6 @@ def test_timestamps_with_transcribe_hybrid(self, audio_files, fast_conformer_hyb
         # check len of output
         assert len(output) == 2
 
-        output = output[1]  # Transducer returns tuple
-
         # check hypothesis object
         assert isinstance(output[0], Hypothesis)
         # check transcript
@@ -405,5 +403,5 @@ def test_timestamps_with_transcribe_hybrid(self, audio_files, fast_conformer_hyb
         assert output[1].text == 'Start.'
 
         # check timestamp
-        assert output[0].timestep['segment'][0]['start'] == pytest.approx(0.48)
-        assert output[0].timestep['segment'][0]['end'] == pytest.approx(0.72)
+        assert output[0].timestamp['segment'][0]['start'] == pytest.approx(0.48)
+        assert output[0].timestamp['segment'][0]['end'] == pytest.approx(0.72)
diff --git a/tests/collections/asr/test_asr_classification_model.py b/tests/collections/asr/test_asr_classification_model.py
index 3888cb30204c..f41c36219142 100644
--- a/tests/collections/asr/test_asr_classification_model.py
+++ b/tests/collections/asr/test_asr_classification_model.py
@@ -52,7 +52,10 @@ def speech_classification_model():
 
     decoder = {
         'cls': 'nemo.collections.asr.modules.ConvASRDecoderClassification',
-        'params': {'feat_in': 32, 'num_classes': 30,},
+        'params': {
+            'feat_in': 32,
+            'num_classes': 30,
+        },
     }
 
     modelConfig = DictConfig(
@@ -95,7 +98,10 @@ def frame_classification_model():
 
     decoder = {
         'cls': 'nemo.collections.common.parts.MultiLayerPerceptron',
-        'params': {'hidden_size': 32, 'num_classes': 5,},
+        'params': {
+            'hidden_size': 32,
+            'num_classes': 5,
+        },
     }
 
     modelConfig = DictConfig(
diff --git a/tests/collections/asr/test_asr_context_biasing.py b/tests/collections/asr/test_asr_context_biasing.py
index b23b12655a8d..78261b65c912 100644
--- a/tests/collections/asr/test_asr_context_biasing.py
+++ b/tests/collections/asr/test_asr_context_biasing.py
@@ -118,7 +118,7 @@ def test_merge_alignment_with_ws_hyps(self, conformer_ctc_bpe_model):
         preds = rnnt_utils.Hypothesis(
             y_sequence=torch.tensor([120, 29]),
             score=0.0,
-            timestep=torch.tensor([0, 1, 2, 3]),
+            timestamp=torch.tensor([0, 1, 2, 3]),
         )
         pred_text, raw_text = context_biasing.merge_alignment_with_ws_hyps(
             preds,
@@ -134,7 +134,7 @@ def test_merge_alignment_with_ws_hyps(self, conformer_ctc_bpe_model):
         preds = rnnt_utils.Hypothesis(
             y_sequence=[],
             score=0.0,
-            timestep=[],
+            timestamp=[],
         )
         pred_text, raw_text = context_biasing.merge_alignment_with_ws_hyps(
             preds,
diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
index 02442291a918..eac5041de2b3 100644
--- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
+++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
@@ -29,6 +29,7 @@
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.parts.submodules import ctc_beam_decoding as beam_decode
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.common import tokenizers
 from nemo.utils.config_utils import assert_dataclass_signature_match
 
@@ -131,7 +132,7 @@ def test_predict_step(self, asr_model):
         assert len(outputs) == 1
         assert len(outputs[0]) == 2
         assert isinstance(outputs[0][0], MonoCut)
-        assert isinstance(outputs[0][1], str)
+        assert isinstance(outputs[0][1], Hypothesis)
 
     @pytest.mark.with_downloads()
     @pytest.mark.unit
diff --git a/tests/collections/asr/test_asr_ctcencdec_model.py b/tests/collections/asr/test_asr_ctcencdec_model.py
index 55451758578f..ae131abd3d48 100644
--- a/tests/collections/asr/test_asr_ctcencdec_model.py
+++ b/tests/collections/asr/test_asr_ctcencdec_model.py
@@ -24,6 +24,7 @@
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.models import EncDecCTCModel, configs
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.common.parts.preprocessing.parsers import make_parser
 from nemo.utils.config_utils import assert_dataclass_signature_match, update_model_config
 
@@ -146,7 +147,7 @@ def test_predict_step(self, asr_model):
         assert len(outputs) == 1
         assert len(outputs[0]) == 2
         assert isinstance(outputs[0][0], MonoCut)
-        assert isinstance(outputs[0][1], str)
+        assert isinstance(outputs[0][1], Hypothesis)
 
     @pytest.mark.unit
     def test_vocab_change(self, asr_model):
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
index d13c879e47f9..c75de6064e51 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
@@ -27,6 +27,7 @@
 from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode
 from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.common import tokenizers
 from nemo.core.utils import numba_utils
 from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__
@@ -179,7 +180,7 @@ def test_predict_step(self, hybrid_asr_model):
         assert len(outputs) == 1
         assert len(outputs[0]) == 2
         assert isinstance(outputs[0][0], MonoCut)
-        assert isinstance(outputs[0][1], str)
+        assert isinstance(outputs[0][1], Hypothesis)
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
index b5c34e197237..456d7450eeba 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
@@ -179,7 +179,7 @@ def test_predict_step(self, hybrid_asr_model):
         assert len(outputs) == 1
         assert len(outputs[0]) == 2
         assert isinstance(outputs[0][0], MonoCut)
-        assert isinstance(outputs[0][1], str)
+        assert isinstance(outputs[0][1], rnnt_utils.Hypothesis)
 
     @pytest.mark.skipif(
         not NUMBA_RNNT_LOSS_AVAILABLE,
@@ -563,10 +563,10 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Opt
                     assert torch.is_tensor(logp)
                     assert torch.is_tensor(label)
 
-    @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE,
-        reason='RNNTLoss has not been compiled with appropriate numba version.',
-    )
+    # @pytest.mark.skipif(
+    #     not NUMBA_RNNT_LOSS_AVAILABLE,
+    #     reason='RNNTLoss has not been compiled with appropriate numba version.',
+    # )
     @pytest.mark.unit
     @pytest.mark.parametrize(
         "beam_config",
diff --git a/tests/collections/asr/test_asr_metrics.py b/tests/collections/asr/test_asr_metrics.py
index efa11b254517..a87622d60a07 100644
--- a/tests/collections/asr/test_asr_metrics.py
+++ b/tests/collections/asr/test_asr_metrics.py
@@ -219,7 +219,7 @@ def test_wer_metric_return_hypothesis(self, batch_dim_index, test_wer_bpe):
 
         # pass batchsize 1 tensor, get back list of length 1 Hypothesis
         wer.decoding.preserve_alignments = True
-        hyp, _ = wer.decoding.ctc_decoder_predictions_tensor(tensor, return_hypotheses=True)
+        hyp = wer.decoding.ctc_decoder_predictions_tensor(tensor, return_hypotheses=True)
         hyp = hyp[0]
         assert isinstance(hyp, Hypothesis)
 
@@ -233,7 +233,7 @@ def test_wer_metric_return_hypothesis(self, batch_dim_index, test_wer_bpe):
         length = torch.tensor([tensor.shape[1 - batch_dim_index]], dtype=torch.long)
 
         # pass batchsize 1 tensor, get back list of length 1 Hypothesis [add length info]
-        hyp, _ = wer.decoding.ctc_decoder_predictions_tensor(tensor, decoder_lengths=length, return_hypotheses=True)
+        hyp = wer.decoding.ctc_decoder_predictions_tensor(tensor, decoder_lengths=length, return_hypotheses=True)
         hyp = hyp[0]
         assert isinstance(hyp, Hypothesis)
         assert hyp.length == 3
@@ -251,7 +251,7 @@ def test_wer_metric_subword_return_hypothesis(self, batch_dim_index, test_wer_bp
 
         # pass batchsize 1 tensor, get back list of length 1 Hypothesis
         wer.decoding.preserve_alignments = True
-        hyp, _ = wer.decoding.ctc_decoder_predictions_tensor(tensor, return_hypotheses=True)
+        hyp = wer.decoding.ctc_decoder_predictions_tensor(tensor, return_hypotheses=True)
         hyp = hyp[0]
         assert isinstance(hyp, Hypothesis)
 
@@ -265,13 +265,15 @@ def test_wer_metric_subword_return_hypothesis(self, batch_dim_index, test_wer_bp
         length = torch.tensor([tensor.shape[1 - batch_dim_index]], dtype=torch.long)
 
         # pass batchsize 1 tensor, get back list of length 1 Hypothesis [add length info]
-        hyp, _ = wer.decoding.ctc_decoder_predictions_tensor(tensor, decoder_lengths=length, return_hypotheses=True)
+        hyp = wer.decoding.ctc_decoder_predictions_tensor(tensor, decoder_lengths=length, return_hypotheses=True)
         hyp = hyp[0]
         assert isinstance(hyp, Hypothesis)
         assert hyp.length == 3
 
     def get_wer_ctc(self, prediction: str, reference: str, test_wer_bpe: bool):
-        ctc_decoder_predictions_tensor_mock = Mock(return_value=([prediction], None))
+        ctc_decoder_predictions_tensor_mock = Mock(
+            return_value=[Hypothesis(score=1.0, y_sequence=[], text=prediction)]
+        )
         if test_wer_bpe:
             decoding = Mock(
                 blank_id=self.char_tokenizer.tokenizer.vocab_size,
@@ -307,7 +309,9 @@ def decode_token_to_str_with_vocabulary_mock(self, ids):
         return ''.join([self.vocabulary[id_] for id_ in ids])
 
     def get_wer_rnnt(self, prediction: str, reference: str, batch_dim_index: int, test_wer_bpe: bool):
-        rnnt_decoder_predictions_tensor_mock = Mock(return_value=([prediction], None))
+        rnnt_decoder_predictions_tensor_mock = Mock(
+            return_value=[Hypothesis(score=1.0, y_sequence=[], text=prediction)]
+        )
         if test_wer_bpe:
             decoding = Mock(
                 blank_id=self.char_tokenizer.tokenizer.vocab_size,
@@ -385,24 +389,24 @@ def test_char_decoding_logprobs(self):
         decoding_cfg = CTCDecodingConfig()
         decoding = CTCDecoding(decoding_cfg, vocabulary=self.vocabulary)
 
-        hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+        hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
         hyp = hyp[0]  # type: Hypothesis
         assert isinstance(hyp.y_sequence, torch.Tensor)
         assert hyp.length == torch.tensor(T, dtype=torch.int32)
         assert hyp.text != ''
-        assert len(hyp.timestep) == 0
+        assert len(hyp.timestamp) == 0
         assert hyp.alignments is None
 
         # Preserve timestamps and alignments
         decoding_cfg = CTCDecodingConfig(preserve_alignments=True, compute_timestamps=True)
         decoding = CTCDecoding(decoding_cfg, vocabulary=self.vocabulary)
 
-        hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+        hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
         hyp = hyp[0]  # type: Hypothesis
         assert isinstance(hyp.y_sequence, torch.Tensor)
         assert hyp.length == torch.tensor(T, dtype=torch.int32)
         assert hyp.text != ''
-        assert len(hyp.timestep) == 4
+        assert len(hyp.timestamp) == 4
         assert hyp.alignments is not None
 
     @pytest.mark.unit
@@ -416,24 +420,24 @@ def test_subword_decoding_logprobs(self):
         decoding_cfg = CTCBPEDecodingConfig()
         decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer)
 
-        hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+        hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
         hyp = hyp[0]  # type: Hypothesis
         assert isinstance(hyp.y_sequence, torch.Tensor)
         assert hyp.length == torch.tensor(T, dtype=torch.int32)
         assert hyp.text != ''
-        assert len(hyp.timestep) == 0
+        assert len(hyp.timestamp) == 0
         assert hyp.alignments is None
 
         # Preserve timestamps and alignments
         decoding_cfg = CTCBPEDecodingConfig(preserve_alignments=True, compute_timestamps=True)
         decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer)
 
-        hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+        hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
         hyp = hyp[0]  # type: Hypothesis
         assert isinstance(hyp.y_sequence, torch.Tensor)
         assert hyp.length == torch.tensor(T, dtype=torch.int32)
         assert hyp.text != ''
-        assert len(hyp.timestep) == 4
+        assert len(hyp.timestamp) == 4
         assert hyp.alignments is not None
 
     @pytest.mark.unit
@@ -447,12 +451,12 @@ def test_char_decoding_labels(self):
         decoding_cfg = CTCDecodingConfig()
         decoding = CTCDecoding(decoding_cfg, vocabulary=self.vocabulary)
 
-        hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+        hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
         hyp = hyp[0]  # type: Hypothesis
         assert isinstance(hyp.y_sequence, torch.Tensor)
         assert hyp.length == torch.tensor(T, dtype=torch.int32)
         assert hyp.text != ''
-        assert len(hyp.timestep) == 0
+        assert len(hyp.timestamp) == 0
         assert hyp.alignments is None
 
         # Preserve timestamps and alignments
@@ -461,18 +465,18 @@ def test_char_decoding_labels(self):
 
         # Cannot compute alignments from labels
         with pytest.raises(ValueError):
-            hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+            _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
 
         # Preserve timestamps
         decoding_cfg = CTCDecodingConfig(preserve_alignments=False, compute_timestamps=True)
         decoding = CTCDecoding(decoding_cfg, vocabulary=self.vocabulary)
 
-        hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+        hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
         hyp = hyp[0]  # type: Hypothesis
         assert isinstance(hyp.y_sequence, torch.Tensor)
         assert hyp.length == torch.tensor(T, dtype=torch.int32)
         assert hyp.text != ''
-        assert len(hyp.timestep) == 4
+        assert len(hyp.timestamp) == 4
         assert hyp.alignments is None
 
     @pytest.mark.unit
@@ -486,24 +490,24 @@ def test_subword_decoding_logprobs(self):
         decoding_cfg = CTCBPEDecodingConfig()
         decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer)
 
-        hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+        hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
         hyp = hyp[0]  # type: Hypothesis
         assert isinstance(hyp.y_sequence, torch.Tensor)
         assert hyp.length == torch.tensor(T, dtype=torch.int32)
         assert hyp.text != ''
-        assert len(hyp.timestep) == 0
+        assert len(hyp.timestamp) == 0
         assert hyp.alignments is None
 
         # Preserve timestamps and alignments
         decoding_cfg = CTCBPEDecodingConfig(preserve_alignments=True, compute_timestamps=True)
         decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer)
 
-        hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+        hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
         hyp = hyp[0]  # type: Hypothesis
         assert isinstance(hyp.y_sequence, torch.Tensor)
         assert hyp.length == torch.tensor(T, dtype=torch.int32)
         assert hyp.text != ''
-        assert len(hyp.timestep) == 4
+        assert len(hyp.timestamp) == 4
         assert hyp.alignments is not None
 
     @pytest.mark.unit
@@ -517,12 +521,12 @@ def test_subword_decoding_labels(self):
         decoding_cfg = CTCBPEDecodingConfig()
         decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer)
 
-        hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+        hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
         hyp = hyp[0]  # type: Hypothesis
         assert isinstance(hyp.y_sequence, torch.Tensor)
         assert hyp.length == torch.tensor(T, dtype=torch.int32)
         assert hyp.text != ''
-        assert len(hyp.timestep) == 0
+        assert len(hyp.timestamp) == 0
         assert hyp.alignments is None
 
         # Preserve timestamps and alignments
@@ -531,16 +535,16 @@ def test_subword_decoding_labels(self):
 
         # Cannot compute alignments from labels
         with pytest.raises(ValueError):
-            hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+            _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
 
         # Preserve timestamps
         decoding_cfg = CTCBPEDecodingConfig(preserve_alignments=False, compute_timestamps=True)
         decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer)
 
-        hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
+        hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True)
         hyp = hyp[0]  # type: Hypothesis
         assert isinstance(hyp.y_sequence, torch.Tensor)
         assert hyp.length == torch.tensor(T, dtype=torch.int32)
         assert hyp.text != ''
-        assert len(hyp.timestep) == 4
+        assert len(hyp.timestamp) == 4
         assert hyp.alignments is None
diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py
index 63185f687fea..5d96ec8de1fa 100644
--- a/tests/collections/asr/test_asr_multitask_model_bpe.py
+++ b/tests/collections/asr/test_asr_multitask_model_bpe.py
@@ -450,7 +450,7 @@ def test_transcribe_single_file(self, asr_model, test_data_dir):
         # Numpy array test
         outputs = asr_model.transcribe(audio_file, batch_size=1)
         assert len(outputs) == 1
-        assert isinstance(outputs[0], str)
+        assert isinstance(outputs[0].text, str)
 
     @pytest.mark.unit
     def test_transcribe_single_file_translation(self, asr_model, test_data_dir):
@@ -459,7 +459,7 @@ def test_transcribe_single_file_translation(self, asr_model, test_data_dir):
         # Numpy array test
         outputs = asr_model.transcribe(audio_file, batch_size=1, task="ast", source_lang='en', target_lang='de')
         assert len(outputs) == 1
-        assert isinstance(outputs[0], str)
+        assert isinstance(outputs[0].text, str)
 
     @pytest.mark.unit
     def test_transcribe_return_hypothesis(self, asr_model, test_data_dir):
@@ -486,7 +486,7 @@ def test_transcribe_tensor(self, asr_model, test_data_dir):
         # Numpy array test
         outputs = asr_model.transcribe(audio, batch_size=1)
         assert len(outputs) == 1
-        assert isinstance(outputs[0], str)
+        assert isinstance(outputs[0].text, str)
 
     @pytest.mark.unit
     def test_build_tokenizer(self, asr_model, test_data_dir):
@@ -527,7 +527,7 @@ def test_predict_step(self, asr_model, test_data_dir):
         assert len(outputs) == 1
         assert len(outputs[0]) == 2
         assert isinstance(outputs[0][0], MonoCut)
-        assert isinstance(outputs[0][1], str)
+        assert isinstance(outputs[0][1].text, str)
 
     @pytest.mark.unit
     def test_FrameBatchMultiTaskAED(self, asr_model, test_data_dir):
diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py
index 5e810243c919..07c6adf761ba 100644
--- a/tests/collections/asr/test_asr_rnnt_encdec_model.py
+++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py
@@ -174,7 +174,7 @@ def joint_after_projection(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tens
     setup["decoder"] = DummyRNNTDecoder(vocab_size=2, blank_idx=2, blank_as_pad=True)
     setup["decoder_masked"] = DummyRNNTDecoder(vocab_size=2, blank_idx=2, blank_as_pad=False)
     setup["joint"] = DummyRNNTJoint(num_outputs=3)
-    # expected timesteps for max_symbols_per_step=5 are [[0, 0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 1]],
+    # expected timestamps for max_symbols_per_step=5 are [[0, 0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 1]],
     # so we have both looped and regular iteration on the second frame
     setup["encoder_output"] = torch.tensor(
         [[[1, 0, 0], [0, 1, 0], [0, 0, 1]], [[0, 0, 1], [2, 0, 0], [0, 0, 0]]], dtype=torch.float32
@@ -311,7 +311,7 @@ def test_predict_step(self, asr_model):
         assert len(outputs) == 1
         assert len(outputs[0]) == 2
         assert isinstance(outputs[0][0], MonoCut)
-        assert isinstance(outputs[0][1], str)
+        assert isinstance(outputs[0][1], rnnt_utils.Hypothesis)
 
     @pytest.mark.skipif(
         not NUMBA_RNNT_LOSS_AVAILABLE,
@@ -836,16 +836,16 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Opt
                 hyp = greedy(encoder_output=enc_out, encoded_lengths=enc_len)[0][0]  # type: rnnt_utils.Hypothesis
                 assert hyp.alignments is not None
 
-                timestep_count = {
-                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True))
+                timestamp_count = {
+                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestamp), return_counts=True))
                 }
                 for t in range(len(hyp.alignments)):
 
-                    # check that the number of alignment elements is consistent with hyp.timestep
+                    # check that the number of alignment elements is consistent with hyp.timestamp
                     alignment_len = len(hyp.alignments[t])
                     assert alignment_len <= max_symbols_per_step
-                    if t in timestep_count:  # non-blank
-                        assert alignment_len == timestep_count[t] + (1 if alignment_len < max_symbols_per_step else 0)
+                    if t in timestamp_count:  # non-blank
+                        assert alignment_len == timestamp_count[t] + (1 if alignment_len < max_symbols_per_step else 0)
                     else:  # blank
                         assert alignment_len == 1
 
@@ -908,20 +908,20 @@ def test_greedy_decoding_preserve_frame_confidence(self, greedy_class, loop_labe
                 hyp = greedy(encoder_output=enc_out, encoded_lengths=enc_len)[0][0]  # type: rnnt_utils.Hypothesis
                 assert hyp.frame_confidence is not None
 
-                timestep_count = {
-                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True))
+                timestamp_count = {
+                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestamp), return_counts=True))
                 }
                 for t in range(len(hyp.frame_confidence)):
 
-                    # check that the number of confidence elements is consistent with hyp.timestep
+                    # check that the number of confidence elements is consistent with hyp.timestamp
                     confidence_len = len(hyp.frame_confidence[t])
                     assert confidence_len <= max_symbols_per_step
-                    if t in timestep_count:  # non-blank
-                        # if timestep_count[t] less than max_symbols_per_step,
+                    if t in timestamp_count:  # non-blank
+                        # if timestamp_count[t] less than max_symbols_per_step,
                         # blank emission and corresponding confidence expected
-                        # if timestep_count[t] == max_symbols_per_step, "forced blank" is not added => no confidence
-                        assert confidence_len == timestep_count[t] + (
-                            1 if timestep_count[t] < max_symbols_per_step else 0
+                        # if timestamp_count[t] == max_symbols_per_step, "forced blank" is not added => no confidence
+                        assert confidence_len == timestamp_count[t] + (
+                            1 if timestamp_count[t] < max_symbols_per_step else 0
                         )
                     else:  # blank
                         assert confidence_len == 1
@@ -969,16 +969,16 @@ def test_greedy_decoding_max_symbols_alignment(
                 hyp = greedy(encoder_output=encoder_output, encoded_lengths=encoded_lengths)[0][0]
                 assert hyp.alignments is not None
 
-                timestep_count = {
-                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True))
+                timestamp_count = {
+                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestamp), return_counts=True))
                 }
                 for t in range(len(hyp.alignments)):
 
-                    # check that the number of confidence elements is consistent with hyp.timestep
+                    # check that the number of confidence elements is consistent with hyp.timestamp
                     alignment_len = len(hyp.alignments[t])
                     assert alignment_len <= max_symbols_per_step
-                    if t in timestep_count:  # non-blank
-                        assert alignment_len == timestep_count[t] + (1 if alignment_len < max_symbols_per_step else 0)
+                    if t in timestamp_count:  # non-blank
+                        assert alignment_len == timestamp_count[t] + (1 if alignment_len < max_symbols_per_step else 0)
                     else:  # blank or max_symbols_per_step == 0
                         assert alignment_len <= 1
 
@@ -1056,16 +1056,16 @@ def test_greedy_decoding_max_symbols_confidence(
                 hyp = greedy(encoder_output=encoder_output, encoded_lengths=encoded_lengths)[0][0]
                 assert hyp.frame_confidence is not None
 
-                timestep_count = {
-                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True))
+                timestamp_count = {
+                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestamp), return_counts=True))
                 }
                 for t in range(len(hyp.frame_confidence)):
 
-                    # check that the number of confidence elements is consistent with hyp.timestep
+                    # check that the number of confidence elements is consistent with hyp.timestamp
                     confidence_len = len(hyp.frame_confidence[t])
                     assert confidence_len <= max_symbols_per_step
-                    if t in timestep_count:  # non-blank
-                        assert confidence_len == timestep_count[t] + (
+                    if t in timestamp_count:  # non-blank
+                        assert confidence_len == timestamp_count[t] + (
                             1 if confidence_len < max_symbols_per_step else 0
                         )
                     else:  # blank or max_symbols_per_step == 0
diff --git a/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py b/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py
index aba364868e88..be86d5bffbb2 100644
--- a/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py
+++ b/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py
@@ -27,6 +27,7 @@
 from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel
 from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode
 from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.common import tokenizers
 from nemo.core.utils import numba_utils
 from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__
@@ -191,7 +192,7 @@ def test_predict_step(self, asr_model):
         assert len(outputs) == 1
         assert len(outputs[0]) == 2
         assert isinstance(outputs[0][0], MonoCut)
-        assert isinstance(outputs[0][1], str)
+        assert isinstance(outputs[0][1], Hypothesis)
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb
index 7171510f4e0d..6c551e00b2bf 100644
--- a/tutorials/asr/ASR_Context_Biasing.ipynb
+++ b/tutorials/asr/ASR_Context_Biasing.ipynb
@@ -259,10 +259,6 @@
    "execution_count": null,
    "id": "d34ee0ba",
    "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    },
     "scrolled": true
    },
    "outputs": [],
@@ -322,7 +318,7 @@
     "\n",
     "for idx, ref in enumerate(ref_text):\n",
     "    ref = ref.split()\n",
-    "    hyp = recog_results[idx].split()\n",
+    "    hyp = recog_results[idx].text.split()\n",
     "    texterrors_ali = texterrors.align_texts(ref, hyp, False)\n",
     "    ali = []\n",
     "    for i in range(len(texterrors_ali[0])):\n",
@@ -898,7 +894,7 @@
     "        print(f\"[ref text]: {target_transcripts[idx]}\")\n",
     "    else:\n",
     "        # if no spotted words, use standard greedy predictions\n",
-    "        pred_text = ctc_model.wer.decoding.ctc_decoder_predictions_tensor(greedy_predicts)[0][0]"
+    "        pred_text = ctc_model.wer.decoding.ctc_decoder_predictions_tensor(greedy_predicts)[0].text"
    ]
   },
   {
diff --git a/tutorials/asr/ASR_with_NeMo.ipynb b/tutorials/asr/ASR_with_NeMo.ipynb
index bb62e2f5eb9d..86625e2565c3 100644
--- a/tutorials/asr/ASR_with_NeMo.ipynb
+++ b/tutorials/asr/ASR_with_NeMo.ipynb
@@ -1,38 +1,12 @@
 {
-    "nbformat": 4,
-    "nbformat_minor": 0,
-    "metadata": {
-        "accelerator": "GPU",
-        "colab": {
-            "name": "ASR_with_NeMo.ipynb",
-            "provenance": [],
-            "collapsed_sections": [],
-            "toc_visible": true
-        },
-        "kernelspec": {
-            "display_name": "Python 3",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.7.7"
-        }
-    },
     "cells": [
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "lJz6FDU1lRzc"
             },
+            "outputs": [],
             "source": [
                 "\"\"\"\n",
                 "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
@@ -43,7 +17,9 @@
                 "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
                 "4. Run this cell to set up dependencies.\n",
                 "5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n",
-                "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
+                "\n",
+                "\n",
+                "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
                 "\"\"\"\n",
                 "# If you're using Google Colab and not running locally, run this cell.\n",
                 "\n",
@@ -63,9 +39,7 @@
                 "that you want to use the \"Run All Cells\" (or similar) option.\n",
                 "\"\"\"\n",
                 "# exit()"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -158,9 +132,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "gAhsmi6HlRzh"
             },
+            "outputs": [],
             "source": [
                 "import os\n",
                 "# This is where the an4/ directory will be placed.\n",
@@ -169,16 +145,16 @@
                 "\n",
                 "if not os.path.exists(data_dir):\n",
                 "  os.makedirs(data_dir)"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "Yb4fuUvWlRzk",
                 "scrolled": true
             },
+            "outputs": [],
             "source": [
                 "import glob\n",
                 "import os\n",
@@ -208,9 +184,7 @@
                 "        cmd = [\"sox\", sph_path, wav_path]\n",
                 "        subprocess.run(cmd)\n",
                 "print(\"Finished conversion.\\n******\")"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -225,9 +199,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "_M_bSs3MjQlz"
             },
+            "outputs": [],
             "source": [
                 "import librosa\n",
                 "import IPython.display as ipd\n",
@@ -237,9 +213,7 @@
                 "audio, sample_rate = librosa.load(example_file)\n",
                 "\n",
                 "ipd.Audio(example_file, rate=sample_rate)"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -254,9 +228,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "MqIAKkqelRzm"
             },
+            "outputs": [],
             "source": [
                 "%matplotlib inline\n",
                 "import librosa.display\n",
@@ -268,9 +244,7 @@
                 "plt.ylabel('Amplitude')\n",
                 "\n",
                 "_ = librosa.display.waveshow(audio, color='blue')"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -292,9 +266,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "oCFneEs1lRzp"
             },
+            "outputs": [],
             "source": [
                 "import numpy as np\n",
                 "\n",
@@ -306,9 +282,7 @@
                 "librosa.display.specshow(spec_db, y_axis='log', x_axis='time')\n",
                 "plt.colorbar()\n",
                 "plt.title('Audio Spectrogram');"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -325,9 +299,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "7yQXVn-TlRzt"
             },
+            "outputs": [],
             "source": [
                 "# Plot the mel spectrogram of our sample\n",
                 "mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate)\n",
@@ -337,9 +313,7 @@
                 "    mel_spec_db, x_axis='time', y_axis='mel')\n",
                 "plt.colorbar()\n",
                 "plt.title('Mel Spectrogram');"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -384,18 +358,18 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "4_W0lhaQlRzx"
             },
+            "outputs": [],
             "source": [
                 "# NeMo's \"core\" package\n",
                 "import nemo\n",
                 "# NeMo's ASR collection - this collections contains complete ASR models and\n",
                 "# building blocks (modules) for ASR\n",
                 "import nemo.collections.asr as nemo_asr"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -410,15 +384,15 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "KFZZpYult96G"
             },
+            "outputs": [],
             "source": [
                 "# This line will download pre-trained QuartzNet15x5 model from NVIDIA's NGC cloud and instantiate it for you\n",
                 "quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=\"QuartzNet15x5Base-En\")"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -431,16 +405,16 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "3QCpR_93u1hp"
             },
+            "outputs": [],
             "source": [
                 "files = [os.path.join(data_dir, 'an4/wav/an4_clstk/mgah/cen2-mgah-b.wav')]\n",
                 "for fname, transcription in zip(files, quartznet.transcribe(audio=files)):\n",
                 "  print(f\"Audio in {fname} was recognized as: {transcription}\")"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -488,9 +462,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "lVB1sG1GlRzz"
             },
+            "outputs": [],
             "source": [
                 "# --- Building Manifest Files --- #\n",
                 "import json\n",
@@ -537,9 +513,7 @@
                 "    build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')\n",
                 "    print(\"Test manifest created.\")\n",
                 "print(\"***Done***\")"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -575,9 +549,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "PXVKBniMlRz5"
             },
+            "outputs": [],
             "source": [
                 "# --- Config Information ---#\n",
                 "try:\n",
@@ -596,9 +572,7 @@
                 "with open(config_path) as f:\n",
                 "    params = yaml.load(f)\n",
                 "print(params)"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -615,15 +589,15 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "GUfR6tAK0k2u"
             },
+            "outputs": [],
             "source": [
                 "import lightning.pytorch as pl\n",
                 "trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=50)"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -637,17 +611,17 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "Cbf0fsMK09lk"
             },
+            "outputs": [],
             "source": [
                 "from omegaconf import DictConfig\n",
                 "params['model']['train_ds']['manifest_filepath'] = train_manifest\n",
                 "params['model']['validation_ds']['manifest_filepath'] = test_manifest\n",
                 "first_asr_model = nemo_asr.models.EncDecCTCModel(cfg=DictConfig(params['model']), trainer=trainer)"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -660,15 +634,15 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "inRJsnrz1psq"
             },
+            "outputs": [],
             "source": [
                 "# Start training!!!\n",
                 "trainer.fit(first_asr_model)"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -686,9 +660,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "n_0y3stSXDX_"
             },
+            "outputs": [],
             "source": [
                 "try:\n",
                 "  from google import colab\n",
@@ -702,9 +678,7 @@
                 "  %tensorboard --logdir lightning_logs/\n",
                 "else:\n",
                 "  print(\"To use tensorboard, please use this notebook in a Google Colab environment.\")"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -717,14 +691,14 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "7kdQbpohXnEd"
             },
+            "outputs": [],
             "source": [
                 "print(params['model']['optim'])"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -737,18 +711,18 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "AbigFKUtYgvn"
             },
+            "outputs": [],
             "source": [
                 "import copy\n",
                 "new_opt = copy.deepcopy(params['model']['optim'])\n",
                 "new_opt['lr'] = 0.001\n",
                 "first_asr_model.setup_optimization(optim_config=DictConfig(new_opt))\n",
                 "# And then you can invoke trainer.fit(first_asr_model)"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -765,9 +739,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "3FT0klSV268p"
             },
+            "outputs": [],
             "source": [
                 "audio = [os.path.join(data_dir, 'an4/wav/an4_clstk/mgah/cen2-mgah-b.wav'),\n",
                 "                     os.path.join(data_dir, 'an4/wav/an4_clstk/fmjd/cen7-fmjd-b.wav'),\n",
@@ -775,9 +751,7 @@
                 "                     os.path.join(data_dir, 'an4/wav/an4_clstk/fkai/cen8-fkai-b.wav')]\n",
                 "print(first_asr_model.transcribe(audio=audio,\n",
                 "                                 batch_size=4))"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -790,9 +764,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "7mP4r1Gx_Ilt"
             },
+            "outputs": [],
             "source": [
                 "# Bigger batch-size = bigger throughput\n",
                 "params['model']['validation_ds']['batch_size'] = 16\n",
@@ -831,9 +807,7 @@
                 "\n",
                 "# We need to sum all numerators and denominators first. Then divide.\n",
                 "print(f\"WER = {sum(wer_nums)/sum(wer_denoms)}\")"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -865,14 +839,14 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "9glGogaPlR0H"
             },
+            "outputs": [],
             "source": [
                 "print(quartznet._cfg['spec_augment'])"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -900,9 +874,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "hl320dsydWX0"
             },
+            "outputs": [],
             "source": [
                 "# Check what kind of vocabulary/alphabet the model has right now\n",
                 "print(quartznet.decoder.vocabulary)\n",
@@ -915,9 +891,7 @@
                 "        'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', \"'\", \"!\"\n",
                 "    ]\n",
                 ")"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -930,9 +904,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "_PZJIso-eDl-"
             },
+            "outputs": [],
             "source": [
                 "# Use the smaller learning rate we set before\n",
                 "quartznet.setup_optimization(optim_config=DictConfig(new_opt))\n",
@@ -946,9 +922,7 @@
                 "# And now we can create a PyTorch Lightning trainer and call `fit` again.\n",
                 "trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=2)\n",
                 "trainer.fit(quartznet)"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -993,9 +967,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "I4WRcmakjQnj"
             },
+            "outputs": [],
             "source": [
                 "!pip install --upgrade onnxruntime # for gpu, use onnxruntime-gpu\n",
                 "#!mkdir -p ort\n",
@@ -1007,9 +983,7 @@
                 "#!pip uninstall -y onnxruntime-gpu\n",
                 "#!pip install  --upgrade --force-reinstall ./build/Linux/Release/dist/onnxruntime*.whl\n",
                 "#%cd .."
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -1022,9 +996,11 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "HZnyWxPyjQnm"
             },
+            "outputs": [],
             "source": [
                 "import json\n",
                 "import os\n",
@@ -1098,12 +1074,10 @@
                 "        logits = torch.from_numpy(alogits[0])\n",
                 "        greedy_predictions = logits.argmax(dim=-1, keepdim=False)\n",
                 "        wer = WER(decoding=quartznet.decoding, use_cer=False)\n",
-                "        hypotheses, _ = wer.decoding.ctc_decoder_predictions_tensor(greedy_predictions)\n",
+                "        hypotheses = wer.decoding.ctc_decoder_predictions_tensor(greedy_predictions)\n",
                 "        print(hypotheses)\n",
                 "        break\n"
-            ],
-            "execution_count": null,
-            "outputs": []
+            ]
         },
         {
             "cell_type": "markdown",
@@ -1165,12 +1139,40 @@
         },
         {
             "cell_type": "code",
+            "execution_count": null,
             "metadata": {
                 "id": "V3ERGX86lR0V"
             },
-            "source": [],
-            "execution_count": null,
-            "outputs": []
+            "outputs": [],
+            "source": []
         }
-    ]
+    ],
+    "metadata": {
+        "accelerator": "GPU",
+        "colab": {
+            "collapsed_sections": [],
+            "name": "ASR_with_NeMo.ipynb",
+            "provenance": [],
+            "toc_visible": true
+        },
+        "kernelspec": {
+            "display_name": "Python 3",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.7.7"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 0
 }
diff --git a/tutorials/asr/ASR_with_Transducers.ipynb b/tutorials/asr/ASR_with_Transducers.ipynb
index 95eecbfb8916..ddd0582f82a5 100644
--- a/tutorials/asr/ASR_with_Transducers.ipynb
+++ b/tutorials/asr/ASR_with_Transducers.ipynb
@@ -1206,7 +1206,7 @@
       "outputs": [],
       "source": [
         "# Get a batch of hypotheses, as well as a batch of all obtained hypotheses (if beam search is used)\n",
-        "hypotheses, all_hypotheses = rnnt_alignments(model, batch)"
+        "hypotheses = rnnt_alignments(model, batch)"
       ]
     },
     {
diff --git a/tutorials/asr/Buffered_Transducer_Inference.ipynb b/tutorials/asr/Buffered_Transducer_Inference.ipynb
index c23398dca46a..f79fd4dff64e 100644
--- a/tutorials/asr/Buffered_Transducer_Inference.ipynb
+++ b/tutorials/asr/Buffered_Transducer_Inference.ipynb
@@ -17,7 +17,9 @@
     "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
     "4. Run this cell to set up dependencies.\n",
     "5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n",
-    "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
+    "\n",
+    "\n",
+    "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
     "\"\"\"\n",
     "# If you're using Google Colab and not running locally, run this cell.\n",
     "\n",
@@ -730,7 +732,7 @@
     "                new_prev_hypothesis.append(self.previous_hypotheses[old_pos])\n",
     "            self.previous_hypotheses = new_prev_hypothesis\n",
     "\n",
-    "        best_hyp, _ = self.asr_model.decoding.rnnt_decoder_predictions_tensor(\n",
+    "        best_hyp = self.asr_model.decoding.rnnt_decoder_predictions_tensor(\n",
     "            encoded, encoded_len, return_hypotheses=True, partial_hypotheses=self.previous_hypotheses\n",
     "        )\n",
     "\n",
@@ -925,7 +927,7 @@
     "After this, we perform regular transducer decoding of the Prediction Network + Joint Network. Since it is being done on a subset of samples, it is much faster than padded decoding.\n",
     "\n",
     "```python\n",
-    "best_hyp, _ = self.asr_model.decoding.rnnt_decoder_predictions_tensor(\n",
+    "best_hyp = self.asr_model.decoding.rnnt_decoder_predictions_tensor(\n",
     "    encoded, encoded_len, return_hypotheses=True, partial_hypotheses=self.previous_hypotheses\n",
     ")\n",
     "```"

From d19682f750d7922ec8aaaa84c89f0797f3b00a0e Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Tue, 11 Feb 2025 20:42:55 -0800
Subject: [PATCH 07/14] fix: export weight name mapping if model is nemo model
 (#11497)

* fix: export weight name mapping if model is nemo model

Signed-off-by: Terry Kong <terryk@nvidia.com>

* missing license headers

Signed-off-by: Terry Kong <terryk@nvidia.com>

* pytest mark unit and CPU

Signed-off-by: Terry Kong <terryk@nvidia.com>

---------

Signed-off-by: Terry Kong <terryk@nvidia.com>
Co-authored-by: Dong Hyuk Chang <thomaschang26@tutanota.com>
---
 nemo/export/tensorrt_llm.py                   |  7 ++-
 tests/export/test_tensorrt_llm.py             | 63 +++++++++++++++++++
 .../converter/test_model_to_trt_llm_ckpt.py   | 43 +++++++++++++
 3 files changed, 110 insertions(+), 3 deletions(-)
 create mode 100644 tests/export/test_tensorrt_llm.py
 create mode 100644 tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 74be0eba0491..6299134e833c 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -776,7 +776,8 @@ def get_input_dtype(self, storage_dtype):
         elif storage_dtype == torch.float16:
             return DataType.float16
 
-    def get_nemo_to_trtllm_conversion_dict(self, model_state_dict):
+    @staticmethod
+    def get_nemo_to_trtllm_conversion_dict(model_state_dict):
         """MCore export supports some default conversion dictionaries
         All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models sometimes start with "model.decoder.layers.4.blahblah". so we append model prefix. to the keys
         """
@@ -786,8 +787,8 @@ def get_nemo_to_trtllm_conversion_dict(self, model_state_dict):
 
         nemo_model_conversion_dict = {}
         for key, value in DEFAULT_CONVERSION_DICT.items():
-            if 'layers' in key and model_prefix:
-                nemo_model_conversion_dict[f'{model_prefix}.{key}'] = value
+            if model_prefix:
+                nemo_model_conversion_dict[f'{model_prefix}{key}'] = value
             else:
                 nemo_model_conversion_dict[key] = value
         return nemo_model_conversion_dict
diff --git a/tests/export/test_tensorrt_llm.py b/tests/export/test_tensorrt_llm.py
new file mode 100644
index 000000000000..7361befcbaa9
--- /dev/null
+++ b/tests/export/test_tensorrt_llm.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import pytest
+
+
+@pytest.mark.run_only_on('GPU')
+@pytest.mark.unit
+def test_get_nemo_to_trtllm_conversion_dict_on_nemo_model():
+    try:
+        from nemo.export.tensorrt_llm import TensorRTLLM
+    except ImportError:
+        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
+
+    dummy_state = object()
+    model_state_dict = {
+        'model.embedding.word_embeddings.weight': dummy_state,
+        'model.decoder.layers.0.self_attention.linear_proj.weight': dummy_state,
+    }
+    nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict)
+
+    # Check that every key starts with 'model.' and not 'model..' by using a regex
+    # This pattern ensures:
+    #   - The key starts with 'model.'
+    #   - Immediately after 'model.', there must be at least one character that is NOT a '.'
+    #     (preventing the 'model..' scenario)
+    pattern = re.compile(r'^model\.[^.].*')
+    for key in nemo_model_conversion_dict.keys():
+        assert pattern.match(key), f"Key '{key}' does not properly start with 'model.'"
+
+
+@pytest.mark.run_only_on('GPU')
+@pytest.mark.unit
+def test_get_nemo_to_trtllm_conversion_dict_on_mcore_model():
+    try:
+        from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT
+
+        from nemo.export.tensorrt_llm import TensorRTLLM
+    except ImportError:
+        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
+
+    dummy_state = object()
+    model_state_dict = {
+        'embedding.word_embeddings.weight': dummy_state,
+        'decoder.layers.0.self_attention.linear_proj.weight': dummy_state,
+    }
+    nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict)
+
+    # This is essentially a no-op
+    assert nemo_model_conversion_dict == DEFAULT_CONVERSION_DICT
diff --git a/tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py b/tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py
new file mode 100644
index 000000000000..2be809cc4406
--- /dev/null
+++ b/tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    'input_layer_names,expected_model_prefix',
+    [
+        (
+            [
+                'model.embedding.word_embeddings.weight',
+                'model.decoder.layers.0.self_attention.linear_proj.weight',
+                'model.decoder.layers.0.self_attention.linear_qkv.layer_norm_weight',
+                'model.decoder.layers.0.self_attention.linear_qkv.weight',
+                'model.decoder.layers.0.mlp.linear_fc1.layer_norm_weight',
+                'model.decoder.layers.0.mlp.linear_fc1.weight',
+                'model.decoder.layers.0.mlp.linear_fc2.weight',
+            ],
+            'model.',
+        )
+    ],
+)
+@pytest.mark.run_only_on('CPU')
+@pytest.mark.unit
+def test_get_layer_prefix_is_mcore(input_layer_names, expected_model_prefix):
+    try:
+        from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import get_layer_prefix
+    except ImportError:
+        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
+    model_prefix, _ = get_layer_prefix(input_layer_names, is_mcore=True)
+    assert model_prefix == expected_model_prefix

From 6dbcbac4250a09f3b6c840cdd1d2b9b05a0caf8e Mon Sep 17 00:00:00 2001
From: Yuanzhe Dong <5069709+yuanzhedong@users.noreply.github.com>
Date: Wed, 12 Feb 2025 00:42:07 -0800
Subject: [PATCH 08/14] Add error message when downloading failed. (#12139)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update vLLM to 0.7.2 (#12078)

* initial commit

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* vllm bump cleanup

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Flake8

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* flake should not fail with tensorstore

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* pylint also should not fail

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* local tokenizer load

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* add missing requirements

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* absolute path for sentencepiece tokenizer

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* fix absolute path, add new vllm params

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* bump vllm, fix tokenizer

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* code review + docstrings

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* flake8

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* fix formatting

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

---------

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Yuanzhe Dong <yudong@nvidia.com>

* better error message

Signed-off-by: Yuanzhe Dong <yudong@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: yuanzhedong <yuanzhedong@users.noreply.github.com>

* pylint

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Signed-off-by: Yuanzhe Dong <yudong@nvidia.com>
Signed-off-by: yuanzhedong <yuanzhedong@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: yuanzhedong <yuanzhedong@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 .../modules/common/megatron/megatron_utils.py | 33 +++++++++++++++----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_utils.py b/nemo/collections/nlp/modules/common/megatron/megatron_utils.py
index d610f5b61c24..7ca5154b7911 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_utils.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_utils.py
@@ -55,14 +55,14 @@
     },
     "megatron-bert-345m-uncased": {
         "config": CONFIGS["345m"],
-        "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.0/files/release/mp_rank_00/model_optim_rng.pt",
+        "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.0/files/release/mp_rank_00/model_optim_rng.pt",  # pylint: disable=line-too-long
         "vocab": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
         "do_lower_case": True,
         "tokenizer_name": "bert-large-uncased",
     },
     "megatron-bert-345m-cased": {
         "config": CONFIGS["345m"],
-        "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/files/release/mp_rank_00/model_optim_rng.pt",
+        "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/files/release/mp_rank_00/model_optim_rng.pt",  # pylint: disable=line-too-long
         "vocab": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
         "do_lower_case": False,
         "tokenizer_name": "bert-large-cased",
@@ -83,14 +83,14 @@
     },
     "biomegatron-bert-345m-uncased": {
         "config": CONFIGS["345m"],
-        "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345muncased/versions/0/files/MegatronBERT.pt",
+        "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345muncased/versions/0/files/MegatronBERT.pt",  # pylint: disable=line-too-long
         "vocab": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345muncased/versions/0/files/vocab.txt",
         "do_lower_case": True,
         "tokenizer_name": "bert-large-uncased",
     },
     "biomegatron-bert-345m-cased": {
         "config": CONFIGS["345m"],
-        "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345mcased/versions/0/files/MegatronBERT.pt",
+        "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345mcased/versions/0/files/MegatronBERT.pt",  # pylint: disable=line-too-long
         "vocab": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345mcased/versions/0/files/vocab.txt",
         "do_lower_case": False,
         "tokenizer_name": "bert-large-cased",
@@ -98,11 +98,28 @@
 }
 
 
-def compute_model_parallel_rank(local_rank, model_parallel_size):
+def compute_model_parallel_rank(local_rank: int, model_parallel_size: int) -> int:
+    """Calculates the model_parallel_rank from the local rank and the model parallel size
+
+    Args:
+        local_rank (int): The local rank of the process.
+        model_parallel_size (int): The number of ranks in the model parallel group.
+
+    Returns:
+        int: The model parallel rank corresponding to the given local rank.
+    """
     return local_rank % model_parallel_size
 
 
 def get_megatron_pretrained_bert_models() -> List[str]:
+    """Retrieves the names of all available pretrained Megatron-BERT models.
+
+    This function uses the NeMo MegatronBertModel class to list all available
+    pretrained model configurations, extracting each model's name.
+
+    Returns:
+        List[str]: A list of pretrained Megatron-BERT model names.
+    """
     from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
 
     all_pretrained_megatron_bert_models = [
@@ -207,6 +224,8 @@ def _download(path: str, url: str):
         os.makedirs(MEGATRON_CACHE, exist_ok=True)
         logging.info(f"Downloading from {url} to {path}")
         downloaded_path = wget.download(url)
+        if not os.path.exists(downloaded_path):
+            raise FileNotFoundError(f"Downloaded file not found: {downloaded_path}")
         shutil.move(downloaded_path, path)
     # wait until the master process downloads the file and writes it to the cache dir
     if torch.distributed.is_initialized():
@@ -230,12 +249,12 @@ def is_lower_cased_megatron(pretrained_model_name):
 
 def get_megatron_tokenizer(pretrained_model_name: str):
     """
-    Takes a pretrained_model_name for megatron such as "megatron-bert-cased" and returns the according 
+    Takes a pretrained_model_name for megatron such as "megatron-bert-cased" and returns the according
     tokenizer name for tokenizer instantiating.
 
     Args:
         pretrained_model_name: pretrained_model_name for megatron such as "megatron-bert-cased"
-    Returns: 
+    Returns:
         tokenizer name for tokenizer instantiating
     """
     _check_megatron_name(pretrained_model_name)

From d2f7b8e50f47ec1c2ff3d7f25013aa89df61e4db Mon Sep 17 00:00:00 2001
From: Roman Korostik <rkorostik@nvidia.com>
Date: Wed, 12 Feb 2025 15:56:52 +0400
Subject: [PATCH 09/14] AudioToAudioModel: fix model->dataloader sample_rate
 parameter injection (#12092)

* AudioToAudioModel: fix model->dataloader sample_rate parameter injection

Signed-off-by: Roman Korostik <rkorostik@nvidia.com>

* AudioToAudioModel: import missing type (PretrainedModelInfo)

Signed-off-by: Roman Korostik <rkorostik@nvidia.com>

---------

Signed-off-by: Roman Korostik <rkorostik@nvidia.com>
---
 nemo/collections/audio/models/audio_to_audio.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/audio/models/audio_to_audio.py b/nemo/collections/audio/models/audio_to_audio.py
index 41125a81035b..28109f27b7f2 100644
--- a/nemo/collections/audio/models/audio_to_audio.py
+++ b/nemo/collections/audio/models/audio_to_audio.py
@@ -33,6 +33,7 @@
 from nemo.collections.audio.metrics.audio import AudioMetricWrapper
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes import ModelPT
+from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging, model_utils
 
 __all__ = ['AudioToAudioModel']
@@ -180,6 +181,9 @@ def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):
         return self.multi_evaluation_epoch_end(outputs, dataloader_idx, 'test')
 
     def _setup_dataloader_from_config(self, config: Optional[Dict]):
+        # TODO: Consider moving `inject` from `audio_to_text_dataset` to a utility module?
+        # Automatically inject args from model config to dataloader config
+        inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate')
 
         if config.get("use_lhotse", False):
             return get_lhotse_dataloader_from_config(
@@ -190,10 +194,6 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
         if is_concat:
             raise NotImplementedError('Concat not implemented')
 
-        # TODO: Consider moving `inject` from `audio_to_text_dataset` to a utility module?
-        # Automatically inject args from model config to dataloader config
-        inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate')
-
         # Instantiate tarred dataset loader or normal dataset loader
         if config.get('is_tarred', False):
             raise NotImplementedError('Tarred datasets not supported')

From e44633777923b99888bdc8c4187385c3e809b0e1 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Wed, 12 Feb 2025 08:12:55 -0800
Subject: [PATCH 10/14] interface for asymmetric pipeline schedule (#12039)

* interface for asymmetric pipeline schedule

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: erhoo82 <erhoo82@users.noreply.github.com>

* linting fix

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: erhoo82 <erhoo82@users.noreply.github.com>

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: erhoo82 <erhoo82@users.noreply.github.com>
Co-authored-by: erhoo82 <erhoo82@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/base.py        |   5 +-
 nemo/collections/llm/recipes/llama31_405b.py  |   6 +-
 .../language_modeling/megatron_base_model.py  |  65 +++++++----
 .../language_modeling/megatron_gpt_model.py   | 104 ++++++++++--------
 .../pytorch/strategies/megatron_strategy.py   |  12 ++
 5 files changed, 122 insertions(+), 70 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 4af9c6c1263b..b92ca669db49 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -193,7 +193,10 @@ def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MC
             )
 
         vp_size = self.virtual_pipeline_model_parallel_size
-        if vp_size:
+        is_pipeline_asymmetric = getattr(self, 'account_for_embedding_in_pipeline_split', False) or getattr(
+            self, 'account_for_loss_in_pipeline_split', False
+        )
+        if vp_size and not is_pipeline_asymmetric:
             p_size = self.pipeline_model_parallel_size
             assert (
                 self.num_layers // p_size
diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py
index d60bbf54f8f0..d3dd2185efc8 100644
--- a/nemo/collections/llm/recipes/llama31_405b.py
+++ b/nemo/collections/llm/recipes/llama31_405b.py
@@ -64,11 +64,13 @@ def model() -> run.Config[pl.LightningModule]:
 
 def trainer(
     tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 9,
+    pipeline_parallelism: int = 8,
     pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = 2,
     context_parallelism: int = 4,
     sequence_parallelism: bool = True,
+    account_for_embedding_in_pipeline_split: bool = True,
+    account_for_loss_in_pipeline_split: bool = True,
     num_nodes: int = 72,
     num_gpus_per_node: int = 8,
     max_steps: int = 1168251,
@@ -113,6 +115,8 @@ def trainer(
         virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
         context_parallel_size=context_parallelism,
         sequence_parallel=sequence_parallelism,
+        account_for_embedding_in_pipeline_split=account_for_embedding_in_pipeline_split,
+        account_for_loss_in_pipeline_split=account_for_loss_in_pipeline_split,
         gradient_as_bucket_view=True,
         ckpt_async_save=True,
         ckpt_parallel_load=True,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 122c86614311..3df28212c899 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -101,15 +101,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
 
         if not HAVE_MEGATRON_CORE:
             raise ImportError(
-                "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+                "megatron-core was not found. Please see the NeMo README for installation instructions: "
+                "https://github.com/NVIDIA/NeMo#megatron-gpt."
             )
 
         if trainer is None:
-            raise ValueError(f"Trainer cannot be None for Megatron-based models. Please provide a PTL trainer object.")
+            raise ValueError("Trainer cannot be None for Megatron-based models. Please provide a PTL trainer object.")
 
         if cfg.get('use_flash_attention', False) and not HAVE_FLASH_ATTENTION:
             raise ImportError(
-                "flash_attn was not found. Please see the installation instructions: https://github.com/HazyResearch/flash-attention."
+                "flash_attn was not found. Please see the installation instructions: "
+                "https://github.com/HazyResearch/flash-attention."
                 "If you use flash_attn with triton. Please install triton==2.0.0.dev20221202."
             )
 
@@ -182,9 +184,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
             if vp_size == 1:
                 vp_size = None
             else:
-                assert (
-                    self.cfg.num_layers // self.cfg.pipeline_model_parallel_size
-                ) % vp_size == 0, 'Make sure the number of model chunks is the same across all pipeline stages.'
+                if not (
+                    self.cfg.get('account_for_embedding_in_pipeline_split', False)
+                    and self.cfg.get('account_for_loss_in_pipeline_split', False)
+                ):
+                    assert (
+                        self.cfg.num_layers // self.cfg.pipeline_model_parallel_size
+                    ) % vp_size == 0, 'Make sure the number of model chunks is the same across all pipeline stages.'
 
         initialize_model_parallel_for_nemo(
             world_size=init_world_size,
@@ -252,7 +258,7 @@ def setup_transformer_engine_tp_groups(self):
         """
         for module in self.get_model_module_list():
             """Set TP group
-            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398 # pylint: disable=line-too-long
             """
             # Deep iterate but skip self to avoid infinite recursion.
             for index, child in enumerate(module.modules()):
@@ -270,7 +276,7 @@ def setup_transformer_engine_cp_groups(self):
 
         for module in self.get_model_module_list():
             """Set context parallel running
-            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py # pylint: disable=line-too-long
             """
             # Deep iterate but skip self to avoid infinite recursion.
             for index, child in enumerate(module.modules()):
@@ -345,7 +351,8 @@ def _reconfigure_limit_batches(self, limit_batches, dataloader, mode):
         """
         Reconfigure trainer.limit_val_batches for pretraining
         """
-        # Override limit_batches in terms of num microbatches and so there are limit_batches//num_micro_batches num of global batches
+        # Override limit_batches in terms of num microbatches
+        # and so there are limit_batches//num_micro_batches num of global batches
         if isinstance(limit_batches, int):
             limit_batches *= get_num_microbatches()
         else:
@@ -539,6 +546,9 @@ def build_transformer_config(self) -> TransformerConfig:
 
         tp_only_amax_red = self.cfg.get('tp_only_amax_red', False)
 
+        account_for_embedding_in_pipeline_split = self.cfg.get('account_for_embedding_in_pipeline_split', False)
+        account_for_loss_in_pipeline_split = self.cfg.get('account_for_loss_in_pipeline_split', False)
+
         attention_backend = self.cfg.get('attention_backend', "auto")
         attention_backend = AttnBackend[attention_backend]
 
@@ -566,6 +576,8 @@ def build_transformer_config(self) -> TransformerConfig:
             'rotary_interleaved': rotary_interleaved,
             'deallocate_pipeline_outputs': True,
             'tp_only_amax_red': tp_only_amax_red,
+            'account_for_embedding_in_pipeline_split': account_for_embedding_in_pipeline_split,
+            'account_for_loss_in_pipeline_split': account_for_loss_in_pipeline_split,
             'attention_backend': attention_backend,
         }
 
@@ -609,7 +621,8 @@ def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by
         multiple = make_vocab_size_divisible_by * tensor_model_parallel_size
         after = ((after + multiple - 1) // multiple) * multiple
         logging.info(
-            f'Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, dummy tokens: {after - orig_vocab_size}.'
+            f"Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, "
+            f"dummy tokens: {after - orig_vocab_size}."
         )
         return after
 
@@ -664,7 +677,7 @@ def configure_gradient_clipping(self, *args, **kwargs):
 
     def allreduce_gradients(self):
         """Reduce gradients across data parallel ranks.
-        Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
+        Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188 # pylint: disable=line-too-long
         """
         # Bucketize and all-reduce
         buckets = {}
@@ -836,7 +849,8 @@ def configure_optimizers(self):
                 # TODO: contiguous grad bucket for fp16 is also planned to be supported
                 contiguous_grad_bucket = False
                 raise ValueError(
-                    "fp16 training is not yet supported with O2. Please set megatron_amp_O2 to False in the model config."
+                    "fp16 training is not yet supported with O2."
+                    "Please set megatron_amp_O2 to False in the model config."
                 )
 
             # if using tensor parallel only, we automatically use async grad all-reduce
@@ -974,7 +988,8 @@ def _validate_and_override_config(self):
 
         if self.cfg.get('sequence_parallel', False) and self.cfg.get('tensor_model_parallel_size', 1) == 1:
             logging.info(
-                "Sequence parallel should only be used with tensor parallel size > 1. Setting sequence parallel to False"
+                "Sequence parallel should only be used with tensor parallel size > 1. "
+                "Setting sequence parallel to False"
             )
             with open_dict(self.cfg):
                 self.cfg.sequence_parallel = False
@@ -993,7 +1008,8 @@ def _validate_and_override_config(self):
         if self.cfg.get('gradient_accumulation_fusion', False):
             if data_parallel_size > 1 and pipeline_model_parallel_size == 1 and not distributed_fused_adam:
                 logging.info(
-                    "When not using pipeline model parallel, gradient accumulation fusion can only be used with distributed_fused_adam."
+                    "When not using pipeline model parallel, "
+                    "gradient accumulation fusion can only be used with distributed_fused_adam."
                 )
                 with open_dict(self.cfg):
                     self.cfg.gradient_accumulation_fusion = False
@@ -1015,9 +1031,13 @@ def _validate_and_override_config(self):
             if vp_size == 1:
                 self.cfg['virtual_pipeline_model_parallel_size'] = None
             else:
-                assert (
-                    self.cfg.num_layers // self.cfg.pipeline_model_parallel_size
-                ) % vp_size == 0, 'Make sure the number of model chunks is the same across all pipeline stages.'
+                if not (
+                    self.cfg.get('account_for_embedding_in_pipeline_split', False)
+                    and self.cfg.get('account_for_loss_in_pipeline_split', False)
+                ):
+                    assert (
+                        self.cfg.num_layers // self.cfg.pipeline_model_parallel_size
+                    ) % vp_size == 0, 'Make sure the number of model chunks is the same across all pipeline stages.'
 
         if self.cfg.get('ub_tp_comm_overlap', False):
             if not self.cfg.get('sequence_parallel', False):
@@ -1110,7 +1130,8 @@ def _get_total_params_across_model_parallel_groups_enc_dec(self, model):
             parallel_state.get_pipeline_model_parallel_rank() == self.cfg.get('pipeline_model_parallel_split_rank', 0)
             or parallel_state.is_pipeline_last_stage()
         ):
-            # If the current rank is the in the decoder first stage (decoder emb) or last rank (output layer), subtract those weights since it is already accounted for in the encoder first stage.
+            # If the current rank is the in the decoder first stage (decoder emb) or last rank (output layer),
+            # subtract those weights since it is already accounted for in the encoder first stage.
             # TODO: If we support embedding untying with PP > 1, we will need to update this.
             num_word_embedding_parameters = sum([p.nelement() for p in model.word_embeddings_weight()])
             num_parameters_on_device -= num_word_embedding_parameters
@@ -1167,7 +1188,7 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
         config_mapping = {
             "perform_initialization": True,  # initailize weights when constructing the module
             "fp16": self.torch_dtype == torch.float16
-            and megatron_amp_O2,  # NeMo does not currently support fp16 training with megatron amp O2, eval and inference is supported
+            and megatron_amp_O2,  # fp16 training with megatron amp O2 not supported, eval and inference is supported
             "bf16": self.torch_dtype == torch.bfloat16 and megatron_amp_O2,
             "params_dtype": self.params_dtype,
             "timers": self.megatron_timers,
@@ -1216,7 +1237,8 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
             setattr(model_parallel_config, 'hidden_size', self.cfg.hidden_size)
         except AttributeError:
             logging.warning(
-                f'hidden_size not found in {self.cfg}. Set this in model_parallel_config if using pipeline parallelism.'
+                f'hidden_size not found in {self.cfg}. '
+                'Set this in model_parallel_config if using pipeline parallelism.'
             )
 
         return model_parallel_config
@@ -1299,7 +1321,8 @@ def find_frozen_submodules(model):
                 logging.debug(f"Ignoring state {submodule} in FSDP.")
             self.trainer.strategy.kwargs['ignored_states'] = frozen_submodules
             # FSDP requires uniform status of require_grads
-            # Diffusion models like SD has frozen parts and needs to be added to 'ignored_states' from sharding for FSDP to work
+            # Diffusion models like SD has frozen parts and needs to be added to 'ignored_states'
+            # from sharding for FSDP to work
             self.model = self.trainer.strategy._setup_model(self.model)
             # Move the CPU-initialized model (with `use_cpu_initialization=True`) to GPU, which is to avoid
             # out-of-memory carash before sharding. In case of GPU-initialized model, this is no-op.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 0ebf1ba17ddb..78171e4ed605 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -17,7 +17,6 @@
 import queue
 import warnings
 from contextlib import nullcontext
-from dataclasses import fields
 from functools import cache, partial
 from importlib.metadata import version
 from typing import Any, Dict, Iterator, List, Optional, Union
@@ -25,9 +24,7 @@
 import packaging
 import torch
 from lightning.pytorch.accelerators import CPUAccelerator
-from lightning.pytorch.loops.fetchers import _DataFetcherWrapper
 from lightning.pytorch.trainer.trainer import Trainer
-from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
 
 from nemo.collections.common.parts.utils import apply_rope_scaling, extend_instance
@@ -69,7 +66,7 @@
     TextGeneration,
 )
 from nemo.collections.nlp.parts import utils_funcs
-from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank
+from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.core.classes import Exportable
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.neural_types import ChannelType, NeuralType
@@ -78,8 +75,7 @@
 from nemo.utils.te_utils import is_float8tensor, te_version
 
 try:
-    import megatron.core as core
-    from megatron.core import InferenceParams, parallel_state, tensor_parallel
+    from megatron.core import InferenceParams, parallel_state
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
     from megatron.core.datasets.utils import get_blend_from_list
@@ -98,13 +94,7 @@
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
-    from megatron.core.utils import (
-        drain_embedding_wgrad_compute,
-        get_model_config,
-        init_method_normal,
-        is_te_min_version,
-        scaled_init_method_normal,
-    )
+    from megatron.core.utils import drain_embedding_wgrad_compute, get_model_config, is_te_min_version
 
     HAVE_MEGATRON_CORE = True
 
@@ -143,14 +133,14 @@ def mcore_supports_moe() -> bool:
     if not HAVE_MEGATRON_CORE:
         return False
     try:
-        from megatron.core.transformer.moe.router import TopKRouter
+        from megatron.core.transformer.moe.router import TopKRouter  # noqa: F401
 
         return True
     except ImportError:
         return False
 
 
-## TODO: This function will not work if TE is not installed
+# TODO: This function will not work if TE is not installed
 def get_specs(spec_name, transformer_config=None, use_te=True, hyena_cfg: Dict = None, fp8=False):
     from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_spec import get_gemma2_layer_spec
 
@@ -331,7 +321,8 @@ class MegatronGPTModel(MegatronBaseModel, TextGeneration):
     def __init__(self, cfg: DictConfig, trainer: Trainer):
         if not HAVE_MEGATRON_CORE:
             logging.warning(
-                "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+                "megatron-core was not found. Please see the NeMo README for installation instructions:"
+                "https://github.com/NVIDIA/NeMo#megatron-gpt."
             )
         # this prevents base constructor from initializing tokenizer
         self.tokenizer = None
@@ -371,7 +362,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         if self.cfg.get('expert_model_parallel_size', 1) > 1 and self.with_distributed_adam:
             if not self.use_mcore_dist_optim:
                 raise ValueError(
-                    'Expert parallelism is currently not supporting Apex distributed optimizer, use Mcore distributed optimizer instead'
+                    'Expert parallelism is currently not supporting Apex distributed optimizer,'
+                    'use Mcore distributed optimizer instead'
                 )
 
         if self.cfg.optim.get('overlap_param_gather_with_optimizer_step', False):
@@ -424,7 +416,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         if self.megatron_amp_O2:
 
             if not self.with_distributed_adam and not self.cfg.get("use_cpu_initialization", False):
-                # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
+                # Pre-allocate the model on GPU to have master parameters allocated
+                # on the same device with matching data type
                 if isinstance(self.model, list):
                     for module in self.model:
                         module.cuda(torch.cuda.current_device())
@@ -471,7 +464,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.reset_lr_steps = self.cfg.get('reset_lr_steps', False)
         if self.reset_lr and (not self.with_distributed_adam or not self.megatron_amp_O2):
             raise ValueError(
-                'Learning rate reset feature is only supported with the distributed optmizer and megatron_amp_O2 for now.'
+                'Learning rate reset feature is only supported with the distributed optmizer'
+                'and megatron_amp_O2 for now.'
             )
 
         # default to false since this doesn't work with sequence parallelism currently
@@ -805,7 +799,8 @@ def initialize_ub_func(self):
         ub_cfgs = self.cfg.get('ub_tp_comm_overlap_cfg', None)
         if ub_cfgs is None:
             warnings.warn(
-                "Couldn't find TP config. Please check the path correctness. Initializing TP comm overlap with the default config."
+                "Couldn't find TP config. Please check the path correctness."
+                "Initializing TP comm overlap with the default config."
             )
 
         input_shape = [
@@ -1002,7 +997,7 @@ def training_step(self, dataloader_iter):
                 batch_size=1,
             )
 
-        ## logging
+        # logging
         if self.log_train_loss:
             # When using pipeline parallelism, loss is calculated only in the last pipeline stage and
             # it should be casted to other pipeline stages for logging.
@@ -1043,11 +1038,11 @@ def training_step(self, dataloader_iter):
         if self.rampup_batch_size:
             self.prev_global_batch_size = current_global_batch_size
             self.prev_consumed_samples = consumed_samples
-            num_microbatch_calculator.update(
+            num_microbatch_calculator.update(  # noqa: F821
                 consumed_samples=consumed_samples,
                 consistency_check=False,
             )
-            current_global_batch_size = num_microbatch_calculator.current_global_batch_size
+            current_global_batch_size = num_microbatch_calculator.current_global_batch_size  # noqa: F821
             self.log('global_batch_size', current_global_batch_size, prog_bar=True, rank_zero_only=True, batch_size=1)
             self.if_first_step = 1
 
@@ -1120,7 +1115,7 @@ def allreduce_fsdp_sharding_omitted_gradients(self):
 
     def allreduce_first_last_embeddings(self):
 
-        # Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/training.py#L407
+        # Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/training.py#L407 # pylint: disable=line-too-long
         # All-reduce word_embeddings' grad across first and last stages to ensure
         # that word_embeddings parameters stay in sync.
         # This should only run for models that support pipelined model parallelism
@@ -1141,7 +1136,8 @@ def allreduce_first_last_embeddings(self):
                 word_embeddings_weight = (
                     module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight()
                 )
-                # (@adithyare) adapter training now extends MegatronGPTModel so we have to add this check here to ensure we do not perform all_reduce when grad is None.
+                # (@adithyare) adapter training now extends MegatronGPTModel so we have to add this
+                # check here to ensure we do not perform all_reduce when grad is None.
                 # grad can be None when performing PeFT training.
                 if word_embeddings_weight.requires_grad:
                     if self.megatron_amp_O2:
@@ -1351,7 +1347,8 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                             import transformer_engine_torch as tex
                         except ModuleNotFoundError as e:
                             logging.error(
-                                "Please update Transformer Engine to >= 1.10 to use Context Parallel with THD format data"
+                                "Please update Transformer Engine to >= 1.10 "
+                                "to use Context Parallel with THD format data"
                             )
                             raise e
                         cp_rank = parallel_state.get_context_parallel_rank()
@@ -1402,7 +1399,8 @@ def loss_func(output_tensor):
                 loss_for_ub = self.loss_func(batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor)
                 cp_size = parallel_state.get_context_parallel_world_size()
                 if isinstance(loss_for_ub, dict):
-                    # TODO: need a better way to check if loss_func is returning more stuff than just loss... (@adithyare)
+                    # TODO: need a better way to check if loss_func is returning
+                    # more stuff than just loss... (@adithyare)
 
                     if set(loss_for_ub.keys()) == set(
                         ["loss", "query_hs", "pos_doc_hs", "pos_cs", "neg_cs", "diff_cs"]
@@ -1459,7 +1457,8 @@ def loss_func(output_tensor):
                             torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(),
                         ]
                     )
-                    # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds)
+                    # Could potentially reduce num_valid_samples_in_microbatch and use that to
+                    # aggregate instead of len(self._validation_ds)
                     torch.distributed.all_reduce(
                         loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group()
                     )
@@ -1641,10 +1640,12 @@ def build_train_valid_test_datasets(self):
             test_iters * global_batch_size,
         ]
 
-        # The line below exploits a quirk in mcore dataset construction, to make number of epochs for validation and test equal to 1
-        # The mcore dataset implementation uses the number N we provide via train_valid_test_num_samples to derive parameter E such that
+        # The line below exploits a quirk in mcore dataset construction, to make number of epochs
+        # for validation and test equal to 1. The mcore dataset implementation uses the number N we
+        # provide via train_valid_test_num_samples to derive parameter E such that
         # E = argmin_e e * N_d >= N, or equivalently E = ceildiv(N, N_d)
-        # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below).
+        # Where N_d is the total number of samples in a dataset (files), and N is the requested
+        # number of samples (provided for every split in the list below).
         # Setting N = 1 we force E to be 1 as well
         legacy_dataset = self.cfg.data.get("legacy_dataset", False)
         if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
@@ -1723,7 +1724,7 @@ def build_train_valid_test_datasets(self):
             logging.info(f'Length of val dataset: {len(self._validation_ds)}')
         if self._test_ds is not None:
             logging.info(f'Length of test dataset: {len(self._test_ds)}')
-        logging.info(f'Finished building GPT datasets.')
+        logging.info('Finished building GPT datasets.')
 
         return self._train_ds, self._validation_ds, self._test_ds
 
@@ -1815,7 +1816,8 @@ def setup(self, stage=None):
             self.setup_test_data(self.cfg.data)
             # Override limit_train_batches in terms of num of microbatches
             self._reconfigure_limit_batches(self.trainer.limit_train_batches, self._train_dl, 'train')
-            # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
+            # Override limit_val_batches to be a multiple of num microbatches to prevent
+            # val_step from exiting in between a step
             self._reconfigure_limit_batches(self.trainer.limit_val_batches, self._validation_dl, 'val')
 
         # Data cache generation only
@@ -1835,7 +1837,8 @@ def setup_training_data(self, cfg):
         if hasattr(self, '_train_ds'):
             consumed_samples = self.compute_consumed_samples(0)
             logging.info(
-                f'Setting up train dataloader with len(len(self._train_ds)): {len(self._train_ds)} and consumed samples: {consumed_samples}'
+                'Setting up train dataloader with len(len(self._train_ds)): '
+                f'{len(self._train_ds)} and consumed samples: {consumed_samples}'
             )
             self._train_dl = self.build_pretraining_data_loader(self._train_ds, consumed_samples)
 
@@ -1843,12 +1846,13 @@ def setup_validation_data(self, cfg):
         if hasattr(self, '_validation_ds'):
             consumed_samples = 0
             logging.info(
-                f'Setting up validation dataloader with len(len(self._validation_ds)): {len(self._validation_ds)} and consumed samples: {consumed_samples}'
+                'Setting up validation dataloader with len(len(self._validation_ds)): '
+                f'{len(self._validation_ds)} and consumed samples: {consumed_samples}'
             )
 
             drop_last = True
             if not self.validation_drop_last:
-                logging.info(f'Drop last in validation dataset is set to False')
+                logging.info('Drop last in validation dataset is set to False')
                 drop_last = False
             pad_samples_to_global_batch_size = False
             if self.cfg.data.get('pad_samples_to_global_batch_size', False):
@@ -1864,7 +1868,8 @@ def setup_test_data(self, cfg):
             if self._test_ds is not None:
                 consumed_samples = 0
                 logging.info(
-                    f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}'
+                    'Setting up test dataloader with len(len(self._test_ds)): '
+                    f'{len(self._test_ds)} and consumed samples: {consumed_samples}'
                 )
                 self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples)
             else:
@@ -1934,7 +1939,7 @@ def list_available_models(self):
         return None
 
     def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any:
-        """PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
+        """PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device # pylint: disable=line-too-long
         When using pipeline parallelism, we need the global batch to remain on the CPU,
         since the memory overhead will be too high when using a large number of microbatches.
         Microbatches are transferred from CPU to GPU inside the pipeline.
@@ -1947,7 +1952,7 @@ def _validate_trainer(self):
         """
         if self.trainer.accumulate_grad_batches > 1:
             raise ValueError(
-                f'Gradient accumulation is done within training_step. trainer.accumulate_grad_batches must equal 1'
+                'Gradient accumulation is done within training_step. trainer.accumulate_grad_batches must equal 1'
             )
 
     @classmethod
@@ -1961,7 +1966,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
         result.append(
             PretrainedModelInfo(
                 pretrained_model_name="megatron_gpt_345m",
-                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/megatron_gpt_345m/versions/1/files/megatron_gpt_345m.nemo",
+                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/megatron_gpt_345m/versions/1/files/megatron_gpt_345m.nemo",  # pylint: disable=line-too-long
                 description="345M parameter GPT generative Megatron model.",
             )
         )
@@ -2010,7 +2015,8 @@ def on_load_checkpoint(self, checkpoint) -> None:
                         missing_keys, expected_keys = module.load_state_dict(checkpoint_state_dict, strict=False)
                         if all(s.endswith('_extra_state') for s in missing_keys):
                             logging.warning(
-                                f'Loding checkpoint created with Transformer Engine version lower than 1.13. Missing layers {missing_keys} will be ignored.'
+                                'Loding checkpoint created with Transformer Engine version lower than 1.13.'
+                                f'Missing layers {missing_keys} will be ignored.'
                             )
                         else:
                             raise e
@@ -2199,11 +2205,15 @@ def build_transformer_config(self) -> TransformerConfig:
         For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
         """
 
-        if self.cfg.num_layers % self.cfg.get('pipeline_model_parallel_size', 1) != 0:
-            raise ValueError(
-                f"num_layers ({self.cfg.num_layers}) should be divisible by "
-                f"pipeline_model_parallel_size ({self.cfg.get('pipeline_model_parallel_size', 1)})"
-            )
+        if not (
+            self.cfg.get('account_for_embedding_in_pipeline_split', False)
+            and self.cfg.get('account_for_loss_in_pipeline_split', False)
+        ):
+            if self.cfg.num_layers % self.cfg.get('pipeline_model_parallel_size', 1) != 0:
+                raise ValueError(
+                    f"num_layers ({self.cfg.num_layers}) should be divisible by "
+                    f"pipeline_model_parallel_size ({self.cfg.get('pipeline_model_parallel_size', 1)})"
+                )
 
         normalization = self.cfg.get('normalization', 'layernorm').lower()
         layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' or self.cfg.get(
@@ -2231,7 +2241,7 @@ def build_transformer_config(self) -> TransformerConfig:
         elif self.cfg.get('fp8_hybrid', False):
             fp8 = 'hybrid'
         else:
-            raise ValueError(f"fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.")
+            raise ValueError("fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.")
 
         if self.cfg.get('enable_cuda_graph', False):
             assert HAVE_TE, "Transformer Engine is required for cudagraphs."
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index 752b1d9853b3..9fde044cb14f 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -109,6 +109,8 @@ class ParallelismConfig:
     pipeline_dtype: torch.dtype
     encoder_tensor_model_parallel_size: int = 0
     encoder_pipeline_model_parallel_size: int = 0
+    account_for_embedding_in_pipeline_split: bool = False
+    account_for_loss_in_pipeline_split: bool = False
     use_te_rng_tracker: bool = False
     expert_tensor_parallel_size: int = None
     use_tp_pp_dp_mapping: bool = False
@@ -139,6 +141,10 @@ class MegatronStrategy(DDPStrategy, io.IOMixin):
             Defaults to 1.
         expert_tensor_parallel_size (Optional[int]): Sets MoE Experts tensor parallelism size. Defaults to None.
         moe_extended_tp (bool): Alternative parallelization strategy for expert parallelism. Defaults to False.
+        account_for_embedding_in_pipeline_split (bool): If set, *input* embedding layer will be treated as a standard
+            transformer layer in the context of partition and placement for pipeline parallelism.
+        account_for_loss_in_pipeline_split (bool): If set, loss layer will be treated as a standard transformer
+            layer in the context of partition and placement for pipeline parallelism.
         data_sampler (Optional['DataSampler']): Custom data sampler for distributed training. Defaults to None.
         parallel_devices (Optional[List[torch.device]]): List of devices to use for parallelism. Defaults to None.
         cluster_environment: Cluster environment for distributed training. Defaults to None.
@@ -212,6 +218,8 @@ def __init__(
         expert_tensor_parallel_size: int = None,
         encoder_tensor_model_parallel_size: Optional[int] = 0,
         encoder_pipeline_model_parallel_size: Optional[int] = 0,
+        account_for_embedding_in_pipeline_split: bool = False,
+        account_for_loss_in_pipeline_split: bool = False,
         data_sampler: Optional["DataSampler"] = None,
         parallel_devices: Optional[List[torch.device]] = None,
         cluster_environment=None,  # TODO: Add type-hint
@@ -267,6 +275,8 @@ def __init__(
         self.sequence_parallel = sequence_parallel
         self.encoder_tensor_model_parallel_size = encoder_tensor_model_parallel_size
         self.encoder_pipeline_model_parallel_size = encoder_pipeline_model_parallel_size
+        self.account_for_embedding_in_pipeline_split = account_for_embedding_in_pipeline_split
+        self.account_for_loss_in_pipeline_split = account_for_loss_in_pipeline_split
         self.lazy_init = lazy_init
         self.ckpt_load_optimizer = ckpt_load_optimizer
         self.ckpt_save_optimizer = ckpt_save_optimizer
@@ -941,6 +951,8 @@ def parallelism(self) -> ParallelismConfig:
             moe_extended_tp=self.moe_extended_tp,
             encoder_tensor_model_parallel_size=self.encoder_tensor_model_parallel_size,
             encoder_pipeline_model_parallel_size=self.encoder_pipeline_model_parallel_size,
+            account_for_embedding_in_pipeline_split=self.account_for_embedding_in_pipeline_split,
+            account_for_loss_in_pipeline_split=self.account_for_loss_in_pipeline_split,
             pipeline_dtype=self.pipeline_dtype,
             use_te_rng_tracker=self.use_te_rng_tracker,
             use_tp_pp_dp_mapping=self.use_tp_pp_dp_mapping,

From 83d935fc2c4a6d645052f4aa566910334066f725 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Wed, 12 Feb 2025 11:53:31 -0500
Subject: [PATCH 11/14] skip initialization in hf export (#12136)

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/lightning/io/connector.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 602551ae4479..8d1d957ec642 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -15,7 +15,7 @@
 import logging
 import os
 import shutil
-from pathlib import Path, PosixPath, PurePath, WindowsPath
+from pathlib import Path, PosixPath, WindowsPath
 from typing import Generic, Optional, Tuple, TypeVar
 
 import lightning.pytorch as pl
@@ -69,9 +69,11 @@ class Connector(BasePath, Generic[SourceT, TargetT]):
     LOCK_TIMEOUT = 1200
 
     def init(self) -> TargetT:
+        """Should be implemented to initialize the target type from the source type."""
         raise NotImplementedError()
 
     def apply(self, output_path: Path) -> Path:
+        """Should be implemented to apply the transformation and save the result at the output path."""
         raise NotImplementedError()
 
     def __new__(cls, *args, **kwargs):
@@ -118,6 +120,7 @@ def __call__(self, output_path: Optional[Path] = None, overwrite: bool = False)
         return _output_path
 
     def local_path(self, base_path: Optional[Path] = None) -> Path:
+        """Computes the local path for storage based on a base path or a default cache home."""
         if base_path:
             _base = base_path
         else:
@@ -128,6 +131,7 @@ def local_path(self, base_path: Optional[Path] = None) -> Path:
         return _base / str(self).replace("://", "/")
 
     def is_in_cache(self, base_path: Optional[Path] = None) -> bool:
+        """Checks if the transformed data is already cached at the specified base path."""
         return self.local_path(base_path=base_path).exists()
 
 
@@ -145,7 +149,8 @@ class ModelConnector(Connector, Generic[SourceT, TargetT]):
             Saves the model's state to the specified path using the trainer's current strategy.
 
         nemo_load(path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True) -> Tuple[Any, pl.Trainer]:
-            Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer.
+            Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and
+            trainer.
     """
 
     def nemo_setup(
@@ -170,6 +175,7 @@ def nemo_setup(
         )
         # Note: set trainer to fitting state to avoid the following code path. Feel free to refactor if we no longer
         #  need to avoid this:
+        # pylint: disable=C0301
         #  https://github.com/NVIDIA/NeMo/blob/e35a6592f53ee34b1ec2fc3f1e009dd1ebc79e65/nemo/lightning/pytorch/strategies/megatron_strategy.py#L346-L349
         _trainer.state.fn = TrainerFn.FITTING  # needed for proper save.
 
@@ -227,6 +233,9 @@ def nemo_load(
         from nemo.lightning.io.api import load_context
 
         model = load_context(path, subpath="model")
+        # skip initialization since a checkpoint is loaded in this function
+        model.config.perform_initialization = False
+
         is_peft_ckpt = model.model_transform is not None
         callbacks = []
         if is_peft_ckpt:
@@ -285,12 +294,14 @@ def local_path(self, base_path: Optional[Path] = None) -> Path:
         return _base / str(self).replace("://", "/")
 
     def on_import_ckpt(self, model: pl.LightningModule):
+        """Called after checkpoint is imported"""
         if hasattr(self, "tokenizer"):
             model.tokenizer = self.tokenizer
             if hasattr(model, "__io__") and hasattr(self.tokenizer, '__io__'):
                 model.__io__.tokenizer = self.tokenizer.__io__
 
     def save_hf_tokenizer_assets(self, tokenizer_name_or_path, save_path="/tmp/nemo_tokenizer"):
+        """Save HF tokenizer to the imported NeMo model"""
         from transformers import AutoTokenizer
 
         tok = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True)

From d977f42ccafbbd04efcefc543cc05d6f7d965837 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Wed, 12 Feb 2025 10:51:12 -0800
Subject: [PATCH 12/14] update export io call (#12144)

* update call

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update tokenizer

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* docu

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* # noqa C0301

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* pylint

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* fix pylint

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/mixtral.py | 31 ++++++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py
index 219d9b19f6eb..8e71de144e50 100644
--- a/nemo/collections/llm/gpt/model/mixtral.py
+++ b/nemo/collections/llm/gpt/model/mixtral.py
@@ -123,6 +123,8 @@ class MixtralConfig8x22B(MixtralConfig):
 
 
 class MixtralModel(GPTModel):
+    """Mcore-based MixtralModel"""
+
     def __init__(
         self,
         config: Optional[Union[MixtralConfig8x7B, MixtralConfig8x22B]] = None,
@@ -130,6 +132,7 @@ def __init__(
         tokenizer: Optional["TokenizerSpec"] = None,
         model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
     ):
+        """Mcore-based MixtralModel ctor"""
         super().__init__(
             config or MixtralConfig8x7B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform
         )
@@ -137,10 +140,14 @@ def __init__(
 
 @io.model_importer(MixtralModel, ext="hf")
 class HFMixtralImporter(io.ModelConnector["MixtralForCausalLM", MixtralModel]):
+    """HF to NeMo importer"""
+
     def init(self) -> MixtralModel:
+        """init"""
         return MixtralModel(self.config, tokenizer=self.tokenizer)
 
     def apply(self, output_path: Path) -> Path:
+        """Import model from HF"""
         from transformers import MixtralForCausalLM
 
         source = MixtralForCausalLM.from_pretrained(str(self), torch_dtype='auto', use_safetensors=True)
@@ -155,12 +162,13 @@ def apply(self, output_path: Path) -> Path:
         return output_path
 
     def convert_state(self, source, target):
+        """State-dict converter"""
         mapping = {
             "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
             "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
             "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.pre_mlp_layernorm.weight",
             # MoE
-            "model.layers.*.block_sparse_moe.experts.*.w2.weight": "decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight",
+            "model.layers.*.block_sparse_moe.experts.*.w2.weight": "decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight",  # pylint: disable=line-too-long
             "model.layers.*.block_sparse_moe.gate.weight": "decoder.layers.*.mlp.router.weight",
             # lm-head
             "model.norm.weight": "decoder.final_layernorm.weight",
@@ -175,12 +183,14 @@ def convert_state(self, source, target):
 
     @property
     def tokenizer(self) -> "AutoTokenizer":
+        """Configures tokenizer"""
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
         return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)))
 
     @property
     def config(self) -> MixtralConfig8x7B | MixtralConfig8x22B:
+        """Returns Mcore config from HF"""
         from transformers import MixtralConfig as HfMixtralConfig
 
         config = HfMixtralConfig.from_pretrained(str(self))
@@ -226,6 +236,7 @@ def config(self) -> MixtralConfig8x7B | MixtralConfig8x22B:
     target_key="embedding.word_embeddings.weight",
 )
 def _import_embedding(ctx: io.TransformCTX, embedding):
+    """_import_embedding"""
     embedding_weight = ctx.source.model.embed_tokens.weight
     vocab_size = embedding_weight.shape[0]
     ctx.target_state['embedding.word_embeddings.weight'][:vocab_size, :].copy_(embedding_weight)
@@ -237,6 +248,7 @@ def _import_embedding(ctx: io.TransformCTX, embedding):
     target_key="output_layer.weight",
 )
 def _import_lm_head(ctx: io.TransformCTX, embedding):
+    """import head"""
     lm_head_weight = ctx.source.lm_head.weight
     vocab_size = lm_head_weight.shape[0]
     ctx.target_state['output_layer.weight'][:vocab_size, :].copy_(lm_head_weight)
@@ -252,6 +264,7 @@ def _import_lm_head(ctx: io.TransformCTX, embedding):
     target_key="decoder.layers.*.self_attention.linear_qkv.weight",
 )
 def _import_qkv(ctx: io.TransformCTX, q, k, v):
+    """import qkv"""
     megatron_config = ctx.target.config
 
     head_num = megatron_config.num_attention_heads
@@ -293,12 +306,16 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
     target_key="decoder.layers.*.mlp.experts.local_experts.*.linear_fc1.weight",
 )
 def _import_moe_w1_w3(gate_proj, up_proj):
+    """_import_moe_w1_w3"""
     return torch.cat((gate_proj, up_proj), axis=0)
 
 
 @io.model_exporter(MixtralModel, "hf")
 class HFMixtralExporter(io.ModelConnector[MixtralModel, "MixtralForCausalLM"]):
+    """NeMo to HF exporter"""
+
     def init(self) -> "MixtralForCausalLM":
+        """HFMixtralExporter initialization"""
         from transformers import AutoModelForCausalLM
         from transformers.modeling_utils import no_init_weights
 
@@ -306,6 +323,7 @@ def init(self) -> "MixtralForCausalLM":
             return AutoModelForCausalLM.from_config(self.config)
 
     def apply(self, output_path: Path) -> Path:
+        """export to hf format"""
         # TODO: Make it work with lazy init
         # with torch.device("meta"):
         #     target = self.init()
@@ -321,12 +339,13 @@ def apply(self, output_path: Path) -> Path:
         return output_path
 
     def convert_state(self, source, target):
+        """convert state"""
         mapping = {
             "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
             "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
             "decoder.layers.*.pre_mlp_layernorm.weight": "model.layers.*.post_attention_layernorm.weight",
             # MoE
-            "decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight": "model.layers.*.block_sparse_moe.experts.*.w2.weight",
+            "decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight": "model.layers.*.block_sparse_moe.experts.*.w2.weight",  # pylint: disable=line-too-long
             "decoder.layers.*.mlp.router.weight": "model.layers.*.block_sparse_moe.gate.weight",
             # lm-head
             "decoder.final_layernorm.weight": "model.norm.weight",
@@ -341,12 +360,14 @@ def convert_state(self, source, target):
 
     @property
     def tokenizer(self):
-        return io.load_ckpt(str(self)).model.tokenizer.tokenizer
+        """return tokenizer"""
+        return io.load_context(str(self), subpath="model").tokenizer
 
     @property
     def config(self) -> "MixtralConfig":
+        """return hf-config from mcore"""
         # Either MixtralConfig8x7B or MixtralConfig8x22B
-        source: MixtralConfig8x7B = io.load_ckpt(str(self)).model.config
+        source: MixtralConfig8x7B = io.load_context(str(self), subpath="model.config")
 
         from transformers import MixtralConfig as HfMixtralConfig
 
@@ -382,6 +403,7 @@ def config(self) -> "MixtralConfig":
     ),
 )
 def _export_qkv(ctx: io.TransformCTX, linear_qkv):
+    """_export_qkv"""
     megatron_config = ctx.source.config
 
     head_num = megatron_config.num_attention_heads
@@ -417,6 +439,7 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
     ),
 )
 def _export_moe_w1_w3(linear_fc1):
+    """_export_moe_w1_w3"""
     gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
 
     return gate_proj, up_proj

From a682ea95a869fe422d59b27bb3e0315db4224947 Mon Sep 17 00:00:00 2001
From: Sam O <soluwalana@nvidia.com>
Date: Wed, 12 Feb 2025 14:09:59 -0700
Subject: [PATCH 13/14] Minor Bug Fixes - LLaMa Embedding (#12146)

* Minor Bug Fixes - LLaMa Embedding

Signed-off-by: Sam Oluwalana <soluwalana@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Add type checking

Signed-off-by: Sam Oluwalana <soluwalana@nvidia.com>

---------

Signed-off-by: Sam Oluwalana <soluwalana@nvidia.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/hf_llama_embedding.py | 2 +-
 nemo/collections/llm/gpt/model/llama_embedding.py    | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/hf_llama_embedding.py b/nemo/collections/llm/gpt/model/hf_llama_embedding.py
index ba89626ff45f..bbd27ce60507 100644
--- a/nemo/collections/llm/gpt/model/hf_llama_embedding.py
+++ b/nemo/collections/llm/gpt/model/hf_llama_embedding.py
@@ -156,7 +156,7 @@ def forward(
 
         loss = None
         if labels is not None:
-            labels = labels.to(logits.device)
+            labels = labels.to(pooled_logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
diff --git a/nemo/collections/llm/gpt/model/llama_embedding.py b/nemo/collections/llm/gpt/model/llama_embedding.py
index 3d8edcc5121a..96f311acd0b8 100644
--- a/nemo/collections/llm/gpt/model/llama_embedding.py
+++ b/nemo/collections/llm/gpt/model/llama_embedding.py
@@ -31,12 +31,15 @@
 from nemo.collections.llm.utils import Config
 from nemo.lightning import OptimizerModule, io
 from nemo.lightning.pytorch.utils import dtype_from_hf
+from nemo.utils import logging
 from nemo.utils.import_utils import safe_import
 
 if TYPE_CHECKING:
     from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
 
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+    from nemo.collections.llm.gpt.model.hf_llama_embedding import LlamaBidirectionalModel
+
 _, HAVE_TE = safe_import("transformer_engine")
 
 
@@ -271,7 +274,7 @@ class LlamaEmbeddingExporter(io.ModelConnector[LlamaEmbeddingModel, "LlamaBidire
     Note that NV Embedding LLama uses customized LlamaBidirectionalConfig config.
     """
 
-    def init(self, dtype=torch.bfloat16) -> "LlamaForCausalLM":
+    def init(self, dtype=torch.bfloat16) -> "LlamaBidirectionalModel":
         from transformers.modeling_utils import no_init_weights
 
         from nemo.collections.llm.gpt.model.hf_llama_embedding import LlamaBidirectionalModel

From 4b19adecc007c9aa80a03b39695b6bc647855f3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Wed, 12 Feb 2025 22:28:19 +0100
Subject: [PATCH 14/14] build: Force re-install VCS dependencies (#12155)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* build: Re-install nvrx

Signed-off-by: oliver könig <okoenig@nvidia.com>

* f

Signed-off-by: oliver könig <okoenig@nvidia.com>

---------

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 reinstall.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reinstall.sh b/reinstall.sh
index eaa6e657deaa..ced7019a6645 100755
--- a/reinstall.sh
+++ b/reinstall.sh
@@ -140,7 +140,7 @@ nemo() {
   )
 
   echo 'Installing dependencies of nemo'
-  ${PIP} install --no-cache-dir --extra-index-url https://pypi.nvidia.com "${DEPS[@]}"
+  ${PIP} install --force-re-install --no-cache-dir --extra-index-url https://pypi.nvidia.com "${DEPS[@]}"
 
   echo 'Installing nemo itself'
   pip install --no-cache-dir --no-build-isolation $NEMO_DIR/.[all]