From 387144d063df75e45ed0f2cf4560ec4d70912992 Mon Sep 17 00:00:00 2001 From: guyueh1 <140554423+guyueh1@users.noreply.github.com> Date: Mon, 10 Feb 2025 18:45:44 -0800 Subject: [PATCH 01/14] Bug fix with generation of expert_tensor_parallel_rank (#12125) * Bug fix with generation of expert_tensor_parallel_rank Signed-off-by: Guyue Huang * Fix pylint Signed-off-by: Guyue Huang --------- Signed-off-by: Guyue Huang --- nemo/lightning/megatron_init.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/lightning/megatron_init.py b/nemo/lightning/megatron_init.py index 5f1d744e5b77..fab6d17da3cb 100644 --- a/nemo/lightning/megatron_init.py +++ b/nemo/lightning/megatron_init.py @@ -108,7 +108,7 @@ def initialize_model_parallel_for_nemo( use_tp_pp_dp_mapping=False, use_te_rng_tracker=False, ): - + """Initialize model parallel groups in NeMo.""" if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED: raise ValueError("set_virtual_pipeline_model_parallel_world_size is needed in megatron-core for interleaved.") @@ -498,7 +498,7 @@ def generator_wrapper(group_type, is_expert=False, **kwargs): # ETP expert_tensor_parallel_rank = 0 if expert_tensor_parallel_size_ is not None and expert_tensor_parallel_size_ > 1: - for ranks in generator_wrapper('tp-ep', is_expert=True): + for ranks in generator_wrapper('tp', is_expert=True): if rank in ranks: expert_tensor_parallel_rank = list(ranks).index(rank) From 26e2bf9baff1fdcf71fc0650b52a71949cfa5a09 Mon Sep 17 00:00:00 2001 From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Date: Mon, 10 Feb 2025 23:07:56 -0700 Subject: [PATCH 02/14] Rename neva datamodule (#12121) * Rename dataset Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Update Signed-off-by: yaoyu-33 * pylink Signed-off-by: yaoyu-33 * fix f string Signed-off-by: yaoyu-33 * fix intern vit default factory Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Signed-off-by: yaoyu-33 Co-authored-by: yaoyu-33 --- nemo/collections/vlm/__init__.py | 8 ++++---- nemo/collections/vlm/mllama/data/__init__.py | 4 ++-- .../vlm/mllama/data/{lazy.py => preloaded.py} | 10 ++++++---- nemo/collections/vlm/neva/data/__init__.py | 4 ++-- nemo/collections/vlm/neva/data/api.py | 10 ++++++---- .../vlm/neva/data/{lazy.py => preloaded.py} | 2 +- nemo/collections/vlm/vision/intern_vit.py | 14 ++++++++------ scripts/vlm/mllama_finetune.py | 4 ++-- scripts/vlm/neva_finetune.py | 2 +- 9 files changed, 32 insertions(+), 26 deletions(-) rename nemo/collections/vlm/mllama/data/{lazy.py => preloaded.py} (96%) rename nemo/collections/vlm/neva/data/{lazy.py => preloaded.py} (99%) diff --git a/nemo/collections/vlm/__init__.py b/nemo/collections/vlm/__init__.py index 97b154085f4b..71a605a7da61 100644 --- a/nemo/collections/vlm/__init__.py +++ b/nemo/collections/vlm/__init__.py @@ -26,7 +26,7 @@ from nemo.collections.vlm.llava_next.model.llava_next import LlavaNextConfig7B, LlavaNextConfig13B, LlavaNextModel # MLLAMA -from nemo.collections.vlm.mllama.data import MLlamaLazyDataModule, MLlamaMockDataModule +from nemo.collections.vlm.mllama.data import MLlamaMockDataModule, MLlamaPreloadedDataModule from nemo.collections.vlm.mllama.model.base import ( CrossAttentionTextConfig, CrossAttentionVisionConfig, @@ -46,8 +46,8 @@ ImageDataConfig, ImageToken, MultiModalToken, - NevaLazyDataModule, NevaMockDataModule, + NevaPreloadedDataModule, VideoDataConfig, VideoToken, ) @@ -77,9 +77,9 @@ "HFDatasetDataModule", "HFAutoModelForImageTextToText", "NevaMockDataModule", - "NevaLazyDataModule", + "NevaPreloadedDataModule", "MLlamaMockDataModule", - "MLlamaLazyDataModule", + "MLlamaPreloadedDataModule", "DataConfig", "ImageDataConfig", "VideoDataConfig", diff --git a/nemo/collections/vlm/mllama/data/__init__.py b/nemo/collections/vlm/mllama/data/__init__.py index 0e89762a4c9a..5c6b53ec666d 100644 --- a/nemo/collections/vlm/mllama/data/__init__.py +++ b/nemo/collections/vlm/mllama/data/__init__.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo.collections.vlm.mllama.data.lazy import MLlamaLazyDataModule from nemo.collections.vlm.mllama.data.mock import MockDataModule as MLlamaMockDataModule +from nemo.collections.vlm.mllama.data.preloaded import MLlamaPreloadedDataModule __all__ = [ "MLlamaMockDataModule", - "MLlamaLazyDataModule", + "MLlamaPreloadedDataModule", ] diff --git a/nemo/collections/vlm/mllama/data/lazy.py b/nemo/collections/vlm/mllama/data/preloaded.py similarity index 96% rename from nemo/collections/vlm/mllama/data/lazy.py rename to nemo/collections/vlm/mllama/data/preloaded.py index eac29d081a34..2b727d595fa1 100644 --- a/nemo/collections/vlm/mllama/data/lazy.py +++ b/nemo/collections/vlm/mllama/data/preloaded.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# pylint: disable=C0115,C0116 import json import logging @@ -28,7 +29,7 @@ from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids from nemo.collections.vlm.mllama.model.utils import create_vision_mask_tensor from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig -from nemo.collections.vlm.neva.data.lazy import IGNORE_INDEX, LazySupervisedDataset +from nemo.collections.vlm.neva.data.preloaded import IGNORE_INDEX, LazySupervisedDataset from nemo.lightning.pytorch.plugins import MegatronDataSampler @@ -170,7 +171,7 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: return batch -class MLlamaLazyDataModule(pl.LightningDataModule): +class MLlamaPreloadedDataModule(pl.LightningDataModule): def __init__( self, paths: str | List[str], @@ -223,7 +224,7 @@ def __init__( if tokenizer is None or image_processor is None: logging.warning( - f"Processor and tokenizer are not provided! Fall back to `meta-llama/Llama-3.2-11B-Vision-Instruct`." + "Processor and tokenizer are not provided! Fall back to `meta-llama/Llama-3.2-11B-Vision-Instruct`." ) from transformers import AutoProcessor @@ -246,7 +247,8 @@ def setup(self, stage: str = "") -> None: else: # TODO: # rng = torch.Generator().manual_seed(self.seed) - # train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], generator=rng) + # train_dataset, val_dataset, test_dataset = + # random_split(dataset, [train_size, val_size, test_size], generator=rng) self._train_ds = MLlamaDataset( self.paths[0], self.data_config, self.tokenizer, self.image_processor, self.seq_length ) diff --git a/nemo/collections/vlm/neva/data/__init__.py b/nemo/collections/vlm/neva/data/__init__.py index f210d01a06fd..94fe741b8831 100644 --- a/nemo/collections/vlm/neva/data/__init__.py +++ b/nemo/collections/vlm/neva/data/__init__.py @@ -13,12 +13,12 @@ # limitations under the License. from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig, VideoDataConfig -from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule from nemo.collections.vlm.neva.data.mock import MockDataModule as NevaMockDataModule from nemo.collections.vlm.neva.data.multimodal_tokens import ImageToken, MultiModalToken, VideoToken +from nemo.collections.vlm.neva.data.preloaded import NevaPreloadedDataModule __all__ = [ - "NevaLazyDataModule", + "NevaPreloadedDataModule", "NevaMockDataModule", "DataConfig", "ImageDataConfig", diff --git a/nemo/collections/vlm/neva/data/api.py b/nemo/collections/vlm/neva/data/api.py index 15ba45c82fd9..a50c0bdf513d 100644 --- a/nemo/collections/vlm/neva/data/api.py +++ b/nemo/collections/vlm/neva/data/api.py @@ -14,16 +14,18 @@ import lightning.pytorch as pl -from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule from nemo.collections.vlm.neva.data.mock import MockDataModule +from nemo.collections.vlm.neva.data.preloaded import NevaPreloadedDataModule def mock() -> pl.LightningDataModule: + """Mock Neva Data Module""" return MockDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2) -def lazy() -> pl.LightningDataModule: - return NevaLazyDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2) +def preloaded() -> pl.LightningDataModule: + """Preloaded Llava-like Data Module""" + return NevaPreloadedDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2) -__all__ = ["mock", "lazy"] +__all__ = ["mock", "preloaded"] diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/preloaded.py similarity index 99% rename from nemo/collections/vlm/neva/data/lazy.py rename to nemo/collections/vlm/neva/data/preloaded.py index 0076d3439270..40320a3c3799 100644 --- a/nemo/collections/vlm/neva/data/lazy.py +++ b/nemo/collections/vlm/neva/data/preloaded.py @@ -489,7 +489,7 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: return batch -class NevaLazyDataModule(pl.LightningDataModule): +class NevaPreloadedDataModule(pl.LightningDataModule): def __init__( self, paths: str | List[str], diff --git a/nemo/collections/vlm/vision/intern_vit.py b/nemo/collections/vlm/vision/intern_vit.py index 6f718f7258d0..086467a39cb2 100644 --- a/nemo/collections/vlm/vision/intern_vit.py +++ b/nemo/collections/vlm/vision/intern_vit.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import partial from pathlib import Path from typing import Callable @@ -337,7 +337,7 @@ class InternViTConfig(CLIPViTConfig): normalization: str = 'RMSNorm' layernorm_epsilon: float = 1e-6 apply_rope_fusion: bool = False - transformer_layer_spec: ModuleSpec = get_internvit_layer_spec(use_te=True) + transformer_layer_spec: ModuleSpec = field(default_factory=lambda: get_internvit_layer_spec(use_te=True)) @dataclass @@ -363,10 +363,12 @@ class InternViT_300M_448px_Config(InternViTConfig): attention_dropout: float = 0.0 ffn_hidden_size: int = 4096 normalization: str = 'LayerNorm' - transformer_layer_spec: ModuleSpec = get_internvit_layer_spec( - use_te=True, - add_qk_norm=False, - norm_type='LayerNorm', + transformer_layer_spec: ModuleSpec = field( + default_factory=lambda: get_internvit_layer_spec( + use_te=True, + add_qk_norm=False, + norm_type='LayerNorm', + ) ) diff --git a/scripts/vlm/mllama_finetune.py b/scripts/vlm/mllama_finetune.py index 9e37d9c3fc0c..6191145c2afd 100644 --- a/scripts/vlm/mllama_finetune.py +++ b/scripts/vlm/mllama_finetune.py @@ -22,7 +22,7 @@ from nemo import lightning as nl from nemo.collections import llm, vlm from nemo.collections.vlm import ImageDataConfig -from nemo.collections.vlm.mllama.data.lazy import MLlamaLazyDataModule +from nemo.collections.vlm.mllama.data.preloaded import MLlamaPreloadedDataModule from nemo.lightning.pytorch.optim import CosineAnnealingScheduler from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule from nemo.utils.exp_manager import TimingCallback @@ -71,7 +71,7 @@ def main(args): ) # Data module setup - data = MLlamaLazyDataModule( + data = MLlamaPreloadedDataModule( paths=args.data_path, data_config=data_config, seq_length=seq_length, diff --git a/scripts/vlm/neva_finetune.py b/scripts/vlm/neva_finetune.py index 3d733711a514..e62b2208b6c4 100644 --- a/scripts/vlm/neva_finetune.py +++ b/scripts/vlm/neva_finetune.py @@ -94,7 +94,7 @@ def main(args): ) # Data module setup - data = vlm.NevaLazyDataModule( + data = vlm.NevaPreloadedDataModule( paths=args.data_path, data_config=data_config, seq_length=decoder_seq_length, From 7bb74fa059b3d9b3f843f0ebac60ca7a71a036f8 Mon Sep 17 00:00:00 2001 From: Taejin Park Date: Tue, 11 Feb 2025 02:04:18 -0800 Subject: [PATCH 03/14] fix the issue during batched inference of Sortformer diarizer (#12047) * Added changes that fix the issue during batched inference Signed-off-by: Taejin Park * Adding changes to prevent ghost output Signed-off-by: Taejin Park --------- Signed-off-by: Taejin Park --- .../asr/data/audio_to_diar_label_lhotse.py | 2 +- .../asr/models/sortformer_diar_models.py | 15 ++++++++----- .../asr/modules/sortformer_modules.py | 22 +++++++------------ 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_diar_label_lhotse.py b/nemo/collections/asr/data/audio_to_diar_label_lhotse.py index 927e3887de78..6b9a687013a2 100644 --- a/nemo/collections/asr/data/audio_to_diar_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_diar_label_lhotse.py @@ -76,7 +76,7 @@ def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]: target_fr_len = get_hidden_length_from_sample_length( audio_len, self.num_sample_per_mel_frame, self.num_mel_frame_per_target_frame ) - target_lens_list.append([target_fr_len]) + target_lens_list.append(target_fr_len) target_lens = torch.tensor(target_lens_list) return audio, audio_lens, targets, target_lens diff --git a/nemo/collections/asr/models/sortformer_diar_models.py b/nemo/collections/asr/models/sortformer_diar_models.py index e2ac0b09c81b..bf773f1e0006 100644 --- a/nemo/collections/asr/models/sortformer_diar_models.py +++ b/nemo/collections/asr/models/sortformer_diar_models.py @@ -256,21 +256,24 @@ def frontend_encoder(self, processed_signal, processed_signal_length): emb_seq = self.sortformer_modules.encoder_proj(emb_seq) return emb_seq, emb_seq_length - def forward_infer(self, emb_seq): + def forward_infer(self, emb_seq, emb_seq_length): """ The main forward pass for diarization for offline diarization inference. Args: emb_seq (torch.Tensor): tensor containing FastConformer encoder states (embedding vectors). Dimension: (batch_size, diar_frame_count, emb_dim) + emb_seq_length (torch.Tensor): tensor containing lengths of FastConformer encoder states. + Dimension: (batch_size,) Returns: preds (torch.Tensor): Sorted tensor containing Sigmoid values for predicted speaker labels. Dimension: (batch_size, diar_frame_count, num_speakers) """ - encoder_mask = self.sortformer_modules.length_to_mask(emb_seq) + encoder_mask = self.sortformer_modules.length_to_mask(emb_seq_length, emb_seq.shape[1]) trans_emb_seq = self.transformer_encoder(encoder_states=emb_seq, encoder_mask=encoder_mask) - preds = self.sortformer_modules.forward_speaker_sigmoids(trans_emb_seq) + _preds = self.sortformer_modules.forward_speaker_sigmoids(trans_emb_seq) + preds = _preds * encoder_mask.unsqueeze(-1) return preds def _diarize_forward(self, batch: Any): @@ -407,6 +410,8 @@ def process_signal(self, audio_signal, audio_signal_length): processed_signal, processed_signal_length = self.preprocessor( input_signal=audio_signal, length=audio_signal_length ) + if not self.training: + torch.cuda.empty_cache() return processed_signal, processed_signal_length def forward( @@ -434,10 +439,10 @@ def forward( if self._cfg.get("streaming_mode", False): raise NotImplementedError("Streaming mode is not implemented yet.") else: - emb_seq, _ = self.frontend_encoder( + emb_seq, emb_seq_length = self.frontend_encoder( processed_signal=processed_signal, processed_signal_length=processed_signal_length ) - preds = self.forward_infer(emb_seq) + preds = self.forward_infer(emb_seq, emb_seq_length) return preds def _get_aux_train_evaluations(self, preds, targets, target_lens) -> dict: diff --git a/nemo/collections/asr/modules/sortformer_modules.py b/nemo/collections/asr/modules/sortformer_modules.py index d99bf3b93e38..c158b22fe473 100644 --- a/nemo/collections/asr/modules/sortformer_modules.py +++ b/nemo/collections/asr/modules/sortformer_modules.py @@ -67,28 +67,22 @@ def __init__( self.dropout = nn.Dropout(dropout_rate) self.encoder_proj = nn.Linear(self.fc_d_model, self.tf_d_model) - def length_to_mask(self, context_embs): + def length_to_mask(self, lengths, max_length): """ - Convert length values to encoder mask input tensor. + Convert length values to encoder mask input tensor Args: - lengths (torch.Tensor): tensor containing lengths of sequences - max_len (int): maximum sequence length + lengths (torch.Tensor): tensor containing lengths (frame counts) of sequences + max_length (int): maximum length (frame count) of the sequences in the batch Returns: mask (torch.Tensor): tensor of shape (batch_size, max_len) containing 0's in the padded region and 1's elsewhere """ - lengths = torch.tensor([context_embs.shape[1]] * context_embs.shape[0]) - batch_size = context_embs.shape[0] - max_len = context_embs.shape[1] - # create a tensor with the shape (batch_size, 1) filled with ones - row_vector = torch.arange(max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device) - # create a tensor with the shape (batch_size, max_len) filled with lengths - length_matrix = lengths.unsqueeze(1).expand(-1, max_len).to(lengths.device) - # create a mask by comparing the row vector and length matrix - mask = row_vector < length_matrix - return mask.float().to(context_embs.device) + batch_size = lengths.shape[0] + arange = torch.arange(max_length, device=lengths.device) + mask = arange.expand(batch_size, max_length) < lengths.unsqueeze(1) + return mask def forward_speaker_sigmoids(self, hidden_out): """ From 1e5214246fb50077991ba0838c51cfc92574620a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Kami=C5=84ski?= <67481570+Laplasjan107@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:06:27 +0100 Subject: [PATCH 04/14] Update vLLM to 0.7.2 (#12078) * initial commit Signed-off-by: Piotr Kaminski * vllm bump cleanup Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Flake8 Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * flake should not fail with tensorstore Signed-off-by: Piotr Kaminski * pylint also should not fail Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * local tokenizer load Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * add missing requirements Signed-off-by: Piotr Kaminski * absolute path for sentencepiece tokenizer Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * fix absolute path, add new vllm params Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * bump vllm, fix tokenizer Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * code review + docstrings Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * flake8 Signed-off-by: Piotr Kaminski * fix formatting Signed-off-by: Piotr Kaminski --------- Signed-off-by: Piotr Kaminski Signed-off-by: Laplasjan107 Co-authored-by: Laplasjan107 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/export/vllm/model_config.py | 64 +++++++++++++++++++++++--- nemo/export/vllm/model_loader.py | 21 ++++----- nemo/export/vllm/tokenizer_group.py | 19 +++++++- nemo/export/vllm_exporter.py | 71 +++++++++++++++++------------ requirements/requirements_vllm.txt | 7 ++- 5 files changed, 134 insertions(+), 48 deletions(-) diff --git a/nemo/export/vllm/model_config.py b/nemo/export/vllm/model_config.py index 39f6397663b3..989f6c5300ee 100644 --- a/nemo/export/vllm/model_config.py +++ b/nemo/export/vllm/model_config.py @@ -17,8 +17,10 @@ import torch import yaml +from hydra.utils import instantiate +from omegaconf import OmegaConf from transformers import AutoConfig -from vllm.config import ModelConfig, _get_and_verify_dtype, _get_and_verify_max_len +from vllm.config import ModelConfig, ModelImpl, PoolerConfig, _get_and_verify_dtype, _get_and_verify_max_len from vllm.transformers_utils.config import get_hf_text_config from nemo.export.tarutils import TarPath @@ -54,6 +56,11 @@ def __init__( max_logprobs: int = 5, disable_sliding_window: bool = False, use_async_output_proc: bool = False, + disable_mm_preprocessor_cache: bool = False, + logits_processor_pattern: Optional[str] = None, + override_pooler_config: Optional[PoolerConfig] = None, + enable_sleep_mode: bool = False, + model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, ) -> None: # Don't call ModelConfig.__init__ because we don't want it to call # transformers.AutoConfig.from_pretrained(...) @@ -75,6 +82,7 @@ def __init__( self.rope_scaling = rope_scaling self.rope_theta = rope_theta self.tokenizer_revision = tokenizer_revision + self.model_impl = model_impl self.quantization = quantization self.quantization_param_path = quantization_param_path self.enforce_eager = enforce_eager @@ -85,21 +93,39 @@ def __init__( self.multimodal_config = None self.mm_processor_kwargs = {} self.use_async_output_proc = use_async_output_proc + self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache + self.logits_processor_pattern = logits_processor_pattern + self.generation_config = None + self.task = "generate" # Only the generate task is supported + self.is_hybrid = False # No hybrid models are supported + + self.encoder_config = self._get_encoder_config() + self.pooler_config = self._init_pooler_config(override_pooler_config) + self.enable_sleep_mode = enable_sleep_mode + + from vllm.platforms import current_platform # vLLM uses local import for current_platform + + if self.enable_sleep_mode and not current_platform.is_cuda(): + raise ValueError("Sleep mode is only supported on CUDA devices.") self.model_converter = get_model_converter(model_type) if self.model_converter is None: raise RuntimeError(f'Unknown model type "{model_type}"') if is_nemo2_checkpoint(nemo_checkpoint): - from nemo.lightning.io import load_context - nemo_checkpoint: Path = Path(nemo_checkpoint) + tokenizer_config = OmegaConf.load(nemo_checkpoint / "context/model.yaml").tokenizer + if ('additional_special_tokens' in tokenizer_config) and len( + tokenizer_config['additional_special_tokens'] + ) == 0: + del tokenizer_config['additional_special_tokens'] + + tokenizer_config = self._change_paths_to_absolute_paths(tokenizer_config, nemo_checkpoint) + tokenizer = instantiate(tokenizer_config) with (nemo_checkpoint / "context/model.yaml").open('r') as config_file: self.nemo_model_config: dict = yaml.load(config_file, Loader=yaml.SafeLoader) - hf_args = self._load_hf_arguments(self.nemo_model_config['config']) - tokenizer = load_context((nemo_checkpoint / "context"), subpath="model.tokenizer") if hasattr(tokenizer, 'bos_id'): tokenizer.tokenizer.bos_token_id = tokenizer.bos_id @@ -134,10 +160,36 @@ def __init__( self.has_inner_state = self._init_has_inner_state() self._verify_tokenizer_mode() - self._verify_embedding_mode() self._verify_quantization() self._verify_cuda_graph() + @staticmethod + def _change_paths_to_absolute_paths(tokenizer_config: Dict[Any, Any], nemo_checkpoint: Path) -> Dict[Any, Any]: + """ + Creates absolute path to the local tokenizers. Used for NeMo 2.0. + + Args: + tokenizer_config (dict): Parameters for instantiating the tokenizer. + nemo_checkpoint (path): Path to the NeMo2 checkpoint. + Returns: + dict: Updated tokenizer config. + """ + context_path = nemo_checkpoint / 'context' + + # 'pretrained_model_name' -- huggingface tokenizer case + # 'model_path' -- sentencepiece tokenizer + path_keys = ['pretrained_model_name', 'model_path'] + + for path_key in path_keys: + if path := tokenizer_config.get(path_key, None): + tokenizer_path = context_path / path + if not tokenizer_path.exists(): + continue + + tokenizer_config[path_key] = str(tokenizer_path.resolve()) + + return tokenizer_config + def _load_hf_arguments(self, nemo_config: Dict[str, Any]) -> Dict[str, Any]: """ Maps argument names used in NeMo to their corresponding names in HF. diff --git a/nemo/export/vllm/model_loader.py b/nemo/export/vllm/model_loader.py index 8c867f1bb994..45c86b8e0389 100644 --- a/nemo/export/vllm/model_loader.py +++ b/nemo/export/vllm/model_loader.py @@ -17,14 +17,16 @@ import logging import os.path from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any, Dict import numpy import safetensors.torch -import tensorstore # needed to register 'bfloat16' dtype with numpy for zarr compatibility + +# needed to register 'bfloat16' dtype with numpy for zarr compatibility +import tensorstore # noqa: F401 pylint: disable=unused-import import torch import zarr -from vllm.config import CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig +from vllm.config import ModelConfig from vllm.model_executor.model_loader.loader import BaseModelLoader, _initialize_model from vllm.model_executor.model_loader.utils import set_default_torch_dtype @@ -81,29 +83,26 @@ def _load_nemo_checkpoint_state(nemo_file: str): return sharded_state_dict - def download_model(self, model_config: ModelConfig) -> None: + def download_model(self, model_config: ModelConfig) -> None: # pylint: disable=missing-function-docstring raise NotImplementedError def load_model( self, *, - model_config: NemoModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig, + vllm_config: NemoModelConfig, ) -> torch.nn.Module: """ Overrides the load_model function from BaseModelLoader to convert Nemo weights at load time. """ + model_config = vllm_config.model_config + device_config = vllm_config.device_config assert isinstance(model_config, NemoModelConfig) state_dict = NemoModelLoader._load_nemo_checkpoint_state(model_config.nemo_checkpoint) with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): - model = _initialize_model(model_config, self.load_config, lora_config, cache_config) + model = _initialize_model(vllm_config) config = model_config.nemo_model_config if 'config' in config: diff --git a/nemo/export/vllm/tokenizer_group.py b/nemo/export/vllm/tokenizer_group.py index 592b784be04b..34d35af352c2 100644 --- a/nemo/export/vllm/tokenizer_group.py +++ b/nemo/export/vllm/tokenizer_group.py @@ -32,29 +32,44 @@ def __init__(self, tokenizer: SentencePieceTokenizer, add_bos_token: bool = Fals @classmethod def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig] = None, **init_kwargs): + """Create a tokenizer group from a config.""" raise NotImplementedError def ping(self) -> bool: + """Check if the tokenizer group is alive.""" return True def get_max_input_len(self, lora_request: Optional[LoRARequest] = None) -> Optional[int]: + """Get the maximum input length for the LoRA request.""" return None def encode( - self, prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None + self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None, ) -> List[int]: + """Tokenizes the prompt.""" ids = self.tokenizer.encode(prompt) if self.add_bos_token: ids = [self.tokenizer.bos_token_id] + ids return ids async def encode_async( - self, prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None + self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None, ) -> List[int]: + """Encode a prompt using the tokenizer group.""" return self.tokenizer.encode(prompt) # TODO: not sure how this is supposed to work def get_lora_tokenizer(self, lora_request: Optional[LoRARequest] = None) -> SentencePieceTokenizer: + """Get a tokenizer for a LoRA request.""" return self.tokenizer async def get_lora_tokenizer_async(self, lora_request: Optional[LoRARequest] = None) -> SentencePieceTokenizer: + """Get a tokenizer for a LoRA request.""" return self.tokenizer diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py index 0177adbd3587..1b2f69cc5d95 100644 --- a/nemo/export/vllm_exporter.py +++ b/nemo/export/vllm_exporter.py @@ -20,7 +20,16 @@ import numpy import wrapt from vllm import RequestOutput, SamplingParams -from vllm.config import CacheConfig, DeviceConfig, LoadConfig, LoadFormat, LoRAConfig, ParallelConfig, SchedulerConfig +from vllm.config import ( + CacheConfig, + DeviceConfig, + LoadConfig, + LoadFormat, + LoRAConfig, + ParallelConfig, + SchedulerConfig, + VllmConfig, +) from vllm.executor.ray_utils import initialize_ray_cluster from vllm.lora.request import LoRARequest @@ -36,12 +45,15 @@ @wrapt.decorator def noop_decorator(func): + """Used as batch if pytriton is not supported""" + def wrapper(*args, **kwargs): return func(*args, **kwargs) return wrapper +batch = noop_decorator use_pytriton = True try: from pytriton.decorators import batch @@ -239,42 +251,39 @@ def export( ) # Initialize the cluster and specify the executor class. - if device_config.device_type == "neuron": - from vllm.executor.neuron_executor import NeuronExecutor - - executor_class = NeuronExecutor - elif device_config.device_type == "cpu": - from vllm.executor.cpu_executor import CPUExecutor - - executor_class = CPUExecutor - elif parallel_config.distributed_executor_backend == "ray": + if parallel_config.distributed_executor_backend == "ray": initialize_ray_cluster(parallel_config) - from vllm.executor.ray_gpu_executor import RayGPUExecutor + from vllm.executor.ray_distributed_executor import RayDistributedExecutor + + executor_class = RayDistributedExecutor - executor_class = RayGPUExecutor elif parallel_config.distributed_executor_backend == "mp": - from vllm.executor.multiproc_gpu_executor import MultiprocessingGPUExecutor + from vllm.executor.mp_distributed_executor import MultiprocessingDistributedExecutor + + executor_class = MultiprocessingDistributedExecutor - executor_class = MultiprocessingGPUExecutor else: - assert parallel_config.world_size == 1, "Ray is required if parallel_config.world_size > 1." - from vllm.executor.gpu_executor import GPUExecutor + assert parallel_config.distributed_executor_backend == "uni" or parallel_config.world_size == 1 - executor_class = GPUExecutor + from vllm.executor.uniproc_executor import UniProcExecutor + + executor_class = UniProcExecutor # Initialize the engine self.engine = NemoLLMEngine( - model_config=model_config, - cache_config=cache_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - device_config=device_config, - load_config=load_config, - lora_config=lora_config, - speculative_config=None, - decoding_config=None, - observability_config=None, - prompt_adapter_config=None, + vllm_config=VllmConfig( + model_config=model_config, + cache_config=cache_config, + parallel_config=parallel_config, + scheduler_config=scheduler_config, + device_config=device_config, + load_config=load_config, + lora_config=lora_config, + speculative_config=None, + decoding_config=None, + observability_config=None, + prompt_adapter_config=None, + ), executor_class=executor_class, log_stats=log_stats, ) @@ -414,6 +423,9 @@ def get_triton_output(self): @batch def triton_infer_fn(self, **inputs: numpy.ndarray): + """ + This function is used to perform inference on a batch of prompts. + """ request_ids = [] num_requests = len(inputs["prompts"]) for index in range(num_requests): @@ -428,6 +440,9 @@ def triton_infer_fn(self, **inputs: numpy.ndarray): @batch def triton_infer_fn_streaming(self, **inputs: numpy.ndarray): + """ + This function is used to perform streaming inference. + """ request_ids = [] num_requests = len(inputs["prompts"]) for index in range(num_requests): diff --git a/requirements/requirements_vllm.txt b/requirements/requirements_vllm.txt index a72926bee267..8d376785fd9a 100644 --- a/requirements/requirements_vllm.txt +++ b/requirements/requirements_vllm.txt @@ -1,5 +1,8 @@ # Minimal set of NeMo requirements to run vLLM export & deployment in /opt/venv in a NeMo container braceexpand +# datasets and pandas import are triggered by hydra.utils.instantiate in nemo/export/vllm/model_config.py. +# TODO: remove those dependencies by switching to local nemo.export tokenizers. +datasets faiss-cpu fiddle h5py @@ -11,10 +14,12 @@ matplotlib>=3.3.2 omegaconf<=2.3 onnx>=1.7.0 OpenCC +pandas pangu rouge_score sacrebleu scikit-learn -vllm==0.6.3 +vllm==0.7.2 webdataset>=0.2.86 wget +zarr>=2.18.2,<3.0.0 From 6b59ab8a7eba7164b753f384f4496c367b387dba Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Tue, 11 Feb 2025 16:23:04 -0500 Subject: [PATCH 05/14] Prevent downloading dataset every time in ci test (#12095) * prevent downloading dataset everytime in ci test Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * newline Signed-off-by: Chen Cui --------- Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: cuichenx --- .github/workflows/cicd-main.yml | 5 ++--- Dockerfile.ci | 1 + tests/collections/llm/gpt_finetuning.py | 26 ++++++++++++++++++------- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 56f72233ac55..b1282ee63d30 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4859,7 +4859,7 @@ jobs: --tp_size 1 \ --pp_size 1 \ --mbs 1 \ - --chat_dataset_path /home/TestData/nemo2_data/chat + --dataset chat python tests/collections/llm/gpt_finetuning.py \ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \ @@ -4870,7 +4870,7 @@ jobs: --tp_size 1 \ --pp_size 1 \ --mbs 1 \ - --chat_dataset_path /home/TestData/nemo2_data/chat + --dataset chat L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2_exclude: needs: [pre-flight, cicd-test-container-build] @@ -4947,7 +4947,6 @@ jobs: --model mistral \ --dist-opt - L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1_exclude: needs: [pre-flight, cicd-test-container-build] uses: ./.github/workflows/_test_template.yml diff --git a/Dockerfile.ci b/Dockerfile.ci index 2bff4e0c0821..f035a1207ae5 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -94,3 +94,4 @@ RUN \ EOF ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM" +ENV NEMO_HOME="/home/TestData/nemo_home" diff --git a/tests/collections/llm/gpt_finetuning.py b/tests/collections/llm/gpt_finetuning.py index 668109d46b70..e59741ca6f53 100644 --- a/tests/collections/llm/gpt_finetuning.py +++ b/tests/collections/llm/gpt_finetuning.py @@ -20,12 +20,14 @@ from nemo import lightning as nl from nemo.collections import llm +from nemo.collections.llm.gpt.data.core import get_dataset_root from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer from tests.collections.llm.common import Llama3ConfigCI - ## NOTE: This script is present for github-actions testing only. +## CI tests that call this script should set max_steps=3 for initial training +## and max_steps=6 for resume testing def get_args(): @@ -39,9 +41,7 @@ def get_args(): parser.add_argument('--tp_size', type=int, default=1, help="tensor parallel size") parser.add_argument('--pp_size', type=int, default=1, help="pipeline parallel size") parser.add_argument('--packed', action='store_true', help="use packed sequence dataset") - parser.add_argument( - '--chat_dataset_path', type=str, default="", help="path to chat dataset. Uses dolly if this is empty." - ) + parser.add_argument('--dataset', type=str, default="dolly", choices=['dolly', 'chat'], help="Dataset to use") return parser.parse_args() @@ -54,6 +54,7 @@ def get_args(): pipeline_model_parallel_size=args.pp_size, # Pipeline dtype is coupled with the bf16 mixed precision plugin pipeline_dtype=torch.bfloat16, + ckpt_load_strictness="log_all", # Only for CI tests to use older versions of checkpoint ) trainer = nl.Trainer( @@ -101,10 +102,11 @@ def get_args(): packed_sequence_specs = ( PackedSequenceSpecs(packed_sequence_size=2048, tokenizer_model_name="dummy_tokenizer") if args.packed else None ) - if args.chat_dataset_path: + + if args.dataset == 'chat': assert not args.packed data = llm.ChatDataModule( - dataset_root=args.chat_dataset_path, + dataset_root=get_dataset_root("chat"), seq_length=2048, micro_batch_size=args.mbs, global_batch_size=8, @@ -120,6 +122,9 @@ def get_args(): packed_sequence_specs=packed_sequence_specs, ) + # ensure using cached dir + assert str(data.dataset_root).startswith(os.environ.get("NEMO_HOME")) + tokenizer = get_nmt_tokenizer(tokenizer_model=os.path.join(args.restore_path, "dummy_tokenizer.model")) llama3_8b = llm.LlamaModel(Llama3ConfigCI(), tokenizer=tokenizer) @@ -138,6 +143,13 @@ def get_args(): resume=resume, ) + if args.max_steps == 3: + print("Initial Training Succeeded") if args.max_steps == 6: # assert a resume has happened for CI tests - assert 'reduced_train_loss=' in str(trainer.ckpt_path), "Resume did not happen in this resume test." + msg = ( + "Resume did not happen in this resume test.\n" + "Hint: Scroll up and see whether 'Initial Training Succeeded' is printed out.\n" + "If not, then the issue is not with ckpt resume." + ) + assert 'reduced_train_loss=' in str(trainer.ckpt_path), msg From ee543c24d84e7e84f6b2a868d7f15c71b4c57a03 Mon Sep 17 00:00:00 2001 From: Ssofja <78349198+Ssofja@users.noreply.github.com> Date: Wed, 12 Feb 2025 04:29:31 +0400 Subject: [PATCH 06/14] changed asr models outputs to be consistent (#11818) * changed asr models outputs to be consistent Signed-off-by: Ssofja * Apply isort and black reformatting Signed-off-by: Ssofja Signed-off-by: Ssofja * Apply isort and black reformatting Signed-off-by: Ssofja * adding needed changes Signed-off-by: Ssofja * Apply isort and black reformatting Signed-off-by: Ssofja * Small fixes * Returned previous names of return_hypotheses Signed-off-by: Ssofja * Apply isort and black reformatting Signed-off-by: Ssofja --------- Signed-off-by: Ssofja Signed-off-by: Ssofja Co-authored-by: Ssofja --- nemo/collections/asr/metrics/bleu.py | 8 +- nemo/collections/asr/metrics/wer.py | 10 +- .../asr/models/aed_multitask_models.py | 21 +- nemo/collections/asr/models/ctc_models.py | 32 +-- .../asr/models/hybrid_rnnt_ctc_models.py | 37 +-- nemo/collections/asr/models/rnnt_models.py | 32 +-- .../context_biasing/context_biasing_utils.py | 18 +- nemo/collections/asr/parts/mixins/mixins.py | 7 +- .../asr/parts/submodules/ctc_beam_decoding.py | 24 +- .../asr/parts/submodules/ctc_decoding.py | 38 ++- .../parts/submodules/ctc_greedy_decoding.py | 16 +- .../cuda_graph_rnnt_greedy_decoding.py | 4 +- .../submodules/multitask_beam_decoding.py | 4 +- .../parts/submodules/multitask_decoding.py | 31 +-- .../submodules/multitask_greedy_decoding.py | 4 +- .../parts/submodules/rnnt_beam_decoding.py | 34 +-- .../asr/parts/submodules/rnnt_decoding.py | 52 ++--- .../parts/submodules/rnnt_greedy_decoding.py | 37 ++- .../submodules/rnnt_loop_labels_computer.py | 8 +- .../asr/parts/submodules/tdt_beam_decoding.py | 14 +- .../submodules/tdt_loop_labels_computer.py | 8 +- .../collections/asr/parts/utils/rnnt_utils.py | 113 ++++----- .../asr/parts/utils/streaming_utils.py | 4 +- .../asr/parts/utils/transcribe_utils.py | 44 ++-- .../speech_cv/models/visual_ctc_models.py | 24 +- .../models/visual_hybrid_rnnt_ctc_models.py | 30 ++- .../speech_cv/models/visual_rnnt_models.py | 27 +-- nemo/collections/tts/g2p/models/ctc.py | 13 +- .../ngram_lm/eval_beamsearch_ngram_ctc.py | 6 +- .../eval_beamsearch_ngram_transducer.py | 3 +- .../ngram_lm/eval_wfst_decoding_ctc.py | 4 +- .../test_batched_hyps_and_alignments.py | 68 +++--- .../asr/decoding/test_ctc_decoding.py | 54 ++--- .../test_cuda_graph_rnnt_greedy_decoding.py | 20 +- .../asr/decoding/test_rnnt_alignments.py | 2 +- .../asr/decoding/test_rnnt_decoding.py | 61 ++--- .../asr/mixins/test_transcription.py | 20 +- .../asr/test_asr_classification_model.py | 10 +- .../asr/test_asr_context_biasing.py | 4 +- .../asr/test_asr_ctc_encoder_model_bpe.py | 3 +- .../asr/test_asr_ctcencdec_model.py | 3 +- .../asr/test_asr_hybrid_rnnt_ctc_model_bpe.py | 3 +- .../test_asr_hybrid_rnnt_ctc_model_char.py | 10 +- tests/collections/asr/test_asr_metrics.py | 60 ++--- .../asr/test_asr_multitask_model_bpe.py | 8 +- .../asr/test_asr_rnnt_encdec_model.py | 50 ++-- .../asr/test_asr_rnnt_encoder_model_bpe.py | 3 +- tutorials/asr/ASR_Context_Biasing.ipynb | 8 +- tutorials/asr/ASR_with_NeMo.ipynb | 220 +++++++++--------- tutorials/asr/ASR_with_Transducers.ipynb | 2 +- .../asr/Buffered_Transducer_Inference.ipynb | 8 +- 51 files changed, 647 insertions(+), 677 deletions(-) diff --git a/nemo/collections/asr/metrics/bleu.py b/nemo/collections/asr/metrics/bleu.py index 32bd25d952d4..f422f3665561 100644 --- a/nemo/collections/asr/metrics/bleu.py +++ b/nemo/collections/asr/metrics/bleu.py @@ -161,14 +161,16 @@ def update( target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist() reference = self.decoding.decode_tokens_to_str(target) references.append(reference) - hypotheses, _ = self.decode(predictions, predictions_lengths, predictions_mask, input_ids, targets) + hypotheses = self.decode(predictions, predictions_lengths, predictions_mask, input_ids, targets) if self.log_prediction: - logging.info(f"\n") + logging.info("\n") logging.info(f"reference:{references[0]}") logging.info(f"predicted:{hypotheses[0]}") - super().update(hypotheses, [references]) # Note: [references] since BLEU allows multiple references. + super().update( + [h.text for h in hypotheses], [references] + ) # Note: [references] since BLEU allows multiple references. def compute(self, return_all_metrics=True, prefix="", suffix=""): """ diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py index 7bda3a77b278..07ddb928e966 100644 --- a/nemo/collections/asr/metrics/wer.py +++ b/nemo/collections/asr/metrics/wer.py @@ -323,19 +323,19 @@ def update( target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist() reference = self.decoding.decode_tokens_to_str(target) references.append(reference) - hypotheses, _ = self.decode(predictions, predictions_lengths, predictions_mask, input_ids, targets) + hypotheses = self.decode(predictions, predictions_lengths, predictions_mask, input_ids, targets) if self.log_prediction: - logging.info(f"\n") + logging.info("\n") logging.info(f"reference:{references[0]}") - logging.info(f"predicted:{hypotheses[0]}") + logging.info(f"predicted:{hypotheses[0].text}") for h, r in zip(hypotheses, references): if self.use_cer: - h_list = list(h) + h_list = list(h.text) r_list = list(r) else: - h_list = h.split() + h_list = h.text.split() r_list = r.split() words += len(r_list) # Compute Levenstein's distance diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py index a609eeaccf9e..18570e306317 100644 --- a/nemo/collections/asr/models/aed_multitask_models.py +++ b/nemo/collections/asr/models/aed_multitask_models.py @@ -43,7 +43,6 @@ from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common import tokenizers from nemo.collections.common.data.lhotse.dataloader import get_lhotse_dataloader_from_config -from nemo.collections.common.data.prompt_fn import get_prompt_format_fn from nemo.collections.common.metrics import GlobalAverageLossMetric from nemo.collections.common.parts import transformer_weights_init from nemo.collections.common.parts.preprocessing.manifest import get_full_path @@ -60,7 +59,6 @@ SpectrogramType, ) from nemo.utils import logging, model_utils -from nemo.utils.decorators import deprecated __all__ = ['EncDecMultiTaskModel'] @@ -310,7 +308,7 @@ def change_vocabulary( ) if new_tokenizer_type.lower() not in ('bpe', 'wpe'): - raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`') + raise ValueError('New tokenizer type must be either `bpe` or `wpe`') tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type}) @@ -821,7 +819,7 @@ def _transcribe_on_begin(self, audio, trcfg: MultiTaskTranscriptionConfig): if isinstance(audio, list): logging.debug(f"Found 'audio' to be a list of {len(audio)} items.") - logging.debug(f"Assuming each item in 'audio' is a path to audio file.") + logging.debug("Assuming each item in 'audio' is a path to audio file.") if isinstance(self.tokenizer, tokenizers.AggregateTokenizer): if hasattr(trcfg, '_internal') and hasattr(trcfg._internal, 'primary_language'): @@ -929,10 +927,6 @@ def _transcribe_forward( decoder_input_ids=decoder_input_ids, ) - @deprecated( - explanation='The return type of args will be updated in the upcoming release to ensure a consistent \ - output format across all decoder types, such that a Hypothesis object is always returned.' - ) def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionConfig) -> GenericTranscriptionType: """ Internal function to process the model's outputs to return the results to the user. This function is called by @@ -944,7 +938,7 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo Returns: The output can be a list of - objects, list of list of objects, tuple of objects, tuple of list of objects, or a dict of list of objects. + objects, list of list of objects. Its type is defined in `TranscriptionReturnType`. """ log_probs = outputs.pop('log_probs') @@ -955,7 +949,7 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo del log_probs, encoded_len - best_hypotheses, all_hypotheses = self.decoding.decode_predictions_tensor( + hypotheses = self.decoding.decode_predictions_tensor( encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, decoder_input_ids=decoder_input_ids, @@ -963,9 +957,8 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo ) del enc_states, enc_mask, decoder_input_ids - if all_hypotheses is None: - return best_hypotheses - return best_hypotheses, all_hypotheses + + return hypotheses def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': """ @@ -1092,7 +1085,7 @@ def predict_step( encoder_input_mask=enc_mask, decoder_input_ids=batch.prompt, return_hypotheses=False, - )[0] + ) if batch.cuts: return list(zip(batch.cuts, text)) else: diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index ae8c35220931..f65a28e85560 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -34,6 +34,7 @@ from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.asr.parts.utils.transcribe_utils import process_timestamp_outputs from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.common.parts.preprocessing.parsers import make_parser @@ -41,7 +42,6 @@ from nemo.core.classes.mixins import AccessMixin from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType from nemo.utils import logging -from nemo.utils.decorators import deprecated __all__ = ['EncDecCTCModel'] @@ -612,7 +612,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): else: log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) - transcribed_texts, _ = self.wer.decoding.ctc_decoder_predictions_tensor( + transcribed_texts = self.wer.decoding.ctc_decoder_predictions_tensor( decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False, @@ -703,15 +703,11 @@ def _transcribe_forward(self, batch: Any, trcfg: TranscribeConfig): del greedy_predictions return output - @deprecated( - explanation='The return type of args will be updated in the upcoming release to ensure a consistent output \ - format across all decoder types, such that a Hypothesis object is always returned.' - ) def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> GenericTranscriptionType: logits = outputs.pop('logits') logits_len = outputs.pop('logits_len') - current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor( + hypotheses = self.decoding.ctc_decoder_predictions_tensor( logits, decoder_lengths=logits_len, return_hypotheses=trcfg.return_hypotheses, @@ -732,30 +728,24 @@ def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> Gen # cudaMallocHost()-allocated tensor to be floating # around. Were that to be the case, then the pinned # memory cache would always miss. - current_hypotheses[idx].y_sequence = logits_cpu[idx, : logits_len[idx]].clone() - if current_hypotheses[idx].alignments is None: - current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence + hypotheses[idx].y_sequence = logits_cpu[idx, : logits_len[idx]].clone() + if hypotheses[idx].alignments is None: + hypotheses[idx].alignments = hypotheses[idx].y_sequence del logits_cpu # cleanup memory del logits, logits_len if trcfg.timestamps: - current_hypotheses = process_timestamp_outputs( - current_hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride'] + hypotheses = process_timestamp_outputs( + hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride'] ) - all_hyp = process_timestamp_outputs( - all_hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride'] - ) - - hypotheses = [] - if all_hyp is None: - hypotheses += current_hypotheses - else: - hypotheses += all_hyp return hypotheses + def get_best_hyptheses(self, all_hypothesis: list[list[Hypothesis]]): + return [hyp[0] for hyp in all_hypothesis] + def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': """ Setup function for a temporary data loader which wraps the provided audio file. diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py index be795b6e4bc4..9b6ef4356559 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py @@ -13,15 +13,11 @@ # limitations under the License. import copy -import json -import os -import tempfile -from typing import Any, List, Optional, Tuple +from typing import Any, List, Optional, Union import torch from lightning.pytorch import Trainer from omegaconf import DictConfig, OmegaConf, open_dict -from tqdm.auto import tqdm from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs from nemo.collections.asr.losses.ctc import CTCLoss @@ -31,6 +27,7 @@ from nemo.collections.asr.parts.mixins.transcription import TranscriptionReturnType from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.asr.parts.utils.transcribe_utils import process_timestamp_outputs from nemo.core.classes.common import PretrainedModelInfo from nemo.core.classes.mixins import AccessMixin @@ -200,7 +197,7 @@ def _transcribe_forward(self, batch: Any, trcfg: TranscribeConfig): def _transcribe_output_processing( self, outputs, trcfg: TranscribeConfig - ) -> Tuple[List['Hypothesis'], List['Hypothesis']]: + ) -> Union[List['Hypothesis'], List[List['Hypothesis']]]: if self.cur_decoder == "rnnt": return super()._transcribe_output_processing(outputs, trcfg) @@ -208,7 +205,7 @@ def _transcribe_output_processing( logits = outputs.pop('logits') encoded_len = outputs.pop('encoded_len') - best_hyp, all_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor( + hypotheses = self.ctc_decoding.ctc_decoder_predictions_tensor( logits, encoded_len, return_hypotheses=trcfg.return_hypotheses, @@ -218,9 +215,9 @@ def _transcribe_output_processing( if trcfg.return_hypotheses: # dump log probs per file for idx in range(logits.shape[0]): - best_hyp[idx].y_sequence = logits[idx][: encoded_len[idx]] - if best_hyp[idx].alignments is None: - best_hyp[idx].alignments = best_hyp[idx].y_sequence + hypotheses[idx].y_sequence = logits[idx][: encoded_len[idx]] + if hypotheses[idx].alignments is None: + hypotheses[idx].alignments = hypotheses[idx].y_sequence # DEPRECATED? # if logprobs: @@ -228,25 +225,13 @@ def _transcribe_output_processing( # logits_list.append(logit[:elen]) if trcfg.timestamps: - best_hyp = process_timestamp_outputs( - best_hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride'] - ) - all_hyp = process_timestamp_outputs( - all_hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride'] + hypotheses = process_timestamp_outputs( + hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride'] ) del logits, encoded_len - hypotheses = [] - all_hypotheses = [] - - hypotheses += best_hyp - if all_hyp is not None: - all_hypotheses += all_hyp - else: - all_hypotheses += best_hyp - - return (hypotheses, all_hypotheses) + return hypotheses def change_vocabulary( self, @@ -515,7 +500,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) del signal - best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( + best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False ) if isinstance(sample_id, torch.Tensor): diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index 78038d404107..b26337b26cba 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -15,7 +15,7 @@ import copy import os from math import ceil -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Union import numpy as np import torch @@ -40,6 +40,7 @@ from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecoding, RNNTDecodingConfig from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.asr.parts.utils.transcribe_utils import process_timestamp_outputs from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.common.parts.preprocessing.parsers import make_parser @@ -47,7 +48,6 @@ from nemo.core.classes.mixins import AccessMixin from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType from nemo.utils import logging -from nemo.utils.decorators import deprecated class EncDecRNNTModel(ASRModel, ASRModuleMixin, ExportableEncDecModel, ASRTranscriptionMixin): @@ -814,7 +814,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) del signal - best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( + best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False ) @@ -936,17 +936,13 @@ def _transcribe_forward(self, batch: Any, trcfg: TranscribeConfig): output = dict(encoded=encoded, encoded_len=encoded_len) return output - @deprecated( - explanation='The return type of args will be updated in the upcoming release to ensure a consistent \ - output format across all decoder types, such that a "Hypothesis" object is always returned.' - ) def _transcribe_output_processing( self, outputs, trcfg: TranscribeConfig - ) -> Tuple[List['Hypothesis'], List['Hypothesis']]: + ) -> Union[List['Hypothesis'], List[List['Hypothesis']]]: encoded = outputs.pop('encoded') encoded_len = outputs.pop('encoded_len') - best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor( + hyp = self.decoding.rnnt_decoder_predictions_tensor( encoded, encoded_len, return_hypotheses=trcfg.return_hypotheses, @@ -956,23 +952,11 @@ def _transcribe_output_processing( del encoded, encoded_len if trcfg.timestamps: - best_hyp = process_timestamp_outputs( - best_hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride'] + hyp = process_timestamp_outputs( + hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride'] ) - all_hyp = process_timestamp_outputs( - all_hyp, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride'] - ) - - hypotheses = [] - all_hypotheses = [] - - hypotheses += best_hyp - if all_hyp is not None: - all_hypotheses += all_hyp - else: - all_hypotheses += best_hyp - return (hypotheses, all_hypotheses) + return hyp def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': """ diff --git a/nemo/collections/asr/parts/context_biasing/context_biasing_utils.py b/nemo/collections/asr/parts/context_biasing/context_biasing_utils.py index 6b3626920a2f..f168d92bdbc7 100644 --- a/nemo/collections/asr/parts/context_biasing/context_biasing_utils.py +++ b/nemo/collections/asr/parts/context_biasing/context_biasing_utils.py @@ -75,7 +75,7 @@ def merge_alignment_with_ws_hyps( if idx + 1 < len(tokens) and not tokens[idx + 1].startswith(bow): tokens[idx + 1] = bow + tokens[idx + 1] continue - alignment_tokens.append([candidate.timestep[idx].item(), token]) + alignment_tokens.append([candidate.timestamp[idx].item(), token]) else: raise ValueError(f"decoder_type {decoder_type} is not supported") @@ -86,20 +86,26 @@ def merge_alignment_with_ws_hyps( # step 2: get word-level alignment [word, start_frame, end_frame] word_alignment = [] word = "" - l, r, = None, None + ( + L, + r, + ) = ( + None, + None, + ) for item in alignment_tokens: if not word: word = item[1][1:] - l = r = item[0] + L = r = item[0] else: if item[1].startswith(bow): - word_alignment.append((word, l, r)) + word_alignment.append((word, L, r)) word = item[1][1:] - l = r = item[0] + L = r = item[0] else: word += item[1] r = item[0] - word_alignment.append((word, l, r)) + word_alignment.append((word, L, r)) initial_text_transcript = " ".join([item[0] for item in word_alignment]) if print_stats: logging.info(f"Word alignment: {word_alignment}") diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py index 2de8ec775104..577b6393248c 100644 --- a/nemo/collections/asr/parts/mixins/mixins.py +++ b/nemo/collections/asr/parts/mixins/mixins.py @@ -699,10 +699,10 @@ def conformer_stream_step( decoder_lengths=encoded_len[preds_idx : preds_idx + 1], return_hypotheses=False, ) - all_hyp_or_transcribed_texts.append(decoded_out[0][0]) + all_hyp_or_transcribed_texts.append(decoded_out[0]) best_hyp = None else: - best_hyp, all_hyp_or_transcribed_texts = self.decoding.rnnt_decoder_predictions_tensor( + best_hyp = self.decoding.rnnt_decoder_predictions_tensor( encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=True, @@ -710,8 +710,7 @@ def conformer_stream_step( ) greedy_predictions = [hyp.y_sequence for hyp in best_hyp] - if all_hyp_or_transcribed_texts is None: - all_hyp_or_transcribed_texts = best_hyp + all_hyp_or_transcribed_texts = best_hyp result = [ greedy_predictions, diff --git a/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py b/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py index 0beab5f54cb1..5328af1b7785 100644 --- a/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import math import os from dataclasses import dataclass, field @@ -20,7 +22,7 @@ import torch from nemo.collections.asr.parts.k2.classes import GraphIntersectDenseConfig -from nemo.collections.asr.parts.submodules.wfst_decoder import RivaDecoderConfig +from nemo.collections.asr.parts.submodules.wfst_decoder import RivaDecoderConfig, WfstNbestHypothesis from nemo.collections.asr.parts.utils import rnnt_utils from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.core.classes import Typing, typecheck @@ -72,7 +74,7 @@ def pack_wfst_hypotheses( y_sequence=[], score=cand.score, text=" ".join(cand.words), - timestep=list(cand.timesteps), + timestamp=list(cand.timesteps), alignments=list(cand.alignment), ) cand_hyp.y_sequence = y_sequence @@ -240,7 +242,7 @@ def __init__( self.compute_timestamps = compute_timestamps if self.compute_timestamps: - raise ValueError(f"Currently this flag is not supported for beam search algorithms.") + raise ValueError("Currently this flag is not supported for beam search algorithms.") self.vocab = None # This must be set by specific method by user before calling forward() ! @@ -387,7 +389,7 @@ def default_beam_search( hypotheses = [] for candidate_idx, candidate in enumerate(beams): hypothesis = rnnt_utils.Hypothesis( - score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None + score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None ) # For subword encoding, NeMo will double encode the subword (multiple tokens) into a @@ -444,8 +446,8 @@ def _pyctcdecode_beam_search( import pyctcdecode except (ImportError, ModuleNotFoundError): raise ImportError( - f"Could not load `pyctcdecode` library. Please install it from pip using :\n" - f"pip install --upgrade pyctcdecode" + "Could not load `pyctcdecode` library. Please install it from pip using :\n" + "pip install --upgrade pyctcdecode" ) if self.pyctcdecode_beam_scorer is None: @@ -477,7 +479,7 @@ def _pyctcdecode_beam_search( for candidate_idx, candidate in enumerate(beams): # Candidate = (text, last_lm_state, text_frames, logit_score, lm_score) hypothesis = rnnt_utils.Hypothesis( - score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None + score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None ) # TODO: Requires token ids to be returned rather than text. @@ -498,7 +500,7 @@ def _pyctcdecode_beam_search( hypothesis.score = candidate[4] # score # Inject word level timestamps - hypothesis.timestep = candidate[2] # text_frames + hypothesis.timestamp = candidate[2] # text_frames if self.preserve_alignments: hypothesis.alignments = torch.from_numpy(x[beams_idx][: out_len[beams_idx]]) @@ -535,7 +537,7 @@ def flashlight_beam_search( if self.kenlm_path is None or not os.path.exists(self.kenlm_path): raise FileNotFoundError( f"KenLM binary file not found at : {self.kenlm_path}. " - f"Please set a valid path in the decoding config." + "Please set a valid path in the decoding config." ) # perform token offset for subword models @@ -575,7 +577,7 @@ def flashlight_beam_search( hypotheses = [] for candidate_idx, candidate in enumerate(beams): hypothesis = rnnt_utils.Hypothesis( - score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None + score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None ) # We preserve the token ids and the score for this hypothesis @@ -730,7 +732,7 @@ def forward( return (packed_result,) - def _prepare_decoding_lm_wfst(self) -> Union[str, 'kaldifst.StdFst', 'k2.Fsa']: + def _prepare_decoding_lm_wfst(self) -> Union[str, 'kaldifst.StdFst', 'k2.Fsa']: # noqa: F821 """TBD""" arpa_lm_path_exists = self.arpa_lm_path is not None and os.path.exists(self.arpa_lm_path) wfst_lm_path_exists = self.wfst_lm_path is not None and os.path.exists(self.wfst_lm_path) diff --git a/nemo/collections/asr/parts/submodules/ctc_decoding.py b/nemo/collections/asr/parts/submodules/ctc_decoding.py index 0603f5f77206..13591a8b113f 100644 --- a/nemo/collections/asr/parts/submodules/ctc_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_decoding.py @@ -16,7 +16,7 @@ import unicodedata from abc import abstractmethod from dataclasses import dataclass, field, is_dataclass -from typing import Callable, Dict, List, Optional, Set, Tuple, Union +from typing import Callable, Dict, List, Optional, Set, Union import numpy as np import torch @@ -360,7 +360,7 @@ def ctc_decoder_predictions_tensor( decoder_lengths: torch.Tensor = None, fold_consecutive: bool = True, return_hypotheses: bool = False, - ) -> Tuple[List[str], Optional[List[List[str]]], Optional[Union[Hypothesis, NBestHypotheses]]]: + ) -> Union[List[Hypothesis], List[List[Hypothesis]]]: """ Decodes a sequence of labels to words @@ -379,8 +379,7 @@ def ctc_decoder_predictions_tensor( transcribe()) Returns: - Either a list of str which represent the CTC decoded strings per sample, - or a list of Hypothesis objects containing additional information. + A list of Hypothesis objects containing additional information. """ if isinstance(decoder_outputs, torch.Tensor): @@ -410,9 +409,7 @@ def ctc_decoder_predictions_tensor( if isinstance(hypotheses_list[0], NBestHypotheses): if self.cfg.strategy == 'wfst': all_hypotheses = [hyp.n_best_hypotheses for hyp in hypotheses_list] - hypotheses = [hyp[0] for hyp in all_hypotheses] else: - hypotheses = [] all_hypotheses = [] for nbest_hyp in hypotheses_list: # type: NBestHypotheses @@ -427,16 +424,14 @@ def ctc_decoder_predictions_tensor( for hyp_idx in range(len(decoded_hyps)): decoded_hyps[hyp_idx] = self.compute_ctc_timestamps(decoded_hyps[hyp_idx], timestamp_type) - hypotheses.append(decoded_hyps[0]) # best hypothesis all_hypotheses.append(decoded_hyps) if return_hypotheses: - return hypotheses, all_hypotheses + return all_hypotheses # type: list[list[Hypothesis]] - best_hyp_text = [h.text for h in hypotheses] # alaptev: The line below might contain a bug. Do we really want all_hyp_text to be flat? - all_hyp_text = [h.text for hh in all_hypotheses for h in hh] - return best_hyp_text, all_hyp_text + all_hyp = [[Hypothesis(h.score, h.y_sequence, h.text) for h in hh] for hh in all_hypotheses] + return all_hyp else: if self.cfg.strategy == 'wfst': @@ -460,10 +455,9 @@ def ctc_decoder_predictions_tensor( hypotheses[hyp_idx] = self.compute_ctc_timestamps(hypotheses[hyp_idx], timestamp_type) if return_hypotheses: - return hypotheses, None + return hypotheses - best_hyp_text = [h.text for h in hypotheses] - return best_hyp_text, None + return [Hypothesis(h.score, h.y_sequence, h.text) for h in hypotheses] def decode_hypothesis( self, hypotheses_list: List[Hypothesis], fold_consecutive: bool @@ -686,25 +680,25 @@ def compute_ctc_timestamps(self, hypothesis: Hypothesis, timestamp_type: str = " ) # attach results - if len(hypothesis.timestep) > 0: - timestep_info = hypothesis.timestep + if len(hypothesis.timestamp) > 0: + timestep_info = hypothesis.timestamp else: timestep_info = [] # Setup defaults - hypothesis.timestep = {"timestep": timestep_info} + hypothesis.timestamp = {"timestep": timestep_info} # Add char / subword time stamps if char_offsets is not None and timestamp_type in ['char', 'all']: - hypothesis.timestep['char'] = char_offsets + hypothesis.timestamp['char'] = char_offsets # Add word time stamps if word_offsets is not None and timestamp_type in ['word', 'all']: - hypothesis.timestep['word'] = word_offsets + hypothesis.timestamp['word'] = word_offsets # Add segment time stamps if segment_offsets is not None and timestamp_type in ['segment', 'all']: - hypothesis.timestep['segment'] = segment_offsets + hypothesis.timestamp['segment'] = segment_offsets # Convert the token indices to text hypothesis.text = self.decode_tokens_to_str(hypothesis.text) @@ -731,8 +725,8 @@ def _compute_offsets( # If the exact timestep information is available, utilize the 1st non-ctc blank token timestep # as the start index. - if hypothesis.timestep is not None and len(hypothesis.timestep) > 0: - start_index = max(0, hypothesis.timestep[0] - 1) + if hypothesis.timestamp is not None and len(hypothesis.timestamp) > 0: + start_index = max(0, hypothesis.timestamp[0] - 1) # Construct the start and end indices brackets end_indices = np.asarray(token_lengths).cumsum() diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py index 74204cf73d8e..bdcb71e9d721 100644 --- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py @@ -224,7 +224,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: Optional[torch.Tenso # out_len: [seq_len] # Initialize blank state and empty label set in Hypothesis - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) + hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None) prediction = x.cpu() if out_len is not None: @@ -241,7 +241,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: Optional[torch.Tenso hypothesis.alignments = (prediction.clone(), prediction_labels.clone()) if self.compute_timestamps: - hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist() + hypothesis.timestamp = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist() if self.preserve_frame_confidence: hypothesis.frame_confidence = self._get_confidence(prediction) @@ -254,7 +254,7 @@ def _greedy_decode_labels(self, x: torch.Tensor, out_len: Optional[torch.Tensor] # out_len: [seq_len] # Initialize blank state and empty label set in Hypothesis - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) + hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None) prediction_labels = x.cpu() if out_len is not None: @@ -268,7 +268,7 @@ def _greedy_decode_labels(self, x: torch.Tensor, out_len: Optional[torch.Tensor] raise ValueError("Requested for alignments, but predictions provided were labels, not log probabilities.") if self.compute_timestamps: - hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist() + hypothesis.timestamp = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist() if self.preserve_frame_confidence: raise ValueError( @@ -447,7 +447,7 @@ def _greedy_decode_logprobs_batched(self, x: torch.Tensor, out_len: torch.Tensor # This mimics the for loop in GreedyCTCInfer::forward. for i in range(batch_size): - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) + hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None) hypothesis.score = scores[i] prediction_labels_no_padding = predictions_labels[i, : out_len[i]].tolist() @@ -464,7 +464,7 @@ def _greedy_decode_logprobs_batched(self, x: torch.Tensor, out_len: torch.Tensor # TOOD: Could do this in a vectorized manner... Would # prefer to have nonzero_static, though, for sanity. # Or do a prefix sum on out_len - hypothesis.timestep = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist() + hypothesis.timestamp = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist() if self.preserve_frame_confidence: hypothesis.frame_confidence = self._get_confidence(predictions[i, : out_len[i], :]) @@ -493,7 +493,7 @@ def _greedy_decode_labels_batched(self, x: torch.Tensor, out_len: torch.Tensor): hypotheses = [] for i in range(batch_size): - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) + hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None) hypothesis.y_sequence = predictions_labels[i, : out_len[i]].tolist() hypothesis.score = -1.0 @@ -505,7 +505,7 @@ def _greedy_decode_labels_batched(self, x: torch.Tensor, out_len: torch.Tensor): # TOOD: Could do this in a vectorized manner... Would # prefer to have nonzero_static, though, for sanity. # Or do a prefix sum on out_len - hypothesis.timestep = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist() + hypothesis.timestamp = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist() if self.preserve_frame_confidence: raise ValueError( "Requested for per-frame confidence, but predictions provided were labels, not log probabilities." diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py index fc501b3d00de..a0be0e1f4a04 100644 --- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py @@ -359,13 +359,13 @@ def __call__( labels_packed = self.labels_cpu[valid_labels_mask] hypotheses = [ - rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batch_size) + rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestamp=[], dec_state=None) for _ in range(batch_size) ] timestep_start = 0 labels_start = 0 for i in range(batch_size): - hypotheses[i].timestep = timesteps_packed[timestep_start : timestep_start + timestep_segments[i]].tolist() + hypotheses[i].timestamp = timesteps_packed[timestep_start : timestep_start + timestep_segments[i]].tolist() timestep_start += timestep_segments[i] hypotheses[i].score = float(total_scores[i]) hypotheses[i].y_sequence = labels_packed[labels_start : labels_start + labels_segments[i]].tolist() diff --git a/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py b/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py index e181772b7f18..d49c6e69215f 100644 --- a/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py +++ b/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py @@ -195,7 +195,7 @@ def forward( beam_scores = [x.detach().cpu() for x in beam_scores] # each item is [beam,] packed_result = [] for i in range(len(topk_hypotheses)): - hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestep=[]) for _ in range(self.beam_size)] + hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(self.beam_size)] # Pack results into Hypotheses hypotheses = pack_hypotheses(hypotheses, topk_hypotheses[i], beam_scores[i]) self.format_hypotheses(hypotheses, decoder_input_ids) @@ -204,7 +204,7 @@ def forward( beam_scores = [None for _ in range(len(best_hypo))] best_hypo = best_hypo.detach().cpu() hypotheses = [ - Hypothesis(score=0.0, y_sequence=[], timestep=[]) for _ in range(encoder_hidden_states.shape[0]) + Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(encoder_hidden_states.shape[0]) ] # Pack results into Hypotheses packed_result = pack_hypotheses(hypotheses, best_hypo, beam_scores) diff --git a/nemo/collections/asr/parts/submodules/multitask_decoding.py b/nemo/collections/asr/parts/submodules/multitask_decoding.py index 790c95afbbfb..99010bdc14b8 100644 --- a/nemo/collections/asr/parts/submodules/multitask_decoding.py +++ b/nemo/collections/asr/parts/submodules/multitask_decoding.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re from abc import abstractmethod from dataclasses import dataclass, field, is_dataclass -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Union import torch from omegaconf import OmegaConf @@ -216,7 +215,7 @@ def decode_predictions_tensor( decoder_input_ids: Optional[torch.Tensor] = None, return_hypotheses: bool = False, partial_hypotheses: Optional[List[Hypothesis]] = None, - ) -> Tuple[List[str], Optional[List[List[str]]], Optional[Union[Hypothesis, NBestHypotheses]]]: + ) -> Union[List[Hypothesis], List[List[Hypothesis]]]: """ Decode an encoder output by autoregressive decoding of the Decoder+Joint networks. @@ -226,18 +225,14 @@ def decode_predictions_tensor( return_hypotheses: bool. If set to True it will return list of Hypothesis or NBestHypotheses Returns: - If `return_best_hypothesis` is set: - A tuple (hypotheses, None): - hypotheses - list of Hypothesis (best hypothesis per sample). + If `return_all_hypothesis` is set: + A list[list[Hypothesis]]. Look at rnnt_utils.Hypothesis for more information. - If `return_best_hypothesis` is not set: - A tuple(hypotheses, all_hypotheses) - hypotheses - list of Hypothesis (best hypothesis per sample). + If `return_all_hypothesis` is not set: + A list[Hypothesis]. + List of best hypotheses Look at rnnt_utils.Hypothesis for more information. - all_hypotheses - list of NBestHypotheses. Each NBestHypotheses further contains a sorted - list of all the hypotheses of the model per sample. - Look at rnnt_utils.NBestHypotheses for more information. """ # Compute hypotheses with torch.inference_mode(): @@ -265,11 +260,10 @@ def decode_predictions_tensor( all_hypotheses.append(decoded_hyps) if return_hypotheses: - return hypotheses, all_hypotheses + return all_hypotheses - best_hyp_text = [h.text for h in hypotheses] - all_hyp_text = [h.text for hh in all_hypotheses for h in hh] - return best_hyp_text, all_hyp_text + all_hyp = [[Hypothesis(h.score, h.y_sequence, h.text) for h in hh] for hh in all_hypotheses] + return all_hyp else: hypotheses = self.decode_hypothesis(prediction_list) @@ -280,10 +274,9 @@ def decode_predictions_tensor( self.preserve_word_confidence or self.preserve_token_confidence ): hypotheses = self.compute_confidence(hypotheses) - return hypotheses, None + return hypotheses - best_hyp_text = [h.text for h in hypotheses] - return best_hyp_text, None + return [Hypothesis(h.score, h.y_sequence, h.text) for h in hypotheses] def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hypothesis, NBestHypotheses]]: """ diff --git a/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py b/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py index f67cdd9f7944..eeae38ecef30 100644 --- a/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py @@ -201,7 +201,7 @@ def forward( packed_result = [] for i in range(len(topk_hypotheses)): # Pack results into Hypotheses - hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestep=[]) for _ in range(self.n_samples)] + hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(self.n_samples)] self.format_hypotheses(hypotheses, decoder_input_ids) packed_result.append( NBestHypotheses( @@ -212,7 +212,7 @@ def forward( beam_scores = [None for _ in range(len(best_hypo))] best_hypo = best_hypo.cpu() hypotheses = [ - Hypothesis(score=0.0, y_sequence=[], timestep=[]) for _ in range(encoder_hidden_states.shape[0]) + Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(encoder_hidden_states.shape[0]) ] # Pack results into Hypotheses packed_result = pack_hypotheses(hypotheses, best_hypo, beam_scores, step_confidence) diff --git a/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py index e0bd47bb8ce0..b34a962d280d 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py @@ -76,8 +76,8 @@ def pack_hypotheses(hypotheses: List[Hypothesis]) -> List[Hypothesis]: hyp.dec_state = _states_to_device(hyp.dec_state) # Remove -1 from timestep - if hyp.timestep is not None and len(hyp.timestep) > 0 and hyp.timestep[0] == -1: - hyp.timestep = hyp.timestep[1:] + if hyp.timestamp is not None and len(hyp.timestamp) > 0 and hyp.timestamp[0] == -1: + hyp.timestamp = hyp.timestamp[1:] return hypotheses @@ -485,7 +485,7 @@ def greedy_search( # Construct initial hypothesis hyp = Hypothesis( - score=0.0, y_sequence=[self.blank], dec_state=dec_state, timestep=[-1], length=encoded_lengths + score=0.0, y_sequence=[self.blank], dec_state=dec_state, timestamp=[-1], length=encoded_lengths ) if partial_hypotheses is not None: @@ -532,7 +532,7 @@ def greedy_search( hyp.y_sequence.append(int(pred)) hyp.score += float(logp) hyp.dec_state = state - hyp.timestep.append(i) + hyp.timestamp.append(i) # Compute next state and token y, state, _ = self.decoder.score_hypothesis(hyp, cache) @@ -582,7 +582,7 @@ def default_beam_search( dec_state = self.decoder.initialize_state(h) # Initialize first hypothesis for the beam (blank) - kept_hyps = [Hypothesis(score=0.0, y_sequence=[self.blank], dec_state=dec_state, timestep=[-1], length=0)] + kept_hyps = [Hypothesis(score=0.0, y_sequence=[self.blank], dec_state=dec_state, timestamp=[-1], length=0)] cache = {} if partial_hypotheses is not None: @@ -631,7 +631,7 @@ def default_beam_search( y_sequence=max_hyp.y_sequence[:], dec_state=max_hyp.dec_state, lm_state=max_hyp.lm_state, - timestep=max_hyp.timestep[:], + timestamp=max_hyp.timestamp[:], length=encoded_lengths, ) @@ -645,7 +645,7 @@ def default_beam_search( # if non-blank token was predicted, update state and sequence and then search more hypothesis new_hyp.dec_state = state new_hyp.y_sequence.append(int(k)) - new_hyp.timestep.append(i) + new_hyp.timestamp.append(i) hyps.append(new_hyp) @@ -729,7 +729,7 @@ def time_sync_decoding( y_sequence=[self.blank], score=0.0, dec_state=self.decoder.batch_select_state(beam_state, 0), - timestep=[-1], + timestamp=[-1], length=0, ) ] @@ -775,7 +775,7 @@ def time_sync_decoding( y_sequence=hyp.y_sequence[:], dec_state=hyp.dec_state, lm_state=hyp.lm_state, - timestep=hyp.timestep[:], + timestamp=hyp.timestamp[:], length=encoded_lengths, ) @@ -807,7 +807,7 @@ def time_sync_decoding( y_sequence=(hyp.y_sequence + [int(k)]), dec_state=beam_state[j], lm_state=hyp.lm_state, - timestep=hyp.timestep[:] + [i], + timestamp=hyp.timestamp[:] + [i], length=encoded_lengths, ) @@ -903,7 +903,7 @@ def align_length_sync_decoding( y_sequence=[self.blank], score=0.0, dec_state=beam_state[0], - timestep=[-1], + timestamp=[-1], length=0, ) ] @@ -999,7 +999,7 @@ def align_length_sync_decoding( y_sequence=hyp.y_sequence[:], dec_state=hyp.dec_state, lm_state=hyp.lm_state, - timestep=hyp.timestep[:], + timestamp=hyp.timestamp[:], length=i, ) @@ -1036,7 +1036,7 @@ def align_length_sync_decoding( y_sequence=(hyp.y_sequence[:] + [int(k)]), dec_state=beam_state[h_states_idx], lm_state=hyp.lm_state, - timestep=hyp.timestep[:] + [i], + timestamp=hyp.timestamp[:] + [i], length=i, ) @@ -1116,7 +1116,7 @@ def modified_adaptive_expansion_search( y_sequence=[self.blank], score=0.0, dec_state=self.decoder.batch_select_state(beam_state, 0), - timestep=[-1], + timestamp=[-1], length=0, ) ] @@ -1160,7 +1160,7 @@ def modified_adaptive_expansion_search( dec_out=[beam_dec_out[0]], lm_state=lm_state, lm_scores=lm_scores, - timestep=[-1], + timestamp=[-1], length=0, ) ] @@ -1218,7 +1218,7 @@ def modified_adaptive_expansion_search( dec_state=hyp.dec_state, lm_state=hyp.lm_state, lm_scores=hyp.lm_scores, - timestep=hyp.timestep[:], + timestamp=hyp.timestamp[:], length=t, ) if self.ngram_lm: @@ -1232,7 +1232,7 @@ def modified_adaptive_expansion_search( # new_hyp.y_sequence.append(int(k)) if (new_hyp.y_sequence + [int(k)]) not in duplication_check: new_hyp.y_sequence.append(int(k)) - new_hyp.timestep.append(t) + new_hyp.timestamp.append(t) # Setup ngram LM: if self.ngram_lm: diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py index 18fcc57e5184..1a50f10d3ed4 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py @@ -17,7 +17,7 @@ import unicodedata from abc import abstractmethod from dataclasses import dataclass, field, is_dataclass -from typing import Callable, Dict, List, Optional, Set, Tuple, Union +from typing import Callable, Dict, List, Optional, Set, Union import numpy as np import torch @@ -494,7 +494,7 @@ def rnnt_decoder_predictions_tensor( encoded_lengths: torch.Tensor, return_hypotheses: bool = False, partial_hypotheses: Optional[List[Hypothesis]] = None, - ) -> Tuple[List[str], Optional[List[List[str]]], Optional[Union[Hypothesis, NBestHypotheses]]]: + ) -> Union[List[Hypothesis], List[List[Hypothesis]]]: """ Decode an encoder output by autoregressive decoding of the Decoder+Joint networks. @@ -504,18 +504,14 @@ def rnnt_decoder_predictions_tensor( return_hypotheses: bool. If set to True it will return list of Hypothesis or NBestHypotheses Returns: - If `return_best_hypothesis` is set: - A tuple (hypotheses, None): - hypotheses - list of Hypothesis (best hypothesis per sample). + If `return_all_hypothesis` is set: + A list[list[Hypothesis]]. Look at rnnt_utils.Hypothesis for more information. - If `return_best_hypothesis` is not set: - A tuple(hypotheses, all_hypotheses) - hypotheses - list of Hypothesis (best hypothesis per sample). + If `return_all_hypothesis` is not set: + A list[Hypothesis]. + List of best hypotheses Look at rnnt_utils.Hypothesis for more information. - all_hypotheses - list of NBestHypotheses. Each NBestHypotheses further contains a sorted - list of all the hypotheses of the model per sample. - Look at rnnt_utils.NBestHypotheses for more information. """ # Compute hypotheses with torch.inference_mode(): @@ -546,11 +542,10 @@ def rnnt_decoder_predictions_tensor( all_hypotheses.append(decoded_hyps) if return_hypotheses: - return hypotheses, all_hypotheses + return all_hypotheses # type: list[list[Hypothesis]] - best_hyp_text = [h.text for h in hypotheses] - all_hyp_text = [h.text for hh in all_hypotheses for h in hh] - return best_hyp_text, all_hyp_text + all_hyp = [[Hypothesis(h.score, h.y_sequence, h.text) for h in hh] for hh in all_hypotheses] + return all_hyp else: hypotheses = self.decode_hypothesis(prediction_list) # type: List[str] @@ -567,10 +562,9 @@ def rnnt_decoder_predictions_tensor( self.preserve_word_confidence or self.preserve_token_confidence ): hypotheses = self.compute_confidence(hypotheses) - return hypotheses, None + return hypotheses - best_hyp_text = [h.text for h in hypotheses] - return best_hyp_text, None + return [Hypothesis(h.score, h.y_sequence, h.text) for h in hypotheses] def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hypothesis, NBestHypotheses]]: """ @@ -681,7 +675,7 @@ def compute_confidence(self, hypotheses_list: List[Hypothesis]) -> List[Hypothes hyp.token_confidence = hyp.non_blank_frame_confidence else: for hyp in hypotheses_list: - timestep = hyp.timestep.tolist() if isinstance(hyp.timestep, torch.Tensor) else hyp.timestep + timestep = hyp.timestamp.tolist() if isinstance(hyp.timestamp, torch.Tensor) else hyp.timestamp offset = 0 token_confidence = [] if len(timestep) > 0: @@ -894,25 +888,25 @@ def compute_rnnt_timestamps(self, hypothesis: Hypothesis, timestamp_type: str = ) # attach results - if len(hypothesis.timestep) > 0: - timestep_info = hypothesis.timestep + if len(hypothesis.timestamp) > 0: + timestep_info = hypothesis.timestamp else: timestep_info = [] # Setup defaults - hypothesis.timestep = {"timestep": timestep_info} + hypothesis.timestamp = {"timestep": timestep_info} # Add char / subword time stamps if char_offsets is not None and timestamp_type in ['char', 'all']: - hypothesis.timestep['char'] = char_offsets + hypothesis.timestamp['char'] = char_offsets # Add word time stamps if word_offsets is not None and timestamp_type in ['word', 'all']: - hypothesis.timestep['word'] = word_offsets + hypothesis.timestamp['word'] = word_offsets # Add segment time stamps if segment_offsets is not None and timestamp_type in ['segment', 'all']: - hypothesis.timestep['segment'] = segment_offsets + hypothesis.timestamp['segment'] = segment_offsets # Convert the flattened token indices to text hypothesis.text = self.decode_tokens_to_str(hypothesis.text) @@ -939,8 +933,8 @@ def _compute_offsets( # If the exact timestep information is available, utilize the 1st non-rnnt blank token timestep # as the start index. - if hypothesis.timestep is not None and len(hypothesis.timestep) > 0: - first_timestep = hypothesis.timestep[0] + if hypothesis.timestamp is not None and len(hypothesis.timestamp) > 0: + first_timestep = hypothesis.timestamp[0] first_timestep = first_timestep if isinstance(first_timestep, int) else first_timestep.item() start_index = max(0, first_timestep - 1) @@ -980,7 +974,7 @@ def _compute_offsets_tdt(hypothesis: Hypothesis, *args) -> List[Dict[str, Union[ # Merge the results per token into a list of dictionaries offsets = [ {"char": [t, -1], "start_offset": int(s), "end_offset": int(s + d)} - for t, s, d in zip(hypothesis.text[0], hypothesis.timestep, hypothesis.token_duration) + for t, s, d in zip(hypothesis.text[0], hypothesis.timestamp, hypothesis.token_duration) ] return offsets @@ -991,7 +985,7 @@ def _refine_timestamps( supported_punctuation: Optional[Set] = None, ) -> List[Dict[str, Union[str, int]]]: - ## no refinement for rnnt + # no refinement for rnnt return encoded_char_offsets, char_offsets diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py index bd169d0d224e..9200e3b0c2da 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py @@ -420,7 +420,7 @@ def _greedy_decode( # out_len: [seq_len] # Initialize blank state and empty label set in Hypothesis - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) + hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None) if partial_hypotheses is not None: hypothesis.last_token = partial_hypotheses.last_token @@ -492,7 +492,7 @@ def _greedy_decode( # Append token to label set, update RNN state. hypothesis.y_sequence.append(k) hypothesis.score += float(v) - hypothesis.timestep.append(time_idx) + hypothesis.timestamp.append(time_idx) hypothesis.dec_state = hidden_prime hypothesis.last_token = k @@ -787,7 +787,7 @@ def _greedy_decode_blank_as_pad_loop_frames( # Initialize list of Hypothesis batchsize = x.shape[0] hypotheses = [ - rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], token_duration=[], dec_state=None) + rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestamp=[], token_duration=[], dec_state=None) for _ in range(batchsize) ] @@ -924,7 +924,7 @@ def _greedy_decode_blank_as_pad_loop_frames( for kidx, ki in enumerate(k): if blank_mask[kidx] == 0: hypotheses[kidx].y_sequence.append(ki) - hypotheses[kidx].timestep.append(time_idx) + hypotheses[kidx].timestamp.append(time_idx) hypotheses[kidx].score += float(v[kidx]) symbols_added += 1 @@ -986,7 +986,7 @@ def _greedy_decode_masked( # Initialize state batchsize = x.shape[0] hypotheses = [ - rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize) + rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestamp=[], dec_state=None) for _ in range(batchsize) ] # Initialize Hidden state matrix (shared by entire batch) @@ -997,8 +997,6 @@ def _greedy_decode_masked( # alignments is a 3-dimensional dangling list representing B x T x U for hyp in hypotheses: hyp.alignments = [[]] - else: - alignments = None # If confidence scores need to be preserved, register a danling list to hold the values if self.preserve_frame_confidence: @@ -1135,7 +1133,7 @@ def _greedy_decode_masked( for kidx, ki in enumerate(k): if blank_mask[kidx] == 0: hypotheses[kidx].y_sequence.append(ki) - hypotheses[kidx].timestep.append(time_idx) + hypotheses[kidx].timestamp.append(time_idx) hypotheses[kidx].score += float(v[kidx]) symbols_added += 1 @@ -1403,7 +1401,7 @@ def __init__(self, encoder_model: str, decoder_joint_model: str, max_symbols_per import onnx import onnxruntime except (ModuleNotFoundError, ImportError): - raise ImportError(f"`onnx` or `onnxruntime` could not be imported, please install the libraries.\n") + raise ImportError("`onnx` or `onnxruntime` could not be imported, please install the libraries.\n") if torch.cuda.is_available(): # Try to use onnxruntime-gpu @@ -1731,7 +1729,7 @@ def _greedy_decode( # out_len: [seq_len] # Initialize blank state and empty label set in Hypothesis - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) + hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None) if partial_hypotheses is not None: hypothesis.last_token = partial_hypotheses.last_token @@ -1816,7 +1814,7 @@ def _greedy_decode( # Append token to label set, update RNN state. hypothesis.y_sequence.append(k) hypothesis.score += float(v) - hypothesis.timestep.append(time_idx) + hypothesis.timestamp.append(time_idx) hypothesis.dec_state = hidden_prime hypothesis.last_token = k @@ -1952,7 +1950,7 @@ def _greedy_decode_blank_as_pad( # Initialize list of Hypothesis batchsize = x.shape[0] hypotheses = [ - rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize) + rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestamp=[], dec_state=None) for _ in range(batchsize) ] # Initialize Hidden state matrix (shared by entire batch) @@ -2112,7 +2110,7 @@ def _greedy_decode_blank_as_pad( for kidx, ki in enumerate(k): if blank_mask[kidx] == 0: hypotheses[kidx].y_sequence.append(ki) - hypotheses[kidx].timestep.append(time_idx) + hypotheses[kidx].timestamp.append(time_idx) hypotheses[kidx].score += float(v[kidx]) symbols_added += 1 @@ -2188,7 +2186,7 @@ def _greedy_decode_masked( # Initialize state batchsize = x.shape[0] hypotheses = [ - rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize) + rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestamp=[], dec_state=None) for _ in range(batchsize) ] # Initialize Hidden state matrix (shared by entire batch) @@ -2330,7 +2328,7 @@ def _greedy_decode_masked( for kidx, ki in enumerate(k): if blank_mask[kidx] == 0: hypotheses[kidx].y_sequence.append(ki) - hypotheses[kidx].timestep.append(time_idx) + hypotheses[kidx].timestamp.append(time_idx) hypotheses[kidx].score += float(v[kidx]) symbols_added += 1 @@ -2564,7 +2562,7 @@ def _greedy_decode( # Initialize blank state and empty label set in Hypothesis hypothesis = rnnt_utils.Hypothesis( - score=0.0, y_sequence=[], dec_state=None, timestep=[], token_duration=[], last_token=None + score=0.0, y_sequence=[], dec_state=None, timestamp=[], token_duration=[], last_token=None ) if partial_hypotheses is not None: @@ -2592,7 +2590,6 @@ def _greedy_decode( f = x.narrow(dim=0, start=time_idx, length=1) # Setup exit flags and counter - not_blank = True symbols_added = 0 need_loop = True @@ -2644,13 +2641,11 @@ def _greedy_decode( del logp # If blank token is predicted, exit inner loop, move onto next timestep t - if k == self._blank_index: - not_blank = False - else: + if k != self._blank_index: # Append token to label set, update RNN state. hypothesis.y_sequence.append(k) hypothesis.score += float(v) - hypothesis.timestep.append(time_idx) + hypothesis.timestamp.append(time_idx) hypothesis.dec_state = hidden_prime hypothesis.last_token = k if self.include_duration: diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py index 13bb0b471ed2..a3b7ac9d3c34 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py +++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py @@ -474,9 +474,9 @@ def loop_labels_torch( torch.logical_and( torch.logical_and( labels != self._blank_index, - batched_hyps.last_timestep_lasts >= self.max_symbols, + batched_hyps.last_timestamp_lasts >= self.max_symbols, ), - batched_hyps.last_timestep == time_indices, + batched_hyps.last_timestamp == time_indices, ), ) time_indices += force_blank_mask # emit blank => advance time indices @@ -878,9 +878,9 @@ def _after_inner_loop(self): torch.logical_and( torch.logical_and( self.state.labels != self._blank_index, - self.state.batched_hyps.last_timestep_lasts >= self.max_symbols, + self.state.batched_hyps.last_timestamp_lasts >= self.max_symbols, ), - self.state.batched_hyps.last_timestep == self.state.time_indices, + self.state.batched_hyps.last_timestamp == self.state.time_indices, ), ) self.state.time_indices.add_(force_blank_mask) # emit blank => advance time indices diff --git a/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py b/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py index 908fc1c13d19..7aeb3417b8b2 100644 --- a/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py +++ b/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py @@ -349,7 +349,7 @@ def default_beam_search( # Initialize hypothesis array with blank hypothesis. start_hyp = Hypothesis( - score=0.0, y_sequence=[self.blank], dec_state=decoder_state, timestep=[-1], length=0, last_frame=0 + score=0.0, y_sequence=[self.blank], dec_state=decoder_state, timestamp=[-1], length=0, last_frame=0 ) kept_hyps = [start_hyp] @@ -394,7 +394,7 @@ def default_beam_search( score=float(max_hyp.score + total_logp_topk), # update score y_sequence=max_hyp.y_sequence + [token_idx], # update hypothesis sequence dec_state=decoder_state, # update decoder state - timestep=max_hyp.timestep + [time_idx + duration], # update timesteps + timestamp=max_hyp.timestamp + [time_idx + duration], # update timesteps length=encoded_lengths, last_frame=max_hyp.last_frame + duration, ) # update frame idx where last token appeared @@ -421,7 +421,7 @@ def default_beam_search( score=float(max_hyp.score + logp[self.blank] + durations_logp[duration_idx]), # update score y_sequence=max_hyp.y_sequence[:], # no need to update sequence dec_state=max_hyp.dec_state, # no need to update decoder state - timestep=max_hyp.timestep[:], # no need to update timesteps + timestamp=max_hyp.timestamp[:], # no need to update timesteps length=encoded_lengths, last_frame=max_hyp.last_frame + duration, ) # update frame idx where last token appeared @@ -482,7 +482,7 @@ def modified_adaptive_expansion_search( y_sequence=[self.blank], score=0.0, dec_state=self.decoder.batch_select_state(beam_state, 0), - timestep=[-1], + timestamp=[-1], length=0, last_frame=0, ) @@ -501,7 +501,7 @@ def modified_adaptive_expansion_search( score=0.0, dec_state=state, dec_out=[beam_decoder_output[0]], - timestep=[-1], + timestamp=[-1], length=0, last_frame=0, ) @@ -580,7 +580,7 @@ def modified_adaptive_expansion_search( y_sequence=hyp.y_sequence[:], dec_out=hyp.dec_out[:], dec_state=hyp.dec_state, - timestep=hyp.timestep[:], + timestamp=hyp.timestamp[:], length=time_idx, last_frame=hyp.last_frame + duration, ) @@ -593,7 +593,7 @@ def modified_adaptive_expansion_search( list_b.append(new_hyp) else: new_hyp.y_sequence.append(k) - new_hyp.timestep.append(time_idx + duration) + new_hyp.timestamp.append(time_idx + duration) if self.ngram_lm: lm_score, new_hyp.ngram_lm_state = self.compute_ngram_score(hyp.ngram_lm_state, int(k)) diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py index c0fbe5361761..a830bc304691 100644 --- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py +++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py @@ -541,9 +541,9 @@ def loop_labels_torch( torch.logical_and( torch.logical_and( labels != self._blank_index, - batched_hyps.last_timestep_lasts >= self.max_symbols, + batched_hyps.last_timestamp_lasts >= self.max_symbols, ), - batched_hyps.last_timestep == time_indices, + batched_hyps.last_timestamp == time_indices, ), ) time_indices += force_blank_mask # emit blank => advance time indices @@ -996,9 +996,9 @@ def _after_inner_loop(self): torch.logical_and( torch.logical_and( self.state.labels != self._blank_index, - self.state.batched_hyps.last_timestep_lasts >= self.max_symbols, + self.state.batched_hyps.last_timestamp_lasts >= self.max_symbols, ), - self.state.batched_hyps.last_timestep == self.state.time_indices, + self.state.batched_hyps.last_timestamp == self.state.time_indices, ), ) self.state.time_indices.add_(force_blank_mask) # emit blank => advance time indices diff --git a/nemo/collections/asr/parts/utils/rnnt_utils.py b/nemo/collections/asr/parts/utils/rnnt_utils.py index 8d2755fcc0ae..3e3146ad3901 100644 --- a/nemo/collections/asr/parts/utils/rnnt_utils.py +++ b/nemo/collections/asr/parts/utils/rnnt_utils.py @@ -47,7 +47,7 @@ class Hypothesis: `blank` tokens, and optionally merging word-pieces). Should be used as decoded string for Word Error Rate calculation. - timestep: (Optional) A list of integer indices representing at which index in the decoding + timestamp: (Optional) A list of integer indices representing at which index in the decoding process did the token appear. Should be of same length as the number of non-blank tokens. alignments: (Optional) Represents the CTC / RNNT token alignments as integer tokens along an axis of @@ -94,7 +94,7 @@ class Hypothesis: text: Optional[str] = None dec_out: Optional[List[torch.Tensor]] = None dec_state: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor]]] = None - timestep: Union[List[int], torch.Tensor] = field(default_factory=list) + timestamp: Union[List[int], torch.Tensor] = field(default_factory=list) alignments: Optional[Union[List[int], List[List[int]]]] = None frame_confidence: Optional[Union[List[float], List[List[float]]]] = None token_confidence: Optional[List[float]] = None @@ -111,19 +111,19 @@ class Hypothesis: @property def non_blank_frame_confidence(self) -> List[float]: - """Get per-frame confidence for non-blank tokens according to self.timestep + """Get per-frame confidence for non-blank tokens according to self.timestamp Returns: - List with confidence scores. The length of the list is the same as `timestep`. + List with confidence scores. The length of the list is the same as `timestamp`. """ non_blank_frame_confidence = [] - # self.timestep can be a dict for RNNT - timestep = self.timestep['timestep'] if isinstance(self.timestep, dict) else self.timestep - if len(timestep) != 0 and self.frame_confidence is not None: + # self.timestamp can be a dict for RNNT + timestamp = self.timestamp['timestep'] if isinstance(self.timestamp, dict) else self.timestamp + if len(timestamp) != 0 and self.frame_confidence is not None: if any(isinstance(i, list) for i in self.frame_confidence): # rnnt t_prev = -1 offset = 0 - for t in timestep: + for t in timestamp: if t != t_prev: t_prev = t offset = 0 @@ -131,7 +131,7 @@ def non_blank_frame_confidence(self) -> List[float]: offset += 1 non_blank_frame_confidence.append(self.frame_confidence[t][offset]) else: # ctc - non_blank_frame_confidence = [self.frame_confidence[t] for t in timestep] + non_blank_frame_confidence = [self.frame_confidence[t] for t in timestamp] return non_blank_frame_confidence @property @@ -258,22 +258,22 @@ def __init__( raise ValueError(f"batch_size must be > 0, got {batch_size}") self._max_length = init_length - # batch of current lengths of hypotheses and correspoinding timesteps + # batch of current lengths of hypotheses and correspoinding timestamps self.current_lengths = torch.zeros(batch_size, device=device, dtype=torch.long) # tensor for storing transcripts self.transcript = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long) - # tensor for storing timesteps corresponding to transcripts - self.timesteps = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long) + # tensor for storing timestamps corresponding to transcripts + self.timestamps = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long) # tensor for storing durations corresponding to transcripts tokens self.token_durations = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long) # accumulated scores for hypotheses self.scores = torch.zeros(batch_size, device=device, dtype=float_dtype) - # tracking last timestep of each hyp to avoid infinite looping (when max symbols per frame is restricted) - # last observed timestep (with label) for each hypothesis - self.last_timestep = torch.full((batch_size,), -1, device=device, dtype=torch.long) - # number of labels for the last timestep - self.last_timestep_lasts = torch.zeros(batch_size, device=device, dtype=torch.long) + # tracking last timestamp of each hyp to avoid infinite looping (when max symbols per frame is restricted) + # last observed timestamp (with label) for each hypothesis + self.last_timestamp = torch.full((batch_size,), -1, device=device, dtype=torch.long) + # number of labels for the last timestamp + self.last_timestamp_lasts = torch.zeros(batch_size, device=device, dtype=torch.long) self._batch_indices = torch.arange(batch_size, device=device) self._ones_batch = torch.ones_like(self._batch_indices) @@ -283,11 +283,11 @@ def clear_(self): """ self.current_lengths.fill_(0) self.transcript.fill_(0) - self.timesteps.fill_(0) + self.timestamps.fill_(0) self.token_durations.fill_(0) self.scores.fill_(0.0) - self.last_timestep.fill_(-1) - self.last_timestep_lasts.fill_(0) + self.last_timestamp.fill_(-1) + self.last_timestamp_lasts.fill_(0) def _allocate_more(self): """ @@ -295,7 +295,7 @@ def _allocate_more(self): to maintain O(1) insertion time complexity """ self.transcript = torch.cat((self.transcript, torch.zeros_like(self.transcript)), dim=-1) - self.timesteps = torch.cat((self.timesteps, torch.zeros_like(self.timesteps)), dim=-1) + self.timestamps = torch.cat((self.timestamps, torch.zeros_like(self.timestamps)), dim=-1) self.token_durations = torch.cat((self.token_durations, torch.zeros_like(self.token_durations)), dim=-1) self._max_length *= 2 @@ -353,17 +353,17 @@ def add_results_no_checks_( # accumulate scores self.scores[active_indices] += scores - # store transcript and timesteps + # store transcript and timestamps active_lengths = self.current_lengths[active_indices] self.transcript[active_indices, active_lengths] = labels - self.timesteps[active_indices, active_lengths] = time_indices + self.timestamps[active_indices, active_lengths] = time_indices if token_durations is not None: self.token_durations[active_indices, active_lengths] = token_durations - # store last observed timestep + number of observation for the current timestep - self.last_timestep_lasts[active_indices] = torch.where( - self.last_timestep[active_indices] == time_indices, self.last_timestep_lasts[active_indices] + 1, 1 + # store last observed timestamp + number of observation for the current timestamp + self.last_timestamp_lasts[active_indices] = torch.where( + self.last_timestamp[active_indices] == time_indices, self.last_timestamp_lasts[active_indices] + 1, 1 ) - self.last_timestep[active_indices] = time_indices + self.last_timestamp[active_indices] = time_indices # increase lengths self.current_lengths[active_indices] += 1 @@ -417,27 +417,27 @@ def add_results_masked_no_checks_( # same as self.scores[active_mask] += scores[active_mask], but non-blocking torch.where(active_mask, self.scores + scores, self.scores, out=self.scores) - # store transcript and timesteps + # store transcript and timestamps self.transcript[self._batch_indices, self.current_lengths] = labels - self.timesteps[self._batch_indices, self.current_lengths] = time_indices + self.timestamps[self._batch_indices, self.current_lengths] = time_indices if token_durations is not None: self.token_durations[self._batch_indices, self.current_lengths] = token_durations - # store last observed timestep + number of observation for the current timestep - # if last_timestep == time_indices, increase; else set to 1 + # store last observed timestamp + number of observation for the current timestamp + # if last_timestamp == time_indices, increase; else set to 1 torch.where( - torch.logical_and(active_mask, self.last_timestep == time_indices), - self.last_timestep_lasts + 1, - self.last_timestep_lasts, - out=self.last_timestep_lasts, + torch.logical_and(active_mask, self.last_timestamp == time_indices), + self.last_timestamp_lasts + 1, + self.last_timestamp_lasts, + out=self.last_timestamp_lasts, ) torch.where( - torch.logical_and(active_mask, self.last_timestep != time_indices), + torch.logical_and(active_mask, self.last_timestamp != time_indices), self._ones_batch, - self.last_timestep_lasts, - out=self.last_timestep_lasts, + self.last_timestamp_lasts, + out=self.last_timestamp_lasts, ) - # same as: self.last_timestep[active_mask] = time_indices[active_mask], but non-blocking - torch.where(active_mask, time_indices, self.last_timestep, out=self.last_timestep) + # same as: self.last_timestamp[active_mask] = time_indices[active_mask], but non-blocking + torch.where(active_mask, time_indices, self.last_timestamp, out=self.last_timestamp) # increase lengths self.current_lengths += active_mask @@ -479,8 +479,8 @@ def __init__( self.with_alignments = store_alignments self._max_length = init_length - # tensor to store observed timesteps (for alignments / confidence scores) - self.timesteps = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long) + # tensor to store observed timestamps (for alignments / confidence scores) + self.timestamps = torch.zeros((batch_size, self._max_length), device=device, dtype=torch.long) # current lengths of the utterances (alignments) self.current_lengths = torch.zeros(batch_size, device=device, dtype=torch.long) @@ -508,7 +508,7 @@ def clear_(self): Clears batched hypotheses state. """ self.current_lengths.fill_(0) - self.timesteps.fill_(0) + self.timestamps.fill_(0) self.logits.fill_(0.0) self.labels.fill_(0) self.frame_confidence.fill_(0) @@ -518,7 +518,7 @@ def _allocate_more(self): Allocate 2x space for tensors, similar to common C++ std::vector implementations to maintain O(1) insertion time complexity """ - self.timesteps = torch.cat((self.timesteps, torch.zeros_like(self.timesteps)), dim=-1) + self.timestamps = torch.cat((self.timestamps, torch.zeros_like(self.timestamps)), dim=-1) if self.with_alignments: self.logits = torch.cat((self.logits, torch.zeros_like(self.logits)), dim=1) self.labels = torch.cat((self.labels, torch.zeros_like(self.labels)), dim=-1) @@ -553,8 +553,8 @@ def add_results_( self._allocate_more() active_lengths = self.current_lengths[active_indices] - # store timesteps - same for alignments / confidence - self.timesteps[active_indices, active_lengths] = time_indices + # store timestamps - same for alignments / confidence + self.timestamps[active_indices, active_lengths] = time_indices if self.with_alignments and logits is not None and labels is not None: self.logits[active_indices, active_lengths] = logits @@ -609,11 +609,11 @@ def add_results_masked_no_checks_( labels: tensor with decoded labels (can contain blank) confidence: optional tensor with confidence for each item in batch """ - # store timesteps - same for alignments / confidence - self.timesteps[self._batch_indices, self.current_lengths] = time_indices + # store timestamps - same for alignments / confidence + self.timestamps[self._batch_indices, self.current_lengths] = time_indices if self.with_alignments and logits is not None and labels is not None: - self.timesteps[self._batch_indices, self.current_lengths] = time_indices + self.timestamps[self._batch_indices, self.current_lengths] = time_indices self.logits[self._batch_indices, self.current_lengths] = logits self.labels[self._batch_indices, self.current_lengths] = labels @@ -645,7 +645,7 @@ def batched_hyps_to_hypotheses( Hypothesis( score=batched_hyps.scores[i].item(), y_sequence=batched_hyps.transcript[i, : batched_hyps.current_lengths[i]], - timestep=batched_hyps.timesteps[i, : batched_hyps.current_lengths[i]], + timestamp=batched_hyps.timestamps[i, : batched_hyps.current_lengths[i]], token_duration=( durations if not torch.all( @@ -673,17 +673,20 @@ def batched_hyps_to_hypotheses( if alignments.with_frame_confidence: hypotheses[i].frame_confidence = [] _, grouped_counts = torch.unique_consecutive( - alignments.timesteps[i, : alignment_lengths[i]], return_counts=True + alignments.timestamps[i, : alignment_lengths[i]], return_counts=True ) start = 0 - for timestep_cnt in grouped_counts.tolist(): + for timestamp_cnt in grouped_counts.tolist(): if alignments.with_alignments: hypotheses[i].alignments.append( - [(alignment_logits[i, start + j], alignment_labels[i, start + j]) for j in range(timestep_cnt)] + [ + (alignment_logits[i, start + j], alignment_labels[i, start + j]) + for j in range(timestamp_cnt) + ] ) if alignments.with_frame_confidence: hypotheses[i].frame_confidence.append( - [frame_confidence[i, start + j] for j in range(timestep_cnt)] + [frame_confidence[i, start + j] for j in range(timestamp_cnt)] ) - start += timestep_cnt + start += timestamp_cnt return hypotheses diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py index cb272e3d0462..6497b4594184 100644 --- a/nemo/collections/asr/parts/utils/streaming_utils.py +++ b/nemo/collections/asr/parts/utils/streaming_utils.py @@ -107,7 +107,7 @@ def longest_common_subsequence_merge(X, Y, filepath=None): # value initially in each cell m = len(X) n = len(Y) - LCSuff = [[0 for k in range(n + 1)] for l in range(m + 1)] + LCSuff = [[0 for _ in range(n + 1)] for _ in range(m + 1)] # To store the length of # longest common substring @@ -1672,7 +1672,7 @@ def transcribe( """ self.infer_logits(keep_logits) - hypothesis = " ".join(self.all_preds) + hypothesis = " ".join([h.text for h in self.all_preds]) if not keep_logits: return hypothesis diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 539d820e0814..f3a258e341fd 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -60,11 +60,11 @@ def get_buffered_pred_feat_rnnt( filepaths = [] with open(manifest, "r", encoding='utf_8') as mfst_f: print("Parsing manifest files...") - for l in mfst_f: - l = l.strip() - if not l: + for L in mfst_f: + L = L.strip() + if not L: continue - row = json.loads(l) + row = json.loads(L) audio_file = get_full_path(audio_file=row['audio_filepath'], manifest_file=manifest) filepaths.append(audio_file) if 'text' in row: @@ -145,19 +145,19 @@ def get_buffered_pred_feat( raise ValueError("Either filepaths or manifest shoud not be None") if filepaths: - for l in tqdm(filepaths, desc="Sample:"): + for L in tqdm(filepaths, desc="Sample:"): asr.reset() - asr.read_audio_file(l, delay, model_stride_in_secs) + asr.read_audio_file(L, delay, model_stride_in_secs) hyp = asr.transcribe(tokens_per_chunk, delay) hyps.append(hyp) else: with open(manifest, "r", encoding='utf_8') as mfst_f: - for l in tqdm(mfst_f, desc="Sample:"): + for L in tqdm(mfst_f, desc="Sample:"): asr.reset() - l = l.strip() - if not l: + L = L.strip() + if not L: continue - row = json.loads(l) + row = json.loads(L) if 'text' in row: refs.append(row['text']) audio_file = get_full_path(audio_file=row['audio_filepath'], manifest_file=manifest) @@ -452,7 +452,7 @@ def write_transcription( item = {'audio_filepath': filepaths[idx], pred_text_attr_name: transcription.text} if timestamps: - timestamps = transcription.timestep + timestamps = transcription.timestamp if timestamps is not None and isinstance(timestamps, dict): timestamps.pop( 'timestep', None @@ -480,7 +480,7 @@ def write_transcription( item[pred_text_attr_name] = best_hyps[idx].text if timestamps: - timestamps = best_hyps[idx].timestep + timestamps = best_hyps[idx].timestamp if timestamps is not None and isinstance(timestamps, dict): timestamps.pop( 'timestep', None @@ -631,19 +631,19 @@ def process_timestamp(timestamp, subsampling_factor, window_stride): return timestamp for idx, hyp in enumerate(outputs): - if not hasattr(hyp, 'timestep'): + if not hasattr(hyp, 'timestamp'): raise ValueError( - f"Expected Hypothesis object to have 'timestep' attribute, when compute_timestamps is \ + f"Expected Hypothesis object to have 'timestamp' attribute, when compute_timestamps is \ enabled but got {hyp}" ) - timestep = hyp.timestep - if 'word' in timestep: - outputs[idx].timestep['word'] = process_timestamp(timestep['word'], subsampling_factor, window_stride) - if 'char' in timestep: - outputs[idx].timestep['char'] = process_timestamp(timestep['char'], subsampling_factor, window_stride) - if 'segment' in timestep: - outputs[idx].timestep['segment'] = process_timestamp( - timestep['segment'], subsampling_factor, window_stride + timestamp = hyp.timestamp + if 'word' in timestamp: + outputs[idx].timestamp['word'] = process_timestamp(timestamp['word'], subsampling_factor, window_stride) + if 'char' in timestamp: + outputs[idx].timestamp['char'] = process_timestamp(timestamp['char'], subsampling_factor, window_stride) + if 'segment' in timestamp: + outputs[idx].timestamp['segment'] = process_timestamp( + timestamp['segment'], subsampling_factor, window_stride ) return outputs diff --git a/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py b/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py index 53ae4a2dfb65..54443f36a24d 100644 --- a/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py +++ b/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py @@ -143,10 +143,13 @@ def transcribe( return_hypotheses: (bool) Either return hypotheses or text With hypotheses can do some postprocessing like getting timestamp or rescoring num_workers: (int) number of workers for DataLoader - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. + channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels + from multi-channel audio. If set to `'average'`, it performs averaging across channels. + Disabled if set to `None`. Defaults to `None`. augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. Returns: - A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2video_files + A list of transcriptions (or raw log probabilities if logprobs is True) + in the same order as paths2video_files """ if paths2video_files is None or len(paths2video_files) == 0: return {} @@ -162,7 +165,6 @@ def transcribe( # We will store transcriptions here hypotheses = [] - all_hypotheses = [] # Model's mode and device mode = self.training @@ -206,7 +208,7 @@ def transcribe( lg = logits[idx][: logits_len[idx]] hypotheses.append(lg.cpu().numpy()) else: - current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor( + current_hypotheses = self.decoding.ctc_decoder_predictions_tensor( logits, decoder_lengths=logits_len, return_hypotheses=return_hypotheses, @@ -219,10 +221,7 @@ def transcribe( if current_hypotheses[idx].alignments is None: current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence - if all_hyp is None: - hypotheses += current_hypotheses - else: - hypotheses += all_hyp + hypotheses += current_hypotheses del greedy_predictions del logits @@ -240,9 +239,12 @@ def transcribe( def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[DictConfig] = None): """ - Changes vocabulary used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need + Changes vocabulary used during CTC decoding process. Use this method when + fine-tuning on from pre-trained model. + This method changes only decoder and leaves encoder and pre-processing + modules unchanged. For example, you would + use it if you want to use pretrained encoder when fine-tuning on a data in another language, + or when you'd need model to learn capitalization, punctuation and/or special characters. If new_vocabulary == self.decoder.vocabulary then nothing will be changed. diff --git a/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py b/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py index 158bfaddcc96..eb4e4fa52271 100644 --- a/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py +++ b/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py @@ -16,7 +16,7 @@ import json import os import tempfile -from typing import List, Optional +from typing import List, Optional, Union import torch from lightning.pytorch import Trainer @@ -28,6 +28,7 @@ from nemo.collections.asr.parts.mixins import ASRBPEMixin, InterCTCMixin from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.multimodal.speech_cv.models.visual_rnnt_models import VisualEncDecRNNTModel from nemo.core.classes.common import PretrainedModelInfo from nemo.core.classes.mixins import AccessMixin @@ -100,7 +101,7 @@ def transcribe( partial_hypothesis: Optional[List['Hypothesis']] = None, num_workers: int = 0, channel_selector: Optional[ChannelSelectorType] = None, - ) -> (List[str], Optional[List['Hypothesis']]): + ) -> Union[List['Hypothesis'], Optional[List['Hypothesis']]]: """ Uses greedy decoding to transcribe video files. Use this method for debugging and prototyping. @@ -112,12 +113,12 @@ def transcribe( return_hypotheses: (bool) Either return hypotheses or text With hypotheses can do some postprocessing like getting timestamp or rescoring num_workers: (int) number of workers for DataLoader - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. + channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels + from multi-channel audio. If set to `'average'`, it performs averaging across channels. + Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. Returns: - Returns a tuple of 2 items - - * A list of greedy transcript texts / Hypothesis - * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis. + Returns a list of greedy transcript Hypothesis or list of all Hypothesis """ if self.use_rnnt_decoder: return super().transcribe( @@ -133,7 +134,6 @@ def transcribe( return {} # We will store transcriptions here hypotheses = [] - all_hypotheses = [] # Model's mode and device mode = self.training device = next(self.parameters()).device @@ -177,7 +177,7 @@ def transcribe( ) logits = self.ctc_decoder(encoder_output=encoded) - best_hyp, all_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor( + best_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor( logits, encoded_len, return_hypotheses=return_hypotheses, @@ -191,10 +191,6 @@ def transcribe( del logits hypotheses += best_hyp - if all_hyp is not None: - all_hypotheses += all_hyp - else: - all_hypotheses += best_hyp del encoded del test_batch @@ -210,7 +206,7 @@ def transcribe( self.joint.unfreeze() if hasattr(self, 'ctc_decoder'): self.ctc_decoder.unfreeze() - return hypotheses, all_hypotheses + return hypotheses def change_vocabulary( self, @@ -220,9 +216,9 @@ def change_vocabulary( ): """ Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning a pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. + This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you + would use it if you want to use pretrained encoder when fine-tuning on data in another language, or when + you'd need model to learn capitalization, punctuation and/or special characters. Args: new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ @@ -473,7 +469,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) del signal - best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( + best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False ) diff --git a/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py b/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py index 75202238d2d0..abe9660a7f4e 100644 --- a/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py +++ b/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py @@ -32,6 +32,7 @@ from nemo.collections.asr.parts.mixins import ASRModuleMixin from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecoding, RNNTDecodingConfig +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.multimodal.speech_cv.data import video_to_text_dataset from nemo.core.classes import Exportable from nemo.core.classes.common import PretrainedModelInfo, typecheck @@ -236,18 +237,18 @@ def transcribe( return_hypotheses: (bool) Either return hypotheses or text With hypotheses can do some postprocessing like getting timestamp or rescoring num_workers: (int) number of workers for DataLoader - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. + channel_selector (int | Iterable[int] | str): select a single channel or a subset + of channels from multi-channel audio. If set to `'average'`, + it performs averaging across channels. + Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. Returns: - Returns a tuple of 2 items - - * A list of greedy transcript texts / Hypothesis - * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis. + Returns a list of greedy transcript Hypothesis or list of all Hypothesis """ if paths2video_files is None or len(paths2video_files) == 0: return {} # We will store transcriptions here hypotheses = [] - all_hypotheses = [] # Model's mode and device mode = self.training device = next(self.parameters()).device @@ -289,7 +290,7 @@ def transcribe( encoded, encoded_len = self.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) ) - best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor( + best_hyp = self.decoding.rnnt_decoder_predictions_tensor( encoded, encoded_len, return_hypotheses=return_hypotheses, @@ -297,10 +298,6 @@ def transcribe( ) hypotheses += best_hyp - if all_hyp is not None: - all_hypotheses += all_hyp - else: - all_hypotheses += best_hyp del encoded del test_batch @@ -314,14 +311,14 @@ def transcribe( self.encoder.unfreeze() self.decoder.unfreeze() self.joint.unfreeze() - return hypotheses, all_hypotheses + return hypotheses def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[DictConfig] = None): """ Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning a pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. + This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, + you would use it if you want to use pretrained encoder when fine-tuning on data in another language, + or when you'd need model to learn capitalization, punctuation and/or special characters. Args: new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ @@ -731,7 +728,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) del signal - best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( + best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False ) diff --git a/nemo/collections/tts/g2p/models/ctc.py b/nemo/collections/tts/g2p/models/ctc.py index 1859b09594ff..3248774de571 100644 --- a/nemo/collections/tts/g2p/models/ctc.py +++ b/nemo/collections/tts/g2p/models/ctc.py @@ -27,7 +27,7 @@ from nemo.collections.tts.models.base import G2PModel from nemo.core.classes.common import PretrainedModelInfo from nemo.core.classes.exportable import Exportable -from nemo.core.neural_types import LengthsType, NeuralType, TokenIndex +from nemo.core.neural_types import LengthsType, LossType, NeuralType, TokenIndex from nemo.utils import logging try: @@ -38,7 +38,7 @@ from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig ASR_AVAILABLE = True -except (ModuleNotFoundError, ImportError) as e: +except (ModuleNotFoundError, ImportError): ASR_AVAILABLE = False @@ -356,9 +356,10 @@ def _infer( input_len=input_len.to(device), ) - preds_str, _ = self.decoding.ctc_decoder_predictions_tensor( + preds_hyps = self.decoding.ctc_decoder_predictions_tensor( log_probs, decoder_lengths=encoded_len, return_hypotheses=False ) + preds_str = [hyp.text for hyp in preds_hyps] all_preds.extend(preds_str) del greedy_predictions @@ -396,7 +397,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, name: str): def setup_training_data(self, cfg: DictConfig): if not cfg or cfg.manifest_filepath is None: logging.info( - f"Dataloader config or file_path for the train is missing, so no data loader for train is created!" + "Dataloader config or file_path for the train is missing, so no data loader for train is created!" ) self._train_dl = None return @@ -417,7 +418,7 @@ def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict] = N def setup_validation_data(self, cfg: Optional[DictConfig]): if not cfg or cfg.manifest_filepath is None: logging.info( - f"Dataloader config or file_path for the validation is missing, so no data loader for validation is created!" + "Dataloader config or file_path for the validation is missing, so no data loader for validation is created!" ) self._validation_dl = None return @@ -426,7 +427,7 @@ def setup_validation_data(self, cfg: Optional[DictConfig]): def setup_test_data(self, cfg: Optional[DictConfig]): if not cfg or cfg.manifest_filepath is None: logging.info( - f"Dataloader config or file_path for the test is missing, so no data loader for test is created!" + "Dataloader config or file_path for the test is missing, so no data loader for test is created!" ) self._test_dl = None return diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py index 3bb4fa4f4846..9735180d2659 100644 --- a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py +++ b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py @@ -193,7 +193,7 @@ def beam_search_eval( probs_batch[prob_index], device=packed_batch.device, dtype=packed_batch.dtype ) - _, beams_batch = decoding.ctc_decoder_predictions_tensor( + beams_batch = decoding.ctc_decoder_predictions_tensor( packed_batch, decoder_lengths=probs_lens, return_hypotheses=True, @@ -335,9 +335,9 @@ def main(cfg: EvalBeamSearchNGramConfig): preds = np.argmax(probs, axis=1) preds_tensor = torch.tensor(preds, device='cpu').unsqueeze(0) if isinstance(asr_model, EncDecHybridRNNTCTCModel): - pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0] + pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor)[0] else: - pred_text = asr_model._wer.decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0] + pred_text = asr_model._wer.decoding.ctc_decoder_predictions_tensor(preds_tensor)[0] if cfg.text_processing.do_lowercase: pred_text = punctuation_capitalization.do_lowercase([pred_text])[0] diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py index c61a402c0942..57bf9db6f3bd 100644 --- a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py +++ b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py @@ -176,11 +176,12 @@ def decoding_step( packed_batch[prob_index, :, : probs_lens[prob_index]] = torch.tensor( probs_batch[prob_index].unsqueeze(0), device=packed_batch.device, dtype=packed_batch.dtype ) - best_hyp_batch, beams_batch = model.decoding.rnnt_decoder_predictions_tensor( + beams_batch = model.decoding.rnnt_decoder_predictions_tensor( packed_batch, probs_lens, return_hypotheses=True, ) + best_hyp_batch = [dec_hyp[0] for dec_hyp in beams_batch] if cfg.decoding_strategy == "greedy_batch": beams_batch = [[x] for x in best_hyp_batch] diff --git a/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py b/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py index 63ab24b0921e..0dd61b2d9afd 100644 --- a/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py +++ b/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py @@ -325,9 +325,9 @@ def main(cfg: EvalWFSTNGramConfig): preds_tensor = preds.to(device='cpu').unsqueeze(0) preds_lens = torch.tensor([preds_tensor.shape[1]], device='cpu') if isinstance(asr_model, EncDecHybridRNNTCTCModel): - pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor, preds_lens)[0][0] + pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor, preds_lens)[0] else: - pred_text = asr_model.decoding.ctc_decoder_predictions_tensor(preds_tensor, preds_lens)[0][0] + pred_text = asr_model.decoding.ctc_decoder_predictions_tensor(preds_tensor, preds_lens)[0] if cfg.text_processing.do_lowercase: pred_text = punctuation_capitalization.do_lowercase([pred_text])[0] diff --git a/tests/collections/asr/decoding/test_batched_hyps_and_alignments.py b/tests/collections/asr/decoding/test_batched_hyps_and_alignments.py index 60fd88144230..c2c0d5969ae2 100644 --- a/tests/collections/asr/decoding/test_batched_hyps_and_alignments.py +++ b/tests/collections/asr/decoding/test_batched_hyps_and_alignments.py @@ -46,10 +46,10 @@ class TestBatchedHyps: @pytest.mark.parametrize("device", DEVICES) def test_instantiate(self, device: torch.device): hyps = BatchedHyps(batch_size=2, init_length=3, device=device) - assert torch.is_tensor(hyps.timesteps) + assert torch.is_tensor(hyps.timestamps) # device: for mps device we need to use `type`, not directly compare - assert hyps.timesteps.device.type == device.type - assert hyps.timesteps.shape == (2, 3) + assert hyps.timestamps.device.type == device.type + assert hyps.timestamps.shape == (2, 3) @pytest.mark.unit @pytest.mark.parametrize("batch_size", [-1, 0]) @@ -76,10 +76,10 @@ def test_add_results(self, device: torch.device): ) assert hyps.current_lengths.tolist() == [1, 0] assert hyps.transcript.tolist()[0][:1] == [5] - assert hyps.timesteps.tolist()[0][:1] == [1] + assert hyps.timestamps.tolist()[0][:1] == [1] assert hyps.scores.tolist() == pytest.approx([0.5, 0.0]) - assert hyps.last_timestep.tolist() == [1, -1] - assert hyps.last_timestep_lasts.tolist() == [1, 0] + assert hyps.last_timestamp.tolist() == [1, -1] + assert hyps.last_timestamp_lasts.tolist() == [1, 0] @pytest.mark.unit @pytest.mark.parametrize("device", DEVICES) @@ -101,11 +101,11 @@ def test_add_multiple_results(self, device: torch.device): assert hyps.current_lengths.tolist() == [2, 1] assert hyps.transcript.tolist()[0][:2] == [5, 2] assert hyps.transcript.tolist()[1][:1] == [4] - assert hyps.timesteps.tolist()[0][:2] == [1, 1] - assert hyps.timesteps.tolist()[1][:1] == [2] + assert hyps.timestamps.tolist()[0][:2] == [1, 1] + assert hyps.timestamps.tolist()[1][:1] == [2] assert hyps.scores.tolist() == pytest.approx([1.5, 1.0]) - assert hyps.last_timestep.tolist() == [1, 2] - assert hyps.last_timestep_lasts.tolist() == [2, 1] + assert hyps.last_timestamp.tolist() == [1, 2] + assert hyps.last_timestamp_lasts.tolist() == [2, 1] @pytest.mark.unit @pytest.mark.parametrize("device", DEVICES) @@ -117,14 +117,17 @@ def test_add_results_masked(self, device: torch.device): scores = torch.tensor([0.5, 10.0], device=device) labels = torch.tensor([5, 1], device=device) hyps.add_results_masked_( - active_mask=active_mask, labels=labels, time_indices=time_indices, scores=scores, + active_mask=active_mask, + labels=labels, + time_indices=time_indices, + scores=scores, ) assert hyps.current_lengths.tolist() == [1, 0] assert hyps.transcript.tolist()[0][:1] == [5] - assert hyps.timesteps.tolist()[0][:1] == [1] + assert hyps.timestamps.tolist()[0][:1] == [1] assert hyps.scores.tolist() == pytest.approx([0.5, 0.0]) # last score should be ignored! - assert hyps.last_timestep.tolist() == [1, -1] - assert hyps.last_timestep_lasts.tolist() == [1, 0] + assert hyps.last_timestamp.tolist() == [1, -1] + assert hyps.last_timestamp_lasts.tolist() == [1, 0] @pytest.mark.unit @pytest.mark.parametrize("device", DEVICES) @@ -138,14 +141,17 @@ def test_add_results_masked_no_checks(self, device: torch.device): # check there are no blocking operations with avoid_sync_operations(device=device): hyps.add_results_masked_no_checks_( - active_mask=active_mask, labels=labels, time_indices=time_indices, scores=scores, + active_mask=active_mask, + labels=labels, + time_indices=time_indices, + scores=scores, ) assert hyps.current_lengths.tolist() == [1, 0] assert hyps.transcript.tolist()[0][:1] == [5] - assert hyps.timesteps.tolist()[0][:1] == [1] + assert hyps.timestamps.tolist()[0][:1] == [1] assert hyps.scores.tolist() == pytest.approx([0.5, 0.0]) # last score should be ignored! - assert hyps.last_timestep.tolist() == [1, -1] - assert hyps.last_timestep_lasts.tolist() == [1, 0] + assert hyps.last_timestamp.tolist() == [1, -1] + assert hyps.last_timestamp_lasts.tolist() == [1, 0] @pytest.mark.unit @pytest.mark.parametrize("device", DEVICES) @@ -167,11 +173,11 @@ def test_add_multiple_results_masked(self, device: torch.device): assert hyps.current_lengths.tolist() == [2, 1] assert hyps.transcript.tolist()[0][:2] == [5, 2] assert hyps.transcript.tolist()[1][:1] == [4] - assert hyps.timesteps.tolist()[0][:2] == [1, 1] - assert hyps.timesteps.tolist()[1][:1] == [2] + assert hyps.timestamps.tolist()[0][:2] == [1, 1] + assert hyps.timestamps.tolist()[1][:1] == [2] assert hyps.scores.tolist() == pytest.approx([1.5, 1.0]) - assert hyps.last_timestep.tolist() == [1, 2] - assert hyps.last_timestep_lasts.tolist() == [2, 1] + assert hyps.last_timestamp.tolist() == [1, 2] + assert hyps.last_timestamp_lasts.tolist() == [2, 1] @pytest.mark.unit @pytest.mark.parametrize("device", DEVICES) @@ -252,7 +258,7 @@ def test_add_results(self, device: torch.device): ) assert alignments.current_lengths.tolist() == [1, 1] assert torch.allclose(alignments.logits[:, 0], sample_logits[:, 0]) - assert alignments.timesteps[:, 0].tolist() == [0, 0] + assert alignments.timestamps[:, 0].tolist() == [0, 0] @pytest.mark.unit @pytest.mark.parametrize("device", DEVICES) @@ -294,7 +300,7 @@ def test_add_results_masked(self, device: torch.device): ) assert alignments.current_lengths.tolist() == [1, 1] assert torch.allclose(alignments.logits[:, 0], sample_logits[:, 0]) - assert alignments.timesteps[:, 0].tolist() == [0, 0] + assert alignments.timestamps[:, 0].tolist() == [0, 0] @pytest.mark.unit @pytest.mark.parametrize("device", DEVICES) @@ -313,7 +319,7 @@ def test_add_results_masked_no_checks(self, device: torch.device): ) assert alignments.current_lengths.tolist() == [1, 1] assert torch.allclose(alignments.logits[:, 0], sample_logits[:, 0]) - assert alignments.timesteps[:, 0].tolist() == [0, 0] + assert alignments.timestamps[:, 0].tolist() == [0, 0] @pytest.mark.unit @pytest.mark.parametrize("device", DEVICES) @@ -382,8 +388,8 @@ def test_convert_to_hypotheses(self, device: torch.device): assert (hypotheses[1].y_sequence == torch.tensor([4], device=device)).all() assert hypotheses[0].score == pytest.approx(1.5) assert hypotheses[1].score == pytest.approx(1.0) - assert (hypotheses[0].timestep == torch.tensor([1, 1], device=device)).all() - assert (hypotheses[1].timestep == torch.tensor([2], device=device)).all() + assert (hypotheses[0].timestamp == torch.tensor([1, 1], device=device)).all() + assert (hypotheses[1].timestamp == torch.tensor([2], device=device)).all() @pytest.mark.unit @pytest.mark.parametrize("device", DEVICES) @@ -442,8 +448,8 @@ def test_convert_to_hypotheses_with_alignments(self, device: torch.device): assert (hypotheses[1].y_sequence == torch.tensor([4], device=device)).all() assert hypotheses[0].score == pytest.approx(1.5) assert hypotheses[1].score == pytest.approx(1.0) - assert (hypotheses[0].timestep == torch.tensor([0, 1], device=device)).all() - assert (hypotheses[1].timestep == torch.tensor([1], device=device)).all() + assert (hypotheses[0].timestamp == torch.tensor([0, 1], device=device)).all() + assert (hypotheses[1].timestamp == torch.tensor([1], device=device)).all() etalon = [ [ @@ -462,7 +468,7 @@ def test_convert_to_hypotheses_with_alignments(self, device: torch.device): ], ] for batch_i in range(batch_size): - for t, group_for_timestep in enumerate(etalon[batch_i]): - for step, (label, current_logits) in enumerate(group_for_timestep): + for t, group_for_timestamp in enumerate(etalon[batch_i]): + for step, (label, current_logits) in enumerate(group_for_timestamp): assert torch.allclose(hypotheses[batch_i].alignments[t][step][0], current_logits) assert hypotheses[batch_i].alignments[t][step][1] == label diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py index 7a16db4324bc..e122dd5a3fdd 100644 --- a/tests/collections/asr/decoding/test_ctc_decoding.py +++ b/tests/collections/asr/decoding/test_ctc_decoding.py @@ -49,16 +49,16 @@ def register_artifact(self, _, vocab_path): def check_char_timestamps(hyp: Hypothesis, decoding: CTCDecoding): - assert hyp.timestep is not None - assert isinstance(hyp.timestep, dict) - assert 'timestep' in hyp.timestep - assert 'char' in hyp.timestep - assert 'word' in hyp.timestep - assert 'segment' in hyp.timestep + assert hyp.timestamp is not None + assert isinstance(hyp.timestamp, dict) + assert 'timestep' in hyp.timestamp + assert 'char' in hyp.timestamp + assert 'word' in hyp.timestamp + assert 'segment' in hyp.timestamp words = hyp.text.split(decoding.word_seperator) words = list(filter(lambda x: x != '', words)) - assert len(hyp.timestep['word']) == len(words) + assert len(hyp.timestamp['word']) == len(words) segments = [] segment = [] @@ -72,20 +72,20 @@ def check_char_timestamps(hyp: Hypothesis, decoding: CTCDecoding): if segment: segments.append(' '.join(segment)) - assert len(hyp.timestep['segment']) == len(segments) + assert len(hyp.timestamp['segment']) == len(segments) def check_subword_timestamps(hyp: Hypothesis, decoding: CTCBPEDecoding): - assert hyp.timestep is not None - assert isinstance(hyp.timestep, dict) - assert 'timestep' in hyp.timestep - assert 'char' in hyp.timestep - assert 'word' in hyp.timestep - assert 'segment' in hyp.timestep + assert hyp.timestamp is not None + assert isinstance(hyp.timestamp, dict) + assert 'timestep' in hyp.timestamp + assert 'char' in hyp.timestamp + assert 'word' in hyp.timestamp + assert 'segment' in hyp.timestamp chars = list(hyp.text) chars = list(filter(lambda x: x not in ['', ' ', '#'], chars)) - all_chars = [list(decoding.tokenizer.tokens_to_text(data['char'])) for data in hyp.timestep['char']] + all_chars = [list(decoding.tokenizer.tokens_to_text(data['char'])) for data in hyp.timestamp['char']] all_chars = [char for subword in all_chars for char in subword] all_chars = list(filter(lambda x: x not in ['', ' ', '#'], all_chars)) assert len(chars) == len(all_chars) @@ -94,7 +94,7 @@ def check_subword_timestamps(hyp: Hypothesis, decoding: CTCBPEDecoding): if not hyp.text or hyp.text[-1] not in decoding.segment_seperators: segments_count += 1 - assert len(hyp.timestep['segment']) == segments_count + assert len(hyp.timestamp['segment']) == segments_count class TestCTCDecoding: @@ -125,9 +125,10 @@ def test_char_decoding_greedy_forward( length = torch.randint(low=1, high=T, size=[B]) with torch.no_grad(): - texts, _ = decoding.ctc_decoder_predictions_tensor( + hypotheses = decoding.ctc_decoder_predictions_tensor( input_signal, length, fold_consecutive=True, return_hypotheses=False ) + texts = [hyp.text for hyp in hypotheses] for text in texts: assert isinstance(text, str) @@ -146,7 +147,7 @@ def test_char_decoding_greedy_forward_hypotheses(self, alignments, timestamps): length = torch.randint(low=1, high=T, size=[B]) with torch.no_grad(): - hyps, _ = decoding.ctc_decoder_predictions_tensor( + hyps = decoding.ctc_decoder_predictions_tensor( input_signal, length, fold_consecutive=True, return_hypotheses=True ) @@ -177,9 +178,10 @@ def test_subword_decoding_greedy_forward(self, tmp_tokenizer): length = torch.randint(low=1, high=T, size=[B]) with torch.no_grad(): - texts, _ = decoding.ctc_decoder_predictions_tensor( + hypotheses = decoding.ctc_decoder_predictions_tensor( input_signal, length, fold_consecutive=True, return_hypotheses=False ) + texts = [hyp.text for hyp in hypotheses] for text in texts: assert isinstance(text, str) @@ -197,7 +199,7 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme length = torch.randint(low=1, high=T, size=[B]) with torch.no_grad(): - hyps, _ = decoding.ctc_decoder_predictions_tensor( + hyps = decoding.ctc_decoder_predictions_tensor( input_signal, length, fold_consecutive=True, return_hypotheses=True ) @@ -283,11 +285,11 @@ def test_batched_decoding_logprobs( length = torch.randint(low=1, high=T, size=[B], device=length_device) with torch.inference_mode(): - hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor( + hyps = unbatched_decoding.ctc_decoder_predictions_tensor( input_signal, length, fold_consecutive=True, return_hypotheses=True ) - batched_hyps, _ = batched_decoding.ctc_decoder_predictions_tensor( + batched_hyps = batched_decoding.ctc_decoder_predictions_tensor( input_signal, length, fold_consecutive=True, return_hypotheses=True ) @@ -296,7 +298,7 @@ def test_batched_decoding_logprobs( assert torch.abs(hyp.score - batched_hyp.score) <= 1e-5 assert torch.all(hyp.y_sequence == batched_hyp.y_sequence) if timestamps: - assert hyp.timestep == batched_hyp.timestep + assert hyp.timestamp == batched_hyp.timestamp if alignments: assert torch.all(hyp.alignments[0] == batched_hyp.alignments[0]) assert torch.all(hyp.alignments[1] == batched_hyp.alignments[1]) @@ -350,11 +352,11 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none length = torch.randint(low=1, high=T, size=[B], device=length_device) with torch.inference_mode(): - hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor( + hyps = unbatched_decoding.ctc_decoder_predictions_tensor( input_labels, length, fold_consecutive=True, return_hypotheses=True ) - batched_hyps, _ = batched_decoding.ctc_decoder_predictions_tensor( + batched_hyps = batched_decoding.ctc_decoder_predictions_tensor( input_labels, length, fold_consecutive=True, return_hypotheses=True ) @@ -363,4 +365,4 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none assert abs(hyp.score - batched_hyp.score) <= 1e-5 assert torch.all(hyp.y_sequence == batched_hyp.y_sequence) if timestamps: - assert hyp.timestep == batched_hyp.timestep + assert hyp.timestamp == batched_hyp.timestamp diff --git a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py index 4715f4826493..cb2ebc9d1202 100644 --- a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py +++ b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py @@ -80,14 +80,18 @@ def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16, audio_filepaths = glob.glob("tests/.data/asr/test/an4/wav/*.wav") with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16): - actual_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + actual_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + + actual_transcripts = [hyp.text for hyp in actual_hypotheses] decoding_config["greedy"]["use_cuda_graph_decoder"] = True nemo_model.change_decoding_strategy(decoding_config) with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16): - fast_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + fast_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + + fast_transcripts = [hyp.text for hyp in fast_hypotheses] wer = jiwer.wer(actual_transcripts, fast_transcripts) @@ -136,7 +140,8 @@ def test_loop_labels_cuda_graph_rnnt_greedy_decoder_forced_mode( audio_filepaths = glob.glob("tests/.data/asr/test/an4/wav/*.wav") with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16): - actual_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + actual_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + actual_transcripts = [hyp.text for hyp in actual_hypotheses] # transcribe with use implementation with cuda graphs decoding_config["greedy"]["use_cuda_graph_decoder"] = True @@ -144,7 +149,8 @@ def test_loop_labels_cuda_graph_rnnt_greedy_decoder_forced_mode( nemo_model.decoding.decoding._decoding_computer.force_cuda_graphs_mode(mode=force_mode) with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16): - fast_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + fast_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + fast_transcripts = [hyp.text for hyp in fast_hypotheses] wer = jiwer.wer(actual_transcripts, fast_transcripts) @@ -185,7 +191,8 @@ def test_change_devices(loop_labels: bool, stt_en_fastconformer_transducer_xlarg nemo_model.to(first_device) audio_filepaths = glob.glob("tests/.data/asr/test/an4/wav/*.wav") with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True): - second_device_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + second_device_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + second_device_transcripts = [hyp.text for hyp in second_device_hypotheses] # Test that the model can run successfully back on second_device # after having been first run on first_device. Because the @@ -195,7 +202,8 @@ def test_change_devices(loop_labels: bool, stt_en_fastconformer_transducer_xlarg nemo_model.to(second_device) with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True): - first_device_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + first_device_hypotheses = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None) + first_device_transcripts = [hyp.text for hyp in first_device_hypotheses] # Sanity check: The device we run on should not change execution # output. assert first_device_transcripts == second_device_transcripts diff --git a/tests/collections/asr/decoding/test_rnnt_alignments.py b/tests/collections/asr/decoding/test_rnnt_alignments.py index 5c43af28b1d4..d94e834aba05 100644 --- a/tests/collections/asr/decoding/test_rnnt_alignments.py +++ b/tests/collections/asr/decoding/test_rnnt_alignments.py @@ -83,7 +83,7 @@ def get_rnnt_alignments( num_workers=cfg.num_workers, return_hypotheses=True, channel_selector=cfg.channel_selector, - )[0] + ) for transcription in transcriptions: for align_elem, frame_confidence in zip(transcription.alignments, transcription.frame_confidence): diff --git a/tests/collections/asr/decoding/test_rnnt_decoding.py b/tests/collections/asr/decoding/test_rnnt_decoding.py index b5250ad5f144..9da09cf0c4fa 100644 --- a/tests/collections/asr/decoding/test_rnnt_decoding.py +++ b/tests/collections/asr/decoding/test_rnnt_decoding.py @@ -119,16 +119,16 @@ def decode_text_from_nbest_hypotheses(hyps, decoding): def check_char_timestamps(hyp: rnnt_utils.Hypothesis, decoding: RNNTDecoding): - assert hyp.timestep is not None - assert isinstance(hyp.timestep, dict) - assert 'timestep' in hyp.timestep - assert 'char' in hyp.timestep - assert 'word' in hyp.timestep - assert 'segment' in hyp.timestep + assert hyp.timestamp is not None + assert isinstance(hyp.timestamp, dict) + assert 'timestep' in hyp.timestamp + assert 'char' in hyp.timestamp + assert 'word' in hyp.timestamp + assert 'segment' in hyp.timestamp words = hyp.text.split(decoding.word_seperator) words = list(filter(lambda x: x != '', words)) - assert len(hyp.timestep['word']) == len(words) + assert len(hyp.timestamp['word']) == len(words) segments = [] segment = [] @@ -142,20 +142,20 @@ def check_char_timestamps(hyp: rnnt_utils.Hypothesis, decoding: RNNTDecoding): if segment: segments.append(' '.join(segment)) - assert len(hyp.timestep['segment']) == len(segments) + assert len(hyp.timestamp['segment']) == len(segments) def check_subword_timestamps(hyp: rnnt_utils.Hypothesis, decoding: RNNTBPEDecoding): - assert hyp.timestep is not None - assert isinstance(hyp.timestep, dict) - assert 'timestep' in hyp.timestep - assert 'char' in hyp.timestep - assert 'word' in hyp.timestep - assert 'segment' in hyp.timestep + assert hyp.timestamp is not None + assert isinstance(hyp.timestamp, dict) + assert 'timestep' in hyp.timestamp + assert 'char' in hyp.timestamp + assert 'word' in hyp.timestamp + assert 'segment' in hyp.timestamp chars = list(hyp.text) chars = list(filter(lambda x: x not in ['', ' ', '#'], chars)) - all_chars = [list(decoding.tokenizer.tokens_to_text(data['char'])) for data in hyp.timestep['char']] + all_chars = [list(decoding.tokenizer.tokens_to_text(data['char'])) for data in hyp.timestamp['char']] all_chars = [char for subword in all_chars for char in subword] all_chars = list(filter(lambda x: x not in ['', ' ', '#'], all_chars)) assert len(chars) == len(all_chars) @@ -164,7 +164,7 @@ def check_subword_timestamps(hyp: rnnt_utils.Hypothesis, decoding: RNNTBPEDecodi if not hyp.text or hyp.text[-1] not in decoding.segment_seperators: segments_count += 1 - assert len(hyp.timestep['segment']) == segments_count + assert len(hyp.timestamp['segment']) == segments_count def check_beam_decoding(test_data_dir, beam_config): @@ -195,8 +195,8 @@ def check_beam_decoding(test_data_dir, beam_config): for idx, hyp_ in enumerate(all_hyps): print("Hyp index", idx + 1, "text :", hyp_.text) - assert len(hyp_.timestep) > 0 - print("Timesteps", hyp_.timestep) + assert len(hyp_.timestamp) > 0 + print("Timesteps", hyp_.timestamp) print() @@ -258,7 +258,7 @@ def test_greedy_decoding_preserve_alignments(self, test_data_dir): t_u.append(int(label)) - print(f"Tokens at timestep {t} = {t_u}") + print(f"Tokens at timestamp {t} = {t_u}") print() @pytest.mark.skipif( @@ -396,15 +396,15 @@ def test_rnnt_beam_decoding_preserve_alignments(self, test_data_dir, beam_config if len(t_u) > 1: assert t_u[-1] == blank_id - # No blank token should be present in the current timestep other than at the end + # No blank token should be present in the current timestamp other than at the end for token in t_u[:-1]: assert token != blank_id - print(f"Tokens at timestep {t} = {t_u}") + print(f"Tokens at timestamp {t} = {t_u}") print() - assert len(hyp_.timestep) > 0 - print("Timesteps", hyp_.timestep) + assert len(hyp_.timestamp) > 0 + print("Timesteps", hyp_.timestamp) print() @pytest.mark.skipif( @@ -438,9 +438,11 @@ def test_subword_decoding_compute_timestamps(self, test_data_dir, decoding_strat decoding_cfg=cfg, decoder=model.decoder, joint=model.joint, tokenizer=model.tokenizer ) - hyps, _ = decoding.rnnt_decoder_predictions_tensor(encoded, encoded_len, return_hypotheses=True) - - check_subword_timestamps(hyps[0], decoding) + hyps = decoding.rnnt_decoder_predictions_tensor(encoded, encoded_len, return_hypotheses=True) + if isinstance(hyps[0], list): + check_subword_timestamps(hyps[0][0], decoding) + else: + check_subword_timestamps(hyps[0], decoding) @pytest.mark.skipif( not NUMBA_RNNT_LOSS_AVAILABLE, @@ -473,9 +475,12 @@ def test_char_decoding_compute_timestamps(self, test_data_dir, decoding_strategy decoding = RNNTDecoding(decoding_cfg=cfg, decoder=model.decoder, joint=model.joint, vocabulary=vocab) - hyps, _ = decoding.rnnt_decoder_predictions_tensor(encoded, encoded_len, return_hypotheses=True) + hyps = decoding.rnnt_decoder_predictions_tensor(encoded, encoded_len, return_hypotheses=True) - check_char_timestamps(hyps[0], decoding) + if isinstance(hyps[0], list): + check_char_timestamps(hyps[0][0], decoding) + else: + check_char_timestamps(hyps[0], decoding) @pytest.mark.skipif( not NUMBA_RNNT_LOSS_AVAILABLE, diff --git a/tests/collections/asr/mixins/test_transcription.py b/tests/collections/asr/mixins/test_transcription.py index 6e2d5fe16c68..ad6613dd1d10 100644 --- a/tests/collections/asr/mixins/test_transcription.py +++ b/tests/collections/asr/mixins/test_transcription.py @@ -334,7 +334,7 @@ def test_transcribe_tensor(self, audio_files, fast_conformer_ctc_model): # Numpy array test outputs = fast_conformer_ctc_model.transcribe(audio, batch_size=1) assert len(outputs) == 1 - assert isinstance(outputs[0], str) + assert isinstance(outputs[0], Hypothesis) @pytest.mark.with_downloads() @pytest.mark.unit @@ -347,8 +347,8 @@ def test_transcribe_multiple_tensor(self, audio_files, fast_conformer_ctc_model) # Numpy array test outputs = fast_conformer_ctc_model.transcribe([audio, audio_2], batch_size=2) assert len(outputs) == 2 - assert isinstance(outputs[0], str) - assert isinstance(outputs[1], str) + assert isinstance(outputs[0], Hypothesis) + assert isinstance(outputs[1], Hypothesis) @pytest.mark.with_downloads() @pytest.mark.unit @@ -363,8 +363,8 @@ def test_transcribe_dataloader(self, audio_files, fast_conformer_ctc_model): # DataLoader test outputs = fast_conformer_ctc_model.transcribe(dataloader, batch_size=1) assert len(outputs) == 2 - assert isinstance(outputs[0], str) - assert isinstance(outputs[1], str) + assert isinstance(outputs[0], Hypothesis) + assert isinstance(outputs[1], Hypothesis) @pytest.mark.with_downloads() @pytest.mark.unit @@ -383,8 +383,8 @@ def test_timestamps_with_transcribe(self, audio_files, fast_conformer_ctc_model) assert output[1].text == 'start' # check timestamp - assert output[0].timestep['segment'][0]['start'] == pytest.approx(0.4) - assert output[0].timestep['segment'][0]['end'] == pytest.approx(0.48) + assert output[0].timestamp['segment'][0]['start'] == pytest.approx(0.4) + assert output[0].timestamp['segment'][0]['end'] == pytest.approx(0.48) @pytest.mark.with_downloads() @pytest.mark.unit @@ -396,8 +396,6 @@ def test_timestamps_with_transcribe_hybrid(self, audio_files, fast_conformer_hyb # check len of output assert len(output) == 2 - output = output[1] # Transducer returns tuple - # check hypothesis object assert isinstance(output[0], Hypothesis) # check transcript @@ -405,5 +403,5 @@ def test_timestamps_with_transcribe_hybrid(self, audio_files, fast_conformer_hyb assert output[1].text == 'Start.' # check timestamp - assert output[0].timestep['segment'][0]['start'] == pytest.approx(0.48) - assert output[0].timestep['segment'][0]['end'] == pytest.approx(0.72) + assert output[0].timestamp['segment'][0]['start'] == pytest.approx(0.48) + assert output[0].timestamp['segment'][0]['end'] == pytest.approx(0.72) diff --git a/tests/collections/asr/test_asr_classification_model.py b/tests/collections/asr/test_asr_classification_model.py index 3888cb30204c..f41c36219142 100644 --- a/tests/collections/asr/test_asr_classification_model.py +++ b/tests/collections/asr/test_asr_classification_model.py @@ -52,7 +52,10 @@ def speech_classification_model(): decoder = { 'cls': 'nemo.collections.asr.modules.ConvASRDecoderClassification', - 'params': {'feat_in': 32, 'num_classes': 30,}, + 'params': { + 'feat_in': 32, + 'num_classes': 30, + }, } modelConfig = DictConfig( @@ -95,7 +98,10 @@ def frame_classification_model(): decoder = { 'cls': 'nemo.collections.common.parts.MultiLayerPerceptron', - 'params': {'hidden_size': 32, 'num_classes': 5,}, + 'params': { + 'hidden_size': 32, + 'num_classes': 5, + }, } modelConfig = DictConfig( diff --git a/tests/collections/asr/test_asr_context_biasing.py b/tests/collections/asr/test_asr_context_biasing.py index b23b12655a8d..78261b65c912 100644 --- a/tests/collections/asr/test_asr_context_biasing.py +++ b/tests/collections/asr/test_asr_context_biasing.py @@ -118,7 +118,7 @@ def test_merge_alignment_with_ws_hyps(self, conformer_ctc_bpe_model): preds = rnnt_utils.Hypothesis( y_sequence=torch.tensor([120, 29]), score=0.0, - timestep=torch.tensor([0, 1, 2, 3]), + timestamp=torch.tensor([0, 1, 2, 3]), ) pred_text, raw_text = context_biasing.merge_alignment_with_ws_hyps( preds, @@ -134,7 +134,7 @@ def test_merge_alignment_with_ws_hyps(self, conformer_ctc_bpe_model): preds = rnnt_utils.Hypothesis( y_sequence=[], score=0.0, - timestep=[], + timestamp=[], ) pred_text, raw_text = context_biasing.merge_alignment_with_ws_hyps( preds, diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py index 02442291a918..eac5041de2b3 100644 --- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py +++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py @@ -29,6 +29,7 @@ from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE from nemo.collections.asr.parts.submodules import ctc_beam_decoding as beam_decode from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common import tokenizers from nemo.utils.config_utils import assert_dataclass_signature_match @@ -131,7 +132,7 @@ def test_predict_step(self, asr_model): assert len(outputs) == 1 assert len(outputs[0]) == 2 assert isinstance(outputs[0][0], MonoCut) - assert isinstance(outputs[0][1], str) + assert isinstance(outputs[0][1], Hypothesis) @pytest.mark.with_downloads() @pytest.mark.unit diff --git a/tests/collections/asr/test_asr_ctcencdec_model.py b/tests/collections/asr/test_asr_ctcencdec_model.py index 55451758578f..ae131abd3d48 100644 --- a/tests/collections/asr/test_asr_ctcencdec_model.py +++ b/tests/collections/asr/test_asr_ctcencdec_model.py @@ -24,6 +24,7 @@ from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.models import EncDecCTCModel, configs from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common.parts.preprocessing.parsers import make_parser from nemo.utils.config_utils import assert_dataclass_signature_match, update_model_config @@ -146,7 +147,7 @@ def test_predict_step(self, asr_model): assert len(outputs) == 1 assert len(outputs[0]) == 2 assert isinstance(outputs[0][0], MonoCut) - assert isinstance(outputs[0][1], str) + assert isinstance(outputs[0][1], Hypothesis) @pytest.mark.unit def test_vocab_change(self, asr_model): diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py index d13c879e47f9..c75de6064e51 100644 --- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py +++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py @@ -27,6 +27,7 @@ from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common import tokenizers from nemo.core.utils import numba_utils from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__ @@ -179,7 +180,7 @@ def test_predict_step(self, hybrid_asr_model): assert len(outputs) == 1 assert len(outputs[0]) == 2 assert isinstance(outputs[0][0], MonoCut) - assert isinstance(outputs[0][1], str) + assert isinstance(outputs[0][1], Hypothesis) @pytest.mark.with_downloads() @pytest.mark.skipif( diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py index b5c34e197237..456d7450eeba 100644 --- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py +++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py @@ -179,7 +179,7 @@ def test_predict_step(self, hybrid_asr_model): assert len(outputs) == 1 assert len(outputs[0]) == 2 assert isinstance(outputs[0][0], MonoCut) - assert isinstance(outputs[0][1], str) + assert isinstance(outputs[0][1], rnnt_utils.Hypothesis) @pytest.mark.skipif( not NUMBA_RNNT_LOSS_AVAILABLE, @@ -563,10 +563,10 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Opt assert torch.is_tensor(logp) assert torch.is_tensor(label) - @pytest.mark.skipif( - not NUMBA_RNNT_LOSS_AVAILABLE, - reason='RNNTLoss has not been compiled with appropriate numba version.', - ) + # @pytest.mark.skipif( + # not NUMBA_RNNT_LOSS_AVAILABLE, + # reason='RNNTLoss has not been compiled with appropriate numba version.', + # ) @pytest.mark.unit @pytest.mark.parametrize( "beam_config", diff --git a/tests/collections/asr/test_asr_metrics.py b/tests/collections/asr/test_asr_metrics.py index efa11b254517..a87622d60a07 100644 --- a/tests/collections/asr/test_asr_metrics.py +++ b/tests/collections/asr/test_asr_metrics.py @@ -219,7 +219,7 @@ def test_wer_metric_return_hypothesis(self, batch_dim_index, test_wer_bpe): # pass batchsize 1 tensor, get back list of length 1 Hypothesis wer.decoding.preserve_alignments = True - hyp, _ = wer.decoding.ctc_decoder_predictions_tensor(tensor, return_hypotheses=True) + hyp = wer.decoding.ctc_decoder_predictions_tensor(tensor, return_hypotheses=True) hyp = hyp[0] assert isinstance(hyp, Hypothesis) @@ -233,7 +233,7 @@ def test_wer_metric_return_hypothesis(self, batch_dim_index, test_wer_bpe): length = torch.tensor([tensor.shape[1 - batch_dim_index]], dtype=torch.long) # pass batchsize 1 tensor, get back list of length 1 Hypothesis [add length info] - hyp, _ = wer.decoding.ctc_decoder_predictions_tensor(tensor, decoder_lengths=length, return_hypotheses=True) + hyp = wer.decoding.ctc_decoder_predictions_tensor(tensor, decoder_lengths=length, return_hypotheses=True) hyp = hyp[0] assert isinstance(hyp, Hypothesis) assert hyp.length == 3 @@ -251,7 +251,7 @@ def test_wer_metric_subword_return_hypothesis(self, batch_dim_index, test_wer_bp # pass batchsize 1 tensor, get back list of length 1 Hypothesis wer.decoding.preserve_alignments = True - hyp, _ = wer.decoding.ctc_decoder_predictions_tensor(tensor, return_hypotheses=True) + hyp = wer.decoding.ctc_decoder_predictions_tensor(tensor, return_hypotheses=True) hyp = hyp[0] assert isinstance(hyp, Hypothesis) @@ -265,13 +265,15 @@ def test_wer_metric_subword_return_hypothesis(self, batch_dim_index, test_wer_bp length = torch.tensor([tensor.shape[1 - batch_dim_index]], dtype=torch.long) # pass batchsize 1 tensor, get back list of length 1 Hypothesis [add length info] - hyp, _ = wer.decoding.ctc_decoder_predictions_tensor(tensor, decoder_lengths=length, return_hypotheses=True) + hyp = wer.decoding.ctc_decoder_predictions_tensor(tensor, decoder_lengths=length, return_hypotheses=True) hyp = hyp[0] assert isinstance(hyp, Hypothesis) assert hyp.length == 3 def get_wer_ctc(self, prediction: str, reference: str, test_wer_bpe: bool): - ctc_decoder_predictions_tensor_mock = Mock(return_value=([prediction], None)) + ctc_decoder_predictions_tensor_mock = Mock( + return_value=[Hypothesis(score=1.0, y_sequence=[], text=prediction)] + ) if test_wer_bpe: decoding = Mock( blank_id=self.char_tokenizer.tokenizer.vocab_size, @@ -307,7 +309,9 @@ def decode_token_to_str_with_vocabulary_mock(self, ids): return ''.join([self.vocabulary[id_] for id_ in ids]) def get_wer_rnnt(self, prediction: str, reference: str, batch_dim_index: int, test_wer_bpe: bool): - rnnt_decoder_predictions_tensor_mock = Mock(return_value=([prediction], None)) + rnnt_decoder_predictions_tensor_mock = Mock( + return_value=[Hypothesis(score=1.0, y_sequence=[], text=prediction)] + ) if test_wer_bpe: decoding = Mock( blank_id=self.char_tokenizer.tokenizer.vocab_size, @@ -385,24 +389,24 @@ def test_char_decoding_logprobs(self): decoding_cfg = CTCDecodingConfig() decoding = CTCDecoding(decoding_cfg, vocabulary=self.vocabulary) - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) hyp = hyp[0] # type: Hypothesis assert isinstance(hyp.y_sequence, torch.Tensor) assert hyp.length == torch.tensor(T, dtype=torch.int32) assert hyp.text != '' - assert len(hyp.timestep) == 0 + assert len(hyp.timestamp) == 0 assert hyp.alignments is None # Preserve timestamps and alignments decoding_cfg = CTCDecodingConfig(preserve_alignments=True, compute_timestamps=True) decoding = CTCDecoding(decoding_cfg, vocabulary=self.vocabulary) - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) hyp = hyp[0] # type: Hypothesis assert isinstance(hyp.y_sequence, torch.Tensor) assert hyp.length == torch.tensor(T, dtype=torch.int32) assert hyp.text != '' - assert len(hyp.timestep) == 4 + assert len(hyp.timestamp) == 4 assert hyp.alignments is not None @pytest.mark.unit @@ -416,24 +420,24 @@ def test_subword_decoding_logprobs(self): decoding_cfg = CTCBPEDecodingConfig() decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer) - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) hyp = hyp[0] # type: Hypothesis assert isinstance(hyp.y_sequence, torch.Tensor) assert hyp.length == torch.tensor(T, dtype=torch.int32) assert hyp.text != '' - assert len(hyp.timestep) == 0 + assert len(hyp.timestamp) == 0 assert hyp.alignments is None # Preserve timestamps and alignments decoding_cfg = CTCBPEDecodingConfig(preserve_alignments=True, compute_timestamps=True) decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer) - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) hyp = hyp[0] # type: Hypothesis assert isinstance(hyp.y_sequence, torch.Tensor) assert hyp.length == torch.tensor(T, dtype=torch.int32) assert hyp.text != '' - assert len(hyp.timestep) == 4 + assert len(hyp.timestamp) == 4 assert hyp.alignments is not None @pytest.mark.unit @@ -447,12 +451,12 @@ def test_char_decoding_labels(self): decoding_cfg = CTCDecodingConfig() decoding = CTCDecoding(decoding_cfg, vocabulary=self.vocabulary) - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) hyp = hyp[0] # type: Hypothesis assert isinstance(hyp.y_sequence, torch.Tensor) assert hyp.length == torch.tensor(T, dtype=torch.int32) assert hyp.text != '' - assert len(hyp.timestep) == 0 + assert len(hyp.timestamp) == 0 assert hyp.alignments is None # Preserve timestamps and alignments @@ -461,18 +465,18 @@ def test_char_decoding_labels(self): # Cannot compute alignments from labels with pytest.raises(ValueError): - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) # Preserve timestamps decoding_cfg = CTCDecodingConfig(preserve_alignments=False, compute_timestamps=True) decoding = CTCDecoding(decoding_cfg, vocabulary=self.vocabulary) - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) hyp = hyp[0] # type: Hypothesis assert isinstance(hyp.y_sequence, torch.Tensor) assert hyp.length == torch.tensor(T, dtype=torch.int32) assert hyp.text != '' - assert len(hyp.timestep) == 4 + assert len(hyp.timestamp) == 4 assert hyp.alignments is None @pytest.mark.unit @@ -486,24 +490,24 @@ def test_subword_decoding_logprobs(self): decoding_cfg = CTCBPEDecodingConfig() decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer) - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) hyp = hyp[0] # type: Hypothesis assert isinstance(hyp.y_sequence, torch.Tensor) assert hyp.length == torch.tensor(T, dtype=torch.int32) assert hyp.text != '' - assert len(hyp.timestep) == 0 + assert len(hyp.timestamp) == 0 assert hyp.alignments is None # Preserve timestamps and alignments decoding_cfg = CTCBPEDecodingConfig(preserve_alignments=True, compute_timestamps=True) decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer) - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) hyp = hyp[0] # type: Hypothesis assert isinstance(hyp.y_sequence, torch.Tensor) assert hyp.length == torch.tensor(T, dtype=torch.int32) assert hyp.text != '' - assert len(hyp.timestep) == 4 + assert len(hyp.timestamp) == 4 assert hyp.alignments is not None @pytest.mark.unit @@ -517,12 +521,12 @@ def test_subword_decoding_labels(self): decoding_cfg = CTCBPEDecodingConfig() decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer) - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) hyp = hyp[0] # type: Hypothesis assert isinstance(hyp.y_sequence, torch.Tensor) assert hyp.length == torch.tensor(T, dtype=torch.int32) assert hyp.text != '' - assert len(hyp.timestep) == 0 + assert len(hyp.timestamp) == 0 assert hyp.alignments is None # Preserve timestamps and alignments @@ -531,16 +535,16 @@ def test_subword_decoding_labels(self): # Cannot compute alignments from labels with pytest.raises(ValueError): - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) # Preserve timestamps decoding_cfg = CTCBPEDecodingConfig(preserve_alignments=False, compute_timestamps=True) decoding = CTCBPEDecoding(decoding_cfg, tokenizer=self.char_tokenizer) - hyp, _ = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) + hyp = decoding.ctc_decoder_predictions_tensor(decoder_outputs, decoder_lens, return_hypotheses=True) hyp = hyp[0] # type: Hypothesis assert isinstance(hyp.y_sequence, torch.Tensor) assert hyp.length == torch.tensor(T, dtype=torch.int32) assert hyp.text != '' - assert len(hyp.timestep) == 4 + assert len(hyp.timestamp) == 4 assert hyp.alignments is None diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py index 63185f687fea..5d96ec8de1fa 100644 --- a/tests/collections/asr/test_asr_multitask_model_bpe.py +++ b/tests/collections/asr/test_asr_multitask_model_bpe.py @@ -450,7 +450,7 @@ def test_transcribe_single_file(self, asr_model, test_data_dir): # Numpy array test outputs = asr_model.transcribe(audio_file, batch_size=1) assert len(outputs) == 1 - assert isinstance(outputs[0], str) + assert isinstance(outputs[0].text, str) @pytest.mark.unit def test_transcribe_single_file_translation(self, asr_model, test_data_dir): @@ -459,7 +459,7 @@ def test_transcribe_single_file_translation(self, asr_model, test_data_dir): # Numpy array test outputs = asr_model.transcribe(audio_file, batch_size=1, task="ast", source_lang='en', target_lang='de') assert len(outputs) == 1 - assert isinstance(outputs[0], str) + assert isinstance(outputs[0].text, str) @pytest.mark.unit def test_transcribe_return_hypothesis(self, asr_model, test_data_dir): @@ -486,7 +486,7 @@ def test_transcribe_tensor(self, asr_model, test_data_dir): # Numpy array test outputs = asr_model.transcribe(audio, batch_size=1) assert len(outputs) == 1 - assert isinstance(outputs[0], str) + assert isinstance(outputs[0].text, str) @pytest.mark.unit def test_build_tokenizer(self, asr_model, test_data_dir): @@ -527,7 +527,7 @@ def test_predict_step(self, asr_model, test_data_dir): assert len(outputs) == 1 assert len(outputs[0]) == 2 assert isinstance(outputs[0][0], MonoCut) - assert isinstance(outputs[0][1], str) + assert isinstance(outputs[0][1].text, str) @pytest.mark.unit def test_FrameBatchMultiTaskAED(self, asr_model, test_data_dir): diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py index 5e810243c919..07c6adf761ba 100644 --- a/tests/collections/asr/test_asr_rnnt_encdec_model.py +++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py @@ -174,7 +174,7 @@ def joint_after_projection(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tens setup["decoder"] = DummyRNNTDecoder(vocab_size=2, blank_idx=2, blank_as_pad=True) setup["decoder_masked"] = DummyRNNTDecoder(vocab_size=2, blank_idx=2, blank_as_pad=False) setup["joint"] = DummyRNNTJoint(num_outputs=3) - # expected timesteps for max_symbols_per_step=5 are [[0, 0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 1]], + # expected timestamps for max_symbols_per_step=5 are [[0, 0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 1]], # so we have both looped and regular iteration on the second frame setup["encoder_output"] = torch.tensor( [[[1, 0, 0], [0, 1, 0], [0, 0, 1]], [[0, 0, 1], [2, 0, 0], [0, 0, 0]]], dtype=torch.float32 @@ -311,7 +311,7 @@ def test_predict_step(self, asr_model): assert len(outputs) == 1 assert len(outputs[0]) == 2 assert isinstance(outputs[0][0], MonoCut) - assert isinstance(outputs[0][1], str) + assert isinstance(outputs[0][1], rnnt_utils.Hypothesis) @pytest.mark.skipif( not NUMBA_RNNT_LOSS_AVAILABLE, @@ -836,16 +836,16 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Opt hyp = greedy(encoder_output=enc_out, encoded_lengths=enc_len)[0][0] # type: rnnt_utils.Hypothesis assert hyp.alignments is not None - timestep_count = { - u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True)) + timestamp_count = { + u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestamp), return_counts=True)) } for t in range(len(hyp.alignments)): - # check that the number of alignment elements is consistent with hyp.timestep + # check that the number of alignment elements is consistent with hyp.timestamp alignment_len = len(hyp.alignments[t]) assert alignment_len <= max_symbols_per_step - if t in timestep_count: # non-blank - assert alignment_len == timestep_count[t] + (1 if alignment_len < max_symbols_per_step else 0) + if t in timestamp_count: # non-blank + assert alignment_len == timestamp_count[t] + (1 if alignment_len < max_symbols_per_step else 0) else: # blank assert alignment_len == 1 @@ -908,20 +908,20 @@ def test_greedy_decoding_preserve_frame_confidence(self, greedy_class, loop_labe hyp = greedy(encoder_output=enc_out, encoded_lengths=enc_len)[0][0] # type: rnnt_utils.Hypothesis assert hyp.frame_confidence is not None - timestep_count = { - u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True)) + timestamp_count = { + u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestamp), return_counts=True)) } for t in range(len(hyp.frame_confidence)): - # check that the number of confidence elements is consistent with hyp.timestep + # check that the number of confidence elements is consistent with hyp.timestamp confidence_len = len(hyp.frame_confidence[t]) assert confidence_len <= max_symbols_per_step - if t in timestep_count: # non-blank - # if timestep_count[t] less than max_symbols_per_step, + if t in timestamp_count: # non-blank + # if timestamp_count[t] less than max_symbols_per_step, # blank emission and corresponding confidence expected - # if timestep_count[t] == max_symbols_per_step, "forced blank" is not added => no confidence - assert confidence_len == timestep_count[t] + ( - 1 if timestep_count[t] < max_symbols_per_step else 0 + # if timestamp_count[t] == max_symbols_per_step, "forced blank" is not added => no confidence + assert confidence_len == timestamp_count[t] + ( + 1 if timestamp_count[t] < max_symbols_per_step else 0 ) else: # blank assert confidence_len == 1 @@ -969,16 +969,16 @@ def test_greedy_decoding_max_symbols_alignment( hyp = greedy(encoder_output=encoder_output, encoded_lengths=encoded_lengths)[0][0] assert hyp.alignments is not None - timestep_count = { - u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True)) + timestamp_count = { + u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestamp), return_counts=True)) } for t in range(len(hyp.alignments)): - # check that the number of confidence elements is consistent with hyp.timestep + # check that the number of confidence elements is consistent with hyp.timestamp alignment_len = len(hyp.alignments[t]) assert alignment_len <= max_symbols_per_step - if t in timestep_count: # non-blank - assert alignment_len == timestep_count[t] + (1 if alignment_len < max_symbols_per_step else 0) + if t in timestamp_count: # non-blank + assert alignment_len == timestamp_count[t] + (1 if alignment_len < max_symbols_per_step else 0) else: # blank or max_symbols_per_step == 0 assert alignment_len <= 1 @@ -1056,16 +1056,16 @@ def test_greedy_decoding_max_symbols_confidence( hyp = greedy(encoder_output=encoder_output, encoded_lengths=encoded_lengths)[0][0] assert hyp.frame_confidence is not None - timestep_count = { - u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True)) + timestamp_count = { + u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestamp), return_counts=True)) } for t in range(len(hyp.frame_confidence)): - # check that the number of confidence elements is consistent with hyp.timestep + # check that the number of confidence elements is consistent with hyp.timestamp confidence_len = len(hyp.frame_confidence[t]) assert confidence_len <= max_symbols_per_step - if t in timestep_count: # non-blank - assert confidence_len == timestep_count[t] + ( + if t in timestamp_count: # non-blank + assert confidence_len == timestamp_count[t] + ( 1 if confidence_len < max_symbols_per_step else 0 ) else: # blank or max_symbols_per_step == 0 diff --git a/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py b/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py index aba364868e88..be86d5bffbb2 100644 --- a/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py +++ b/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py @@ -27,6 +27,7 @@ from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common import tokenizers from nemo.core.utils import numba_utils from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__ @@ -191,7 +192,7 @@ def test_predict_step(self, asr_model): assert len(outputs) == 1 assert len(outputs[0]) == 2 assert isinstance(outputs[0][0], MonoCut) - assert isinstance(outputs[0][1], str) + assert isinstance(outputs[0][1], Hypothesis) @pytest.mark.with_downloads() @pytest.mark.skipif( diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb index 7171510f4e0d..6c551e00b2bf 100644 --- a/tutorials/asr/ASR_Context_Biasing.ipynb +++ b/tutorials/asr/ASR_Context_Biasing.ipynb @@ -259,10 +259,6 @@ "execution_count": null, "id": "d34ee0ba", "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, "scrolled": true }, "outputs": [], @@ -322,7 +318,7 @@ "\n", "for idx, ref in enumerate(ref_text):\n", " ref = ref.split()\n", - " hyp = recog_results[idx].split()\n", + " hyp = recog_results[idx].text.split()\n", " texterrors_ali = texterrors.align_texts(ref, hyp, False)\n", " ali = []\n", " for i in range(len(texterrors_ali[0])):\n", @@ -898,7 +894,7 @@ " print(f\"[ref text]: {target_transcripts[idx]}\")\n", " else:\n", " # if no spotted words, use standard greedy predictions\n", - " pred_text = ctc_model.wer.decoding.ctc_decoder_predictions_tensor(greedy_predicts)[0][0]" + " pred_text = ctc_model.wer.decoding.ctc_decoder_predictions_tensor(greedy_predicts)[0].text" ] }, { diff --git a/tutorials/asr/ASR_with_NeMo.ipynb b/tutorials/asr/ASR_with_NeMo.ipynb index bb62e2f5eb9d..86625e2565c3 100644 --- a/tutorials/asr/ASR_with_NeMo.ipynb +++ b/tutorials/asr/ASR_with_NeMo.ipynb @@ -1,38 +1,12 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "ASR_with_NeMo.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, "cells": [ { "cell_type": "code", + "execution_count": null, "metadata": { "id": "lJz6FDU1lRzc" }, + "outputs": [], "source": [ "\"\"\"\n", "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", @@ -43,7 +17,9 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n", - "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", + "\n", + "\n", + "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", "\"\"\"\n", "# If you're using Google Colab and not running locally, run this cell.\n", "\n", @@ -63,9 +39,7 @@ "that you want to use the \"Run All Cells\" (or similar) option.\n", "\"\"\"\n", "# exit()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -158,9 +132,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "gAhsmi6HlRzh" }, + "outputs": [], "source": [ "import os\n", "# This is where the an4/ directory will be placed.\n", @@ -169,16 +145,16 @@ "\n", "if not os.path.exists(data_dir):\n", " os.makedirs(data_dir)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "Yb4fuUvWlRzk", "scrolled": true }, + "outputs": [], "source": [ "import glob\n", "import os\n", @@ -208,9 +184,7 @@ " cmd = [\"sox\", sph_path, wav_path]\n", " subprocess.run(cmd)\n", "print(\"Finished conversion.\\n******\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -225,9 +199,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "_M_bSs3MjQlz" }, + "outputs": [], "source": [ "import librosa\n", "import IPython.display as ipd\n", @@ -237,9 +213,7 @@ "audio, sample_rate = librosa.load(example_file)\n", "\n", "ipd.Audio(example_file, rate=sample_rate)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -254,9 +228,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "MqIAKkqelRzm" }, + "outputs": [], "source": [ "%matplotlib inline\n", "import librosa.display\n", @@ -268,9 +244,7 @@ "plt.ylabel('Amplitude')\n", "\n", "_ = librosa.display.waveshow(audio, color='blue')" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -292,9 +266,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "oCFneEs1lRzp" }, + "outputs": [], "source": [ "import numpy as np\n", "\n", @@ -306,9 +282,7 @@ "librosa.display.specshow(spec_db, y_axis='log', x_axis='time')\n", "plt.colorbar()\n", "plt.title('Audio Spectrogram');" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -325,9 +299,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "7yQXVn-TlRzt" }, + "outputs": [], "source": [ "# Plot the mel spectrogram of our sample\n", "mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate)\n", @@ -337,9 +313,7 @@ " mel_spec_db, x_axis='time', y_axis='mel')\n", "plt.colorbar()\n", "plt.title('Mel Spectrogram');" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -384,18 +358,18 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "4_W0lhaQlRzx" }, + "outputs": [], "source": [ "# NeMo's \"core\" package\n", "import nemo\n", "# NeMo's ASR collection - this collections contains complete ASR models and\n", "# building blocks (modules) for ASR\n", "import nemo.collections.asr as nemo_asr" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -410,15 +384,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "KFZZpYult96G" }, + "outputs": [], "source": [ "# This line will download pre-trained QuartzNet15x5 model from NVIDIA's NGC cloud and instantiate it for you\n", "quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=\"QuartzNet15x5Base-En\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -431,16 +405,16 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "3QCpR_93u1hp" }, + "outputs": [], "source": [ "files = [os.path.join(data_dir, 'an4/wav/an4_clstk/mgah/cen2-mgah-b.wav')]\n", "for fname, transcription in zip(files, quartznet.transcribe(audio=files)):\n", " print(f\"Audio in {fname} was recognized as: {transcription}\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -488,9 +462,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "lVB1sG1GlRzz" }, + "outputs": [], "source": [ "# --- Building Manifest Files --- #\n", "import json\n", @@ -537,9 +513,7 @@ " build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')\n", " print(\"Test manifest created.\")\n", "print(\"***Done***\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -575,9 +549,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "PXVKBniMlRz5" }, + "outputs": [], "source": [ "# --- Config Information ---#\n", "try:\n", @@ -596,9 +572,7 @@ "with open(config_path) as f:\n", " params = yaml.load(f)\n", "print(params)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -615,15 +589,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "GUfR6tAK0k2u" }, + "outputs": [], "source": [ "import lightning.pytorch as pl\n", "trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=50)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -637,17 +611,17 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "Cbf0fsMK09lk" }, + "outputs": [], "source": [ "from omegaconf import DictConfig\n", "params['model']['train_ds']['manifest_filepath'] = train_manifest\n", "params['model']['validation_ds']['manifest_filepath'] = test_manifest\n", "first_asr_model = nemo_asr.models.EncDecCTCModel(cfg=DictConfig(params['model']), trainer=trainer)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -660,15 +634,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "inRJsnrz1psq" }, + "outputs": [], "source": [ "# Start training!!!\n", "trainer.fit(first_asr_model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -686,9 +660,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "n_0y3stSXDX_" }, + "outputs": [], "source": [ "try:\n", " from google import colab\n", @@ -702,9 +678,7 @@ " %tensorboard --logdir lightning_logs/\n", "else:\n", " print(\"To use tensorboard, please use this notebook in a Google Colab environment.\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -717,14 +691,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "7kdQbpohXnEd" }, + "outputs": [], "source": [ "print(params['model']['optim'])" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -737,18 +711,18 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "AbigFKUtYgvn" }, + "outputs": [], "source": [ "import copy\n", "new_opt = copy.deepcopy(params['model']['optim'])\n", "new_opt['lr'] = 0.001\n", "first_asr_model.setup_optimization(optim_config=DictConfig(new_opt))\n", "# And then you can invoke trainer.fit(first_asr_model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -765,9 +739,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "3FT0klSV268p" }, + "outputs": [], "source": [ "audio = [os.path.join(data_dir, 'an4/wav/an4_clstk/mgah/cen2-mgah-b.wav'),\n", " os.path.join(data_dir, 'an4/wav/an4_clstk/fmjd/cen7-fmjd-b.wav'),\n", @@ -775,9 +751,7 @@ " os.path.join(data_dir, 'an4/wav/an4_clstk/fkai/cen8-fkai-b.wav')]\n", "print(first_asr_model.transcribe(audio=audio,\n", " batch_size=4))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -790,9 +764,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "7mP4r1Gx_Ilt" }, + "outputs": [], "source": [ "# Bigger batch-size = bigger throughput\n", "params['model']['validation_ds']['batch_size'] = 16\n", @@ -831,9 +807,7 @@ "\n", "# We need to sum all numerators and denominators first. Then divide.\n", "print(f\"WER = {sum(wer_nums)/sum(wer_denoms)}\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -865,14 +839,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "9glGogaPlR0H" }, + "outputs": [], "source": [ "print(quartznet._cfg['spec_augment'])" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -900,9 +874,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "hl320dsydWX0" }, + "outputs": [], "source": [ "# Check what kind of vocabulary/alphabet the model has right now\n", "print(quartznet.decoder.vocabulary)\n", @@ -915,9 +891,7 @@ " 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', \"'\", \"!\"\n", " ]\n", ")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -930,9 +904,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "_PZJIso-eDl-" }, + "outputs": [], "source": [ "# Use the smaller learning rate we set before\n", "quartznet.setup_optimization(optim_config=DictConfig(new_opt))\n", @@ -946,9 +922,7 @@ "# And now we can create a PyTorch Lightning trainer and call `fit` again.\n", "trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=2)\n", "trainer.fit(quartznet)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -993,9 +967,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "I4WRcmakjQnj" }, + "outputs": [], "source": [ "!pip install --upgrade onnxruntime # for gpu, use onnxruntime-gpu\n", "#!mkdir -p ort\n", @@ -1007,9 +983,7 @@ "#!pip uninstall -y onnxruntime-gpu\n", "#!pip install --upgrade --force-reinstall ./build/Linux/Release/dist/onnxruntime*.whl\n", "#%cd .." - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1022,9 +996,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "HZnyWxPyjQnm" }, + "outputs": [], "source": [ "import json\n", "import os\n", @@ -1098,12 +1074,10 @@ " logits = torch.from_numpy(alogits[0])\n", " greedy_predictions = logits.argmax(dim=-1, keepdim=False)\n", " wer = WER(decoding=quartznet.decoding, use_cer=False)\n", - " hypotheses, _ = wer.decoding.ctc_decoder_predictions_tensor(greedy_predictions)\n", + " hypotheses = wer.decoding.ctc_decoder_predictions_tensor(greedy_predictions)\n", " print(hypotheses)\n", " break\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1165,12 +1139,40 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "V3ERGX86lR0V" }, - "source": [], - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] } - ] + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "ASR_with_NeMo.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/tutorials/asr/ASR_with_Transducers.ipynb b/tutorials/asr/ASR_with_Transducers.ipynb index 95eecbfb8916..ddd0582f82a5 100644 --- a/tutorials/asr/ASR_with_Transducers.ipynb +++ b/tutorials/asr/ASR_with_Transducers.ipynb @@ -1206,7 +1206,7 @@ "outputs": [], "source": [ "# Get a batch of hypotheses, as well as a batch of all obtained hypotheses (if beam search is used)\n", - "hypotheses, all_hypotheses = rnnt_alignments(model, batch)" + "hypotheses = rnnt_alignments(model, batch)" ] }, { diff --git a/tutorials/asr/Buffered_Transducer_Inference.ipynb b/tutorials/asr/Buffered_Transducer_Inference.ipynb index c23398dca46a..f79fd4dff64e 100644 --- a/tutorials/asr/Buffered_Transducer_Inference.ipynb +++ b/tutorials/asr/Buffered_Transducer_Inference.ipynb @@ -17,7 +17,9 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n", - "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", + "\n", + "\n", + "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", "\"\"\"\n", "# If you're using Google Colab and not running locally, run this cell.\n", "\n", @@ -730,7 +732,7 @@ " new_prev_hypothesis.append(self.previous_hypotheses[old_pos])\n", " self.previous_hypotheses = new_prev_hypothesis\n", "\n", - " best_hyp, _ = self.asr_model.decoding.rnnt_decoder_predictions_tensor(\n", + " best_hyp = self.asr_model.decoding.rnnt_decoder_predictions_tensor(\n", " encoded, encoded_len, return_hypotheses=True, partial_hypotheses=self.previous_hypotheses\n", " )\n", "\n", @@ -925,7 +927,7 @@ "After this, we perform regular transducer decoding of the Prediction Network + Joint Network. Since it is being done on a subset of samples, it is much faster than padded decoding.\n", "\n", "```python\n", - "best_hyp, _ = self.asr_model.decoding.rnnt_decoder_predictions_tensor(\n", + "best_hyp = self.asr_model.decoding.rnnt_decoder_predictions_tensor(\n", " encoded, encoded_len, return_hypotheses=True, partial_hypotheses=self.previous_hypotheses\n", ")\n", "```" From d19682f750d7922ec8aaaa84c89f0797f3b00a0e Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 11 Feb 2025 20:42:55 -0800 Subject: [PATCH 07/14] fix: export weight name mapping if model is nemo model (#11497) * fix: export weight name mapping if model is nemo model Signed-off-by: Terry Kong * missing license headers Signed-off-by: Terry Kong * pytest mark unit and CPU Signed-off-by: Terry Kong --------- Signed-off-by: Terry Kong Co-authored-by: Dong Hyuk Chang --- nemo/export/tensorrt_llm.py | 7 ++- tests/export/test_tensorrt_llm.py | 63 +++++++++++++++++++ .../converter/test_model_to_trt_llm_ckpt.py | 43 +++++++++++++ 3 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 tests/export/test_tensorrt_llm.py create mode 100644 tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 74be0eba0491..6299134e833c 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -776,7 +776,8 @@ def get_input_dtype(self, storage_dtype): elif storage_dtype == torch.float16: return DataType.float16 - def get_nemo_to_trtllm_conversion_dict(self, model_state_dict): + @staticmethod + def get_nemo_to_trtllm_conversion_dict(model_state_dict): """MCore export supports some default conversion dictionaries All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models sometimes start with "model.decoder.layers.4.blahblah". so we append model prefix. to the keys """ @@ -786,8 +787,8 @@ def get_nemo_to_trtllm_conversion_dict(self, model_state_dict): nemo_model_conversion_dict = {} for key, value in DEFAULT_CONVERSION_DICT.items(): - if 'layers' in key and model_prefix: - nemo_model_conversion_dict[f'{model_prefix}.{key}'] = value + if model_prefix: + nemo_model_conversion_dict[f'{model_prefix}{key}'] = value else: nemo_model_conversion_dict[key] = value return nemo_model_conversion_dict diff --git a/tests/export/test_tensorrt_llm.py b/tests/export/test_tensorrt_llm.py new file mode 100644 index 000000000000..7361befcbaa9 --- /dev/null +++ b/tests/export/test_tensorrt_llm.py @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import pytest + + +@pytest.mark.run_only_on('GPU') +@pytest.mark.unit +def test_get_nemo_to_trtllm_conversion_dict_on_nemo_model(): + try: + from nemo.export.tensorrt_llm import TensorRTLLM + except ImportError: + pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") + + dummy_state = object() + model_state_dict = { + 'model.embedding.word_embeddings.weight': dummy_state, + 'model.decoder.layers.0.self_attention.linear_proj.weight': dummy_state, + } + nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict) + + # Check that every key starts with 'model.' and not 'model..' by using a regex + # This pattern ensures: + # - The key starts with 'model.' + # - Immediately after 'model.', there must be at least one character that is NOT a '.' + # (preventing the 'model..' scenario) + pattern = re.compile(r'^model\.[^.].*') + for key in nemo_model_conversion_dict.keys(): + assert pattern.match(key), f"Key '{key}' does not properly start with 'model.'" + + +@pytest.mark.run_only_on('GPU') +@pytest.mark.unit +def test_get_nemo_to_trtllm_conversion_dict_on_mcore_model(): + try: + from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT + + from nemo.export.tensorrt_llm import TensorRTLLM + except ImportError: + pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") + + dummy_state = object() + model_state_dict = { + 'embedding.word_embeddings.weight': dummy_state, + 'decoder.layers.0.self_attention.linear_proj.weight': dummy_state, + } + nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict) + + # This is essentially a no-op + assert nemo_model_conversion_dict == DEFAULT_CONVERSION_DICT diff --git a/tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py b/tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py new file mode 100644 index 000000000000..2be809cc4406 --- /dev/null +++ b/tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +@pytest.mark.parametrize( + 'input_layer_names,expected_model_prefix', + [ + ( + [ + 'model.embedding.word_embeddings.weight', + 'model.decoder.layers.0.self_attention.linear_proj.weight', + 'model.decoder.layers.0.self_attention.linear_qkv.layer_norm_weight', + 'model.decoder.layers.0.self_attention.linear_qkv.weight', + 'model.decoder.layers.0.mlp.linear_fc1.layer_norm_weight', + 'model.decoder.layers.0.mlp.linear_fc1.weight', + 'model.decoder.layers.0.mlp.linear_fc2.weight', + ], + 'model.', + ) + ], +) +@pytest.mark.run_only_on('CPU') +@pytest.mark.unit +def test_get_layer_prefix_is_mcore(input_layer_names, expected_model_prefix): + try: + from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import get_layer_prefix + except ImportError: + pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") + model_prefix, _ = get_layer_prefix(input_layer_names, is_mcore=True) + assert model_prefix == expected_model_prefix From 6dbcbac4250a09f3b6c840cdd1d2b9b05a0caf8e Mon Sep 17 00:00:00 2001 From: Yuanzhe Dong <5069709+yuanzhedong@users.noreply.github.com> Date: Wed, 12 Feb 2025 00:42:07 -0800 Subject: [PATCH 08/14] Add error message when downloading failed. (#12139) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update vLLM to 0.7.2 (#12078) * initial commit Signed-off-by: Piotr Kaminski * vllm bump cleanup Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Flake8 Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * flake should not fail with tensorstore Signed-off-by: Piotr Kaminski * pylint also should not fail Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * local tokenizer load Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * add missing requirements Signed-off-by: Piotr Kaminski * absolute path for sentencepiece tokenizer Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * fix absolute path, add new vllm params Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * bump vllm, fix tokenizer Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * code review + docstrings Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * flake8 Signed-off-by: Piotr Kaminski * fix formatting Signed-off-by: Piotr Kaminski --------- Signed-off-by: Piotr Kaminski Signed-off-by: Laplasjan107 Co-authored-by: Laplasjan107 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Yuanzhe Dong * better error message Signed-off-by: Yuanzhe Dong * Apply isort and black reformatting Signed-off-by: yuanzhedong * pylint Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Piotr Kaminski Signed-off-by: Laplasjan107 Signed-off-by: Yuanzhe Dong Signed-off-by: yuanzhedong Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com> Co-authored-by: Laplasjan107 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: yuanzhedong Co-authored-by: Alexandros Koumparoulis Co-authored-by: akoumpa --- .../modules/common/megatron/megatron_utils.py | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_utils.py b/nemo/collections/nlp/modules/common/megatron/megatron_utils.py index d610f5b61c24..7ca5154b7911 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_utils.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_utils.py @@ -55,14 +55,14 @@ }, "megatron-bert-345m-uncased": { "config": CONFIGS["345m"], - "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.0/files/release/mp_rank_00/model_optim_rng.pt", + "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.0/files/release/mp_rank_00/model_optim_rng.pt", # pylint: disable=line-too-long "vocab": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", "do_lower_case": True, "tokenizer_name": "bert-large-uncased", }, "megatron-bert-345m-cased": { "config": CONFIGS["345m"], - "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/files/release/mp_rank_00/model_optim_rng.pt", + "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/files/release/mp_rank_00/model_optim_rng.pt", # pylint: disable=line-too-long "vocab": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", "do_lower_case": False, "tokenizer_name": "bert-large-cased", @@ -83,14 +83,14 @@ }, "biomegatron-bert-345m-uncased": { "config": CONFIGS["345m"], - "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345muncased/versions/0/files/MegatronBERT.pt", + "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345muncased/versions/0/files/MegatronBERT.pt", # pylint: disable=line-too-long "vocab": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345muncased/versions/0/files/vocab.txt", "do_lower_case": True, "tokenizer_name": "bert-large-uncased", }, "biomegatron-bert-345m-cased": { "config": CONFIGS["345m"], - "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345mcased/versions/0/files/MegatronBERT.pt", + "checkpoint": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345mcased/versions/0/files/MegatronBERT.pt", # pylint: disable=line-too-long "vocab": "https://api.ngc.nvidia.com/v2/models/nvidia/biomegatron345mcased/versions/0/files/vocab.txt", "do_lower_case": False, "tokenizer_name": "bert-large-cased", @@ -98,11 +98,28 @@ } -def compute_model_parallel_rank(local_rank, model_parallel_size): +def compute_model_parallel_rank(local_rank: int, model_parallel_size: int) -> int: + """Calculates the model_parallel_rank from the local rank and the model parallel size + + Args: + local_rank (int): The local rank of the process. + model_parallel_size (int): The number of ranks in the model parallel group. + + Returns: + int: The model parallel rank corresponding to the given local rank. + """ return local_rank % model_parallel_size def get_megatron_pretrained_bert_models() -> List[str]: + """Retrieves the names of all available pretrained Megatron-BERT models. + + This function uses the NeMo MegatronBertModel class to list all available + pretrained model configurations, extracting each model's name. + + Returns: + List[str]: A list of pretrained Megatron-BERT model names. + """ from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel all_pretrained_megatron_bert_models = [ @@ -207,6 +224,8 @@ def _download(path: str, url: str): os.makedirs(MEGATRON_CACHE, exist_ok=True) logging.info(f"Downloading from {url} to {path}") downloaded_path = wget.download(url) + if not os.path.exists(downloaded_path): + raise FileNotFoundError(f"Downloaded file not found: {downloaded_path}") shutil.move(downloaded_path, path) # wait until the master process downloads the file and writes it to the cache dir if torch.distributed.is_initialized(): @@ -230,12 +249,12 @@ def is_lower_cased_megatron(pretrained_model_name): def get_megatron_tokenizer(pretrained_model_name: str): """ - Takes a pretrained_model_name for megatron such as "megatron-bert-cased" and returns the according + Takes a pretrained_model_name for megatron such as "megatron-bert-cased" and returns the according tokenizer name for tokenizer instantiating. Args: pretrained_model_name: pretrained_model_name for megatron such as "megatron-bert-cased" - Returns: + Returns: tokenizer name for tokenizer instantiating """ _check_megatron_name(pretrained_model_name) From d2f7b8e50f47ec1c2ff3d7f25013aa89df61e4db Mon Sep 17 00:00:00 2001 From: Roman Korostik Date: Wed, 12 Feb 2025 15:56:52 +0400 Subject: [PATCH 09/14] AudioToAudioModel: fix model->dataloader sample_rate parameter injection (#12092) * AudioToAudioModel: fix model->dataloader sample_rate parameter injection Signed-off-by: Roman Korostik * AudioToAudioModel: import missing type (PretrainedModelInfo) Signed-off-by: Roman Korostik --------- Signed-off-by: Roman Korostik --- nemo/collections/audio/models/audio_to_audio.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/collections/audio/models/audio_to_audio.py b/nemo/collections/audio/models/audio_to_audio.py index 41125a81035b..28109f27b7f2 100644 --- a/nemo/collections/audio/models/audio_to_audio.py +++ b/nemo/collections/audio/models/audio_to_audio.py @@ -33,6 +33,7 @@ from nemo.collections.audio.metrics.audio import AudioMetricWrapper from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.core.classes import ModelPT +from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging, model_utils __all__ = ['AudioToAudioModel'] @@ -180,6 +181,9 @@ def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): return self.multi_evaluation_epoch_end(outputs, dataloader_idx, 'test') def _setup_dataloader_from_config(self, config: Optional[Dict]): + # TODO: Consider moving `inject` from `audio_to_text_dataset` to a utility module? + # Automatically inject args from model config to dataloader config + inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate') if config.get("use_lhotse", False): return get_lhotse_dataloader_from_config( @@ -190,10 +194,6 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): if is_concat: raise NotImplementedError('Concat not implemented') - # TODO: Consider moving `inject` from `audio_to_text_dataset` to a utility module? - # Automatically inject args from model config to dataloader config - inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate') - # Instantiate tarred dataset loader or normal dataset loader if config.get('is_tarred', False): raise NotImplementedError('Tarred datasets not supported') From e44633777923b99888bdc8c4187385c3e809b0e1 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Wed, 12 Feb 2025 08:12:55 -0800 Subject: [PATCH 10/14] interface for asymmetric pipeline schedule (#12039) * interface for asymmetric pipeline schedule Signed-off-by: Sangkug Lym * Apply isort and black reformatting Signed-off-by: erhoo82 * linting fix Signed-off-by: Sangkug Lym * Apply isort and black reformatting Signed-off-by: erhoo82 --------- Signed-off-by: Sangkug Lym Signed-off-by: erhoo82 Co-authored-by: erhoo82 --- nemo/collections/llm/gpt/model/base.py | 5 +- nemo/collections/llm/recipes/llama31_405b.py | 6 +- .../language_modeling/megatron_base_model.py | 65 +++++++---- .../language_modeling/megatron_gpt_model.py | 104 ++++++++++-------- .../pytorch/strategies/megatron_strategy.py | 12 ++ 5 files changed, 122 insertions(+), 70 deletions(-) diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index 4af9c6c1263b..b92ca669db49 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -193,7 +193,10 @@ def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MC ) vp_size = self.virtual_pipeline_model_parallel_size - if vp_size: + is_pipeline_asymmetric = getattr(self, 'account_for_embedding_in_pipeline_split', False) or getattr( + self, 'account_for_loss_in_pipeline_split', False + ) + if vp_size and not is_pipeline_asymmetric: p_size = self.pipeline_model_parallel_size assert ( self.num_layers // p_size diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py index d60bbf54f8f0..d3dd2185efc8 100644 --- a/nemo/collections/llm/recipes/llama31_405b.py +++ b/nemo/collections/llm/recipes/llama31_405b.py @@ -64,11 +64,13 @@ def model() -> run.Config[pl.LightningModule]: def trainer( tensor_parallelism: int = 8, - pipeline_parallelism: int = 9, + pipeline_parallelism: int = 8, pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, virtual_pipeline_parallelism: Optional[int] = 2, context_parallelism: int = 4, sequence_parallelism: bool = True, + account_for_embedding_in_pipeline_split: bool = True, + account_for_loss_in_pipeline_split: bool = True, num_nodes: int = 72, num_gpus_per_node: int = 8, max_steps: int = 1168251, @@ -113,6 +115,8 @@ def trainer( virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, context_parallel_size=context_parallelism, sequence_parallel=sequence_parallelism, + account_for_embedding_in_pipeline_split=account_for_embedding_in_pipeline_split, + account_for_loss_in_pipeline_split=account_for_loss_in_pipeline_split, gradient_as_bucket_view=True, ckpt_async_save=True, ckpt_parallel_load=True, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 122c86614311..3df28212c899 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -101,15 +101,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): if not HAVE_MEGATRON_CORE: raise ImportError( - "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." + "megatron-core was not found. Please see the NeMo README for installation instructions: " + "https://github.com/NVIDIA/NeMo#megatron-gpt." ) if trainer is None: - raise ValueError(f"Trainer cannot be None for Megatron-based models. Please provide a PTL trainer object.") + raise ValueError("Trainer cannot be None for Megatron-based models. Please provide a PTL trainer object.") if cfg.get('use_flash_attention', False) and not HAVE_FLASH_ATTENTION: raise ImportError( - "flash_attn was not found. Please see the installation instructions: https://github.com/HazyResearch/flash-attention." + "flash_attn was not found. Please see the installation instructions: " + "https://github.com/HazyResearch/flash-attention." "If you use flash_attn with triton. Please install triton==2.0.0.dev20221202." ) @@ -182,9 +184,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): if vp_size == 1: vp_size = None else: - assert ( - self.cfg.num_layers // self.cfg.pipeline_model_parallel_size - ) % vp_size == 0, 'Make sure the number of model chunks is the same across all pipeline stages.' + if not ( + self.cfg.get('account_for_embedding_in_pipeline_split', False) + and self.cfg.get('account_for_loss_in_pipeline_split', False) + ): + assert ( + self.cfg.num_layers // self.cfg.pipeline_model_parallel_size + ) % vp_size == 0, 'Make sure the number of model chunks is the same across all pipeline stages.' initialize_model_parallel_for_nemo( world_size=init_world_size, @@ -252,7 +258,7 @@ def setup_transformer_engine_tp_groups(self): """ for module in self.get_model_module_list(): """Set TP group - Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398 + Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398 # pylint: disable=line-too-long """ # Deep iterate but skip self to avoid infinite recursion. for index, child in enumerate(module.modules()): @@ -270,7 +276,7 @@ def setup_transformer_engine_cp_groups(self): for module in self.get_model_module_list(): """Set context parallel running - Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py + Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py # pylint: disable=line-too-long """ # Deep iterate but skip self to avoid infinite recursion. for index, child in enumerate(module.modules()): @@ -345,7 +351,8 @@ def _reconfigure_limit_batches(self, limit_batches, dataloader, mode): """ Reconfigure trainer.limit_val_batches for pretraining """ - # Override limit_batches in terms of num microbatches and so there are limit_batches//num_micro_batches num of global batches + # Override limit_batches in terms of num microbatches + # and so there are limit_batches//num_micro_batches num of global batches if isinstance(limit_batches, int): limit_batches *= get_num_microbatches() else: @@ -539,6 +546,9 @@ def build_transformer_config(self) -> TransformerConfig: tp_only_amax_red = self.cfg.get('tp_only_amax_red', False) + account_for_embedding_in_pipeline_split = self.cfg.get('account_for_embedding_in_pipeline_split', False) + account_for_loss_in_pipeline_split = self.cfg.get('account_for_loss_in_pipeline_split', False) + attention_backend = self.cfg.get('attention_backend', "auto") attention_backend = AttnBackend[attention_backend] @@ -566,6 +576,8 @@ def build_transformer_config(self) -> TransformerConfig: 'rotary_interleaved': rotary_interleaved, 'deallocate_pipeline_outputs': True, 'tp_only_amax_red': tp_only_amax_red, + 'account_for_embedding_in_pipeline_split': account_for_embedding_in_pipeline_split, + 'account_for_loss_in_pipeline_split': account_for_loss_in_pipeline_split, 'attention_backend': attention_backend, } @@ -609,7 +621,8 @@ def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by multiple = make_vocab_size_divisible_by * tensor_model_parallel_size after = ((after + multiple - 1) // multiple) * multiple logging.info( - f'Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, dummy tokens: {after - orig_vocab_size}.' + f"Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, " + f"dummy tokens: {after - orig_vocab_size}." ) return after @@ -664,7 +677,7 @@ def configure_gradient_clipping(self, *args, **kwargs): def allreduce_gradients(self): """Reduce gradients across data parallel ranks. - Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188 + Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188 # pylint: disable=line-too-long """ # Bucketize and all-reduce buckets = {} @@ -836,7 +849,8 @@ def configure_optimizers(self): # TODO: contiguous grad bucket for fp16 is also planned to be supported contiguous_grad_bucket = False raise ValueError( - "fp16 training is not yet supported with O2. Please set megatron_amp_O2 to False in the model config." + "fp16 training is not yet supported with O2." + "Please set megatron_amp_O2 to False in the model config." ) # if using tensor parallel only, we automatically use async grad all-reduce @@ -974,7 +988,8 @@ def _validate_and_override_config(self): if self.cfg.get('sequence_parallel', False) and self.cfg.get('tensor_model_parallel_size', 1) == 1: logging.info( - "Sequence parallel should only be used with tensor parallel size > 1. Setting sequence parallel to False" + "Sequence parallel should only be used with tensor parallel size > 1. " + "Setting sequence parallel to False" ) with open_dict(self.cfg): self.cfg.sequence_parallel = False @@ -993,7 +1008,8 @@ def _validate_and_override_config(self): if self.cfg.get('gradient_accumulation_fusion', False): if data_parallel_size > 1 and pipeline_model_parallel_size == 1 and not distributed_fused_adam: logging.info( - "When not using pipeline model parallel, gradient accumulation fusion can only be used with distributed_fused_adam." + "When not using pipeline model parallel, " + "gradient accumulation fusion can only be used with distributed_fused_adam." ) with open_dict(self.cfg): self.cfg.gradient_accumulation_fusion = False @@ -1015,9 +1031,13 @@ def _validate_and_override_config(self): if vp_size == 1: self.cfg['virtual_pipeline_model_parallel_size'] = None else: - assert ( - self.cfg.num_layers // self.cfg.pipeline_model_parallel_size - ) % vp_size == 0, 'Make sure the number of model chunks is the same across all pipeline stages.' + if not ( + self.cfg.get('account_for_embedding_in_pipeline_split', False) + and self.cfg.get('account_for_loss_in_pipeline_split', False) + ): + assert ( + self.cfg.num_layers // self.cfg.pipeline_model_parallel_size + ) % vp_size == 0, 'Make sure the number of model chunks is the same across all pipeline stages.' if self.cfg.get('ub_tp_comm_overlap', False): if not self.cfg.get('sequence_parallel', False): @@ -1110,7 +1130,8 @@ def _get_total_params_across_model_parallel_groups_enc_dec(self, model): parallel_state.get_pipeline_model_parallel_rank() == self.cfg.get('pipeline_model_parallel_split_rank', 0) or parallel_state.is_pipeline_last_stage() ): - # If the current rank is the in the decoder first stage (decoder emb) or last rank (output layer), subtract those weights since it is already accounted for in the encoder first stage. + # If the current rank is the in the decoder first stage (decoder emb) or last rank (output layer), + # subtract those weights since it is already accounted for in the encoder first stage. # TODO: If we support embedding untying with PP > 1, we will need to update this. num_word_embedding_parameters = sum([p.nelement() for p in model.word_embeddings_weight()]) num_parameters_on_device -= num_word_embedding_parameters @@ -1167,7 +1188,7 @@ def build_model_parallel_config(self) -> ModelParallelConfig: config_mapping = { "perform_initialization": True, # initailize weights when constructing the module "fp16": self.torch_dtype == torch.float16 - and megatron_amp_O2, # NeMo does not currently support fp16 training with megatron amp O2, eval and inference is supported + and megatron_amp_O2, # fp16 training with megatron amp O2 not supported, eval and inference is supported "bf16": self.torch_dtype == torch.bfloat16 and megatron_amp_O2, "params_dtype": self.params_dtype, "timers": self.megatron_timers, @@ -1216,7 +1237,8 @@ def build_model_parallel_config(self) -> ModelParallelConfig: setattr(model_parallel_config, 'hidden_size', self.cfg.hidden_size) except AttributeError: logging.warning( - f'hidden_size not found in {self.cfg}. Set this in model_parallel_config if using pipeline parallelism.' + f'hidden_size not found in {self.cfg}. ' + 'Set this in model_parallel_config if using pipeline parallelism.' ) return model_parallel_config @@ -1299,7 +1321,8 @@ def find_frozen_submodules(model): logging.debug(f"Ignoring state {submodule} in FSDP.") self.trainer.strategy.kwargs['ignored_states'] = frozen_submodules # FSDP requires uniform status of require_grads - # Diffusion models like SD has frozen parts and needs to be added to 'ignored_states' from sharding for FSDP to work + # Diffusion models like SD has frozen parts and needs to be added to 'ignored_states' + # from sharding for FSDP to work self.model = self.trainer.strategy._setup_model(self.model) # Move the CPU-initialized model (with `use_cpu_initialization=True`) to GPU, which is to avoid # out-of-memory carash before sharding. In case of GPU-initialized model, this is no-op. diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 0ebf1ba17ddb..78171e4ed605 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -17,7 +17,6 @@ import queue import warnings from contextlib import nullcontext -from dataclasses import fields from functools import cache, partial from importlib.metadata import version from typing import Any, Dict, Iterator, List, Optional, Union @@ -25,9 +24,7 @@ import packaging import torch from lightning.pytorch.accelerators import CPUAccelerator -from lightning.pytorch.loops.fetchers import _DataFetcherWrapper from lightning.pytorch.trainer.trainer import Trainer -from omegaconf import OmegaConf from omegaconf.dictconfig import DictConfig from nemo.collections.common.parts.utils import apply_rope_scaling, extend_instance @@ -69,7 +66,7 @@ TextGeneration, ) from nemo.collections.nlp.parts import utils_funcs -from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank +from nemo.collections.nlp.parts.utils_funcs import get_last_rank from nemo.core.classes import Exportable from nemo.core.classes.common import PretrainedModelInfo from nemo.core.neural_types import ChannelType, NeuralType @@ -78,8 +75,7 @@ from nemo.utils.te_utils import is_float8tensor, te_version try: - import megatron.core as core - from megatron.core import InferenceParams, parallel_state, tensor_parallel + from megatron.core import InferenceParams, parallel_state from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset from megatron.core.datasets.utils import get_blend_from_list @@ -98,13 +94,7 @@ from megatron.core.pipeline_parallel.schedules import get_forward_backward_func from megatron.core.transformer.module import Float16Module as MCoreFloat16Module from megatron.core.transformer.transformer_config import TransformerConfig - from megatron.core.utils import ( - drain_embedding_wgrad_compute, - get_model_config, - init_method_normal, - is_te_min_version, - scaled_init_method_normal, - ) + from megatron.core.utils import drain_embedding_wgrad_compute, get_model_config, is_te_min_version HAVE_MEGATRON_CORE = True @@ -143,14 +133,14 @@ def mcore_supports_moe() -> bool: if not HAVE_MEGATRON_CORE: return False try: - from megatron.core.transformer.moe.router import TopKRouter + from megatron.core.transformer.moe.router import TopKRouter # noqa: F401 return True except ImportError: return False -## TODO: This function will not work if TE is not installed +# TODO: This function will not work if TE is not installed def get_specs(spec_name, transformer_config=None, use_te=True, hyena_cfg: Dict = None, fp8=False): from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_spec import get_gemma2_layer_spec @@ -331,7 +321,8 @@ class MegatronGPTModel(MegatronBaseModel, TextGeneration): def __init__(self, cfg: DictConfig, trainer: Trainer): if not HAVE_MEGATRON_CORE: logging.warning( - "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." + "megatron-core was not found. Please see the NeMo README for installation instructions:" + "https://github.com/NVIDIA/NeMo#megatron-gpt." ) # this prevents base constructor from initializing tokenizer self.tokenizer = None @@ -371,7 +362,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): if self.cfg.get('expert_model_parallel_size', 1) > 1 and self.with_distributed_adam: if not self.use_mcore_dist_optim: raise ValueError( - 'Expert parallelism is currently not supporting Apex distributed optimizer, use Mcore distributed optimizer instead' + 'Expert parallelism is currently not supporting Apex distributed optimizer,' + 'use Mcore distributed optimizer instead' ) if self.cfg.optim.get('overlap_param_gather_with_optimizer_step', False): @@ -424,7 +416,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): if self.megatron_amp_O2: if not self.with_distributed_adam and not self.cfg.get("use_cpu_initialization", False): - # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type + # Pre-allocate the model on GPU to have master parameters allocated + # on the same device with matching data type if isinstance(self.model, list): for module in self.model: module.cuda(torch.cuda.current_device()) @@ -471,7 +464,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.reset_lr_steps = self.cfg.get('reset_lr_steps', False) if self.reset_lr and (not self.with_distributed_adam or not self.megatron_amp_O2): raise ValueError( - 'Learning rate reset feature is only supported with the distributed optmizer and megatron_amp_O2 for now.' + 'Learning rate reset feature is only supported with the distributed optmizer' + 'and megatron_amp_O2 for now.' ) # default to false since this doesn't work with sequence parallelism currently @@ -805,7 +799,8 @@ def initialize_ub_func(self): ub_cfgs = self.cfg.get('ub_tp_comm_overlap_cfg', None) if ub_cfgs is None: warnings.warn( - "Couldn't find TP config. Please check the path correctness. Initializing TP comm overlap with the default config." + "Couldn't find TP config. Please check the path correctness." + "Initializing TP comm overlap with the default config." ) input_shape = [ @@ -1002,7 +997,7 @@ def training_step(self, dataloader_iter): batch_size=1, ) - ## logging + # logging if self.log_train_loss: # When using pipeline parallelism, loss is calculated only in the last pipeline stage and # it should be casted to other pipeline stages for logging. @@ -1043,11 +1038,11 @@ def training_step(self, dataloader_iter): if self.rampup_batch_size: self.prev_global_batch_size = current_global_batch_size self.prev_consumed_samples = consumed_samples - num_microbatch_calculator.update( + num_microbatch_calculator.update( # noqa: F821 consumed_samples=consumed_samples, consistency_check=False, ) - current_global_batch_size = num_microbatch_calculator.current_global_batch_size + current_global_batch_size = num_microbatch_calculator.current_global_batch_size # noqa: F821 self.log('global_batch_size', current_global_batch_size, prog_bar=True, rank_zero_only=True, batch_size=1) self.if_first_step = 1 @@ -1120,7 +1115,7 @@ def allreduce_fsdp_sharding_omitted_gradients(self): def allreduce_first_last_embeddings(self): - # Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/training.py#L407 + # Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/training.py#L407 # pylint: disable=line-too-long # All-reduce word_embeddings' grad across first and last stages to ensure # that word_embeddings parameters stay in sync. # This should only run for models that support pipelined model parallelism @@ -1141,7 +1136,8 @@ def allreduce_first_last_embeddings(self): word_embeddings_weight = ( module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight() ) - # (@adithyare) adapter training now extends MegatronGPTModel so we have to add this check here to ensure we do not perform all_reduce when grad is None. + # (@adithyare) adapter training now extends MegatronGPTModel so we have to add this + # check here to ensure we do not perform all_reduce when grad is None. # grad can be None when performing PeFT training. if word_embeddings_weight.requires_grad: if self.megatron_amp_O2: @@ -1351,7 +1347,8 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_ import transformer_engine_torch as tex except ModuleNotFoundError as e: logging.error( - "Please update Transformer Engine to >= 1.10 to use Context Parallel with THD format data" + "Please update Transformer Engine to >= 1.10 " + "to use Context Parallel with THD format data" ) raise e cp_rank = parallel_state.get_context_parallel_rank() @@ -1402,7 +1399,8 @@ def loss_func(output_tensor): loss_for_ub = self.loss_func(batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor) cp_size = parallel_state.get_context_parallel_world_size() if isinstance(loss_for_ub, dict): - # TODO: need a better way to check if loss_func is returning more stuff than just loss... (@adithyare) + # TODO: need a better way to check if loss_func is returning + # more stuff than just loss... (@adithyare) if set(loss_for_ub.keys()) == set( ["loss", "query_hs", "pos_doc_hs", "pos_cs", "neg_cs", "diff_cs"] @@ -1459,7 +1457,8 @@ def loss_func(output_tensor): torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(), ] ) - # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds) + # Could potentially reduce num_valid_samples_in_microbatch and use that to + # aggregate instead of len(self._validation_ds) torch.distributed.all_reduce( loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group() ) @@ -1641,10 +1640,12 @@ def build_train_valid_test_datasets(self): test_iters * global_batch_size, ] - # The line below exploits a quirk in mcore dataset construction, to make number of epochs for validation and test equal to 1 - # The mcore dataset implementation uses the number N we provide via train_valid_test_num_samples to derive parameter E such that + # The line below exploits a quirk in mcore dataset construction, to make number of epochs + # for validation and test equal to 1. The mcore dataset implementation uses the number N we + # provide via train_valid_test_num_samples to derive parameter E such that # E = argmin_e e * N_d >= N, or equivalently E = ceildiv(N, N_d) - # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below). + # Where N_d is the total number of samples in a dataset (files), and N is the requested + # number of samples (provided for every split in the list below). # Setting N = 1 we force E to be 1 as well legacy_dataset = self.cfg.data.get("legacy_dataset", False) if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): @@ -1723,7 +1724,7 @@ def build_train_valid_test_datasets(self): logging.info(f'Length of val dataset: {len(self._validation_ds)}') if self._test_ds is not None: logging.info(f'Length of test dataset: {len(self._test_ds)}') - logging.info(f'Finished building GPT datasets.') + logging.info('Finished building GPT datasets.') return self._train_ds, self._validation_ds, self._test_ds @@ -1815,7 +1816,8 @@ def setup(self, stage=None): self.setup_test_data(self.cfg.data) # Override limit_train_batches in terms of num of microbatches self._reconfigure_limit_batches(self.trainer.limit_train_batches, self._train_dl, 'train') - # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step + # Override limit_val_batches to be a multiple of num microbatches to prevent + # val_step from exiting in between a step self._reconfigure_limit_batches(self.trainer.limit_val_batches, self._validation_dl, 'val') # Data cache generation only @@ -1835,7 +1837,8 @@ def setup_training_data(self, cfg): if hasattr(self, '_train_ds'): consumed_samples = self.compute_consumed_samples(0) logging.info( - f'Setting up train dataloader with len(len(self._train_ds)): {len(self._train_ds)} and consumed samples: {consumed_samples}' + 'Setting up train dataloader with len(len(self._train_ds)): ' + f'{len(self._train_ds)} and consumed samples: {consumed_samples}' ) self._train_dl = self.build_pretraining_data_loader(self._train_ds, consumed_samples) @@ -1843,12 +1846,13 @@ def setup_validation_data(self, cfg): if hasattr(self, '_validation_ds'): consumed_samples = 0 logging.info( - f'Setting up validation dataloader with len(len(self._validation_ds)): {len(self._validation_ds)} and consumed samples: {consumed_samples}' + 'Setting up validation dataloader with len(len(self._validation_ds)): ' + f'{len(self._validation_ds)} and consumed samples: {consumed_samples}' ) drop_last = True if not self.validation_drop_last: - logging.info(f'Drop last in validation dataset is set to False') + logging.info('Drop last in validation dataset is set to False') drop_last = False pad_samples_to_global_batch_size = False if self.cfg.data.get('pad_samples_to_global_batch_size', False): @@ -1864,7 +1868,8 @@ def setup_test_data(self, cfg): if self._test_ds is not None: consumed_samples = 0 logging.info( - f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}' + 'Setting up test dataloader with len(len(self._test_ds)): ' + f'{len(self._test_ds)} and consumed samples: {consumed_samples}' ) self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples) else: @@ -1934,7 +1939,7 @@ def list_available_models(self): return None def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: - """PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device + """PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device # pylint: disable=line-too-long When using pipeline parallelism, we need the global batch to remain on the CPU, since the memory overhead will be too high when using a large number of microbatches. Microbatches are transferred from CPU to GPU inside the pipeline. @@ -1947,7 +1952,7 @@ def _validate_trainer(self): """ if self.trainer.accumulate_grad_batches > 1: raise ValueError( - f'Gradient accumulation is done within training_step. trainer.accumulate_grad_batches must equal 1' + 'Gradient accumulation is done within training_step. trainer.accumulate_grad_batches must equal 1' ) @classmethod @@ -1961,7 +1966,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]: result.append( PretrainedModelInfo( pretrained_model_name="megatron_gpt_345m", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/megatron_gpt_345m/versions/1/files/megatron_gpt_345m.nemo", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/megatron_gpt_345m/versions/1/files/megatron_gpt_345m.nemo", # pylint: disable=line-too-long description="345M parameter GPT generative Megatron model.", ) ) @@ -2010,7 +2015,8 @@ def on_load_checkpoint(self, checkpoint) -> None: missing_keys, expected_keys = module.load_state_dict(checkpoint_state_dict, strict=False) if all(s.endswith('_extra_state') for s in missing_keys): logging.warning( - f'Loding checkpoint created with Transformer Engine version lower than 1.13. Missing layers {missing_keys} will be ignored.' + 'Loding checkpoint created with Transformer Engine version lower than 1.13.' + f'Missing layers {missing_keys} will be ignored.' ) else: raise e @@ -2199,11 +2205,15 @@ def build_transformer_config(self) -> TransformerConfig: For attributes in TransformerConfig that are not in the nemo model config, we add custom logic. """ - if self.cfg.num_layers % self.cfg.get('pipeline_model_parallel_size', 1) != 0: - raise ValueError( - f"num_layers ({self.cfg.num_layers}) should be divisible by " - f"pipeline_model_parallel_size ({self.cfg.get('pipeline_model_parallel_size', 1)})" - ) + if not ( + self.cfg.get('account_for_embedding_in_pipeline_split', False) + and self.cfg.get('account_for_loss_in_pipeline_split', False) + ): + if self.cfg.num_layers % self.cfg.get('pipeline_model_parallel_size', 1) != 0: + raise ValueError( + f"num_layers ({self.cfg.num_layers}) should be divisible by " + f"pipeline_model_parallel_size ({self.cfg.get('pipeline_model_parallel_size', 1)})" + ) normalization = self.cfg.get('normalization', 'layernorm').lower() layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' or self.cfg.get( @@ -2231,7 +2241,7 @@ def build_transformer_config(self) -> TransformerConfig: elif self.cfg.get('fp8_hybrid', False): fp8 = 'hybrid' else: - raise ValueError(f"fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.") + raise ValueError("fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.") if self.cfg.get('enable_cuda_graph', False): assert HAVE_TE, "Transformer Engine is required for cudagraphs." diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index 752b1d9853b3..9fde044cb14f 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -109,6 +109,8 @@ class ParallelismConfig: pipeline_dtype: torch.dtype encoder_tensor_model_parallel_size: int = 0 encoder_pipeline_model_parallel_size: int = 0 + account_for_embedding_in_pipeline_split: bool = False + account_for_loss_in_pipeline_split: bool = False use_te_rng_tracker: bool = False expert_tensor_parallel_size: int = None use_tp_pp_dp_mapping: bool = False @@ -139,6 +141,10 @@ class MegatronStrategy(DDPStrategy, io.IOMixin): Defaults to 1. expert_tensor_parallel_size (Optional[int]): Sets MoE Experts tensor parallelism size. Defaults to None. moe_extended_tp (bool): Alternative parallelization strategy for expert parallelism. Defaults to False. + account_for_embedding_in_pipeline_split (bool): If set, *input* embedding layer will be treated as a standard + transformer layer in the context of partition and placement for pipeline parallelism. + account_for_loss_in_pipeline_split (bool): If set, loss layer will be treated as a standard transformer + layer in the context of partition and placement for pipeline parallelism. data_sampler (Optional['DataSampler']): Custom data sampler for distributed training. Defaults to None. parallel_devices (Optional[List[torch.device]]): List of devices to use for parallelism. Defaults to None. cluster_environment: Cluster environment for distributed training. Defaults to None. @@ -212,6 +218,8 @@ def __init__( expert_tensor_parallel_size: int = None, encoder_tensor_model_parallel_size: Optional[int] = 0, encoder_pipeline_model_parallel_size: Optional[int] = 0, + account_for_embedding_in_pipeline_split: bool = False, + account_for_loss_in_pipeline_split: bool = False, data_sampler: Optional["DataSampler"] = None, parallel_devices: Optional[List[torch.device]] = None, cluster_environment=None, # TODO: Add type-hint @@ -267,6 +275,8 @@ def __init__( self.sequence_parallel = sequence_parallel self.encoder_tensor_model_parallel_size = encoder_tensor_model_parallel_size self.encoder_pipeline_model_parallel_size = encoder_pipeline_model_parallel_size + self.account_for_embedding_in_pipeline_split = account_for_embedding_in_pipeline_split + self.account_for_loss_in_pipeline_split = account_for_loss_in_pipeline_split self.lazy_init = lazy_init self.ckpt_load_optimizer = ckpt_load_optimizer self.ckpt_save_optimizer = ckpt_save_optimizer @@ -941,6 +951,8 @@ def parallelism(self) -> ParallelismConfig: moe_extended_tp=self.moe_extended_tp, encoder_tensor_model_parallel_size=self.encoder_tensor_model_parallel_size, encoder_pipeline_model_parallel_size=self.encoder_pipeline_model_parallel_size, + account_for_embedding_in_pipeline_split=self.account_for_embedding_in_pipeline_split, + account_for_loss_in_pipeline_split=self.account_for_loss_in_pipeline_split, pipeline_dtype=self.pipeline_dtype, use_te_rng_tracker=self.use_te_rng_tracker, use_tp_pp_dp_mapping=self.use_tp_pp_dp_mapping, From 83d935fc2c4a6d645052f4aa566910334066f725 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Wed, 12 Feb 2025 11:53:31 -0500 Subject: [PATCH 11/14] skip initialization in hf export (#12136) Signed-off-by: Chen Cui --- nemo/lightning/io/connector.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index 602551ae4479..8d1d957ec642 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -15,7 +15,7 @@ import logging import os import shutil -from pathlib import Path, PosixPath, PurePath, WindowsPath +from pathlib import Path, PosixPath, WindowsPath from typing import Generic, Optional, Tuple, TypeVar import lightning.pytorch as pl @@ -69,9 +69,11 @@ class Connector(BasePath, Generic[SourceT, TargetT]): LOCK_TIMEOUT = 1200 def init(self) -> TargetT: + """Should be implemented to initialize the target type from the source type.""" raise NotImplementedError() def apply(self, output_path: Path) -> Path: + """Should be implemented to apply the transformation and save the result at the output path.""" raise NotImplementedError() def __new__(cls, *args, **kwargs): @@ -118,6 +120,7 @@ def __call__(self, output_path: Optional[Path] = None, overwrite: bool = False) return _output_path def local_path(self, base_path: Optional[Path] = None) -> Path: + """Computes the local path for storage based on a base path or a default cache home.""" if base_path: _base = base_path else: @@ -128,6 +131,7 @@ def local_path(self, base_path: Optional[Path] = None) -> Path: return _base / str(self).replace("://", "/") def is_in_cache(self, base_path: Optional[Path] = None) -> bool: + """Checks if the transformed data is already cached at the specified base path.""" return self.local_path(base_path=base_path).exists() @@ -145,7 +149,8 @@ class ModelConnector(Connector, Generic[SourceT, TargetT]): Saves the model's state to the specified path using the trainer's current strategy. nemo_load(path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True) -> Tuple[Any, pl.Trainer]: - Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer. + Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and + trainer. """ def nemo_setup( @@ -170,6 +175,7 @@ def nemo_setup( ) # Note: set trainer to fitting state to avoid the following code path. Feel free to refactor if we no longer # need to avoid this: + # pylint: disable=C0301 # https://github.com/NVIDIA/NeMo/blob/e35a6592f53ee34b1ec2fc3f1e009dd1ebc79e65/nemo/lightning/pytorch/strategies/megatron_strategy.py#L346-L349 _trainer.state.fn = TrainerFn.FITTING # needed for proper save. @@ -227,6 +233,9 @@ def nemo_load( from nemo.lightning.io.api import load_context model = load_context(path, subpath="model") + # skip initialization since a checkpoint is loaded in this function + model.config.perform_initialization = False + is_peft_ckpt = model.model_transform is not None callbacks = [] if is_peft_ckpt: @@ -285,12 +294,14 @@ def local_path(self, base_path: Optional[Path] = None) -> Path: return _base / str(self).replace("://", "/") def on_import_ckpt(self, model: pl.LightningModule): + """Called after checkpoint is imported""" if hasattr(self, "tokenizer"): model.tokenizer = self.tokenizer if hasattr(model, "__io__") and hasattr(self.tokenizer, '__io__'): model.__io__.tokenizer = self.tokenizer.__io__ def save_hf_tokenizer_assets(self, tokenizer_name_or_path, save_path="/tmp/nemo_tokenizer"): + """Save HF tokenizer to the imported NeMo model""" from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True) From d977f42ccafbbd04efcefc543cc05d6f7d965837 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Wed, 12 Feb 2025 10:51:12 -0800 Subject: [PATCH 12/14] update export io call (#12144) * update call Signed-off-by: Alexandros Koumparoulis * update tokenizer Signed-off-by: Alexandros Koumparoulis * docu Signed-off-by: Alexandros Koumparoulis * # noqa C0301 Signed-off-by: Alexandros Koumparoulis * fix Signed-off-by: Alexandros Koumparoulis * pylint Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * fix pylint Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- nemo/collections/llm/gpt/model/mixtral.py | 31 ++++++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py index 219d9b19f6eb..8e71de144e50 100644 --- a/nemo/collections/llm/gpt/model/mixtral.py +++ b/nemo/collections/llm/gpt/model/mixtral.py @@ -123,6 +123,8 @@ class MixtralConfig8x22B(MixtralConfig): class MixtralModel(GPTModel): + """Mcore-based MixtralModel""" + def __init__( self, config: Optional[Union[MixtralConfig8x7B, MixtralConfig8x22B]] = None, @@ -130,6 +132,7 @@ def __init__( tokenizer: Optional["TokenizerSpec"] = None, model_transform: Optional[Callable[[nn.Module], nn.Module]] = None, ): + """Mcore-based MixtralModel ctor""" super().__init__( config or MixtralConfig8x7B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform ) @@ -137,10 +140,14 @@ def __init__( @io.model_importer(MixtralModel, ext="hf") class HFMixtralImporter(io.ModelConnector["MixtralForCausalLM", MixtralModel]): + """HF to NeMo importer""" + def init(self) -> MixtralModel: + """init""" return MixtralModel(self.config, tokenizer=self.tokenizer) def apply(self, output_path: Path) -> Path: + """Import model from HF""" from transformers import MixtralForCausalLM source = MixtralForCausalLM.from_pretrained(str(self), torch_dtype='auto', use_safetensors=True) @@ -155,12 +162,13 @@ def apply(self, output_path: Path) -> Path: return output_path def convert_state(self, source, target): + """State-dict converter""" mapping = { "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight", "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight", "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.pre_mlp_layernorm.weight", # MoE - "model.layers.*.block_sparse_moe.experts.*.w2.weight": "decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight", + "model.layers.*.block_sparse_moe.experts.*.w2.weight": "decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight", # pylint: disable=line-too-long "model.layers.*.block_sparse_moe.gate.weight": "decoder.layers.*.mlp.router.weight", # lm-head "model.norm.weight": "decoder.final_layernorm.weight", @@ -175,12 +183,14 @@ def convert_state(self, source, target): @property def tokenizer(self) -> "AutoTokenizer": + """Configures tokenizer""" from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> MixtralConfig8x7B | MixtralConfig8x22B: + """Returns Mcore config from HF""" from transformers import MixtralConfig as HfMixtralConfig config = HfMixtralConfig.from_pretrained(str(self)) @@ -226,6 +236,7 @@ def config(self) -> MixtralConfig8x7B | MixtralConfig8x22B: target_key="embedding.word_embeddings.weight", ) def _import_embedding(ctx: io.TransformCTX, embedding): + """_import_embedding""" embedding_weight = ctx.source.model.embed_tokens.weight vocab_size = embedding_weight.shape[0] ctx.target_state['embedding.word_embeddings.weight'][:vocab_size, :].copy_(embedding_weight) @@ -237,6 +248,7 @@ def _import_embedding(ctx: io.TransformCTX, embedding): target_key="output_layer.weight", ) def _import_lm_head(ctx: io.TransformCTX, embedding): + """import head""" lm_head_weight = ctx.source.lm_head.weight vocab_size = lm_head_weight.shape[0] ctx.target_state['output_layer.weight'][:vocab_size, :].copy_(lm_head_weight) @@ -252,6 +264,7 @@ def _import_lm_head(ctx: io.TransformCTX, embedding): target_key="decoder.layers.*.self_attention.linear_qkv.weight", ) def _import_qkv(ctx: io.TransformCTX, q, k, v): + """import qkv""" megatron_config = ctx.target.config head_num = megatron_config.num_attention_heads @@ -293,12 +306,16 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v): target_key="decoder.layers.*.mlp.experts.local_experts.*.linear_fc1.weight", ) def _import_moe_w1_w3(gate_proj, up_proj): + """_import_moe_w1_w3""" return torch.cat((gate_proj, up_proj), axis=0) @io.model_exporter(MixtralModel, "hf") class HFMixtralExporter(io.ModelConnector[MixtralModel, "MixtralForCausalLM"]): + """NeMo to HF exporter""" + def init(self) -> "MixtralForCausalLM": + """HFMixtralExporter initialization""" from transformers import AutoModelForCausalLM from transformers.modeling_utils import no_init_weights @@ -306,6 +323,7 @@ def init(self) -> "MixtralForCausalLM": return AutoModelForCausalLM.from_config(self.config) def apply(self, output_path: Path) -> Path: + """export to hf format""" # TODO: Make it work with lazy init # with torch.device("meta"): # target = self.init() @@ -321,12 +339,13 @@ def apply(self, output_path: Path) -> Path: return output_path def convert_state(self, source, target): + """convert state""" mapping = { "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight", "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight", "decoder.layers.*.pre_mlp_layernorm.weight": "model.layers.*.post_attention_layernorm.weight", # MoE - "decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight": "model.layers.*.block_sparse_moe.experts.*.w2.weight", + "decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight": "model.layers.*.block_sparse_moe.experts.*.w2.weight", # pylint: disable=line-too-long "decoder.layers.*.mlp.router.weight": "model.layers.*.block_sparse_moe.gate.weight", # lm-head "decoder.final_layernorm.weight": "model.norm.weight", @@ -341,12 +360,14 @@ def convert_state(self, source, target): @property def tokenizer(self): - return io.load_ckpt(str(self)).model.tokenizer.tokenizer + """return tokenizer""" + return io.load_context(str(self), subpath="model").tokenizer @property def config(self) -> "MixtralConfig": + """return hf-config from mcore""" # Either MixtralConfig8x7B or MixtralConfig8x22B - source: MixtralConfig8x7B = io.load_ckpt(str(self)).model.config + source: MixtralConfig8x7B = io.load_context(str(self), subpath="model.config") from transformers import MixtralConfig as HfMixtralConfig @@ -382,6 +403,7 @@ def config(self) -> "MixtralConfig": ), ) def _export_qkv(ctx: io.TransformCTX, linear_qkv): + """_export_qkv""" megatron_config = ctx.source.config head_num = megatron_config.num_attention_heads @@ -417,6 +439,7 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv): ), ) def _export_moe_w1_w3(linear_fc1): + """_export_moe_w1_w3""" gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0) return gate_proj, up_proj From a682ea95a869fe422d59b27bb3e0315db4224947 Mon Sep 17 00:00:00 2001 From: Sam O Date: Wed, 12 Feb 2025 14:09:59 -0700 Subject: [PATCH 13/14] Minor Bug Fixes - LLaMa Embedding (#12146) * Minor Bug Fixes - LLaMa Embedding Signed-off-by: Sam Oluwalana * Apply isort and black reformatting Signed-off-by: artbataev * Add type checking Signed-off-by: Sam Oluwalana --------- Signed-off-by: Sam Oluwalana Signed-off-by: artbataev Co-authored-by: artbataev --- nemo/collections/llm/gpt/model/hf_llama_embedding.py | 2 +- nemo/collections/llm/gpt/model/llama_embedding.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/nemo/collections/llm/gpt/model/hf_llama_embedding.py b/nemo/collections/llm/gpt/model/hf_llama_embedding.py index ba89626ff45f..bbd27ce60507 100644 --- a/nemo/collections/llm/gpt/model/hf_llama_embedding.py +++ b/nemo/collections/llm/gpt/model/hf_llama_embedding.py @@ -156,7 +156,7 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) + labels = labels.to(pooled_logits.device) if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" diff --git a/nemo/collections/llm/gpt/model/llama_embedding.py b/nemo/collections/llm/gpt/model/llama_embedding.py index 3d8edcc5121a..96f311acd0b8 100644 --- a/nemo/collections/llm/gpt/model/llama_embedding.py +++ b/nemo/collections/llm/gpt/model/llama_embedding.py @@ -31,12 +31,15 @@ from nemo.collections.llm.utils import Config from nemo.lightning import OptimizerModule, io from nemo.lightning.pytorch.utils import dtype_from_hf +from nemo.utils import logging from nemo.utils.import_utils import safe_import if TYPE_CHECKING: from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec + from nemo.collections.llm.gpt.model.hf_llama_embedding import LlamaBidirectionalModel + _, HAVE_TE = safe_import("transformer_engine") @@ -271,7 +274,7 @@ class LlamaEmbeddingExporter(io.ModelConnector[LlamaEmbeddingModel, "LlamaBidire Note that NV Embedding LLama uses customized LlamaBidirectionalConfig config. """ - def init(self, dtype=torch.bfloat16) -> "LlamaForCausalLM": + def init(self, dtype=torch.bfloat16) -> "LlamaBidirectionalModel": from transformers.modeling_utils import no_init_weights from nemo.collections.llm.gpt.model.hf_llama_embedding import LlamaBidirectionalModel From 4b19adecc007c9aa80a03b39695b6bc647855f3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 12 Feb 2025 22:28:19 +0100 Subject: [PATCH 14/14] build: Force re-install VCS dependencies (#12155) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * build: Re-install nvrx Signed-off-by: oliver könig * f Signed-off-by: oliver könig --------- Signed-off-by: oliver könig --- reinstall.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reinstall.sh b/reinstall.sh index eaa6e657deaa..ced7019a6645 100755 --- a/reinstall.sh +++ b/reinstall.sh @@ -140,7 +140,7 @@ nemo() { ) echo 'Installing dependencies of nemo' - ${PIP} install --no-cache-dir --extra-index-url https://pypi.nvidia.com "${DEPS[@]}" + ${PIP} install --force-re-install --no-cache-dir --extra-index-url https://pypi.nvidia.com "${DEPS[@]}" echo 'Installing nemo itself' pip install --no-cache-dir --no-build-isolation $NEMO_DIR/.[all]