From d16aa1f8845e21deb139ca7fb1ced14f209407f8 Mon Sep 17 00:00:00 2001 From: Ao Tang Date: Fri, 16 Aug 2024 12:02:45 -0400 Subject: [PATCH 001/664] Nemotron Conversion: kv_channels rename to head_dim (#10068) * kv_channels rename to head_dim * version update, tokenizer with only input_ids * add attention_mask in tokenizer * force to have bos/eos in special token mappings * Apply isort and black reformatting Signed-off-by: suiyoubi --------- Signed-off-by: suiyoubi Co-authored-by: suiyoubi Co-authored-by: Pablo Garay --- .../convert_nemotron_nemo_to_hf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py index 072de9e5d5f4..7a58573278af 100644 --- a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py @@ -121,12 +121,12 @@ def convert_hf_config(nemo_config, tokenizer, vocab_size, dtype, hf_output_path, "partial_rotary_factor": nemo_config.get("rotary_percentage", 1.0), "tie_word_embeddings": False, "torch_dtype": DTYPE2HF[dtype], - "transformers_version": "4.32.0.dev0", # TODO + "transformers_version": "4.44.0", "use_cache": True, "vocab_size": vocab_size, } - if nemo_config.kv_channels is not None: - hf_config["kv_channels"] = nemo_config.kv_channels + if nemo_config.get("kv_channels", None) is not None: + hf_config["head_dim"] = nemo_config.kv_channels json.dump(hf_config, open(f"{hf_output_path}/config.json", "w"), indent=2) @@ -315,7 +315,10 @@ def extract_nemotron_tokenizer(nemo_file, model_config, output_hf_path, nemo_tok tokenizer = LlamaTokenizer.from_pretrained(output_hf_path, legacy=False) # Convert the LlamaTokenizer to a PreTrainedTokenizerFast instance tokenizer = PreTrainedTokenizerFast( - tokenizer_object=LlamaConverter(tokenizer).converted(), model_input_names=["input_ids", "token_type_ids"] + tokenizer_object=LlamaConverter(tokenizer).converted(), + model_input_names=["input_ids", "attention_mask"], + bos_token="", + eos_token="", ) tokenizer.save_pretrained(output_hf_path) logging.info(f"Setencepiece tokenizer has been saved to {output_tokenizer}") From 291197f356f14aa179f2350a720a127efa14355a Mon Sep 17 00:00:00 2001 From: Wil Kong Date: Sat, 17 Aug 2024 01:27:27 +0800 Subject: [PATCH 002/664] Drop update_metrics since it doesn't invoke sync anymore. (#10183) --- nemo/utils/callbacks/cuda_graph.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py index e18a2a3d3b6c..c78196934108 100644 --- a/nemo/utils/callbacks/cuda_graph.py +++ b/nemo/utils/callbacks/cuda_graph.py @@ -139,15 +139,6 @@ def to_tensor(self, value, name): return value -def update_metrics(self, key, value, batch_size): - # PyTorch Lightning always move all metrics to GPU, but moving the metric to - # its input device is prefered. - result_metric = self[key] - device = value.device if isinstance(value, torch.Tensor) else self.device - result_metric.forward(value.to(device), batch_size) - result_metric.has_reset = False - - def get_optimizer_step(state): def optimizer_step( self, @@ -364,8 +355,6 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") # Use smart metrics to avoid syncs LightningModule.__orig_to_tensor__ = LightningModule._LightningModule__to_tensor LightningModule._LightningModule__to_tensor = to_tensor - _ResultCollection.__orig_update_metrics__ = _ResultCollection.update_metrics - _ResultCollection.update_metrics = update_metrics # Save model outputs to static buffer for PL states reconstruct pl_module.__orig_training_step__ = pl_module.training_step @@ -397,8 +386,6 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - LightningModule._LightningModule__to_tensor = LightningModule.__orig_to_tensor__ del LightningModule.__orig_to_tensor__ - _ResultCollection.update_metrics = _ResultCollection.__orig_update_metrics__ - del _ResultCollection.__orig_update_metrics__ pl_module.training_step = pl_module.__orig_training_step__ del pl_module.__orig_training_step__ From 13ad2da8105800b5d7efce2305c3acf7e0a28405 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:01:03 -0700 Subject: [PATCH 003/664] [NeMo-UX] proposal for updated 2.0 progress logger (#9997) * proposal for updated 2.0 progress logger Signed-off-by: ashors1 * make logging consistent Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * make log_interval configurable Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * add init Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * add TQDM progress bar back, default to disabling TQDM progress bar and enabling megatron printing Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * fix typo Signed-off-by: ashors1 * handle string metrics Signed-off-by: ashors1 * move ProgressPrinter init from trainer to strategy Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * remove unused imports Signed-off-by: ashors1 * minor improvement Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * refresh progress bar once per step Signed-off-by: ashors1 * fix validation logging when num_microbatches > 1 Signed-off-by: ashors1 * skip logging of v_num Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * Change default to 1 Signed-off-by: Hemil Desai * use zero-indexing for iteration for consistency Signed-off-by: ashors1 * get current step from trainer Signed-off-by: ashors1 * change default progress_interval to 1 Signed-off-by: ashors1 --------- Signed-off-by: ashors1 Signed-off-by: ashors1 Signed-off-by: Hemil Desai Co-authored-by: ashors1 Co-authored-by: Hemil Desai --- nemo/lightning/pytorch/callbacks/__init__.py | 13 +- .../{progress.py => progress_bar.py} | 4 +- .../pytorch/callbacks/progress_printer.py | 189 ++++++++++++++++++ nemo/lightning/pytorch/strategies.py | 22 +- 4 files changed, 221 insertions(+), 7 deletions(-) rename nemo/lightning/pytorch/callbacks/{progress.py => progress_bar.py} (92%) create mode 100644 nemo/lightning/pytorch/callbacks/progress_printer.py diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py index ee0e777d739e..00637c9d57d4 100644 --- a/nemo/lightning/pytorch/callbacks/__init__.py +++ b/nemo/lightning/pytorch/callbacks/__init__.py @@ -3,7 +3,16 @@ from nemo.lightning.pytorch.callbacks.nsys import NsysCallback from nemo.lightning.pytorch.callbacks.peft import PEFT from nemo.lightning.pytorch.callbacks.preemption import PreemptionCallback -from nemo.lightning.pytorch.callbacks.progress import MegatronProgressBar +from nemo.lightning.pytorch.callbacks.progress_bar import MegatronProgressBar +from nemo.lightning.pytorch.callbacks.progress_printer import ProgressPrinter -__all__ = ["ModelCheckpoint", "ModelTransform", "PEFT", "NsysCallback", "MegatronProgressBar", "PreemptionCallback"] +__all__ = [ + "ModelCheckpoint", + "ModelTransform", + "PEFT", + "NsysCallback", + "MegatronProgressBar", + "ProgressPrinter", + "PreemptionCallback", +] diff --git a/nemo/lightning/pytorch/callbacks/progress.py b/nemo/lightning/pytorch/callbacks/progress_bar.py similarity index 92% rename from nemo/lightning/pytorch/callbacks/progress.py rename to nemo/lightning/pytorch/callbacks/progress_bar.py index 1b1276953a61..d6acb02ae377 100644 --- a/nemo/lightning/pytorch/callbacks/progress.py +++ b/nemo/lightning/pytorch/callbacks/progress_bar.py @@ -13,7 +13,7 @@ def init_train_tqdm(self): Override bar_format to not have 's/it'. """ self.bar = super().init_train_tqdm() - self.bar.bar_format = "{desc} {n_fmt}/{total_fmt}{postfix}" + self.bar.bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}{postfix}]" return self.bar def on_train_epoch_start(self, trainer, *_): @@ -35,7 +35,7 @@ def on_train_batch_end(self, trainer, pl_module, *_, **__): n = trainer.strategy.current_epoch_step if self._should_update(n, self.train_progress_bar.total): _update_n(self.train_progress_bar, n) - self.train_progress_bar.set_postfix(self.get_metrics(trainer, pl_module)) + self.train_progress_bar.set_postfix(self.get_metrics(trainer, pl_module), refresh=False) def calculate_data_parallel_groups() -> int: diff --git a/nemo/lightning/pytorch/callbacks/progress_printer.py b/nemo/lightning/pytorch/callbacks/progress_printer.py new file mode 100644 index 000000000000..8ddc97a6ddd6 --- /dev/null +++ b/nemo/lightning/pytorch/callbacks/progress_printer.py @@ -0,0 +1,189 @@ +from collections import defaultdict +from typing import Any + +from megatron.core.num_microbatches_calculator import get_num_microbatches +from pytorch_lightning.callbacks.progress import ProgressBar +from pytorch_lightning.utilities.types import STEP_OUTPUT +from typing_extensions import override + + +class ProgressPrinter(ProgressBar): + """ + Callback for logging progress in Megatron. Prints status in terms of global batches rather than microbatches. + Recommended over MegatronProgressBar for non-interactive settings + + Args: + log_interval (int): determines how frequently (in steps) to print the progress. + skip_accumulate_metrics (list[str]): for all metrics in this list, value logged will + simply reflect the latest value rather than averaging over the log interval. + exclude_metrics (list[str]): any metrics to exclude from logging. + """ + + def __init__( + self, + log_interval: int = 1, + skip_accumulate_metrics: list[str] = ["global_step"], + exclude_metrics: list[str] = ["v_num"], + ): + self._train_description = "Training" + self._validation_description = "Validation" + self._test_description = "Testing" + self._log_interval = int(log_interval) + # most recent "global_step" will be logged + # rather than averaging over last log_interval steps + self.skip_accumulate_metrics = skip_accumulate_metrics + self.exclude_metrics = exclude_metrics + self.total_metrics_dict = defaultdict(lambda: 0.0) + self._is_disabled = log_interval <= 0 + + super().__init__() + + def format_string(self, prefix, metrics): + log_string = prefix + for metric, val in metrics.items(): + if isinstance(val, (float)) and val.is_integer(): + val = int(val) + log_string += f' | {metric}: {val}' + else: + log_string += f' | {metric}: {val:.4}' + return log_string + + def disable(self): + self._is_disabled = True + + def enable(self): + self._is_disabled = False + + @property + def is_disabled(self) -> bool: + return self._is_disabled + + @property + def average_metrics_dict(self): + average_dict = {} + for key in self.total_metrics_dict: + if key in self.skip_accumulate_metrics or not isinstance(self.total_metrics_dict[key], (int, float)): + average_dict[key] = self.total_metrics_dict[key] + else: + average_dict[key] = self.total_metrics_dict[key] / self.log_interval + return average_dict + + @property + def train_description(self): + return self._train_description + + @property + def validation_description(self): + return self._validation_description + + @property + def test_description(self): + return self._test_description + + @property + def log_interval(self): + return self._log_interval + + @log_interval.setter + def log_interval(self, val): + self._log_interval = val + + @override + def on_sanity_check_start(self, *_: Any) -> None: + self._validation_description = "Sanity checking " + self.validation_description + + @override + def on_sanity_check_end(self, *_: Any) -> None: + self._validation_description = "Validation" + + @override + def on_train_epoch_start(self, trainer, *_): + if trainer.max_steps > 0: + # while resuming from a ckpt use trainer.max_steps as the total for progress bar as trainer.num_training_batches + # is truncated to max_steps - step being resumed at + self.total = trainer.max_steps + else: + self.total = trainer.num_training_batches + + ## TODO(ashors): handle nan losses + @override + def on_train_batch_end(self, trainer, pl_module, *_, **__): + if self.is_disabled: + return + n = trainer.strategy.current_epoch_step + metrics = self.get_metrics(trainer, pl_module) + for key in metrics: + if key in self.exclude_metrics: + continue + if key in self.skip_accumulate_metrics or not isinstance(metrics[key], (int, float)): + self.total_metrics_dict[key] = metrics[key] + else: + self.total_metrics_dict[key] += metrics[key] + + if self.should_log(n): + prefix = self.train_description + f" epoch {trainer.current_epoch}, iteration {n-1}/{self.total-1}" + log_string = self.format_string(prefix, self.average_metrics_dict) + print(log_string) + + self.total_metrics_dict = defaultdict(lambda: 0.0) + + @override + def on_validation_batch_start( + self, + trainer: "pl.Trainer", + pl_module: "pl.LightningModule", + batch: Any, + batch_idx: int, + dataloader_idx: int = 0, + ) -> None: + if not self.has_dataloader_changed(dataloader_idx): + return + self.total_validation_steps = int(self.total_val_batches_current_dataloader / get_num_microbatches()) + + @override + def on_validation_batch_end( + self, + trainer: "pl.Trainer", + pl_module: "pl.LightningModule", + outputs: STEP_OUTPUT, + batch: Any, + batch_idx: int, + dataloader_idx: int = 0, + ) -> None: + if self.is_disabled: + return + n = (batch_idx + 1) / get_num_microbatches() + if self.should_log(n): + print(self.validation_description + f": iteration {int(n)}/{self.total_validation_steps}") + + @override + def on_test_batch_start( + self, + trainer: "pl.Trainer", + pl_module: "pl.LightningModule", + batch: Any, + batch_idx: int, + dataloader_idx: int = 0, + ) -> None: + if not self.has_dataloader_changed(dataloader_idx): + return + self.total_test_steps = int(self.total_test_batches_current_dataloader / get_num_microbatches()) + + @override + def on_test_batch_end( + self, + trainer: "pl.Trainer", + pl_module: "pl.LightningModule", + outputs: STEP_OUTPUT, + batch: Any, + batch_idx: int, + dataloader_idx: int = 0, + ) -> None: + if self.is_disabled: + return + n = int((batch_idx + 1) / get_num_microbatches()) + if self.should_log(n): + print(self.test_description + f": iteration {n}/{self.total_validation_steps}") + + def should_log(self, n): + return n % self.log_interval == 0 diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py index 48cedf7b2625..bf35a2aeed9f 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies.py @@ -34,7 +34,7 @@ from nemo.lightning import _strategy_lib, io from nemo.lightning.io.pl import MegatronCheckpointIO from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction -from nemo.lightning.pytorch.callbacks import MegatronProgressBar, ModelTransform +from nemo.lightning.pytorch.callbacks import MegatronProgressBar, ModelTransform, ProgressPrinter from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO, AsyncFinalizerCallback if TYPE_CHECKING: @@ -113,6 +113,10 @@ class MegatronStrategy(DDPStrategy, io.IOMixin): necessary conversions of optimizer parameters and move optimizer parameters to the correct device. Defaults to True. init_model_parallel (bool): Whether to initialize the model parallel groups. Defaults to True. + replace_progress_bar (bool): Whether to replace the TQDM progress bar with a megatron-style logger + that prints the metrics to stdout. Suitable for non-interactive settings. + progress_interval (int): How frequently to print progress to stdout. Only used when + replace_progress_bar is True. **kwargs: Additional keyword arguments. Note: @@ -151,6 +155,8 @@ def __init__( ckpt_load_directly_on_device: bool = True, setup_optimizers: bool = True, init_model_parallel: bool = True, + replace_progress_bar: bool = True, + progress_interval: int = 1, **kwargs, ) -> None: super().__init__( @@ -188,6 +194,9 @@ def __init__( self.parallel_save_optim = ckpt_parallel_save_optim self.load_directly_on_device = ckpt_load_directly_on_device + self.replace_progress_bar = replace_progress_bar + self.progress_interval = progress_interval + self._ddp = ddp if ddp == "megatron": self.ddp_config = DistributedDataParallelConfig(check_for_nan_in_grad=True) @@ -546,9 +555,16 @@ def _fix_progress_bar(self, trainer: pl.Trainer) -> None: if callback.__class__ == TQDMProgressBar: contains_progress = True if not contains_megatron_progress and contains_progress: - for callback in callbacks: + for i, callback in enumerate(callbacks): if isinstance(callback, TQDMProgressBar): - callback.__class__ = MegatronProgressBar + if self.replace_progress_bar: + printer = ProgressPrinter(log_interval=self.progress_interval) + printer._trainer = trainer + if not trainer.is_global_zero: + printer.disable() + callbacks[i] = printer + else: + callback.__class__ = MegatronProgressBar break def optimizer_sharded_state_dict(self, is_loading=False): From d2f3f49aea7b282816063dad5ff2f2c3eedfc6cf Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:31:39 -0700 Subject: [PATCH 004/664] Use parallelism config in fabric strategy (#10153) Signed-off-by: Alexandros Koumparoulis --- nemo/lightning/fabric/strategies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/lightning/fabric/strategies.py b/nemo/lightning/fabric/strategies.py index 5c2b634ea282..a183c434dc52 100644 --- a/nemo/lightning/fabric/strategies.py +++ b/nemo/lightning/fabric/strategies.py @@ -326,9 +326,9 @@ def checkpoint_io(self) -> CheckpointIO: @property def parallelism(self): - from megatron.core.model_parallel_config import ModelParallelConfig + from nemo.lightning.pytorch.strategies import ParallelismConfig - return ModelParallelConfig( + return ParallelismConfig( tensor_model_parallel_size=self.tensor_model_parallel_size, pipeline_model_parallel_size=self.pipeline_model_parallel_size, virtual_pipeline_model_parallel_size=self.virtual_pipeline_model_parallel_size, From 9b05c3a05874934778f3787281205bb5e3d881dc Mon Sep 17 00:00:00 2001 From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:39:47 -0500 Subject: [PATCH 005/664] Fix metric logging (#10032) * remove unnecessary flag Signed-off-by: Maanu Grover * try refactor for pp loss cast Signed-off-by: Maanu Grover * p2p when logging loss Signed-off-by: Maanu Grover * type annotation Signed-off-by: Maanu Grover * prevent hang Signed-off-by: Maanu Grover * pp last rank broadcast approach Signed-off-by: Maanu Grover * let loss be 0 in val Signed-off-by: Maanu Grover * revert log loss on all ranks Signed-off-by: Maanu Grover * ensure tensor exists and generalize logic Signed-off-by: Maanu Grover * add docs Signed-off-by: Maanu Grover * add val loss to progress bar Signed-off-by: Maanu Grover * change broadcast timing Signed-off-by: Maanu Grover * always broadcast val loss in case of early stop Signed-off-by: Maanu Grover * typo Signed-off-by: Maanu Grover * move val broadcast and loss to validation_end Signed-off-by: Maanu Grover * use simpler average Signed-off-by: Maanu Grover * Revert validation_end attempt Signed-off-by: Maanu Grover * update val_loss logging Signed-off-by: Maanu Grover * typo Signed-off-by: Maanu Grover * update comment Signed-off-by: Maanu Grover --------- Signed-off-by: Maanu Grover Co-authored-by: Eric Harper --- nemo/lightning/_strategy_lib.py | 30 ++++++++++++++ nemo/lightning/megatron_parallel.py | 8 +--- .../pytorch/callbacks/model_checkpoint.py | 15 +++++++ nemo/lightning/pytorch/optim/base.py | 2 +- .../lightning/pytorch/plugins/data_sampler.py | 2 - nemo/lightning/pytorch/strategies.py | 41 +++++++++++-------- 6 files changed, 71 insertions(+), 27 deletions(-) diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py index a5a395a51108..9dd59939fa8a 100644 --- a/nemo/lightning/_strategy_lib.py +++ b/nemo/lightning/_strategy_lib.py @@ -523,3 +523,33 @@ def load_model_state_dict(megatron_parallel, checkpoint: Mapping[str, Any], stri _state_dict[key] = value module.load_state_dict(_state_dict, strict=strict) + + +def _sync_from_last_pipeline_stage(value: torch.Tensor, broadcast: bool = False): + """ + When pipeline parallelism is enabled, casts a tensor defined on the last pipeline stage to other ranks. + + Args: + value (torch.Tensor): A tensor to be casted from the final pipeline stage of a pipeline parallelism group (e.g. loss). + Note that this tensor should already be defined on the target rank(s) to fill with received data. + broadcast (bool): When True, broadcasts value from the final pipeline stage rank to all ranks in its group. + When False, only rank zero receives value from the final pipeline stage rank in its group. + This mode exists to avoid slow one-to-many communication when not necessary. Defaults to False. + """ + from megatron.core import parallel_state + + if parallel_state.get_pipeline_model_parallel_world_size() > 1: + src_rank = parallel_state.get_pipeline_model_parallel_last_rank() + + if not broadcast: + pp_ranks = torch.distributed.get_process_group_ranks(parallel_state.get_pipeline_model_parallel_group()) + if torch.distributed.get_rank() == src_rank and 0 in pp_ranks: + torch.distributed.send(value, 0) + elif torch.distributed.get_rank() == 0: + torch.distributed.recv(value, src_rank) + else: + torch.distributed.broadcast( + value, + src_rank, + group=parallel_state.get_pipeline_model_parallel_group(), + ) diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index 37978510972d..56146498b539 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -286,17 +286,11 @@ def forward( self.callbacks.event("on_megatron_reduce_microbatches_end", **context) else: # we're not on the last pipeline stage so no losses - if forward_only: - loss_mean = cast(torch.Tensor, []) - else: - loss_mean = torch.tensor(0.0, device=torch.cuda.current_device()) + loss_mean = torch.tensor(0.0, device=torch.cuda.current_device()) self.callbacks.event("on_megatron_log_step_end", **context) self.callbacks.event("on_megatron_step_end", **context) - if loss_mean == []: - loss_mean = None - return loss_mean def wrapped_forward_step( diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py index a2068468a0f7..db48ded0d10d 100644 --- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py +++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py @@ -393,6 +393,21 @@ def file_exists(self, filepath: str, trainer: "pytorch_lightning.Trainer", check exists = self._fs.exists(filepath) or (check_dist_ckpt and self._fs.exists(ckpt_to_dir(filepath))) return trainer.strategy.broadcast(exists) + def _monitor_candidates(self, trainer: "pl.Trainer") -> Dict[str, torch.Tensor]: + """Broadcast loss from last pipeline stage.""" + monitor_candidates = super()._monitor_candidates(trainer) + + from nemo.lightning._strategy_lib import _sync_from_last_pipeline_stage + + keys = re.findall(r"[\{](.*?)[:\}]", self.filename) + for loss_name in ['reduced_train_loss']: + if loss_name in keys or loss_name == self.monitor: + if loss_name not in monitor_candidates: + monitor_candidates[loss_name] = torch.tensor(0.0, device=torch.cuda.current_device()) + _sync_from_last_pipeline_stage(monitor_candidates[loss_name], broadcast=True) + + return monitor_candidates + def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None: # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed. # if anything goes wrong during checkpointing, we should be able to detect that data is incomplete. diff --git a/nemo/lightning/pytorch/optim/base.py b/nemo/lightning/pytorch/optim/base.py index c6160fa14b0e..ef7f9d96843d 100644 --- a/nemo/lightning/pytorch/optim/base.py +++ b/nemo/lightning/pytorch/optim/base.py @@ -152,7 +152,7 @@ def optimizers(self, model) -> List[Optimizer]: def on_train_batch_start(self, trainer, pl_module, batch, batch_idx) -> None: if self._optimizers is not None: lr = self._optimizers[0].param_groups[0]['lr'] - pl_module.log('lr', lr, rank_zero_only=True, batch_size=1, prog_bar=True) + pl_module.log('lr', lr, batch_size=1, prog_bar=True) def __call__(self, model: L.LightningModule, megatron_parallel=None) -> OptimizerLRScheduler: """Calls the setup and optimizers methods. diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py index 08a56f854e4a..13a0caa98f0c 100644 --- a/nemo/lightning/pytorch/plugins/data_sampler.py +++ b/nemo/lightning/pytorch/plugins/data_sampler.py @@ -99,7 +99,6 @@ def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModul 'consumed_samples', consumed_samples, prog_bar=True, - rank_zero_only=True, batch_size=1, ) @@ -113,7 +112,6 @@ def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModul "global_batch_size", self.current_global_batch_size, prog_bar=True, - rank_zero_only=True, batch_size=1, ) self.if_first_step = 1 diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py index bf35a2aeed9f..0250709a4e03 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies.py @@ -449,7 +449,6 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP 'global_step', self.trainer.global_step, prog_bar=True, - rank_zero_only=True, batch_size=1, ) @@ -465,31 +464,22 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP "peak_memory_usage", max_memory_reserved, prog_bar=True, - rank_zero_only=True, batch_size=1, ) self.lightning_module.log( "memory_allocated", memory_allocated, prog_bar=True, - rank_zero_only=True, batch_size=1, ) if self.log_train_loss: - from megatron.core import parallel_state - - from nemo.collections.nlp.parts.utils_funcs import get_last_rank - - # When using pipeline parallelism, loss is calculated only in the last pipeline stage and - # it should be casted to other pipeline stages for logging. - # we can avoid this broadcast by updating the PTL log function to accept specific ranks - if parallel_state.get_pipeline_model_parallel_world_size() > 1: - if torch.distributed.get_rank() == get_last_rank(): - torch.distributed.send(out, 0) - elif torch.distributed.get_rank() == 0: - torch.distributed.recv(out, get_last_rank()) - self.lightning_module.log('reduced_train_loss', out, prog_bar=True, rank_zero_only=True, batch_size=1) + # p2p now, broadcast later at ckpt + _strategy_lib._sync_from_last_pipeline_stage(out, broadcast=False) + if torch.distributed.get_rank() == 0: + self.lightning_module.log( + 'reduced_train_loss', out, prog_bar=True, rank_zero_only=True, batch_size=1 + ) return out @@ -501,7 +491,24 @@ def validation_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OU with self.precision_plugin.val_step_context(): # TODO: Do we need this? out = self.model(dataloader_iter, forward_only=True, *args, **kwargs) - self.lightning_module.log('val_loss', out, rank_zero_only=True, batch_size=1) + + from megatron.core import parallel_state + + pp_size = parallel_state.get_pipeline_model_parallel_world_size() + if pp_size > 1: + # ranks that are not final pp stage have 0 for loss, and out will be mean-reduced over pp + # groups (due to sync_dist), which divides val_loss by pp_size. so we multiply by pp_size to cancel out + self.lightning_module.log( + 'val_loss', + out * pp_size, + prog_bar=True, + sync_dist=True, + sync_dist_group=parallel_state.get_pipeline_model_parallel_group(), + on_epoch=True, + ) + else: + self.lightning_module.log('val_loss', out, prog_bar=True, on_epoch=True) + return out @override From 70cc9cb95b5b0ea76dcf12a31d0fc708858e9187 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Fri, 16 Aug 2024 12:44:50 -0700 Subject: [PATCH 006/664] Update recipes for Llama 3 8b and 70b (#10147) * Update recipes for Llama 3 Signed-off-by: Hemil Desai * Update values Signed-off-by: Hemil Desai * Remove unused import Signed-off-by: Hemil Desai --------- Signed-off-by: Hemil Desai --- nemo/collections/llm/gpt/model/llama.py | 37 ++++- nemo/collections/llm/recipes/__init__.py | 7 +- nemo/collections/llm/recipes/llama2_7b.py | 61 --------- nemo/collections/llm/recipes/llama3_70b.py | 113 ++++++++++++++++ nemo/collections/llm/recipes/llama3_8b.py | 128 ++++++++++++------ nemo/collections/llm/recipes/llama3_8b_16k.py | 78 ++++++----- nemo/collections/llm/recipes/llama3_8b_64k.py | 78 ++++++----- nemo/collections/llm/recipes/log/default.py | 55 ++++++-- nemo/collections/llm/recipes/mistral.py | 6 +- .../llm/recipes/mixtral_8x22b_4k.py | 6 +- .../llm/recipes/mixtral_8x7b_4k.py | 6 +- nemo/collections/llm/recipes/optim/adam.py | 35 +++-- nemo/collections/llm/utils.py | 13 +- 13 files changed, 423 insertions(+), 200 deletions(-) delete mode 100644 nemo/collections/llm/recipes/llama2_7b.py create mode 100644 nemo/collections/llm/recipes/llama3_70b.py diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 14bed9de6e7e..425170c07707 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -62,17 +62,48 @@ class Llama2Config70B(LlamaConfig): @dataclass -class Llama3Config8B(Llama2Config7B): +class Llama3Config(GPTConfig): + num_query_groups: int = 8 + hidden_dropout: float = 0.0 + attention_dropout: float = 0.0 + normalization = "RMSNorm" + init_method_std: float = 0.01 + layernorm_epsilon: float = 1.0e-05 + add_bias_linear: bool = False + activation_func: Callable = F.silu + gated_linear_unit: bool = True + apply_query_key_layer_scaling: bool = True + # Fusions + bias_activation_fusion: bool = True + masked_softmax_fusion: bool = True + persist_layer_norm: bool = True + bias_dropout_fusion: bool = True + apply_rope_fusion: bool = True + share_embeddings_and_output_weights: bool = False + position_embedding_type = "rope" + rotary_percent: float = 1.0 + + +@dataclass +class Llama3Config8B(Llama3Config): rotary_base: int = 500_000 seq_length: int = 8192 - num_query_groups: int = 8 + num_layers: int = 32 + hidden_size: int = 4096 ffn_hidden_size: int = 14336 + num_attention_heads: int = 32 @dataclass -class Llama3Config70B(Llama2Config70B): +class Llama3Config70B(Llama3Config): rotary_base: int = 500_000 seq_length: int = 8192 + num_layers: int = 80 + hidden_size: int = 8192 + ffn_hidden_size: int = 28672 + num_attention_heads: int = 64 + init_method_std: float = 0.008944 + make_vocab_size_divisible_by: int = 128 @dataclass diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index 8d4d874362a9..d9fb5cc61f38 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -1,13 +1,14 @@ -from nemo.collections.llm.recipes import llama2_7b, llama3_8b, llama3_8b_16k, llama3_8b_64k, mistral -from nemo.collections.llm.recipes.log.default import default_log +from nemo.collections.llm.recipes import llama3_8b, llama3_8b_16k, llama3_8b_64k, llama3_70b, mistral +from nemo.collections.llm.recipes.log.default import default_log, default_resume from nemo.collections.llm.recipes.optim import adam __all__ = [ "llama3_8b", "llama3_8b_16k", "llama3_8b_64k", - "llama2_7b", + "llama3_70b", "mistral", "adam", "default_log", + "default_resume", ] diff --git a/nemo/collections/llm/recipes/llama2_7b.py b/nemo/collections/llm/recipes/llama2_7b.py deleted file mode 100644 index 1767dc4690c8..000000000000 --- a/nemo/collections/llm/recipes/llama2_7b.py +++ /dev/null @@ -1,61 +0,0 @@ -import pytorch_lightning as pl - -from nemo import lightning as nl -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.api import squad -from nemo.collections.llm.gpt.model.llama import Llama2Config7B, LlamaModel -from nemo.collections.llm.peft.api import gpt_lora -from nemo.collections.llm.recipes.log.default import default_log -from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing -from nemo.collections.llm.utils import Partial, factory - -NAME = "llama2_7b" - - -@factory(name=NAME) -def model() -> pl.LightningModule: - return LlamaModel(Llama2Config7B()) - - -@factory(name=NAME) -def trainer(devices=8) -> nl.Trainer: - strategy = nl.MegatronStrategy(tensor_model_parallel_size=2) - - return nl.Trainer( - devices=devices, - max_steps=100, - accelerator="gpu", - strategy=strategy, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), - ) - - -@factory(name=NAME + "_hf") -def hf_resume() -> nl.AutoResume: - return nl.AutoResume(import_path="hf://meta-llama/Llama-2-7b-hf") - - -@factory(name=NAME, for_task="llm.pretrain") -def pretrain_recipe() -> Partial: - return Partial( - pretrain, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=adam_with_cosine_annealing, - ) - - -@factory(name=NAME, for_task="llm.finetune") -def finetune_recipe() -> Partial: - return Partial( - finetune, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=adam_with_cosine_annealing, - peft=gpt_lora, - resume=hf_resume, - ) diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py new file mode 100644 index 000000000000..4b99aef74a30 --- /dev/null +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -0,0 +1,113 @@ +from typing import Callable, Optional + +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback + +NAME = "llama3_70b" + + +def model() -> Config[pl.LightningModule]: + return Config(LlamaModel, config=Config(Llama3Config70B)) + + +def trainer( + tensor_parallelism: int, + pipeline_parallelism: int, + pipeline_parallelism_type: Optional[torch.dtype], + virtual_pipeline_parallelism: Optional[int], + context_parallelism: int, + sequence_parallelism: bool, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[Config[Callback]]] = None, +) -> Config[nl.Trainer]: + strategy = Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_include_optimizer=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + ), + ) + + trainer = Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + gradient_clip_val=1.0, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + return Partial( + fn, + model=model(), + trainer=trainer( + tensor_parallelism=4, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=5, + context_parallelism=2, + sequence_parallelism=True, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], + ), + data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +def hf_resume() -> Config[nl.AutoResume]: + return Config(nl.AutoResume, import_path="hf://meta-llama/Meta-Llama-3.1-70B") + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune + ) + recipe.resume = hf_resume() + recipe.peft = Config(LoRA) + recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index 34ce418a0701..d70366f6c5ed 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -1,61 +1,113 @@ +from typing import Callable, Optional + import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.api import squad +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel -from nemo.collections.llm.peft.api import gpt_lora -from nemo.collections.llm.recipes.log.default import default_log -from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing -from nemo.collections.llm.utils import Partial, factory +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback NAME = "llama3_8b" -@factory(name=NAME) -def model() -> pl.LightningModule: - return LlamaModel(Llama3Config8B(seq_length=16384)) +def model() -> Config[pl.LightningModule]: + return Config(LlamaModel, config=Config(Llama3Config8B)) -@factory(name=NAME) -def trainer(devices=8) -> nl.Trainer: - strategy = nl.MegatronStrategy(tensor_model_parallel_size=2) +def trainer( + tensor_parallelism: int, + pipeline_parallelism: int, + pipeline_parallelism_type: Optional[torch.dtype], + virtual_pipeline_parallelism: Optional[int], + context_parallelism: int, + sequence_parallelism: bool, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[Config[Callback]]] = None, +) -> Config[nl.Trainer]: + strategy = Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_include_optimizer=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + ), + ) - return nl.Trainer( - devices=devices, - max_steps=100, + trainer = Config( + nl.Trainer, accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + gradient_clip_val=1.0, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), strategy=strategy, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + use_distributed_sampler=False, + val_check_interval=2000, ) - -@factory(name=NAME + "_hf") -def hf_resume() -> nl.AutoResume: - return nl.AutoResume(import_path="hf://meta-llama/Meta-Llama-3-8B") + return trainer -@factory(name=NAME, for_task="llm.pretrain") -def pretrain_recipe() -> Partial: +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: return Partial( - pretrain, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=adam_with_cosine_annealing, + fn, + model=model(), + trainer=trainer( + tensor_parallelism=1, + pipeline_parallelism=1, + pipeline_parallelism_type=None, + virtual_pipeline_parallelism=None, + context_parallelism=2, + sequence_parallelism=False, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], + ), + data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), ) -@factory(name=NAME, for_task="llm.finetune") -def finetune_recipe() -> Partial: - return Partial( - finetune, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=adam_with_cosine_annealing, - peft=gpt_lora, - resume=hf_resume, +def hf_resume() -> Config[nl.AutoResume]: + return Config(nl.AutoResume, import_path="hf://meta-llama/Meta-Llama-3.1-8B") + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune ) + recipe.resume = hf_resume() + recipe.peft = Config(LoRA) + recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py index 3a590f26894e..8bb2b636eba0 100644 --- a/nemo/collections/llm/recipes/llama3_8b_16k.py +++ b/nemo/collections/llm/recipes/llama3_8b_16k.py @@ -1,45 +1,59 @@ -import pytorch_lightning as pl +from typing import Callable + +import torch -from nemo import lightning as nl from nemo.collections.llm.api import pretrain -from nemo.collections.llm.gpt.data.api import squad -from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel -from nemo.collections.llm.recipes.log.default import default_log -from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing -from nemo.collections.llm.utils import Partial, factory +from nemo.collections.llm.recipes import llama3_8b +from nemo.collections.llm.utils import Partial NAME = "llama3_8b_16k" -@factory(name=NAME) -def model() -> pl.LightningModule: - return LlamaModel(Llama3Config8B(seq_length=16384)) +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + recipe = llama3_8b.pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn + ) + + trainer = llama3_8b.trainer( + tensor_parallelism=2, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=5, + context_parallelism=2, + sequence_parallelism=True, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = llama3_8b.model() + model.config.seq_length = 16384 + recipe.model = model + recipe.trainer = trainer -@factory(name=NAME) -def trainer(devices=8) -> nl.Trainer: - strategy = nl.MegatronStrategy( - tensor_model_parallel_size=4, - context_parallel_size=2, - sequence_parallel=True, + return trainer + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = llama3_8b.finetune_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node ) - return nl.Trainer( - devices=devices, - max_steps=100, - accelerator="gpu", - strategy=strategy, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + trainer = llama3_8b.trainer( + tensor_parallelism=2, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=5, + context_parallelism=2, + sequence_parallelism=True, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, ) + model = llama3_8b.model() + model.config.seq_length = 16384 + recipe.model = model + recipe.trainer = trainer -@factory(name=NAME, for_task="llm.pretrain") -def pretrain_recipe() -> Partial: - return Partial( - pretrain, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=adam_with_cosine_annealing, - ) + return trainer diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py index c826feb28901..b42e1e53399e 100644 --- a/nemo/collections/llm/recipes/llama3_8b_64k.py +++ b/nemo/collections/llm/recipes/llama3_8b_64k.py @@ -1,45 +1,59 @@ -import pytorch_lightning as pl +from typing import Callable + +import torch -from nemo import lightning as nl from nemo.collections.llm.api import pretrain -from nemo.collections.llm.gpt.data.api import squad -from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel -from nemo.collections.llm.recipes.log.default import default_log -from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing -from nemo.collections.llm.utils import Partial, factory +from nemo.collections.llm.recipes import llama3_8b +from nemo.collections.llm.utils import Partial NAME = "llama3_8b_64k" -@factory(name=NAME) -def model() -> pl.LightningModule: - return LlamaModel(Llama3Config8B(seq_length=65536)) +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + recipe = llama3_8b.pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn + ) + + trainer = llama3_8b.trainer( + tensor_parallelism=2, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=5, + context_parallelism=4, + sequence_parallelism=True, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = llama3_8b.model() + model.config.seq_length = 65536 + recipe.model = model + recipe.trainer = trainer -@factory(name=NAME) -def trainer(devices=8) -> nl.Trainer: - strategy = nl.MegatronStrategy( - tensor_model_parallel_size=8, - context_parallel_size=4, - sequence_parallel=True, + return trainer + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = llama3_8b.finetune_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node ) - return nl.Trainer( - devices=devices, - max_steps=100, - accelerator="gpu", - strategy=strategy, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + trainer = llama3_8b.trainer( + tensor_parallelism=2, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=5, + context_parallelism=4, + sequence_parallelism=True, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, ) + model = llama3_8b.model() + model.config.seq_length = 65536 + recipe.model = model + recipe.trainer = trainer -@factory(name=NAME, for_task="llm.pretrain") -def pretrain_recipe() -> Partial: - return Partial( - pretrain, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=adam_with_cosine_annealing, - ) + return trainer diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py index a40e141bfa95..dc18565a0e06 100644 --- a/nemo/collections/llm/recipes/log/default.py +++ b/nemo/collections/llm/recipes/log/default.py @@ -1,15 +1,52 @@ +from typing import Optional + +from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger + from nemo import lightning as nl -from nemo.collections.llm.utils import factory +from nemo.collections.llm.utils import Config + +def tensorboard_logger(name: str, save_dir: str = "tb_logs") -> Config[TensorBoardLogger]: + return Config(TensorBoardLogger, save_dir=save_dir, name=name) + + +def wandb_logger(project: str, name: str) -> Config[WandbLogger]: + return Config( + WandbLogger, + project=project, + name=name, + config={}, + ) -@factory -def default_log() -> nl.NeMoLogger: - ckpt = nl.ModelCheckpoint( - save_best_model=True, + +def default_log( + ckpt_dir: str, + name: str, + tensorboard_logger: Optional[Config[TensorBoardLogger]] = None, + wandb_logger: Optional[Config[WandbLogger]] = None, +) -> Config[nl.NeMoLogger]: + ckpt = Config( + nl.ModelCheckpoint, + save_best_model=False, save_last=True, - monitor="reduced_train_loss", - save_top_k=2, - save_on_train_epoch_end=True, + save_top_k=10, + every_n_train_steps=200, + filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}", + ) + + return Config( + nl.NeMoLogger, + ckpt=ckpt, + name=name, + tensorboard=tensorboard_logger, + wandb=wandb_logger, + dir=ckpt_dir, ) - return nl.NeMoLogger(ckpt=ckpt) + +def default_resume() -> Config[nl.AutoResume]: + return Config( + nl.AutoResume, + resume_if_exists=True, + resume_ignore_no_checkpoint=True, + ) diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral.py index 12af8d5d18ff..061e82c9d9d2 100644 --- a/nemo/collections/llm/recipes/mistral.py +++ b/nemo/collections/llm/recipes/mistral.py @@ -6,7 +6,7 @@ from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel from nemo.collections.llm.peft.api import gpt_lora from nemo.collections.llm.recipes.log.default import default_log -from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing from nemo.collections.llm.utils import Partial, factory NAME = "mistral" @@ -43,7 +43,7 @@ def pretrain_recipe() -> Partial: trainer=trainer, data=squad, log=default_log, - optim=adam_with_cosine_annealing, + optim=distributed_fused_adam_with_cosine_annealing(), ) @@ -55,7 +55,7 @@ def finetune_recipe() -> Partial: trainer=trainer, data=squad, log=default_log, - optim=adam_with_cosine_annealing, + optim=distributed_fused_adam_with_cosine_annealing(), peft=gpt_lora, resume=hf_resume, ) diff --git a/nemo/collections/llm/recipes/mixtral_8x22b_4k.py b/nemo/collections/llm/recipes/mixtral_8x22b_4k.py index 4385e5a54827..5a29cca38506 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b_4k.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b_4k.py @@ -6,7 +6,7 @@ from nemo.collections.llm.gpt.model.llama import MixtralConfig8x22B, MixtralModel from nemo.collections.llm.peft.api import gpt_lora from nemo.collections.llm.recipes.log.default import default_log -from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing from nemo.collections.llm.utils import Partial, factory NAME = "mixtral_8x22b_4k" @@ -46,7 +46,7 @@ def pretrain_recipe() -> Partial: trainer=trainer, data=squad, log=default_log, - optim=adam_with_cosine_annealing, + optim=distributed_fused_adam_with_cosine_annealing(), ) @@ -58,7 +58,7 @@ def finetune_recipe() -> Partial: trainer=trainer, data=squad, log=default_log, - optim=adam_with_cosine_annealing, + optim=distributed_fused_adam_with_cosine_annealing(), peft=gpt_lora, resume=hf_resume, ) diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_4k.py b/nemo/collections/llm/recipes/mixtral_8x7b_4k.py index d7543e51812e..5afa3cd072f6 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b_4k.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b_4k.py @@ -6,7 +6,7 @@ from nemo.collections.llm.gpt.model.llama import MixtralConfig8x7B, MixtralModel from nemo.collections.llm.peft.api import gpt_lora from nemo.collections.llm.recipes.log.default import default_log -from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing from nemo.collections.llm.utils import Partial, factory NAME = "mixtral_8x7b_4k" @@ -46,7 +46,7 @@ def pretrain_recipe() -> Partial: trainer=trainer, data=squad, log=default_log, - optim=adam_with_cosine_annealing, + optim=distributed_fused_adam_with_cosine_annealing(), ) @@ -58,7 +58,7 @@ def finetune_recipe() -> Partial: trainer=trainer, data=squad, log=default_log, - optim=adam_with_cosine_annealing, + optim=distributed_fused_adam_with_cosine_annealing(), peft=gpt_lora, resume=hf_resume, ) diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py index 4229001b2130..d46f7d5d36d6 100644 --- a/nemo/collections/llm/recipes/optim/adam.py +++ b/nemo/collections/llm/recipes/optim/adam.py @@ -1,16 +1,33 @@ from megatron.core.optimizer import OptimizerConfig -from nemo import lightning as nl -from nemo.collections.llm.utils import factory +from nemo.collections.llm.utils import Config +from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule, OptimizerModule -@factory -def adam_with_cosine_annealing() -> nl.OptimizerModule: - return nl.MegatronOptimizerModule( - config=OptimizerConfig(optimizer="adam", lr=0.001, use_distributed_optimizer=True), - lr_scheduler=nl.lr_scheduler.CosineAnnealingScheduler(), +def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> Config[OptimizerModule]: + opt_cfg = Config( + OptimizerConfig, + optimizer="adam", + lr=max_lr, + weight_decay=0.1, + bf16=True, + adam_beta1=0.9, + adam_beta2=0.95, + adam_eps=1e-5, + use_distributed_optimizer=True, + overlap_grad_reduce=True, + overlap_param_gather=True, ) + sched = Config( + CosineAnnealingScheduler, + warmup_steps=2000, + constant_steps=0, + min_lr=0.1 * max_lr, + ) -# TODO: Fix the name-arg inside the factory-function so we don't need to do this -with_cosine_annealing = adam_with_cosine_annealing + return Config( + MegatronOptimizerModule, + config=opt_cfg, + lr_scheduler=sched, + ) diff --git a/nemo/collections/llm/utils.py b/nemo/collections/llm/utils.py index 26b511fcb26d..5ff01a9b0a86 100644 --- a/nemo/collections/llm/utils.py +++ b/nemo/collections/llm/utils.py @@ -1,6 +1,7 @@ +import logging from typing import Any, Callable, Generic, TypeVar, Union, overload -T = TypeVar('T', bound=Callable[..., Any]) +T = TypeVar("T", bound=Callable[..., Any]) try: import nemo_run as run @@ -8,7 +9,11 @@ Config = run.Config Partial = run.Partial except ImportError: - _T = TypeVar('_T') + logging.warning( + "Trying to use Config or Partial, but NeMo-Run is not installed. Please install NeMo-Run before proceeding." + ) + + _T = TypeVar("_T") class Config(Generic[_T]): pass @@ -22,7 +27,7 @@ def task(*args: Any, **kwargs: Any) -> Callable[[T], T]: import nemo_run as run return run.task(*args, **kwargs) - except ImportError: + except (ImportError, AttributeError): # Return a no-op function def noop_decorator(func: T) -> T: return func @@ -47,7 +52,7 @@ def factory(*args: Any, **kwargs: Any) -> Union[Callable[[T], T], T]: else: # Used as @factory(*args, **kwargs) return run.factory(*args, **kwargs) - except ImportError: + except (ImportError, AttributeError): # Return a no-op function def noop_decorator(func: T) -> T: return func From 509e140319af98054e0b714060dc129174b45bad Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Fri, 16 Aug 2024 15:08:31 -0700 Subject: [PATCH 007/664] NeMo's Mixtral-8x3B config (#9993) * rebase to main Signed-off-by: Alexandros Koumparoulis * refactor MixtralConfig8x7B to base from MixtralConfig Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * Apply isort and black reformatting Signed-off-by: akoumpa * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- nemo/collections/llm/__init__.py | 2 + nemo/collections/llm/gpt/model/__init__.py | 8 ++- nemo/collections/llm/gpt/model/mixtral.py | 60 +++++++++++++--------- 3 files changed, 45 insertions(+), 25 deletions(-) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index ac3ad56ff28e..3ccecce96044 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -44,6 +44,7 @@ MaskedTokenLossReduction, MistralConfig7B, MistralModel, + MixtralConfig8x3B, MixtralConfig8x7B, MixtralConfig8x22B, MixtralModel, @@ -61,6 +62,7 @@ "MaskedTokenLossReduction", "MistralConfig7B", "MistralModel", + "MixtralConfig8x3B", "MixtralConfig8x7B", "MixtralConfig8x22B", "MixtralModel", diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 12cfbd87ac09..e2d940e02d32 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -31,13 +31,19 @@ LlamaModel, ) from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel -from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralConfig8x22B, MixtralModel +from nemo.collections.llm.gpt.model.mixtral import ( + MixtralConfig8x3B, + MixtralConfig8x7B, + MixtralConfig8x22B, + MixtralModel, +) __all__ = [ "GPTConfig", "GPTModel", "MistralConfig7B", "MistralModel", + "MixtralConfig8x3B", "MixtralConfig8x7B", "MixtralModel", "LlamaConfig", diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py index 35005f95614c..7100b62c2aa6 100644 --- a/nemo/collections/llm/gpt/model/mixtral.py +++ b/nemo/collections/llm/gpt/model/mixtral.py @@ -18,10 +18,9 @@ @dataclass -class MixtralConfig8x7B(GPTConfig): +class MixtralConfig(GPTConfig): """ - Config for Mixtral-8x7B model - Official announcement: https://mistral.ai/news/mixtral-of-experts/ + Base config for Mixtral models. """ normalization: str = "RMSNorm" @@ -36,8 +35,8 @@ class MixtralConfig8x7B(GPTConfig): num_attention_heads: int = 32 num_query_groups: int = 8 ffn_hidden_size: int = 14336 - max_position_embeddings: int = 4096 # 32768 - seq_length: int = 4096 # 32768 + max_position_embeddings: int = 4096 + seq_length: int = 4096 attention_dropout: float = 0.0 hidden_dropout: float = 0.0 share_embeddings_and_output_weights: bool = False @@ -57,38 +56,51 @@ class MixtralConfig8x7B(GPTConfig): @dataclass -class MixtralConfig8x22B(GPTConfig): +class MixtralConfig8x3B(MixtralConfig): + """ + NeMo's Mixtral-8x3B model variant + https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/main/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml + """ + + num_layers: int = 32 + hidden_size: int = 2560 + num_attention_heads: int = 32 + ffn_hidden_size: int = 8960 + max_position_embeddings: int = 4096 + seq_length: int = 4096 + + +@dataclass +class MixtralConfig8x7B(MixtralConfig): """ Config for Mixtral-8x7B model - Official announcement: https://mistral.ai/news/mixtral-8x22b/ + Official announcement: https://mistral.ai/news/mixtral-of-experts/ """ - normalization: str = "RMSNorm" - activation_func: Callable = F.silu - position_embedding_type: str = "rope" - add_bias_linear: bool = False - gated_linear_unit: bool = True - apply_query_key_layer_scaling: bool = False + num_layers: int = 32 + hidden_size: int = 4096 + ffn_hidden_size: int = 14336 + max_position_embeddings: int = 4096 + seq_length: int = 4096 + + +@dataclass +class MixtralConfig8x22B(MixtralConfig): + """ + Config for Mixtral-8x7B model + Official announcement: https://mistral.ai/news/mixtral-8x22b/ + """ num_layers: int = 56 hidden_size: int = 6144 num_attention_heads: int = 48 - num_query_groups: int = 8 ffn_hidden_size: int = 16384 - max_position_embeddings: int = 65536 - seq_length: int = 4096 # 65536 + max_position_embeddings: int = 4096 + seq_length: int = 4096 # MoE num_moe_experts: int = 8 moe_router_topk: int = 2 - init_method_std: float = 0.02 - layernorm_epsilon: float = 1e-5 - # rotary - rotary_percent: float = 1.0 - rotary_base: float = 1000000.0 - bf16: bool = True - params_dtype: torch.dtype = torch.bfloat16 - class MixtralModel(GPTModel): def __init__( From a8c95dce04020d3afb71a002925474698e9da765 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 16 Aug 2024 17:49:06 -0700 Subject: [PATCH 008/664] Add deploy and REST API support to NeMo 2.0 (#9834) (#10002) * Add deploy method to nemo 2.0 * Add openai_format_response arg and store_args_to_json method * Change default Triton port in deploy task NeMo 2.0 * Add logging statements and minor fixes * Apply isort and black reformatting * Add import guard for deploy in NeMo 2.0 * Add line end of file * Add additional import guards * Add import guard for TRTLLM only * Add trt_llm_supported varibale * Add import guard for uvicorn * Apply isort and black reformatting * Remove import uvicorn outside the try except block --------- Signed-off-by: Abhishree Signed-off-by: athitten Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Co-authored-by: athitten --- nemo/collections/llm/__init__.py | 11 ++ nemo/collections/llm/api.py | 167 ++++++++++++++++++++++++++++ scripts/deploy/nlp/deploy_triton.py | 4 +- 3 files changed, 180 insertions(+), 2 deletions(-) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 3ccecce96044..7b2b38e50bc3 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -52,6 +52,13 @@ gpt_forward_step, ) from nemo.collections.llm.recipes import * # noqa +from nemo.utils import logging + +try: + from nemo.collections.llm.api import deploy +except ImportError as error: + deploy = None + logging.warning(f"The deploy module could not be imported: {error}") __all__ = [ "MockDataModule", @@ -106,3 +113,7 @@ "dolly", "peft", ] + +# add 'deploy' to __all__ if it was successfully imported +if deploy is not None: + __all__.append("deploy") diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 56da9e5496b2..46d94d26b03b 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -1,3 +1,5 @@ +import json +import os from copy import deepcopy from pathlib import Path from typing import Any, Callable, Optional, Union @@ -6,10 +8,24 @@ from typing_extensions import Annotated from nemo.collections.llm.utils import Config, task +from nemo.deploy import DeployPyTriton from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform from nemo.utils import logging +trt_llm_supported = True +try: + from nemo.export.tensorrt_llm import TensorRTLLM +except ImportError as error: + logging.warning(f"TensorRTLLM could not be imported from nemo.export: {error}") + trt_llm_supported = False + +uvicorn_supported = True +try: + import uvicorn +except ImportError as error: + logging.warning(f"uvicorn could not be imported: {error}") + uvicorn_supported = False TokenizerType = Any @@ -225,6 +241,157 @@ def validate( return app_state.exp_dir +def get_trtllm_deployable( + nemo_checkpoint, + model_type, + triton_model_repository, + num_gpus, + tensor_parallelism_size, + pipeline_parallelism_size, + max_input_len, + max_output_len, + max_batch_size, + dtype, +): + if triton_model_repository is None: + trt_llm_path = "/tmp/trt_llm_model_dir/" + Path(trt_llm_path).mkdir(parents=True, exist_ok=True) + else: + trt_llm_path = triton_model_repository + + if nemo_checkpoint is None and triton_model_repository is None: + raise ValueError( + "The provided model repository is not a valid TensorRT-LLM model " + "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine." + ) + + if nemo_checkpoint is None and not os.path.isdir(triton_model_repository): + raise ValueError( + "The provided model repository is not a valid TensorRT-LLM model " + "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine." + ) + + if nemo_checkpoint is not None and model_type is None: + raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.") + + if not trt_llm_supported: + raise ValueError("TensorRT-LLM engine is not supported in this environment.") + trt_llm_exporter = TensorRTLLM( + model_dir=trt_llm_path, + load_model=(nemo_checkpoint is None), + ) + + if nemo_checkpoint is not None: + try: + logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.") + trt_llm_exporter.export( + nemo_checkpoint_path=nemo_checkpoint, + model_type=model_type, + n_gpus=num_gpus, + tensor_parallelism_size=tensor_parallelism_size, + pipeline_parallelism_size=pipeline_parallelism_size, + max_input_len=max_input_len, + max_output_len=max_output_len, + max_batch_size=max_batch_size, + dtype=dtype, + ) + except Exception as error: + raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) + + return trt_llm_exporter + + +def store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response): + args_dict = { + "triton_service_ip": triton_http_address, + "triton_service_port": triton_port, + "triton_request_timeout": triton_request_timeout, + "openai_format_response": openai_format_response, + } + with open("nemo/deploy/service/config.json", "w") as f: + json.dump(args_dict, f) + + +@task(namespace="llm") +def deploy( + nemo_checkpoint: Path = None, + model_type: str = "llama", + triton_model_name: str = "xxx", + triton_model_version: Optional[int] = 1, + triton_port: int = 8080, + triton_http_address: str = "0.0.0.0", + triton_request_timeout: int = 60, + triton_model_repository: Path = None, + num_gpus: int = 1, + tensor_parallelism_size: int = 1, + pipeline_parallelism_size: int = 1, + dtype: str = "bfloat16", + max_input_len: int = 256, + max_output_len: int = 256, + max_batch_size: int = 8, + start_rest_service: bool = False, + rest_service_http_address: str = "0.0.0.0", + rest_service_port: int = 8000, + openai_format_response: bool = False, +): + if start_rest_service: + if triton_port == rest_service_port: + logging.error("REST service port and Triton server port cannot use the same port.") + return + # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py + store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response) + + triton_deployable = get_trtllm_deployable( + nemo_checkpoint, + model_type, + triton_model_repository, + num_gpus, + tensor_parallelism_size, + pipeline_parallelism_size, + max_input_len, + max_output_len, + max_batch_size, + dtype, + ) + + try: + nm = DeployPyTriton( + model=triton_deployable, + triton_model_name=triton_model_name, + triton_model_version=triton_model_version, + max_batch_size=max_batch_size, + port=triton_port, + address=triton_http_address, + ) + + logging.info("Triton deploy function will be called.") + nm.deploy() + except Exception as error: + logging.error("Error message has occurred during deploy function. Error message: " + str(error)) + return + + try: + logging.info("Model serving on Triton is will be started.") + if start_rest_service and uvicorn_supported: + try: + logging.info("REST service will be started.") + uvicorn.run( + 'nemo.deploy.service.rest_model_api:app', + host=rest_service_http_address, + port=rest_service_port, + reload=True, + ) + except Exception as error: + logging.error("Error message has occurred during REST service start. Error message: " + str(error)) + nm.serve() + except Exception as error: + logging.error("Error message has occurred during deploy function. Error message: " + str(error)) + return + + logging.info("Model serving will be stopped.") + nm.stop() + + @task(name="import", namespace="llm") def import_ckpt( model: pl.LightningModule, diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index 01be9ff63a0d..c0acd97e1b50 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -185,8 +185,8 @@ def get_args(argv): parser.add_argument( "-srs", "--start_rest_service", - default="False", - type=str, + default=False, + type=bool, help="Starts the REST service for OpenAI API support", ) parser.add_argument( From 5e91738de7cb0f6ac61a2685a53cdb67e648a52b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 16 Aug 2024 17:49:36 -0700 Subject: [PATCH 009/664] Disable nvFuser setup with PyTorch 23.11 and later (#9837) (#9870) * Disable nvFuser setup with PyTorch 23.11 and later * Apply isort and black reformatting --------- Signed-off-by: Abhishree Signed-off-by: athitten Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Co-authored-by: athitten --- .../nlp/models/language_modeling/megatron_base_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 7042b0d35ad9..788e9bd059f6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -408,7 +408,9 @@ def is_official_release_version(nvidia_torch_version): self.cfg.persist_layer_norm = False # NVFUSER available starting with 21.11 - if NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11): + if (NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11)) and ( + NVIDIA_TORCH_MAJOR < 23 or (NVIDIA_TORCH_MAJOR == 23 and NVIDIA_TORCH_MINOR < 11) + ): # NVFUSER torch._C._jit_set_profiling_executor(True) From 96b5bc9c5e228364f534ecebc5bfc602934864e1 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Sat, 17 Aug 2024 19:44:01 -0700 Subject: [PATCH 010/664] fix a version bug in nemologger (#10090) Signed-off-by: ashors1 --- nemo/lightning/nemo_logger.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py index 1df40cf659ae..5ba2c39f9cff 100644 --- a/nemo/lightning/nemo_logger.py +++ b/nemo/lightning/nemo_logger.py @@ -102,14 +102,15 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool = self.name = "default" version = self.version or os.environ.get(NEMO_ENV_VARNAME_VERSION, None) - if is_global_rank_zero(): - if self.use_datetime_version: - version = time.strftime('%Y-%m-%d_%H-%M-%S') - if resume_if_exists: - logging.warning( - "No version folders would be created under the log folder as 'resume_if_exists' is enabled." - ) - version = None + if not version: + if resume_if_exists: + logging.warning( + "No version folders would be created under the log folder as 'resume_if_exists' is enabled." + ) + version = None + elif is_global_rank_zero(): + if self.use_datetime_version: + version = time.strftime('%Y-%m-%d_%H-%M-%S') if version: if is_global_rank_zero(): os.environ[NEMO_ENV_VARNAME_VERSION] = version From ff7778921bc92e8ed3839a5d3c05326e795afcc5 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 19 Aug 2024 14:29:03 +0200 Subject: [PATCH 011/664] Remove max_num_tokens heuristic from test scripts (#10074) Signed-off-by: Jan Lasek --- nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py | 2 +- tests/deploy/nemo_deploy.py | 10 ++++++++-- tests/export/nemo_export.py | 10 ++++++++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py index 2a8a9a91e46d..921c6535a57a 100644 --- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py +++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py @@ -105,7 +105,7 @@ def qnemo_to_tensorrt_llm( if max_seq_len: build_cmd += f"--max_seq_len {max_seq_len} " - if max_num_tokens: + if max_num_tokens is not None: build_cmd += f"--max_num_tokens {max_num_tokens} " else: build_cmd += f"--max_num_tokens {max_batch_size * max_input_len} " diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py index 5193fe951138..88efb2374555 100644 --- a/tests/deploy/nemo_deploy.py +++ b/tests/deploy/nemo_deploy.py @@ -164,6 +164,7 @@ def run_trt_llm_inference( use_embedding_sharing=False, max_input_len=128, max_output_len=128, + max_num_tokens=None, ptuning=False, p_tuning_checkpoint=None, lora=False, @@ -249,7 +250,7 @@ def run_trt_llm_inference( max_prompt_embedding_table_size=max_prompt_embedding_table_size, use_lora_plugin=use_lora_plugin, lora_target_modules=lora_target_modules, - max_num_tokens=int(max_input_len * max_batch_size * 0.2), + max_num_tokens=max_num_tokens, opt_num_tokens=60, use_embedding_sharing=use_embedding_sharing, ) @@ -424,6 +425,7 @@ def run_existing_checkpoints( use_embedding_sharing=use_embedding_sharing, max_input_len=512, max_output_len=model_info["max_output_len"], + max_num_tokens=None, ptuning=ptuning, p_tuning_checkpoint=p_tuning_checkpoint, lora=lora, @@ -448,7 +450,6 @@ def get_args(): formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Deploy nemo models to Triton and benchmark the models", ) - parser.add_argument( "--model_name", type=str, @@ -499,6 +500,10 @@ def get_args(): type=int, default=128, ) + parser.add_argument( + "--max_num_tokens", + type=int, + ) parser.add_argument( "--p_tuning_checkpoint", type=str, @@ -646,6 +651,7 @@ def run_inference_tests(args): max_batch_size=args.max_batch_size, max_input_len=args.max_input_len, max_output_len=args.max_output_len, + max_num_tokens=args.max_num_tokens, ptuning=args.ptuning, p_tuning_checkpoint=args.p_tuning_checkpoint, lora=args.lora, diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py index 6a296fdb92eb..557d6c07613d 100644 --- a/tests/export/nemo_export.py +++ b/tests/export/nemo_export.py @@ -223,6 +223,7 @@ def run_inference( use_embedding_sharing=False, max_input_len=128, max_output_len=128, + max_num_tokens=None, use_parallel_embedding=False, ptuning=False, p_tuning_checkpoint=None, @@ -322,7 +323,7 @@ def run_inference( max_prompt_embedding_table_size=max_prompt_embedding_table_size, use_lora_plugin=use_lora_plugin, lora_target_modules=lora_target_modules, - max_num_tokens=int(max_input_len * max_batch_size * 0.2), + max_num_tokens=max_num_tokens, use_embedding_sharing=use_embedding_sharing, ) @@ -511,6 +512,7 @@ def run_existing_checkpoints( use_parallel_embedding=use_parallel_embedding, max_input_len=512, max_output_len=model_info["max_output_len"], + max_num_tokens=None, ptuning=ptuning, p_tuning_checkpoint=p_tuning_checkpoint, lora=lora, @@ -596,7 +598,6 @@ def get_args(): formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Deploy nemo models to Triton and benchmark the models", ) - parser.add_argument( "--model_name", type=str, @@ -652,6 +653,10 @@ def get_args(): type=int, default=128, ) + parser.add_argument( + "--max_num_tokens", + type=int, + ) parser.add_argument( "--use_parallel_embedding", type=str, @@ -856,6 +861,7 @@ def run_inference_tests(args): max_batch_size=args.max_batch_size, max_input_len=args.max_input_len, max_output_len=args.max_output_len, + max_num_tokens=args.max_num_tokens, use_parallel_embedding=args.use_parallel_embedding, ptuning=args.ptuning, p_tuning_checkpoint=args.p_tuning_checkpoint, From 0f492216c057aad6d5db6ebc80c06e17c33b4143 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Mon, 19 Aug 2024 15:29:16 +0300 Subject: [PATCH 012/664] Add missing imports for torch dist ckpt in export (#10071) * fix minor import bug Signed-off-by: Onur Yilmaz * torch distributed file read Signed-off-by: Onur Yilmaz --------- Signed-off-by: Onur Yilmaz Co-authored-by: Jan Lasek --- nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py index 479d93498475..1b711b5edbf3 100644 --- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py +++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py @@ -25,7 +25,9 @@ import torch import yaml import zarr -from torch.distributed.checkpoint import FileSystemReader +from tensorrt_llm._utils import np_bfloat16 +from torch.distributed.checkpoint import FileSystemReader, TensorStorageMetadata +from torch.distributed.checkpoint.state_dict_loader import load_state_dict from transformers import AutoTokenizer, PreTrainedTokenizer from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer @@ -56,9 +58,11 @@ class TarFileSystemReader(FileSystemReader): """ def __init__(self, path: Union[Path, TarPath]) -> None: - """No call to super().__init__ because it expects pure Path.""" - self.path = path - self.storage_data = dict() + """Makes sure that super().__init__ gets a pure path as expected.""" + super_path = str(path) if isinstance(path, TarPath) else path + super().__init__(super_path) + if isinstance(path, TarPath): + self.path = path # overwrites path set in super().__init__ call def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch_tensor=True): From 72a39f468663879d123e501a4adb7afdd99d6692 Mon Sep 17 00:00:00 2001 From: Asha Anoosheh Date: Mon, 19 Aug 2024 14:56:17 +0200 Subject: [PATCH 013/664] Minor formatting fix (#10196) Signed-off-by: Asha Anoosheh --- docs/source/nlp/distillation.rst | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/source/nlp/distillation.rst b/docs/source/nlp/distillation.rst index 01e5d599cb8c..22b2f3dd8a1c 100644 --- a/docs/source/nlp/distillation.rst +++ b/docs/source/nlp/distillation.rst @@ -27,25 +27,25 @@ The script must be launched correctly with the number of processes equal to tens .. code-block:: bash -STUDENT_CKPT="path/to/student.nemo" # can also be None (will use default architecture found in examples/nlp/language_modeling/conf/megatron_llama_distill.yaml) -TEACHER_CKPT="path/to/teacher.nemo" -TOKENIZER="path/to/tokenizer.model" -DATA_PATHS="[1.0,path/to/documents]" -FINAL_SAVE_FILE="final_checkpoint.nemo" -TP=4 - -NPROC=$TP -launch_config="torchrun --nproc_per_node=$NPROC" - -${launch_config} examples/nlp/language_modeling/megatron_gpt_distillation.py \ - model.restore_from_path=$STUDENT_CKPT \ - model.kd_teacher_restore_from_path=$TEACHER_CKPT \ - model.tensor_model_parallel_size=${TP} \ - model.tokenizer.model=$TOKENIZER \ - model.data.data_prefix=$DATA_PATHS \ - model.nemo_path=$FINAL_SAVE_FILE \ - trainer.precision=bf16 \ - trainer.devices=$NPROC + STUDENT_CKPT="path/to/student.nemo" # can also be None (will use default architecture found in examples/nlp/language_modeling/conf/megatron_llama_distill.yaml) + TEACHER_CKPT="path/to/teacher.nemo" + TOKENIZER="path/to/tokenizer.model" + DATA_PATHS="[1.0,path/to/tokenized/data]" + FINAL_SAVE_FILE="final_checkpoint.nemo" + TP=4 + + NPROC=$TP + launch_config="torchrun --nproc_per_node=$NPROC" + + ${launch_config} examples/nlp/language_modeling/megatron_gpt_distillation.py \ + model.restore_from_path=$STUDENT_CKPT \ + model.kd_teacher_restore_from_path=$TEACHER_CKPT \ + model.tensor_model_parallel_size=$TP \ + model.tokenizer.model=$TOKENIZER \ + model.data.data_prefix=$DATA_PATHS \ + model.nemo_path=$FINAL_SAVE_FILE \ + trainer.precision=bf16 \ + trainer.devices=$NPROC For large models, the command can be used in multi-node setting. For example, this can be done with `NeMo Framework Launcher `_ using Slurm. From eb1c811ab6b607194233d11d9db0ec6a1408c7ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 19 Aug 2024 16:17:52 +0200 Subject: [PATCH 014/664] tests: Fix L2_Community_vita_Checkpoints_tests_Llama3 (#10197) Signed-off-by: Oliver Koenig --- .github/workflows/cicd-main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index c77e135125c2..a4d65c5a4dc0 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -221,6 +221,7 @@ jobs: with: RUNNER: self-hosted-azure SCRIPT: | + mkdir /home/TestData/multimodal/video_neva/llama3-ci-hf/${{ github.run_id }} export PYTHONPATH=/home/TestData/multimodal/video_neva/LLaVA:$PYTHONPATH CUDA_VISIBLE_DEVICES=0 python examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py \ --in-file /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/llm \ @@ -228,12 +229,11 @@ jobs: --mm-vision-tower /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/vision_tower \ --tokenizer-model /home/TestData/multimodal/video_neva/vita-tokenizer/ \ --config-file vita_config.yaml \ - --out-file=/home/TestData/multimodal/video_neva/llama3-ci-hf/llama3_ci.nemo \ + --out-file=/home/TestData/multimodal/video_neva/llama3-ci-hf/${{ github.run_id }}/llama3_ci.nemo \ --model-type VITA \ --conv-template llama_3 AFTER_SCRIPT: | - rm -f /home/TestData/multimodal/video_neva/llama3-ci-hf/llama3_ci.nemo - rm -rf /home/TestData/multimodal/video_neva/llama3-ci-hf/model_weights + rm -rf /home/TestData/multimodal/video_neva/llama3-ci-hf/${{ github.run_id }} # this test is using a 7B model which is too large for GitHub CI # replace the model in this test with a toy model or move the test From cbcc222ac0dc22457a5af7a86ada6726b8e25f3f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:41:13 -0600 Subject: [PATCH 015/664] fix (#9759) (#9781) Signed-off-by: Chen Cui Co-authored-by: Chen Cui --- scripts/nlp_language_modeling/preprocess_data_for_megatron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/nlp_language_modeling/preprocess_data_for_megatron.py b/scripts/nlp_language_modeling/preprocess_data_for_megatron.py index e1f89182279b..cde14d83ec4b 100644 --- a/scripts/nlp_language_modeling/preprocess_data_for_megatron.py +++ b/scripts/nlp_language_modeling/preprocess_data_for_megatron.py @@ -338,7 +338,7 @@ def main(): if json_file.endswith('.gz'): fin = gzip.open(json_file, 'r') else: - fin = open(args.input, 'r', encoding='utf-8') + fin = open(json_file, 'r', encoding='utf-8') encoded_docs = pool.imap(encoder.encode, fin, 25) From c73f677b8711affc34b13f36e75d3c9ea6e8dfb7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 19 Aug 2024 10:39:11 -0600 Subject: [PATCH 016/664] Gemma 2 (#9708) * Gemma 2 (#9672) * gemma2 initial commit Signed-off-by: Chen Cui * enable conversion on cpu Signed-off-by: Chen Cui * fix code scanning Signed-off-by: Chen Cui * typo in config Signed-off-by: Chen Cui * fix output layer and add comments Signed-off-by: Chen Cui * refactor model customize to one function Signed-off-by: Chen Cui * unpin transformers version Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx --------- Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: cuichenx * typo Signed-off-by: Chen Cui * import in function to fix test Signed-off-by: Chen Cui --------- Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: Chen Cui Co-authored-by: cuichenx Co-authored-by: Eric Harper --- .../conf/megatron_gemma2_config.yaml | 225 +++++++++++++ .../megatron/gemma2/__init__.py | 13 + .../megatron/gemma2/gemma2_modules.py | 280 ++++++++++++++++ .../megatron/gemma2/gemma2_spec.py | 51 +++ .../language_modeling/megatron_bert_model.py | 4 +- .../language_modeling/megatron_gpt_model.py | 25 +- .../convert_gemma2_hf_to_nemo.py | 311 ++++++++++++++++++ 7 files changed, 901 insertions(+), 8 deletions(-) create mode 100644 examples/nlp/language_modeling/conf/megatron_gemma2_config.yaml create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/gemma2/__init__.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_spec.py create mode 100644 scripts/checkpoint_converters/convert_gemma2_hf_to_nemo.py diff --git a/examples/nlp/language_modeling/conf/megatron_gemma2_config.yaml b/examples/nlp/language_modeling/conf/megatron_gemma2_config.yaml new file mode 100644 index 000000000000..374223c20daf --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_gemma2_config.yaml @@ -0,0 +1,225 @@ +name: megatron_gemma2 +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + precision: 32 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. + max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models + gradient_clip_val: 1.0 + benchmark: False + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: megatron_llama + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + +model: + mcore_gpt: True + # specify micro_batch_size, global_batch_size, and model parallelism + # gradient accumulation will be done automatically based on data_parallel_size + micro_batch_size: 4 # limited by GPU memory + global_batch_size: 8 # will use more micro batches to reach global batch size + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + virtual_pipeline_model_parallel_size: null # interleaved pipeline + + # model architecture + encoder_seq_length: 8192 + max_position_embeddings: ${.encoder_seq_length} + num_layers: 42 # 9b: 18 | 27b: 46 + hidden_size: 3584 # 9b: 3584 | 27b: 4608 + ffn_hidden_size: 28672 # Transformer FFN hidden size. Usually 4 * hidden_size. | 9b: 28672 | 27b: 72728 + num_attention_heads: 16 # 9b: 16 | 27b: 32 + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + use_scaled_init_method: True # use scaled residuals initialization + hidden_dropout: 0.0 # Dropout probability for hidden state transformer. + attention_dropout: 0.0 # Dropout probability for attention + ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. + kv_channels: 256 # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null | 9b: 256 | 27b: 128 + apply_embedding_scaling: True # scale sqrt(hidden_size) + apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number. + normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' + layernorm_zero_centered_gamma: True + layernorm_epsilon: 1e-6 + do_layer_norm_weight_decay: False # True means weight decay on all params + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + persist_layer_norm: True # Use of persistent fused layer norm kernel. + bias: False # Whether to use bias terms in all weight matrices. + activation: 'fast-geglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] + headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. + transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] + openai_gelu: True # Use OpenAI's GELU instead of the default GeLU + normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. + position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope'] + rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. + attention_type: 'multihead' # Attention type. Options ['multihead'] + share_embeddings_and_output_weights: True # Share embedding and output layer weights. + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: 8 # Number of query groups for group query attention. If None, normal attention is used. | 9b: 8 | 27b: 16 + mcore_customization_config: + query_pre_attn_scalar: 224 # Custom scale factor (normally sqrt dim) in SDPA 9b: 224 | 27b: 144 + attn_logit_softcapping: 50.0 # Prevents attention outputs from growing excessively by scaling them to a fixed range + final_logit_softcapping: 30.0 # Prevents final logits from growing excessively by scaling them to a fixed range + + tokenizer: + library: 'sentencepiece' + type: null + model: ??? # /path/to/tokenizer.model + vocab_file: null + merge_file: null + delimiter: null # only used for tabular tokenizer + sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers. + + # Mixed precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + hysteresis: 2 # Gradient scale hysteresis + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # Megatron O2-style half-precision + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + grad_allreduce_chunk_size_mb: 125 + + # Fusion + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism.. + gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2. + bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. + bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + + + # Miscellaneous + seed: 1234 + resume_from_checkpoint: null # manually set the checkpoint file to load from + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages + + ## Activation Checkpointing + # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. + # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + # 'full' will checkpoint the entire transformer layer. + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null + # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. + # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. + num_micro_batches_with_partial_activation_checkpoints: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed + # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is + # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint + # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. + # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. + activations_checkpoint_layers_per_pipeline: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later + # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than + # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage + # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', + # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Transformer Engine + transformer_engine: True + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration + use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + + data: + # Path to data must be specified by the user. + # Supports List, String and Dictionary + # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", + # Or see example below: + # data_prefix: + # - .5 + # - /raid/data/pile/my-gpt3_00_text_document + # - .5 + # - /raid/data/pile/my-gpt3_01_text_document + # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} + # Or see example below: + # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" + # data_prefix: ??? + index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix + data_impl: mmap + splits_string: 900,50,50 + seq_length: ${model.encoder_seq_length} + skip_warmup: True + num_workers: 2 + dataloader_type: single # cyclic + reset_position_ids: False # Reset position ids after end-of-document token + reset_attention_mask: False # Reset attention mask after end-of-document token + eod_mask_loss: False # Mask loss for the end of document tokens + validation_drop_last: True # Set to false if the last partial validation samples is to be consumed + no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token + pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size + shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled + + # Nsys profiling options + nsys_profile: + enabled: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + + optim: + name: fused_adam + lr: 2e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 50000 + min_lr: 2e-5 diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gemma2/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/__init__.py new file mode 100644 index 000000000000..d9155f923f18 --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py new file mode 100644 index 000000000000..fd2472c5fe49 --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py @@ -0,0 +1,280 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Callable, Optional + +import torch +from megatron.core import parallel_state, tensor_parallel +from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax +from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.tensor_parallel import ColumnParallelLinear +from megatron.core.transformer import MegatronModule, TransformerConfig +from megatron.core.transformer.custom_layers.transformer_engine import TENorm, TERowParallelLinear +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.utils import attention_mask_func +from megatron.core.utils import divide +from torch import Tensor + + +def get_swa(seq_q, seq_kv, w): + """Create the equivalent attention mask fro SWA in [seq_q, seq_kv] shape""" + m = torch.ones(seq_q, seq_kv, dtype=torch.bool, device="cuda") + mu = torch.triu(m, diagonal=seq_kv - seq_q - w[0]) + ml = torch.tril(mu, diagonal=seq_kv - seq_q + w[1]) + ml = ~ml + return ml + + +def logit_softcapping(logits: torch.Tensor, scale: Optional[float]): + """Prevents logits from growing excessively by scaling them to a fixed range""" + if not scale: + return logits + return scale * torch.tanh(logits / scale) + + +class Gemma2DotProductAttention(MegatronModule): + """ + Region where selective activation recomputation is applied. + This region is memory intensive but less compute intensive which + makes activation checkpointing more efficient for LLMs (20B+). + See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + + We use the following notation: + h: hidden size + n: number of attention heads + p: number of tensor model parallel partitions + b: batch size + s: sequence length + """ + + def __init__( + self, + config: TransformerConfig, + layer_number: int, + attn_mask_type: AttnMaskType, + attention_type: str, + attention_dropout: float = None, + ): + super().__init__(config=config) + + self.config: TransformerConfig = config + + assert ( + self.config.context_parallel_size == 1 + ), "Context parallelism is only supported by TEDotProductAttention!" + + self.layer_number = max(1, layer_number) + + self.window_size = None + if self.layer_number % 2 == 0: + self.window_size = config.window_size + + self.attn_mask_type = attn_mask_type + self.attention_type = attention_type # unused for now + + projection_size = self.config.kv_channels * self.config.num_attention_heads + + # Per attention head and per partition values. + world_size = parallel_state.get_tensor_model_parallel_world_size() + self.hidden_size_per_partition = divide(projection_size, world_size) + self.hidden_size_per_attention_head = divide(projection_size, config.num_attention_heads) + self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size) + self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size) + + coeff = None + self.norm_factor = math.sqrt(config.query_pre_attn_scalar) + + if self.config.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff + + self.scale_mask_softmax = FusedScaleMaskSoftmax( + input_in_fp16=self.config.fp16, + input_in_bf16=self.config.bf16, + attn_mask_type=self.attn_mask_type, + scaled_masked_softmax_fusion=self.config.masked_softmax_fusion, + mask_func=attention_mask_func, + softmax_in_fp32=self.config.attention_softmax_in_fp32, + scale=coeff, + ) + + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout( + self.config.attention_dropout if attention_dropout is None else attention_dropout + ) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask: Tensor, + attn_mask_type: AttnMaskType = None, + packed_seq_params: PackedSeqParams = None, + ): + assert packed_seq_params is None, ( + "Packed sequence is not supported by DotProductAttention." "Please use TEDotProductAttention instead." + ) + + # =================================== + # Raw attention scores. [b, n/p, s, s] + # =================================== + + # expand the key and value [sk, b, ng, hn] -> [sk, b, np, hn] + # This is a noop for normal attention where ng == np. When using group query attention this + # creates a view that has the keys and values virtually repeated along their dimension to + # match the number of queries. + + # attn_mask_type is not used. + if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1: + key = key.repeat_interleave( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2 + ) + value = value.repeat_interleave( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2 + ) + + # [b, np, sq, sk] + output_size = ( + query.size(1), + query.size(2), + query.size(0), + key.size(0), + ) + + # [sq, b, np, hn] -> [sq, b * np, hn] + # This will be a simple view when doing normal attention, but in group query attention + # the key and value tensors are repeated to match the queries so you can't use simple strides + # to extract the queries. + query = query.reshape(output_size[2], output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key = key.view(output_size[3], output_size[0] * output_size[1], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor( + (output_size[0] * output_size[1], output_size[2], output_size[3]), + query.dtype, + "mpu", + ) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query.transpose(0, 1), # [b * np, sq, hn] + key.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=(1.0 / self.norm_factor), + ) + # Gemma 2 specific: + matmul_result = logit_softcapping(matmul_result, self.config.attn_logit_softcapping) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # sliding window attention + if attention_mask is not None and self.window_size is not None: + attention_mask = get_swa(query.size(0), key.size(0), self.window_size) + + # attention scores and attention mask [b, np, sq, sk] + attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + + if not self.config.sequence_parallel: + with tensor_parallel.get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + else: + attention_probs = self.attention_dropout(attention_probs) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = ( + value.size(1), + value.size(2), + query.size(0), + value.size(3), + ) + + # change view [sk, b * np, hn] + value = value.view(value.size(0), output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + + # matmul: [b * np, sq, hn] + context = torch.bmm(attention_probs, value.transpose(0, 1)) + + # change view [b, np, sq, hn] + context = context.view(*output_size) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context = context.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_shape = context.size()[:-2] + (self.hidden_size_per_partition,) + context = context.view(*new_context_shape) + return context + + +class TERowParallelLinearLayerNorm(TERowParallelLinear): + def __init__( + self, + input_size: int, + output_size: int, + *, + config: TransformerConfig, + init_method: Callable, + bias: bool, + input_is_parallel: bool, + skip_bias_add: bool, + is_expert: bool, + tp_comm_buffer_name: str = None, + ): + super().__init__( + input_size, + output_size, + config=config, + init_method=init_method, + bias=bias, + input_is_parallel=input_is_parallel, + skip_bias_add=skip_bias_add, + is_expert=is_expert, + tp_comm_buffer_name=tp_comm_buffer_name, + ) + self.post_layernorm = TENorm(config, output_size) + + def forward(self, x): + output, bias = super().forward(x) + return self.post_layernorm(output), bias + + +class Gemma2OutputLayer(ColumnParallelLinear): + def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None): + output, bias = super().forward(input_, weight) + output = logit_softcapping(output, self.config.final_logit_softcapping) + return output, bias diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_spec.py new file mode 100644 index 000000000000..32b2535c1010 --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_spec.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.custom_layers.transformer_engine import TELayerNormColumnParallelLinear +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.mlp import MLP, MLPSubmodules + +from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_modules import ( + Gemma2DotProductAttention, + TERowParallelLinearLayerNorm, +) + + +def get_gemma2_layer_spec(): + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=Gemma2DotProductAttention, # use unfused SDPA for attn logit softcapping + linear_proj=TERowParallelLinearLayerNorm, # post attn RMSNorm + ), + ), + self_attn_bda=get_bias_dropout_add, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, + linear_fc2=TERowParallelLinearLayerNorm, # post mlp RMSNorm + ), + ), + mlp_bda=get_bias_dropout_add, + ), + ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index 093fb2b8d688..0eb5ea1c0048 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -1169,7 +1169,9 @@ def build_transformer_config(self) -> TransformerConfig: normalization = self.cfg.get('normalization', 'layernorm') - layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' + layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' or self.cfg.get( + "layernorm_zero_centered_gamma", False + ) if normalization == 'layernorm': normalization = 'LayerNorm' elif normalization == 'rmsnorm': diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 0198c07c82d2..49f749e4a40e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -154,6 +154,8 @@ def mcore_supports_moe() -> bool: ## TODO: This function will not work if TE is not installed def get_specs(spec_name, transformer_config=None, use_te=True, hyena_cfg: Dict = None): + from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_spec import get_gemma2_layer_spec + # else cases for backwards compatibility with neva num_experts = transformer_config.num_moe_experts if transformer_config else None moe_grouped_gemm = transformer_config.moe_grouped_gemm if transformer_config else False @@ -167,6 +169,7 @@ def get_specs(spec_name, transformer_config=None, use_te=True, hyena_cfg: Dict = "": get_gpt_layer_local_spec(num_experts, moe_grouped_gemm), "te_gpt": get_gpt_layer_with_transformer_engine_spec(num_experts, moe_grouped_gemm), "megatron_falcon_gpt": get_falcon_layer_spec(), + "megatron_gemma2": get_gemma2_layer_spec(), "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(transformer_config), "modelopt": get_gpt_layer_modelopt_spec(num_experts), "te_gpt_hyena": get_gpt_layer_with_te_and_hyena_spec(hyena_cfg), @@ -176,6 +179,17 @@ def get_specs(spec_name, transformer_config=None, use_te=True, hyena_cfg: Dict = return name_spec_dict[spec_name] +def mcore_model_customize(cfg, model): + if cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage(): + extend_instance(model.embedding, EmbeddingScalingMixin) + if cfg.get('scale_positional_embedding', False): + model.rotary_pos_emb.inv_freq = apply_rope_scaling(model.rotary_pos_emb.inv_freq) + if cfg.get("mcore_customization_config", {}).get("final_logit_softcapping", 0): + from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_modules import Gemma2OutputLayer + + extend_instance(model.output_layer, Gemma2OutputLayer) + + class EmbeddingScalingMixin(torch.nn.Module): """ A mixin class for scaling embeddings in Megatron GPT. @@ -450,12 +464,7 @@ def model_provider_func(self, pre_process, post_process): seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), rotary_base=self.cfg.get('rotary_base', 10000), ) - - if self.cfg.get('scale_positional_embedding', False): - model.rotary_pos_emb.inv_freq = apply_rope_scaling(model.rotary_pos_emb.inv_freq) - - if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage(): - extend_instance(model.embedding, EmbeddingScalingMixin) + mcore_model_customize(self.cfg, model) else: assert self.cfg.get('num_query_groups', None) is None or self.cfg.get( 'num_query_groups', None @@ -2081,7 +2090,9 @@ def build_transformer_config(self) -> TransformerConfig: ) normalization = self.cfg.get('normalization', 'layernorm').lower() - layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' + layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' or self.cfg.get( + "layernorm_zero_centered_gamma", False + ) if normalization == 'layernorm': normalization = 'LayerNorm' elif normalization == 'rmsnorm': diff --git a/scripts/checkpoint_converters/convert_gemma2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_gemma2_hf_to_nemo.py new file mode 100644 index 000000000000..fb296cf25c68 --- /dev/null +++ b/scripts/checkpoint_converters/convert_gemma2_hf_to_nemo.py @@ -0,0 +1,311 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Requires HF transformers updated to v4.42 to support Gemma 2 Models + + huggingface-cli login + >>> from huggingface_hub import snapshot_download + >>> snapshot_download(repo_id="google/gemma-2-9b", local_dir="/path/to/gemma2/checkpoints/hf/9b") + + python3 /opt/NeMo/scripts/checkpoint_converters/convert_gemma2_hf_to_nemo.py \ + --input_name_or_path /path/to/gemma2/checkpoints/hf/9b \ + --output_path /path/to/gemma2-9b.nemo \ + --tokenizer_path /path/to/gemma2/checkpoints/hf/9b/tokenizer.model + [--cpu] + +If you encounter a torch.cuda.OutOfMemoryError, try converting on CPU with --cpu. +""" + +import os +from argparse import ArgumentParser + +import torch + +from megatron.core import parallel_state +from omegaconf import OmegaConf +from transformers import AutoModelForCausalLM, AutoTokenizer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision +from nemo.utils import logging + + +def create_rename_keys(num_hidden_layers): + rename_keys = [] + for i in range(num_hidden_layers): + # Attention layers + rename_keys.extend( + [ + ( + f"model.layers.{i}.self_attn.o_proj.weight", + f"model.decoder.layers.{i}.self_attention.linear_proj.weight", + ), + ( + f"model.layers.{i}.self_attn.q_proj.weight", + f"model.decoder.layers.{i}.self_attention.linear_q.weight", + ), + ( + f"model.layers.{i}.self_attn.k_proj.weight", + f"model.decoder.layers.{i}.self_attention.linear_k.weight", + ), + ( + f"model.layers.{i}.self_attn.v_proj.weight", + f"model.decoder.layers.{i}.self_attention.linear_v.weight", + ), + # MLP and LayerNorm + (f"model.layers.{i}.mlp.gate_proj.weight", f"model.decoder.layers.{i}.mlp.linear_fc1_gate.weight"), + (f"model.layers.{i}.mlp.up_proj.weight", f"model.decoder.layers.{i}.mlp.linear_fc1_proj.weight"), + (f"model.layers.{i}.mlp.down_proj.weight", f"model.decoder.layers.{i}.mlp.linear_fc2.weight"), + ( + f"model.layers.{i}.input_layernorm.weight", + f"model.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_weight", + ), + ( + f"model.layers.{i}.pre_feedforward_layernorm.weight", + f"model.decoder.layers.{i}.mlp.linear_fc1.layer_norm_weight", + ), + ( + f"model.layers.{i}.post_attention_layernorm.weight", + f"model.decoder.layers.{i}.self_attention.linear_proj.post_layernorm.weight", + ), + ( + f"model.layers.{i}.post_feedforward_layernorm.weight", + f"model.decoder.layers.{i}.mlp.linear_fc2.post_layernorm.weight", + ), + ] + ) + + # Non layer dependent keys + rename_keys.extend( + [ + ("model.embed_tokens.weight", "model.embedding.word_embeddings.weight"), + ("model.norm.weight", "model.decoder.final_layernorm.weight"), + ] + ) + + return rename_keys + + +def rename_model_keys(model_state_dict, rename_keys): + """ + Rename keys in the model's state dictionary based on the provided mappings. + + Parameters: + model_state_dict (dict): The state dictionary of the model. + rename_keys (list): A list of tuples with the mapping (old_key, new_key). + + Returns: + dict: A new state dictionary with updated key names. + """ + + # Create a new state dictionary with updated key names + new_state_dict = {} + + # Track keys from the original state dict to ensure all are processed + remaining_keys = set(model_state_dict.keys()) + + # Iterate over the rename mappings + for old_key, new_key in rename_keys: + if old_key in model_state_dict: + # Rename the key and remove it from the tracking set + new_state_dict[new_key] = model_state_dict[old_key] + remaining_keys.remove(old_key) + + # Check if any keys were not converted from old to new + for old_key in remaining_keys: + print(f"Warning: Key '{old_key}' was not converted.") + + return new_state_dict + + +def adjust_tensor_shapes(model, nemo_state_dict): + """ + Adapt tensor shapes in the state dictionary to ensure compatibility with a different model structure. + + Parameters: + nemo_state_dict (dict): The state dictionary of the model. + + Returns: + dict: The updated state dictionary with modified tensor shapes for compatibility. + """ + model_config = model.cfg + num_query_groups = model_config["num_query_groups"] + head_num = model_config["num_attention_heads"] + hidden_size = model_config["hidden_size"] + head_size = model_config["kv_channels"] + heads_per_group = head_num // num_query_groups + + # Note: For 'key' and 'value' weight and biases, NeMo uses a consolidated tensor 'query_key_value'. + for key_ in list(nemo_state_dict.keys()): + if 'mlp.linear_fc1_gate.weight' in key_: + key_gate = key_ + key_proj = key_.replace('mlp.linear_fc1_gate.weight', 'mlp.linear_fc1_proj.weight') + new_key = key_.replace('mlp.linear_fc1_gate.weight', 'mlp.linear_fc1.weight') + gate_weight = nemo_state_dict[key_gate] + proj_weight = nemo_state_dict[key_proj] + nemo_state_dict[new_key] = torch.cat((gate_weight, proj_weight)) + if 'layernorm.weight' in key_ or 'layer_norm_weight' in key_: + nemo_state_dict[key_] = nemo_state_dict[key_] + if 'self_attention.linear_q.weight' in key_: + key_q = key_ + key_k = key_.replace('linear_q', 'linear_k') + key_v = key_.replace('linear_q', 'linear_v') + key_qkv = key_.replace('linear_q', 'linear_qkv') + + # [(head_num + 2 * num_query_groups) * head_size, hidden_size] + # -> [head_num, head_size, hidden_size], 2 * [num_query_groups, head_size, hidden_size] + q_weight, k_weight, v_weight = nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v] + q_weight = q_weight.reshape(head_num, head_size, hidden_size) + k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size) + v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size) + + qkv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device) + for i in range(num_query_groups): + qkv_weight = torch.cat((qkv_weight, q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :])) + qkv_weight = torch.cat((qkv_weight, k_weight[i : i + 1, :, :])) + qkv_weight = torch.cat((qkv_weight, v_weight[i : i + 1, :, :])) + qkv_weight = qkv_weight.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) + nemo_state_dict[key_qkv] = qkv_weight + del nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v] + + return nemo_state_dict + + +def adjust_nemo_config(model_config, ref_config): + model_config["encoder_seq_length"] = ref_config["max_position_embeddings"] + model_config["num_layers"] = ref_config["num_hidden_layers"] + model_config["ffn_hidden_size"] = ref_config["intermediate_size"] + model_config["hidden_size"] = ref_config["hidden_size"] + model_config["num_attention_heads"] = ref_config["num_attention_heads"] + model_config["num_query_groups"] = ref_config["num_key_value_heads"] + model_config["kv_channels"] = ref_config["head_dim"] + model_config["layernorm_epsilon"] = ref_config["rms_norm_eps"] + model_config["window_size"] = (ref_config["sliding_window_size"], 0) + model_config["layernorm_zero_centered_gamma"] = True + model_config["name"] = 'megatron_gemma2' + model_config['mcore_customization_config'] = { + "attn_logit_softcapping": ref_config["attn_logit_softcapping"], + "final_logit_softcapping": ref_config["final_logit_softcapping"], + "query_pre_attn_scalar": ref_config["query_pre_attn_scalar"], + } + return model_config + + +def get_args(): + parser = ArgumentParser() + parser.add_argument("--input_name_or_path", type=str) + parser.add_argument("--tokenizer_path", type=str) + parser.add_argument( + "--hparams_file", + type=str, + default=os.path.join( + os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_gemma2_config.yaml' + ), + required=False, + help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", + ) + parser.add_argument("--output_path", type=str, default=None, help="Path to output .nemo file.") + parser.add_argument( + "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weight saved" + ) + parser.add_argument("--run_verification", action="store_true") + parser.add_argument("--cpu", action="store_true") + + args = parser.parse_args() + return args + + +def verify(nemo_model, hf_tokenizer, hf_model): + # Verifications + input_texts = [ + 'query: how much protein should a female eat', + ] + logging.info(f"Running verifications {input_texts} ...") + + # Tokenize the input texts + hf_tokenizer.pad_token = hf_tokenizer.eos_token + batch_dict = hf_tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt') + batch_dict_cuda = {k: v.cuda() for k, v in batch_dict.items()} + hf_model = hf_model.cuda().eval() + nemo_model = nemo_model.eval() + + hf_outputs = hf_model(**batch_dict_cuda, output_hidden_states=True) + + parallel_state._set_global_memory_buffer() + ids = batch_dict_cuda['input_ids'] + + id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids.cpu()] + + masks_and_position_ids = [ + get_ltor_masks_and_position_ids(id_tensor, hf_tokenizer.eos_token, False, False, False) + for id_tensor in id_tensors + ] + for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): + attn_mask, _, pos_ids = attn_mask_and_pos_ids + outputs = nemo_model( + tokens=tokens.cuda(), text_position_ids=pos_ids.cuda(), attention_mask=attn_mask.cuda(), labels=None + ) + + hf_next_token = hf_outputs.logits[0, -1].argmax() + next_token = outputs.squeeze()[-1].argmax() + + logging.info(f"HF predicted next token is: '{hf_tokenizer._convert_id_to_token(hf_next_token)}'.") + logging.info(f"NeMo predicted next token is: '{hf_tokenizer._convert_id_to_token(next_token)}'.") + assert ( + hf_next_token == next_token + ), f'prediction mismatch: {hf_tokenizer.decode(hf_next_token)} != {hf_tokenizer.decode(next_token)}' + + +def convert(args): + logging.info(f"Loading checkpoint from HF Gemma 2: `{args.input_name_or_path}`") + hf_tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path) + hf_model = AutoModelForCausalLM.from_pretrained(args.input_name_or_path) + logging.info("HF Model loading done.") + + nemo_config = OmegaConf.load(args.hparams_file) + nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.__dict__) + nemo_config.model.tokenizer["model"] = args.tokenizer_path + + nemo_config.trainer["precision"] = args.precision + if args.cpu: + nemo_config.model['use_cpu_initialization'] = True + nemo_config.trainer['accelerator'] = 'cpu' + trainer = MegatronTrainerBuilder(nemo_config).create_trainer() + model = MegatronGPTModel(nemo_config.model, trainer) + + rename_keys = create_rename_keys(nemo_config.model.num_layers) + old_state_dict = hf_model.state_dict() + new_state_dict = rename_model_keys(model_state_dict=old_state_dict, rename_keys=rename_keys) + + nemo_state_dict = adjust_tensor_shapes(model, new_state_dict) + model.load_state_dict(nemo_state_dict, strict=False) + + if args.run_verification and not args.cpu: + logging.info(f'=' * 100) + verify(model, hf_tokenizer, hf_model) + logging.info(f'=' * 100) + + dtype = torch_dtype_from_precision(args.precision) + model = model.to(dtype=dtype) + model.cfg.use_cpu_initialization = False + model.save_to(args.output_path) + logging.info(f'NeMo model saved to: {args.output_path}') + + +if __name__ == '__main__': + args = get_args() + convert(args) From 49376671cc185ac1fbc24d04552c2a102f6bdbb6 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:39:54 -0700 Subject: [PATCH 017/664] Use packaging & fall-back to pkg_resources if not available (#9958) Signed-off-by: Alexandros Koumparoulis --- .../megatron/gpt_full_te_layer_autocast_spec.py | 2 +- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- .../nlp/models/language_modeling/megatron_retro_model.py | 2 +- .../nlp/modules/common/megatron/adapters/parallel_adapters.py | 2 +- nemo/collections/nlp/modules/common/megatron/adapters/qlora.py | 2 +- nemo/collections/nlp/modules/common/megatron/transformer.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py index f3299d488fd0..a2d85ebe3006 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py @@ -15,8 +15,8 @@ from importlib.metadata import version from typing import Any, Callable, Optional +import packaging import torch -from pkg_resources import packaging from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults from nemo.collections.nlp.parts import utils_funcs diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 49f749e4a40e..67310d59db45 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -22,10 +22,10 @@ from importlib.metadata import version from typing import Any, Dict, Iterator, List, Optional, Union +import packaging import torch from omegaconf import OmegaConf from omegaconf.dictconfig import DictConfig -from pkg_resources import packaging from pytorch_lightning.accelerators import CPUAccelerator from pytorch_lightning.loops.fetchers import _DataFetcherWrapper from pytorch_lightning.trainer.trainer import Trainer diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py index 6dfe022d0275..9061f430e722 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py @@ -431,7 +431,7 @@ def build_retro_config(self) -> RetroConfig: # Validate Transformer Engine version. from importlib.metadata import version - from pkg_resources import packaging + import packaging te_version = packaging.version.Version(version("transformer-engine")) if te_version >= packaging.version.Version("1.3"): diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 7167eefda637..4f9f04527038 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -253,7 +253,7 @@ def __init__( if self._sequence_parallel and not input_is_parallel: from importlib.metadata import version - from pkg_resources import packaging + import packaging te_version = packaging.version.Version(version("transformer-engine")) if te_version >= packaging.version.Version("1.5.0dev") and ( diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py b/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py index a834b9a3fb49..4a180234e3cf 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py @@ -15,9 +15,9 @@ from importlib.metadata import version from typing import TYPE_CHECKING, Dict, Optional +import packaging import torch import torch.nn.functional as F -from pkg_resources import packaging from torch import Tensor, nn from nemo.collections.nlp.parts.peft_config import LORA_CONFIG_TO_MCORE_MAP, get_target_modules diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index c5907873bac3..e803a622f75d 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -18,10 +18,10 @@ from importlib.metadata import version from typing import Any, Callable, Optional +import packaging import torch import torch.nn as nn from einops import rearrange -from pkg_resources import packaging from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( From 9c86009fded25da5b8192f2de5010fa026efeb0d Mon Sep 17 00:00:00 2001 From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:47:22 -0700 Subject: [PATCH 018/664] [Docs] Fixing warnings, focusing on SpeechLLM docs + copy editing (#10141) * various simple docs source fixes Signed-off-by: Elena Rastorgueva * fix docstrings and typing with forward reference Signed-off-by: Elena Rastorgueva * Apply isort and black reformatting Signed-off-by: erastorgueva-nv * fix typing forward reference for PromptedAudioToTextLhotseDataset Signed-off-by: Elena Rastorgueva * copy edit docs/source/core/core Signed-off-by: Elena Rastorgueva * copy edit docs/source/core/exp_manager.rst Signed-off-by: Elena Rastorgueva * copy edit docs/source/features/mixed_precision.rst Signed-off-by: Elena Rastorgueva * copy edit docs/source/features/mixed_precision.rst forgot to add Signed-off-by: Elena Rastorgueva * copy edit docs/source/features/parallelisms.rst Signed-off-by: Elena Rastorgueva * copy edit docs/source/multimodal/mllm/datasets.rst Signed-off-by: Elena Rastorgueva * copy edit docs/source/multimodal/speech_llm/configs.rst Signed-off-by: Elena Rastorgueva * copy edit docs/source/multimodal/speech_llm/intro.rst Signed-off-by: Elena Rastorgueva * copy edit docs/source/starthere/fundamentals.rst Signed-off-by: Elena Rastorgueva * update url and heading Signed-off-by: Elena Rastorgueva * add back import Union Signed-off-by: Elena Rastorgueva * update speech llm intro Signed-off-by: Elena Rastorgueva * update speech llm examples readme Signed-off-by: Elena Rastorgueva --------- Signed-off-by: Elena Rastorgueva Signed-off-by: erastorgueva-nv Signed-off-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Co-authored-by: erastorgueva-nv --- docs/source/checkpoints/intro.rst | 6 +- docs/source/collections.rst | 2 +- docs/source/core/core.rst | 72 +++++------ docs/source/core/exp_manager.rst | 115 +++++++++--------- docs/source/features/mixed_precision.rst | 23 ++-- docs/source/features/moe.rst | 2 +- docs/source/features/parallelisms.rst | 91 +++++++------- docs/source/multimodal/mllm/datasets.rst | 36 +++--- docs/source/multimodal/speech_llm/configs.rst | 32 ++--- docs/source/multimodal/speech_llm/intro.rst | 35 +++--- docs/source/starthere/fundamentals.rst | 18 +-- examples/multimodal/speech_llm/README.md | 9 +- .../asr/data/audio_to_text_lhotse_prompted.py | 4 +- .../submodules/multitask_beam_decoding.py | 4 +- .../submodules/multitask_greedy_decoding.py | 4 +- .../speech_llm/data/audio_text_dataset.py | 76 ++++++++---- .../speech_llm/data/lhotse_dataset.py | 18 ++- .../speech_llm/modules/perception_modules.py | 93 +++++++------- 18 files changed, 343 insertions(+), 297 deletions(-) diff --git a/docs/source/checkpoints/intro.rst b/docs/source/checkpoints/intro.rst index 7c7154d64015..37d3bd7051f9 100644 --- a/docs/source/checkpoints/intro.rst +++ b/docs/source/checkpoints/intro.rst @@ -4,8 +4,8 @@ Checkpoints In this section, we present key functionalities of NVIDIA NeMo related to checkpoint management. -Understanding Checkpoint Formats --------------------------------- +Checkpoint Formats +------------------ A ``.nemo`` checkpoint is fundamentally a tar file that bundles the model configurations (specified inside a YAML file), model weights (inside a ``.ckpt`` file), and other artifacts like tokenizer models or vocabulary files. This consolidated design streamlines sharing, loading, tuning, evaluating, and inference. @@ -43,7 +43,7 @@ The following example shows the contents of a quantized model intended to be ser └── tokenizer_config.yaml Community Checkpoint Converter ------------------------------ +------------------------------ We provide easy-to-use tools that enable users to convert community checkpoints into the NeMo format. These tools facilitate various operations, including resuming training, Supervised Fine-Tuning (SFT), Parameter-Efficient Fine-Tuning (PEFT), and deployment. For detailed instructions and guidelines, please refer to our documentation. We offer comprehensive guides to assist both end users and developers: diff --git a/docs/source/collections.rst b/docs/source/collections.rst index 0198ef250ce3..2f04d1557628 100644 --- a/docs/source/collections.rst +++ b/docs/source/collections.rst @@ -25,7 +25,7 @@ Documentation for the individual collections multimodal/vlm/intro multimodal/text2img/intro multimodal/nerf/intro - mumtimoda/speech_llm/intro + multimodal/speech_llm/intro .. toctree:: :maxdepth: 1 diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst index 3c1a496993bd..6bdd18559902 100644 --- a/docs/source/core/core.rst +++ b/docs/source/core/core.rst @@ -4,7 +4,7 @@ NeMo Models Basics ------ -NeMo models contain everything needed to train and reproduce Conversational AI models: +NeMo models contain everything needed to train and reproduce conversational AI models: - neural network architectures - datasets/data loaders @@ -35,7 +35,7 @@ As an example, we can instantiate QuartzNet with the following: model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En") -To see all available pretrained models for a specific NeMo model, use the ``list_available_models()`` method. +To see all available pretrained models for a specific NeMo model, use the ``list_available_models()`` method: .. code-block:: Python @@ -52,7 +52,7 @@ Training NeMo leverages `PyTorch Lightning `__ for model training. PyTorch Lightning lets NeMo decouple the conversational AI code from the PyTorch training code. This means that NeMo users can focus on their domain (ASR, NLP, TTS) and -build complex AI applications without having to rewrite boiler plate code for PyTorch training. +build complex AI applications without having to rewrite boilerplate code for PyTorch training. When using PyTorch Lightning, NeMo users can automatically train with: @@ -168,7 +168,7 @@ While validation logic can be found in ``validation_step``: return {'val_loss': val_loss, 'tp': tp, 'fn': fn, 'fp': fp} -PyTorch Lightning then handles all of the boiler plate code needed for training. Virtually any aspect of training can be customized +PyTorch Lightning then handles all of the boilerplate code needed for training. Virtually any aspect of training can be customized via PyTorch Lightning `hooks `_, `Plugins `_, `callbacks `_, or by overriding `methods `_. @@ -239,8 +239,8 @@ Every NeMo example YAML has the same underlying configuration structure: - exp_manager - model -Model configuration always contain ``train_ds``, ``validation_ds``, ``test_ds``, and ``optim``. Model architectures vary across -domains, therefore, refer to the ASR, NLP, and TTS Collections documentation for more detailed information on Model architecture configuration. +The model configuration always contains ``train_ds``, ``validation_ds``, ``test_ds``, and ``optim``. Model architectures, however, can vary across domains. +Refer to the documentation of specific collections (LLM, ASR etc.) for detailed information on model architecture configuration. A NeMo configuration file should look similar to the following: @@ -288,15 +288,11 @@ A NeMo configuration file should look similar to the following: decoder: ... -More specific details about configuration files for each collection can be found on the following pages: - -:ref:`NeMo ASR Configuration Files` - CLI ~~~ With NeMo and Hydra, every aspect of model training can be modified from the command-line. This is extremely helpful for running lots -of experiments on compute clusters or for quickly testing parameters while developing. +of experiments on compute clusters or for quickly testing parameters during development. All NeMo `examples `_ come with instructions on how to run the training/inference script from the command-line (see `here `__ @@ -374,7 +370,7 @@ be instantiated and modified like any Python `Dataclass YAML > Dataclass +.. note:: Configuration with Hydra always has the following precedence CLI > YAML > Dataclass. .. _optimization-label: @@ -382,7 +378,7 @@ Optimization ------------ Optimizers and learning rate schedules are configurable across all NeMo models and have their own namespace. Here is a sample YAML -configuration for a Novograd optimizer with Cosine Annealing learning rate schedule. +configuration for a Novograd optimizer with a Cosine Annealing learning rate schedule. .. code-block:: yaml @@ -408,7 +404,7 @@ configuration for a Novograd optimizer with Cosine Annealing learning rate sched warmup_ratio: null min_lr: 1e-9: -.. note:: `NeMo Examples `_ has optimizer and scheduler configurations for every NeMo model. +.. note:: `NeMo Examples `_ has optimizer and scheduler configurations for every NeMo model. Optimizers can be configured from the CLI as well: @@ -596,7 +592,7 @@ as shown below we can update this config prior to restoring the model. Register Artifacts ------------------ -Conversational AI models can be complicated to restore as more information is needed than just the checkpoint weights in order to use the model. +Restoring conversational AI models can be complicated because it requires more than just the checkpoint weights; additional information is also needed to use the model. NeMo models can save additional artifacts in the .nemo file by calling ``.register_artifact``. When restoring NeMo models using ``.restore_from`` or ``.from_pretrained``, any artifacts that were registered will be available automatically. @@ -643,7 +639,7 @@ Push to Hugging Face Hub NeMo models can be pushed to the `Hugging Face Hub `_ with the :meth:`~nemo.core.classes.mixins.hf_io_mixin.HuggingFaceFileIO.push_to_hf_hub` method. This method performs the same actions as ``save_to()`` and then uploads the model to the HuggingFace Hub. It offers an additional ``pack_nemo_file`` argument that allows the user to upload the entire NeMo file or just the ``.nemo`` file. This is useful for large language models that have a massive number of parameters, and a single NeMo file could exceed the max upload size of Hugging Face Hub. -Upload a model to the hub +Upload a model to the Hub ~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -688,15 +684,15 @@ Use a Custom Model Card Template for the Hub Nested NeMo Models ------------------ -In some cases, it may be helpful to use NeMo models inside other NeMo models. For example, we can incorporate language models into ASR models to use in a decoding process to improve accuracy or use hybrid ASR-TTS models to generate audio from the text on the fly to train or finetune the ASR model. +In some cases, it may be helpful to use NeMo models inside other NeMo models. For example, we can incorporate language models into ASR models to use in a decoding process to improve accuracy or use hybrid ASR-TTS models to generate audio from the text on the fly to train or fine-tune the ASR model. -There are 3 ways to instantiate child models inside parent models: +There are three ways to instantiate child models inside parent models: - use subconfig directly - use the ``.nemo`` checkpoint path to load the child model - use a pretrained NeMo model -To register a child model, use the ``register_nemo_submodule`` method of the parent model. This method will add the child model to a provided model attribute and, in the serialization process, will handle child artifacts correctly and store the child model config in the parent model config in ``config_field``. +To register a child model, use the ``register_nemo_submodule`` method of the parent model. This method will add the child model to a specified model attribute. During serialization, it will correctly handle child artifacts and store the child model’s configuration in the parent model’s ``config_field``. .. code-block:: python @@ -746,30 +742,38 @@ To register a child model, use the ``register_nemo_submodule`` method of the par Profiling --------- -NeMo offers users two options for profiling: Nsys & CUDA memory profiling. These two options allow users +NeMo offers users two options for profiling: Nsys and CUDA memory profiling. These two options allow users to debug performance issues as well as memory issues such as memory leaks. To enable Nsys profiling, add the following options to the model config: -nsys_profile: False - start_step: 10 # Global batch to start profiling - end_step: 10 # Global batch to end profiling - ranks: [0] # Global rank IDs to profile - gen_shape: False # Generate model and kernel details including input shapes -Finally, the model training script with: +.. code-block:: yaml + + nsys_profile: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + +Finally, run the model training script with: + +.. code-block:: bash + + nsys profile -s none -o -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/... -nsys profile -s none -o -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/... See more options at `nsight user guide `_. To enable CUDA memory profiling, add the following options to the model config: -memory_profile: - enabled: True - start_step: 10 # Global batch to start profiling - end_step: 10 # Global batch to end profiling - rank: 0 # Global rank ID to profile - output_path: None # Path to store the profile output file +.. code-block:: yaml + + memory_profile: + enabled: True + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + rank: 0 # Global rank ID to profile + output_path: None # Path to store the profile output file -And invoke your NeMo script without any changes in the invocation command. +Then invoke your NeMo script without any changes in the invocation command. diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst index 6daa5070a16e..50ff94bfcb80 100644 --- a/docs/source/core/exp_manager.rst +++ b/docs/source/core/exp_manager.rst @@ -4,16 +4,16 @@ Experiment Manager ================== -NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing, TensorBoard Logging, Weights and Biases, DLLogger and MLFlow logging. The +The NeMo Framework Experiment Manager leverages PyTorch Lightning for model checkpointing, TensorBoard Logging, Weights and Biases, DLLogger and MLFlow logging. The Experiment Manager is included by default in all NeMo example scripts. -To use the experiment manager simply call :class:`~nemo.utils.exp_manager.exp_manager` and pass in the PyTorch Lightning ``Trainer``. +To use the Experiment Manager, call :class:`~nemo.utils.exp_manager.exp_manager` and pass in the PyTorch Lightning ``Trainer``. .. code-block:: python exp_dir = exp_manager(trainer, cfg.get("exp_manager", None)) -And is configurable via YAML with Hydra. +The Experiment Manager is configurable using YAML with Hydra. .. code-block:: bash @@ -23,7 +23,7 @@ And is configurable via YAML with Hydra. create_tensorboard_logger: True create_checkpoint_callback: True -Optionally, launch TensorBoard to view the training results in ``./nemo_experiments`` (by default). +Optionally, launch TensorBoard to view the training results in ``exp_dir``, which by default is set to ``./nemo_experiments``. .. code-block:: bash @@ -33,7 +33,7 @@ Optionally, launch TensorBoard to view the training results in ``./nemo_experime If ``create_checkpoint_callback`` is set to ``True``, then NeMo automatically creates checkpoints during training using PyTorch Lightning's `ModelCheckpoint `_. -We can configure the ``ModelCheckpoint`` via YAML or CLI. +We can configure the ``ModelCheckpoint`` via YAML or CLI: .. code-block:: yaml @@ -51,9 +51,8 @@ We can configure the ``ModelCheckpoint`` via YAML or CLI. Resume Training --------------- -We can auto-resume training as well by configuring the ``exp_manager``. Being able to auto-resume is important when doing long training -runs that are premptible or may be shut down before the training procedure has completed. To auto-resume training, set the following -via YAML or CLI: +To auto-resume training, configure the ``exp_manager``. This feature is important for long training runs that might be interrupted or +shut down before the procedure has completed. To auto-resume training, set the following parameters via YAML or CLI: .. code-block:: yaml @@ -73,7 +72,7 @@ via YAML or CLI: Experiment Loggers ------------------ -Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow, DLLogger, ClearML and NeptuneLogger. To use these loggers, simply set the following +Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow, DLLogger, ClearML and NeptuneLogger. To use these loggers, set the following via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`. @@ -179,7 +178,7 @@ Exponential Moving Average .. _exp_manager_ema-label: NeMo supports using exponential moving average (EMA) for model parameters. This can be useful for improving model generalization -and stability. To use EMA, simply set the following via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`. +and stability. To use EMA, set the following parameters via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`. .. code-block:: yaml @@ -201,7 +200,7 @@ and stability. To use EMA, simply set the following via YAML or :class:`~nemo.ut NeMo adds support for a callback upon preemption while running the models on clusters. The callback takes care of saving the current state of training via the ``.ckpt`` file followed by a graceful exit from the run. The checkpoint saved upon preemption has the ``*last.ckpt`` suffix and replaces the previously saved last checkpoints. This feature is useful to increase utilization on clusters. - The ``PreemptionCallback`` is enabled by default. To disable it simply add ``create_preemption_callback: False`` under exp_manager in the config YAML file. + The ``PreemptionCallback`` is enabled by default. To disable it, add ``create_preemption_callback: False`` under exp_manager in the config YAML file. Stragglers Detection ---------------------- @@ -211,7 +210,7 @@ and stability. To use EMA, simply set the following via YAML or :class:`~nemo.ut .. note:: Stragglers Detection feature is included in the optional NeMo resiliency package. - Distributed training can be affected by stragglers, which are slow workers that slow down the overall training process. + Distributed training can be affected by stragglers, which are workers that slow down the overall training process. NeMo provides a straggler detection feature that can identify slower GPUs. This feature is implemented in the ``StragglerDetectionCallback``, which is disabled by default. @@ -220,12 +219,12 @@ and stability. To use EMA, simply set the following via YAML or :class:`~nemo.ut A performance score can be interpreted as the ratio of current performance to reference performance. There are two types of performance scores provided by the callback: - - Relative GPU performance score: The best-performing GPU in the workload is used as a reference. - - Individual GPU performance score: The best historical performance of the GPU is used as a reference. + * Relative GPU performance score: The best-performing GPU in the workload is used as a reference. + * Individual GPU performance score: The best historical performance of the GPU is used as a reference. Examples: - - If the relative performance score is 0.5, it means that a GPU is twice slower than the fastest GPU. - - If the individual performance score is 0.5, it means that a GPU is twice slower than its best observed performance. + * If the relative performance score is 0.5, it means that a GPU is twice slower than the fastest GPU. + * If the individual performance score is 0.5, it means that a GPU is twice slower than its best observed performance. If a GPU performance score drops below the specified threshold, it is identified as a straggler. @@ -246,7 +245,7 @@ and stability. To use EMA, simply set the following via YAML or :class:`~nemo.ut gpu_individual_perf_threshold: 0.7 # Threshold for individual GPU performance scores stop_if_detected: True # Terminate the workload if stragglers are detected - Straggler detection might involve inter-rank synchronization, and should be invoked with reasonable frequency (e.g. every few minutes). + Straggler detection may require inter-rank synchronization and should be performed at regular intervals, such as every few minutes. .. Fault Tolerance @@ -257,7 +256,7 @@ and stability. To use EMA, simply set the following via YAML or :class:`~nemo.ut .. note:: Fault Tolerance feature is included in the optional NeMo resiliency package. - When training DNN models, faults may occur, hindering the progress of the entire training process. + When training Deep Neural Network (DNN models), faults may occur, hindering the progress of the entire training process. This is particularly common in distributed, multi-node training scenarios, with many nodes and GPUs involved. NeMo incorporates a fault tolerance mechanism to detect training halts. @@ -292,11 +291,11 @@ and stability. To use EMA, simply set the following via YAML or :class:`~nemo.ut **Importantly, `heartbeats` are not sent during checkpoint loading and saving**, so time for checkpointing related operations should be taken into account. - If ``calculate_timeouts: True`` timeouts will be automatically estimated based on observed intervals. - Estimated timeouts take precedence over timeouts defined in the config file. **Timeouts are estimated - at the end of a training run, when checkpoint loading and saving were observed**. Hence, in a multi-part - training started from scratch, estimated timeouts won't be available during initial two runs. - Estimated timeouts are stored in a separate JSON file. + If ``calculate_timeouts: True``, timeouts will be automatically estimated based on observed intervals. + Estimated timeouts take precedence over timeouts defined in the config file. **Timeouts are estimated + at the end of a training run when checkpoint loading and saving were observed.** Hence, in a multi-part + training started from scratch, estimated timeouts won't be available during the initial two runs. + Estimated timeouts are stored in a separate JSON file. ``max_subsequent_job_failures`` allows for the automatic continuation of training on a SLURM cluster. This feature requires SLURM job to be scheduled with ``NeMo-Framework-Launcher``. If ``max_subsequent_job_failures`` @@ -317,24 +316,25 @@ and stability. To use EMA, simply set the following via YAML or :class:`~nemo.ut * ``max_rank_restarts`` (int, default=0) Used by FT launcher. Max number of restarts for a rank. If ``>0`` ranks will be restarted on existing nodes in case of a failure. * ``max_subsequent_job_failures`` (int, default=0) Used by FT launcher. How many subsequent job failures are allowed until stopping autoresuming. - ``0`` means do not autoresume. + ``0`` means do not auto-resume. * ``additional_ft_launcher_args`` (str, default='') Additional FT launcher params (for advanced use). .. _nemo_multirun-label: + Hydra Multi-Run with NeMo ------------------------- -When training neural networks, it is common to perform hyper parameter search in order to improve the performance of a model -on some validation data. However, it can be tedious to manually prepare a grid of experiments and management of all checkpoints -and their metrics. In order to simplify such tasks, NeMo integrates with `Hydra Multi-Run support `_ in order to provide a unified way to run a set of experiments all -from the config. +When training neural networks, it is common to perform a hyperparameter search to improve the model’s performance on validation data. +However, manually preparing a grid of experiments and managing all checkpoints and their metrics can be tedious. +To simplify these tasks, NeMo integrates with `Hydra Multi-Run support `_, +providing a unified way to run a set of experiments directly from the configuration. There are certain limitations to this framework, which we list below: * All experiments are assumed to be run on a single GPU, and multi GPU for single run (model parallel models are not supported as of now). -* NeMo Multi-Run supports only grid search over a set of hyper-parameters, but we will eventually add support for advanced hyper parameter search strategies. -* **NeMo Multi-Run only supports running on one or more GPUs** and will not work if no GPU devices are present. +* NeMo Multi-Run currently supports only grid search over a set of hyperparameters. Support for advanced hyperparameter search strategies will be added in the future. +* **NeMo Multi-Run requires one or more GPUs** to function and will not work without GPU devices. Config Setup ~~~~~~~~~~~~ @@ -405,10 +405,10 @@ name as shown below - resume_ignore_no_checkpoint: true -Running a Multi-Run config -~~~~~~~~~~~~~~~~~~~~~~~~~~ +Run a NeMo Multi-Run Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once the config has been updated, we can now run it just like any normal Hydra script -- with one special flag (``-m``) ! +Once the config has been updated, we can now run it just like any normal Hydra script, with one special flag (``-m``). .. code-block:: bash @@ -417,21 +417,24 @@ Once the config has been updated, we can now run it just like any normal Hydra s ... Tips and Tricks -~~~~~~~~~~~~~~~ +--------------- -* Preserving disk space for large number of experiments +This section provides recommendations for using the Experiment Manager. -Some models may have a large number of parameters, and it may be very expensive to save a large number of checkpoints on -physical storage drives. For example, if you use Adam optimizer, each PyTorch Lightning ".ckpt" file will actually be 3x the -size of just the model parameters - per ckpt file ! This can be exhorbitant if you have multiple runs. +Preserving disk space for a large number of experiments +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In the above config, we explicitly set ``save_top_k: 1`` and ``always_save_nemo: True`` - what this does is limit the number of -ckpt files to just 1, and also save a NeMo file (which will contain just the model parameters without optimizer state) and -can be restored immediately for further work. +Some models may have a large number of parameters, making it very expensive to save numerous checkpoints on physical storage drives. +For example, if you use the Adam optimizer, each PyTorch Lightning ".ckpt" file will be three times the size of just the model +parameters. This can become exorbitant if you have multiple runs. -We can further reduce the storage space by utilizing some utility functions of NeMo to automatically delete either -ckpt or NeMo files after a training run has finished. This is sufficient in case you are collecting results in some experiment -tracking tool and can simply rerun the best config after the search is finished. +In the above configuration, we explicitly set ``save_top_k: 1`` and ``always_save_nemo: True``. This limits the number of ".ckpt" +files to just one and also saves a NeMo file, which contains only the model parameters without the optimizer state. +This NeMo file can be restored immediately for further work. + +We can further save storage space by using NeMo's utility functions to automatically delete either ".ckpt" or NeMo files +after a training run has finished. This is sufficient if you are collecting results in an experiment tracking tool and can +simply rerun the best configuration after the search is completed. .. code-block:: python @@ -452,24 +455,26 @@ tracking tool and can simply rerun the best config after the search is finished. clean_exp_ckpt(exp_log_dir, remove_ckpt=True, remove_nemo=False) -* Debugging Multi-Run Scripts +Debugging Multi-Run Scripts +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When running Hydra scripts, you may encounter configuration issues that crash the program. In NeMo Multi-Run, a crash in +any single run will not crash the entire program. Instead, we will note the error and proceed to the next job. Once all +jobs are completed, we will raise the errors in the order they occurred, crashing the program with the first error’s stack trace. -When running hydra scripts, you may sometimes face config issues which crash the program. In NeMo Multi-Run, a crash in -any one run will **not** crash the entire program, we will simply take note of it and move onto the next job. Once all -jobs are completed, we then raise the error in the order that it occurred (it will crash the program with the first error's -stack trace). -In order to debug Muti-Run, we suggest to comment out the full hyper parameter config set inside ``sweep.params`` -and instead run just a single experiment with the config - which would immediately raise the error. +To debug NeMo Multi-Run, we recommend commenting out the entire hyperparameter configuration set inside ``sweep.params``. +Instead, run a single experiment with the configuration, which will immediately raise the error. -* Experiment name cannot be parsed by Hydra +Experiment name cannot be parsed by Hydra +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Sometimes our hyper parameters include PyTorch Lightning ``trainer`` arguments - such as number of steps, number of epochs -whether to use gradient accumulation or not etc. When we attempt to add these as keys to the expriment manager's ``name``, +Sometimes our hyperparameters include PyTorch Lightning ``trainer`` arguments, such as the number of steps, number of epochs, +and whether to use gradient accumulation. When we attempt to add these as keys to the experiment manager's ``name``, Hydra may complain that ``trainer.xyz`` cannot be resolved. -A simple solution is to finalize the hydra config before you call ``exp_manager()`` as follows - +A simple solution is to finalize the Hydra config before you call ``exp_manager()`` as follows: .. code-block:: python diff --git a/docs/source/features/mixed_precision.rst b/docs/source/features/mixed_precision.rst index 7e1e8c2f05fc..b1ec196c567e 100644 --- a/docs/source/features/mixed_precision.rst +++ b/docs/source/features/mixed_precision.rst @@ -3,15 +3,15 @@ Mixed Precision Training ------------------------ -Mixed precision training significantly enhances computational efficiency by conducting operations in low-precision format, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. NeMo now supports FP16, BF16, and FP8 (via Transformer Engine) across most models. Further details will be provided shortly. +Mixed precision training significantly enhances computational efficiency by conducting operations in low-precision format, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. NeMo Framework now supports FP16, BF16, and FP8 via Transformer Engine (TE) across most models. Half-precision Training ======================= -NeMo supports half-precision (FP16 and BF16) computation training via Megatron Core and the distributed optimizer. +NeMo Framework supports half-precision FP16 and BF16 computation training via Megatron Core and the distributed optimizer. This training recipe uses half-precision in all layer computation keeping the model states (optimizer states and master parameters) in single-precision. -To avoid repeated data type casting at each layer computation, Megatron Core keeps a separate copy of half-precision parameters that is updated after each optimizer.step. +To avoid repeated data type casting at each layer computation, Megatron Core keeps a separate copy of half-precision parameters that is updated after each optimizer step. Half-precision training is enabled when setting ``precision`` to either of ``fp16-mixed`` or ``bf16-mixed`` along with ``megatron_amp_O2=true``. The parameter gradients are computed in the same half-precision, and the precision of gradient reduce-scatter across data-parallel GPUs can be set by ``optim.grad_sync_dtype``. @@ -19,13 +19,10 @@ The parameter gradients are computed in the same half-precision, and the precisi FP8 Training ============ -Overview -^^^^^^^^ - -NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. NeMo uses the NVIDIA `TransformerEngine `_ (TE) in order to leverage speedups from FP8. The following table summarizes the FP8 related arguments that can be configured in NeMo (`example config setting `_). For a more detailed overview, refer to the TE `documentation `_, specifically the FP8 `format `_ and `recipe `_. +NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. NeMo Framework uses the NVIDIA `TransformerEngine `_ (TE) to leverage speedups from FP8. The following table summarizes the FP8 related arguments that can be configured in NeMo (`example config setting `_). For a more detailed overview, refer to the TE `documentation `_, specifically the FP8 `format `_ and `recipe `_. .. list-table:: FP8 arguments - :widths: 25 25 50 + :widths: 10 20 :header-rows: 1 * - Argument @@ -33,7 +30,7 @@ NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point * - transformer_engine - TE and related functionality can be enabled by setting this boolean argument to True. If this argument is not set to True, all subsequent arguments will be ignored. * - fp8 - - Enables FP8 training. For transformer networks, the QKV, projection, FC1, and FC2 matrix multiplications are executed using the 4th generation H100 tensor cores with FP8 support. + - Enables FP8 training. For transformer networks, the QKV, projection, FC1, and FC2 matrix multiplications are executed using the fourth-generation NVIDIA H100 Tensor Cores with FP8 support. * - fp8_e4m3 - Training recipe format for FP8. Activations, weights, and gradient tensors use the E4M3 format. * - fp8_hybrid @@ -47,12 +44,12 @@ NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point * - reduce_amax - Indicates whether or not to perform an allreduce on the amax (absolute max) values for the FP8 tensors. Since the amax is directly used to compute the scaling factor for FP8 tensors, setting this argument ensures that the scaling factors for a tensor remain synchronized across devices in multi-GPU training configurations. * - fp8_params - - Indicates whether or not to store module level parameters in FP8. Enabling this option can lead to reduced memory consumption. It eliminates the need to store a copy of weights in higher precision (> half) for cases where these weights are externally maintained, such as master parameters in the optimizer. For more information, refer to the `fp8_model_init `_ API in TE. + - Indicates whether to store module-level parameters in FP8. Enabling this option can reduce memory consumption by eliminating the need to store a copy of weights in higher precision for cases where these weights are externally maintained, such as master parameters in the optimizer. For more information, refer to the `fp8_model_init `_ API in TE. Resources ^^^^^^^^^ -- `TE documentation `_ +- `Transformer Engine documentation `_ - `Intro to FP8, floating point formats, and mixed precision training `_ -- `Performance optimizations `_ that are natively supported in NeMo by enabling FP8 training with TE -- `TE installation `_ +- `Performance optimizations `_ that are natively supported in NeMo Framework by enabling FP8 training with TE +- `Transformer Engine installation `_ diff --git a/docs/source/features/moe.rst b/docs/source/features/moe.rst index 4c935f9f16a7..4457043777c6 100644 --- a/docs/source/features/moe.rst +++ b/docs/source/features/moe.rst @@ -60,7 +60,7 @@ Other options include: 2. ``moe_token_dropping`` enables selectively dropping and padding tokens for each expert to achieve a specified capacity, similar to GShard, Switch-Transformer, and DeepSpeed-MoE. Briefly, if the number of tokens routed to an expert exceeds its capacity, then the exceeding tokens are dropped. Note that this is - currently unsupported so should remain False. + currently unsupported so should remain False. 3. ``moe_token_dispatcher_type`` specifies the token dispatcher type, options include 'allgather' and 'alltoall'. diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst index bf327fb18331..3cf7f7a8fdf9 100644 --- a/docs/source/features/parallelisms.rst +++ b/docs/source/features/parallelisms.rst @@ -3,7 +3,7 @@ Parallelisms ============ -NeMo Megatron supports various data- and model-parallel deep learning workload deployment methods (which can be mixed together arbitrarily). +NeMo Megatron supports various data-parallel and model-parallel deep learning workload deployment methods, which can be mixed together arbitrarily. Data Parallelism ---------------- @@ -31,17 +31,17 @@ It shards the optimizer states and the high-precision master parameters across d At the parameter optimizer step, each data-parallel GPU updates its shard of parameters. Since each GPU needs its own gradient shard, the distributed optimizer conducts reduce-scatter of the parameter gradients instead of all-reduce of them. Then, the updated parameter shards are all-gathered across data-parallel GPUs. -This approach significantly reduces the memory need of large scale LLM training. +This approach significantly reduces the memory need of large-scale LLM training. Also, when the precision of the gradient is higher than the parameter precision, the split execution of gradient reduce-scatter and parameter all-gather can reduce the total communication volume. This split collective execution increases the total computation to overlap with the communication, which improves the overlap opportunity. Enable Data Parallelism ~~~~~~~~~~~~~~~~~~~~~~~ -In NeMo, DDP is the default parallel deployment method. -This means that the total number of GPUs corresponds to the size of the DP group and training a LLM with model parallelism decreases the size of the DP group. +In NeMo Framework, DDP is the default parallel deployment method. +This means that the total number of GPUs corresponds to the size of the DP group, and training an LLM with model parallelism decreases the size of the DP group. -Currently, NeMo supports optimizer distribution only for Adam optimizer. +Currently, NeMo Framework supports optimizer distribution only for Adam optimizer. To enable the distributed adam optimizer, set ``model.optim.name=distributed_fused_adam`` in the model configuration. It can be configured with the following options: @@ -63,27 +63,28 @@ See the keyword arguments in `Apex DistributedFusedAdam `_ (default) or a custom implementation (if custom multi-precision training is enabled with ``megatron_amp_O2``). -The distributed optimizer in NeMo is built on top of +The distributed optimizer in NeMo Framework is built on top of `DistributedFusedAdam `_ from Apex. -Fully-Shared Data Parallelism (FSDP) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Fully-Shared Data Parallelism +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -NeMo supports Fully-Sharded Data Parallelism (FSDP) that shards parameter gradients and low-precision parameters for computation on top of the model states that Distributed optimizer shards (optimizer states and high-precision parameters). -Since FSDP shards the entire model states, it ensures linear model state memory saving with increasing DP size. -FSDP can be preferred for the LLM training with unbalanced workload between pipeline stages (or Transformer layers) or with a large vocabulary size, where pipelining would cause huge computation bubbles due to the workload imbalance. -Also, FSDP unloads the effort to search the performance-optimal mappings with 3D parallelism (TP/PP/DP) because it has a single parallelization domain. +NeMo Framework supports Fully-Sharded Data Parallelism (FSDP), which shards parameter gradients and low-precision parameters for computation. This is in addition to the model states that the distributed optimizer shards, including optimizer states and high-precision parameters. +Since FSDP shards the entire model states, it ensures linear model state memory savings with increasing DP size. +FSDP is preferred for LLM training with unbalanced workloads between pipeline stages (or Transformer layers) or with a large vocabulary size, where pipelining would cause significant computation bubbles due to workload imbalance. +Additionally, FSDP eliminates the need to search for performance-optimal mappings with 3D parallelism (TP/PP/DP) because it operates within a single parallelization domain. -NeMo uses `pytorch's FSDP interface `_ to shard LLM model states, which flattens the parameters of each Transformer layer and partitions across datap-parallel GPUs. -FSDP introduces collectives across data-parallel GPUs; all-gather of the parameters for computation and reduce-scatter of parameter gradients. -The parameter all-gather occurs in both network forward- and back-propagation phases. The gradient reduce-scatter happens only in the back-propagation. -These FSDP communications are overlapped with Transformer layer computations. + +NeMo Framework uses `PyTorch's FSDP interface `_ to shard LLM model states, flattening the parameters of each transformer layer and partitioning them across data-parallel GPUs. +FSDP introduces collective operations across data-parallel GPUs, including all-gather for parameter computation and reduce-scatter for parameter gradients. +The all-gather operation occurs during both the network forward and back-propagation phases, while the gradient reduce-scatter operation happens only during back-propagation. +These FSDP communications are overlapped with transformer layer computations. Setting ``fsdp=true`` enables FSDP. The mixed precision recipe can be set by ``precision`` knob, which determines both the computation and communication precisions. @@ -93,15 +94,15 @@ Also, one can use ``grad_reduce_dtype`` to override the gradient reduction preci Model Parallelism ----------------- -Model parallelism (MP) is a distributed model deployment method that partitions the model parameters across GPUs to reduce the need of per-GPU memory. -NeMo supports various model-parallel methods, which can be mixed to maximize LLM training performance. +Model Parallelism (MP) is a distributed model deployment method that partitions the model parameters across GPUs to reduce the need of per-GPU memory. +NeMo Framework supports various model-parallel methods, which can be mixed to maximize LLM training performance. Tensor Parallelism ^^^^^^^^^^^^^^^^^^ Tensor Parallelism (TP) is a model-parallel partitioning method that distributes the parameter tensor of an individual layer across GPUs. -On top of reducing the model state memory usage, it also saves the activation memory as per-GPU tensor sizes shrinks. -However, the reduced per-GPU tensor lowers per-GPU-kernel workload sizes that increases CPU overhead. +In addition to reducing model state memory usage, it also saves activation memory as the per-GPU tensor sizes shrink. +However, the reduced per-GPU tensor size increases CPU overhead due to smaller per-GPU kernel workloads. .. image:: ../nlp/nemo_megatron/images/tp.gif :align: center @@ -111,9 +112,7 @@ However, the reduced per-GPU tensor lowers per-GPU-kernel workload sizes that in Enable Tensor Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~ -To enable TP in the NeMo framework, configure the ``tensor_model_parallel_size`` parameter in the model configuration. This parameter determines the number of GPUs among which the model's tensors are partitioned. - -**For Tensor Parallelism**: +To enable TP in the NeMo Framework, configure the ``tensor_model_parallel_size`` parameter in the model configuration. This parameter determines the number of GPUs among which the model's tensors are partitioned. Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism. @@ -121,24 +120,24 @@ Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer m tensor_model_parallel_size: 1 # Example to enable Tensor Parallelism -The configuration file can be adjusted here: `NeMo Megatron GPT Config `_. +The configuration file can be adjusted here: `NeMo Megatron GPT Config `__. Implement Tensor Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -NeMo integrates Tensor Parallelism through the implementation from Megatron Core. To understand how TP is activated within transformer blocks, refer to the code in the following repository: `Megatron-LM Transformer Block `_. +NeMo Framework integrates TP through the implementation from Megatron Core. To understand how TP is activated within transformer blocks, refer to the code in the following repository: `Megatron-LM Transformer Block `__. For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide `_. FSDP with Tensor Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -NeMo supports FSDP along with tensor parallelism. This is done by restricting the model state sharding to the data-parallel domain. -Using FSDP with tensor parallelism can be helpful when the model doesn't have sufficient parallelism to deploy on a large scale training system with the data-parallel mapping. For example, running a model with the global batch size of 1024 on 2048 GPUs. -Also, tensor parallelism enables FSDP feasibility by reducing the model state size and the activation size per GPU, thus lower the FSDP communication overhead and the activation memory overhead. +NeMo Framework supports FSDP along with TP. This is done by restricting the model state sharding to the data-parallel domain. +Using FSDP with TP can be helpful when the model doesn't have sufficient parallelism to deploy on a large-scale training system with the data-parallel mapping. For example, running a model with the global batch size of 1024 on 2048 GPUs. +Also, TP enables FSDP feasibility by reducing the model state size and the activation size per GPU, thus lower the FSDP communication overhead and the activation memory overhead. Using both FSDP and TP works by enabling FSDP (``fsdp=true``) and setting ``tensor_model_parllel_size > 1``. -The user should unset ``CUDA_DEVICE_MAX_CONNECTIONS`` environment variable to enable that sets the number of GPU kernel queue to overlap of the FSDP communication with computation kernels. +Unset the ``CUDA_DEVICE_MAX_CONNECTIONS`` environment variable to set the number of GPU kernel queues, allowing the overlap of FSDP communication with computation kernels. Pipeline Parallelism ^^^^^^^^^^^^^^^^^^^^ @@ -154,9 +153,7 @@ Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segm Enable Pipeline Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To utilize PP in the NeMo framework, you need to set the ``pipeline_model_parallel_size`` parameter in the model's configuration. This parameter specifies the number of GPUs among which the model's layers are distributed. - -**For Pipeline Parallelism**: +To utilize Pipeline Parallelism (PP) in NeMo Framework, set the ``pipeline_model_parallel_size`` parameter in the model's configuration. This parameter specifies the number of GPUs among which the model's layers are distributed. Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism. @@ -164,7 +161,7 @@ Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable int pipeline_model_parallel_size: 1 # Example to enable Pipeline Parallelism -Adjust the configuration accordingly here: `NeMo Megatron GPT Config `_. +Adjust the configuration accordingly here: `NeMo Megatron GPT Config `__. Interleaved Pipeline Parallel Schedule ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -180,7 +177,7 @@ For more insights into this approach, see our detailed blog: `Scaling Language M Implement Pipeline Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The NeMo implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block `_. +The NeMo Framework implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block `_. For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide `_. @@ -197,31 +194,31 @@ Unlike other model-parallel techniques, EP is applied to only the expert layers Enable Expert Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~ -To enable EP, set ``model.expert_model_parallel_size`` to the desired expert parallel size. For example, if the model has six experts (``model.num_moe_experts=6``), then setting ``model.expert_model_parallel_size=3`` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size. +To enable EP, set ``model.expert_model_parallel_size`` to the expert parallel size you want. For example, if the model has six experts (``model.num_moe_experts=6``), then setting ``model.expert_model_parallel_size=3`` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size. .. code-block:: yaml expert_model_parallel_size: 3 # Set EP to 3 -For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config `_. +For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config `__. Implement Expert Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The NeMo implementation of Expert Parallelism uses functionality from Megatron Core. Please consult the `Megatron Core MoE layer `_ for more MoE implementation details. +The NeMo Framework implementation of EP uses functionality from Megatron Core. Please consult the `Megatron Core MoE layer `_ for more MoE implementation details. Activation Partitioning ----------------------- In LLM training, a large memory space is needed to store the input activations of the network layers. -NeMo provides effective activation distribution methods, which is critical in training LLM with a large sequence length or large per-GPU micro-batch size. +NeMo Framework provides effective activation distribution methods, which is critical in training LLM with a large sequence length or large per-GPU micro-batch size. Sequence Parallelism ^^^^^^^^^^^^^^^^^^^^ -Sequence Parallelism extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency. +Sequence Parallelism (SP) extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency. .. image:: ../nlp/nemo_megatron/images/sp.gif :align: center @@ -231,31 +228,29 @@ Sequence Parallelism extends tensor-level model parallelism by distributing comp Enable Sequence Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To utilize Sequence Parallelism in NeMo, set the ``sequence_parallel`` parameter to ``True`` in the model's configuration. Note that this feature is effective only when the tensor parallel size (``tensor_model_parallel_size``) is greater than ``1``. +To utilize SP in NeMo Framework, set the ``sequence_parallel`` parameter to ``True`` in the model's configuration. Note that this feature is effective only when the tensor parallel size (``tensor_model_parallel_size``) is greater than ``1``. .. code-block:: yaml sequence_parallel: True # Enable Sequence Parallelism -For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config `_. +For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config `__. Implement Sequence Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The NeMo implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code `_. +The NeMo Framework implementation of SP utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code `_. Context Parallelism ^^^^^^^^^^^^^^^^^^^ Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs, partitioning the input tensors in the sequence dimension. -Unlike Sequence Parallelism (SP) that partitions the activations of specific layers, CP divides the activations of all layers. +Unlike SP, which partitions the activations of specific layers, CP divides the activations of all layers. Enable Context Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~ -To activate CP in the NeMo framework, set the ``context_parallel_size`` parameter in the model configuration. This parameter specifies the number of GPUs among which the model's sequence activations are distributed. - -**For Context Parallelism**: +To activate CP in the NeMo Framework, set the ``context_parallel_size`` parameter in the model configuration. This parameter specifies the number of GPUs among which the model's sequence activations are distributed. Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism. @@ -268,7 +263,7 @@ The configuration can be found and modified here: `NeMo Megatron Core Context Co Implement Context Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -NeMo leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency. +NeMo Framework leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency. Visit our source code for more insights into the implementation: - `Megatron Core wrappers for Transformer Engine `_ diff --git a/docs/source/multimodal/mllm/datasets.rst b/docs/source/multimodal/mllm/datasets.rst index 2f2000124e4d..60ee00b7f7f0 100644 --- a/docs/source/multimodal/mllm/datasets.rst +++ b/docs/source/multimodal/mllm/datasets.rst @@ -1,12 +1,12 @@ Multimodal Language Model Datasets ================================== -The NeMo multimodal language model supports the conversation data format, drawing inspiration from and designed based on `LLaVA `_. Sample datasets can be explored at `LLaVA's data documentation `_. +The NeMo Framework multimodal language model supports the conversation data format, drawing inspiration from and designed based on `LLaVA `_. Sample datasets can be explored at `LLaVA's data documentation `_. -Preparing the Training Dataset ------------------------------- +Prepare the Training Dataset +---------------------------- -The NeVA model training encompasses two phases: pretraining and finetuning. Each phase mandates a unique dataset. +The NeVA model training encompasses two phases: pretraining and fine-tuning. Each phase mandates a unique dataset. For **pretraining**, utilize the *LAION/CC/SBU BLIP-Caption Concept-balanced 558K* dataset. Access this dataset via `LLaVA's GitHub `_. After procuring the dataset, extract it to: @@ -14,13 +14,13 @@ For **pretraining**, utilize the *LAION/CC/SBU BLIP-Caption Concept-balanced 558 /path/to/neva/datasets/LLaVA-Pretrain-LCS-558K/blip_laion_cc_sbu_558k.json -Acquire the image data from `HuggingFace `_ and extract to: +Acquire the image data from `Hugging Face `__ and extract to: .. code-block:: bash /path/to/neva/datasets/LLaVA-Pretrain-LCS-558K/images -For **fine-tuning**, deploy the *LLaVA-Instruct-150K* dataset. This is also available on `LLaVA's GitHub `_. You can download the prompts from `HuggingFace `_: +For **fine-tuning**, deploy the *LLaVA-Instruct-150K* dataset. This is also available on `LLaVA's GitHub `_. You can download the prompts from `HuggingFace `__: .. code-block:: bash @@ -32,19 +32,19 @@ Image data for this phase can be obtained from the `COCO Dataset `_. After downloading the desired HuggingFace checkpoint, extract and store it on your local system to prep for pretraining. +Support is available for both the 7B and 13B chat models. Both can be downloaded from `LLaVA's Model Zoo `__. After downloading the checkpoint you want from Hugging Face, extract and store it on your local system to prepare for pretraining. To convert the LLaMA-2 checkpoints to NeMo's format, follow these steps: -1. Adjust the default yaml file at `megatron_llama_config.yaml `_. Ensure ``model.mcore_gpt`` and ``model.transformer_engine`` are set to `False` before the checkpoint conversion. +1. Adjust the default YAML file at `megatron_llama_config.yaml `__. Ensure ``model.mcore_gpt`` and ``model.transformer_engine`` are set to `False` before the checkpoint conversion. 2. For the 7B chat model, use this conversion command: @@ -56,7 +56,7 @@ To convert the LLaMA-2 checkpoints to NeMo's format, follow these steps: For the 13B model, adjust the paths in the `--in-file` and `--out-file` parameters accordingly. -3. Execute the subsequent command to divide the checkpoint for tensor model parallel sizes of 4 or 8. It's advisable to use TP=4 for the 7B model and TP=8 for the 13B model to ensure both pretraining and finetuning operate without memory complications. +3. Execute the subsequent command to divide the checkpoint for tensor model parallel sizes of 4 or 8. It's advisable to use TP=4 for the 7B model and TP=8 for the 13B model to ensure both pretraining and fine-tuning operate without memory complications. .. code-block:: bash @@ -73,10 +73,10 @@ For the 13B model, adjust the paths in the `--in-file` and `--out-file` paramete --model_class="nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel" \ --tokenizer_model_path=/tokenizer.model -Tokenizer Configuration -^^^^^^^^^^^^^^^^^^^^^^^ +Configure Tokenizer +^^^^^^^^^^^^^^^^^^^ -For NeVA training, integrating special tokens into the tokenizer is vital. After obtaining the 7B/13B model from Huggingface, also procure the corresponding tokenizer model. Referring to the 7B-chat model: +For NeVA training, it is vital that you integrate special tokens into the tokenizer. After obtaining the 7B/13B model from Hugging Face, you need to procure the corresponding tokenizer model. Referring to the 7B-chat model: 1. Download the `tokenizer.model `_ to: @@ -84,7 +84,7 @@ For NeVA training, integrating special tokens into the tokenizer is vital. After /path/to/neva/tokenizers/tokenizer.model -2. Executing the next script necessitates the NeMo dependency. It's more convenient to run the script within the NeMo container. +2. Step 3 requires NeMo Framework to be installed. For quick setup, we recommend running it within the NeMo Framework container. 3. Employ the command below to infuse special tokens into the tokenizer: diff --git a/docs/source/multimodal/speech_llm/configs.rst b/docs/source/multimodal/speech_llm/configs.rst index 5edd169eed25..b48a99612049 100644 --- a/docs/source/multimodal/speech_llm/configs.rst +++ b/docs/source/multimodal/speech_llm/configs.rst @@ -1,9 +1,9 @@ Common Configuration Files ========================== -This section provides a detailed overview of the NeMo configuration file setup specific to models within the NeMo SpeechLLM collection. For foundational knowledge about setting up and executing experiments common to all NeMo models, such as the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`core <../../core/core>` documentation. +This section provides a detailed overview of the NeMo Framework configuration file setup, specifically for models within the NeMo Speech-augmented Large Language Models (SpeechLLM) collection. For foundational knowledge on setting up and executing experiments common to all NeMo Framework models, including the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`core <../../core/core>` documentation. -Within the configuration files of the NeMo SpeechLLMs, details concerning dataset(s), augmentation, optimization parameters, and model architectural specifications are central. This page explores each of these aspects. +The configuration files for NeMo SpeechLLMs focus on key details such as datasets, augmentation, optimization parameters, and model architectural specifications. This page explores each of these aspects. Discover exemplary configuration files for all SpeechLLMs in the `config directory of the examples `_. @@ -11,9 +11,9 @@ Discover exemplary configuration files for all SpeechLLMs in the `config directo Dataset Configuration --------------------- -The dataset configuration is based on the NeMo ASR data configuration and the NLP data configuration +The dataset configuration is based on the NeMo ASR data configuration and the NLP data configuration. -The configuration file allows setting any initialization parameter accepted by the Dataset class used in the experiment. For a comprehensive list of Datasets and their parameters, visit the `Datasets <./api.html#Datasets>`__ section of the API. +The configuration file enables you to set any initialization parameter accepted by the Dataset class used in the experiment. For a comprehensive list of datasets and their parameters, refer to the Datasets section of the :doc:`API <./api>`. A typical training configuration is as follows: @@ -55,9 +55,9 @@ A typical training configuration is as follows: audio_locator: null -Key parameters include: +The key configuration parameters include: -- ``manifest_filepath``: The path to the dataset in JSON lines format, where each line in the file is a python dictionary. This can either be a single file or a list of files. +- ``manifest_filepath``: The path to the dataset in JSON lines format, where each line in the file is a Python dictionary. This can either be a single file or a list of files. - ``global_batch_size``: The global batch size that takes consideration of gradient accumulation, data parallelism. - ``micro_batch_size``: The micro batch size that fits on each GPU. - ``shuffle``: Whether to shuffle the dataset. @@ -115,7 +115,7 @@ For a detailed list of arguments, refer to the `Pytorch Lightning Trainer `. Model Configurations -------------------- Each configuration file should detail the model architecture used for the experiment. -The parameters commonly shared across most multimodal language models include: +The following table shows the parameters commonly shared across most multimodal language models. +------------------------------------------+--------------+---------------------------------------------------------------------------------------+ | **Parameter** | **Datatype** | **Description** | @@ -185,13 +187,13 @@ The parameters commonly shared across most multimodal language models include: | :code:`seed` | int | seed used in training | +------------------------------------------+--------------+---------------------------------------------------------------------------------------+ -SALM -~~~~ +Speech-Augmented Language Model (SALM) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For model-specific configurations, refer to `the examples `_. +For information about SALM model-specific configurations, refer to `the examples `__. -BESTOW -~~~~~~ +BESt features from TwO Worlds (BESTOW) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For model-specific configurations, refer to `the examples `_. +For information about BESTOW model-specific configurations, refer to `the examples `__. diff --git a/docs/source/multimodal/speech_llm/intro.rst b/docs/source/multimodal/speech_llm/intro.rst index 55ea13d7d411..1f73ed9ed249 100644 --- a/docs/source/multimodal/speech_llm/intro.rst +++ b/docs/source/multimodal/speech_llm/intro.rst @@ -1,41 +1,42 @@ Speech-agumented Large Language Models (SpeechLLM) ================================================== -The endeavor to extend Language Models (LLMs) with the ability to understand speech and audio inputs, detailed examples can be found in the `SpeechLLM example `_.. +SpeechLLM is a multi-modal Large Language Model (LLM) designed to understand and process speech and audio inputs. Detailed information can be found in the `SpeechLLM examples README `_. .. toctree:: :maxdepth: 1 + datasets configs api -In general, there're three main components of a modular SpeechLLM: +In general, there are three main components of a modular SpeechLLM: + - An audio encoder that processes the input audio and produces a sequence of audio embeddings. -- A modality adapter that processes the audio embeddings and produces a sequence of embeddings in the same latent space as the token embeddings of a pretrained large language model (LLM). -- A pretrained large language model (LLM) that processes embeddings from the modality adapter as well as token embeddings of input prompt, and produces the text output. The audio embeddings and text token embeddings are concatenated in time dimension before going into the LLM. +- A modality adapter that processes the audio embeddings and produces a sequence of embeddings in the same latent space as the token embeddings of a pretrained LLM. +- A pretrained LLM that processes embeddings from the modality adapter and token embeddings from the input prompt, then produces the text output. The audio embeddings and text token embeddings are concatenated in time dimension before going into the LLM. - The LLM produces text outputs based on the concatenated input audio and text embedding. Model Architecture ^^^^^^^^^^^^^^^^^^ -One way to incorporate speech into LLM is to concatenate speech features with the token embeddings of the input text prompt before being fed into the LLM. In this way, the LLM can have direct access to the speech information when generating the output text. - .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/salm.png - :align: center - :alt: SALM model - :scale: 50% - +One way to incorporate speech into an LLM is to concatenate speech features with the token embeddings of the input text prompt before feeding them into the LLM. In this way, the LLM can have direct access to the speech information when generating the output text. The `Speech-Augmented Language Model `__ (SALM) follows this approach. - -Another way is to use cross-attention mechanism, by using text embeddings to attend to speech embeddings to extract task-specific information from the speech embeddings. In order to minimize the computational cost of cross-attention, we add a cross-attention module only before the LLM. - - .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/bestow.png - :align: center - :alt: BESTOW model - :scale: 50% +.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/salm.png + :align: center + :alt: SALM model + :scale: 50% +Another approach is to use a cross-attention mechanism, where text embeddings attend to speech embeddings to extract task-specific information. To minimize the computational cost, we add a cross-attention module only before the LLM. The `BESt features from TwO Worlds `__ (BESTOW) model follows this approach. + +.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/bestow.png + :align: center + :alt: BESTOW model + :scale: 50% +NeMo Framework contains `example scripts `__ for training and evaluating for both SALM and BESTOW models. A pre-trained `checkpoint `__ for SALM is also available. diff --git a/docs/source/starthere/fundamentals.rst b/docs/source/starthere/fundamentals.rst index 6413cb9d376a..e3014e0f5a03 100644 --- a/docs/source/starthere/fundamentals.rst +++ b/docs/source/starthere/fundamentals.rst @@ -190,10 +190,10 @@ NeMo Inference Scripts The examples scripts directory also contains many inference scripts such as `transcribe_speech.py `_. These inference scripts typically differ in structure from training scripts, as they include additional utilities for file I/O (reading and saving files). While inference scripts still use configurations (configs), they don’t require the ``trainer`` and ``model`` sections. Additionally, the default configs for inference scripts are usually specified as dataclasses rather than separate files. You can also modify elements via the command line. -Specifying training data +Specifying Training Data ------------------------ -NeMo will handle creation of data loaders for you, as long as you put your data into the expected input format. You may also need to train a tokenizer before starting training. To learn more about data formats, see :doc:`LLM <../nlp/nemo_megatron/gpt/gpt_training>`, :doc:`Multimodal <../multimodal/mllm/datasets>`, :ref:`Speech AI `, and :doc:`Vision models <../vision/datasets>`. +NeMo will handle the creation of data loaders for you, as long as you put your data into the expected input format. You may also need to train a tokenizer before starting training. To learn more about data formats, see :doc:`LLM <../nlp/nemo_megatron/gpt/gpt_training>`, :doc:`Multimodal <../multimodal/mllm/datasets>`, :ref:`Speech AI `, and :doc:`Vision models <../vision/datasets>`. Model Checkpoints @@ -209,7 +209,7 @@ The NeMo team also releases pretrained models which you can browse on `NGC `_ or :doc:`tutorials <./tutorials>` +* Explore examples or tutorials: dive into NeMo by exploring our `examples `_ or :doc:`tutorials <./tutorials>` -* Domain-Specific Documentation: +* Domain-specific documentation: - * For Large Language Models (LLMs), checkout out the :doc:`LLM <../nlp/nemo_megatron/intro>` documentation. + * For Large Language Models (LLMs), checkout the :doc:`LLM <../nlp/nemo_megatron/intro>` documentation. * For Multimodal tasks, refer to the :doc:`Multimodal <../multimodal/mllm/intro>` documentation. * If you’re interested in Automatic Speech Recognition (ASR), explore the :doc:`ASR <../asr/intro>` documentation. * For Text-to-Speech (TTS), find details in the :doc:`TTS <../tts/intro>` documentation. - * Lastly, for Vision Models, consult the :doc:`Vision Models <../vision/intro>` documentation. + * For Vision Models, consult the :doc:`Vision Models <../vision/intro>` documentation. * `NeMo Primer `__: This tutorial provides a hands-on introduction to NeMo, PyTorch Lightning, and OmegaConf. It covers how to use, modify, save, and restore NeMo models. * `NeMo Models `__: In this tutorial, you'll learn the fundamentals of creating NeMo models. -* NeMo Core Documentation: Explore the :doc:`NeMo Core <../core/core>` documentation for NeMo, which explains the inner workings of the framework. +* NeMo Core Documentation: Explore the :doc:`NeMo Core <../core/core>` documentation for NeMo, which explains the inner workings of NeMo Framework. diff --git a/examples/multimodal/speech_llm/README.md b/examples/multimodal/speech_llm/README.md index b6a9c7486331..3d7e37d05828 100644 --- a/examples/multimodal/speech_llm/README.md +++ b/examples/multimodal/speech_llm/README.md @@ -1,9 +1,6 @@ # Modular SpeechLLM -This directory contains example scripts to train and evaluate modular SpeechLLM (e.g, SALM[1], etc). - -## Requirements -You will need to install this specific branch of NeMo, or use the provided Dockerfile in the root directory of this repository to build a Docker image with all the necessary dependencies. +This directory contains example scripts to train and evaluate modular SpeechLLM (e.g, SALM[1], BESTOW[2] etc). ## Architecture @@ -186,4 +183,6 @@ If you have a local `.nemo` file, you can use `model.restore_from_path=/path/to/ ## Reference -[1] Chen, Z.\*, Huang, H.\*, Andrusenko, A., Hrinchuk, O., Puvvada, K.C., Li, J., Ghosh, S., Balam, J. and Ginsburg, B., 2023. SALM: Speech-augmented Language Model with In-context Learning for Speech Recognition and Translation. ICASSP'24. \ No newline at end of file +[1] Chen, Z.\*, Huang, H.\*, Andrusenko, A., Hrinchuk, O., Puvvada, K.C., Li, J., Ghosh, S., Balam, J. and Ginsburg, B., 2023. SALM: Speech-augmented Language Model with In-context Learning for Speech Recognition and Translation. ICASSP'24. + +[2] Chen, Z., Huang, H., Hrinchuk, O., Puvvada, K.C., Koluguri, N.R., .Zelasko, P., Balam, J., & Ginsburg, B. ,2024. BESTOW: Efficient and Streamable Speech Language Model with the Best of Two Worlds in GPT and T5. ArXiv, abs/2406.19954. \ No newline at end of file diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py index 89dcc61655e8..f7c6b6adff7f 100644 --- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py +++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Callable +from typing import Callable, Union import torch.utils.data from lhotse import CutSet @@ -100,7 +100,7 @@ def __getitem__(self, cuts: CutSet) -> PromptedAudioToTextMiniBatch: prompted_transcript_lens=prompts_with_answers_lens, ) - def _collate_tokens(self, tokens: list[list[int] | torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: + def _collate_tokens(self, tokens: list[Union[list[int], torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]: tokens = [torch.as_tensor(t) for t in tokens] token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long) tokens = collate_vectors(tokens, padding_value=self.padding_value) diff --git a/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py b/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py index de2d63cd99de..e181772b7f18 100644 --- a/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py +++ b/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py @@ -14,7 +14,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import List, Optional +from typing import List, Optional, Union import torch @@ -212,7 +212,7 @@ def forward( return (packed_result,) - def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids: torch.Tensor | None) -> None: + def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids: Union[torch.Tensor, None]) -> None: """ For each hypothesis in the mini-batch: * Remove the decoder input ids (prompt) from the predictions diff --git a/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py b/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py index 891d003bd001..b38c02574d5b 100644 --- a/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py @@ -14,7 +14,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import List, Optional +from typing import List, Optional, Union import torch @@ -205,7 +205,7 @@ def forward( return (packed_result,) - def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids: torch.Tensor | None) -> None: + def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids: Union[torch.Tensor, None]) -> None: """ For each hypothesis in the mini-batch: * Remove the decoder input ids (prompt) from the predictions diff --git a/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py b/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py index a433a5a6badf..ee194b74f993 100644 --- a/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py +++ b/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py @@ -198,21 +198,26 @@ class AudioTextDataset(TextProcessing, Dataset): """ Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds). Each new line is a different sample. Example below: - {"audio_filepath": "1.wav", "duration": 1.12, "question": "what is the capital of France?", "answer": "Paris"} - {"audio_filepath": "2.wav", "duration": 2.15, "question": "what is the capital of Italy?", "answer": "Rome"} + + .. code-block:: json + + {"audio_filepath": "1.wav", "duration": 1.12, "question": "what is the capital of France?", "answer": "Paris"} + {"audio_filepath": "2.wav", "duration": 2.15, "question": "what is the capital of Italy?", "answer": "Rome"} + Args: manifest_filepath: Path to manifest json as described above. Can be comma-separated paths. tokenizer: text tokenizer object sample_rate (int): Sample rate to resample loaded audio to int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded - audio + augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded audio max_duration: If audio exceeds this length, do not include in dataset min_duration: If audio is less than this length, do not include in dataset max_utts: Limit number of utterances trim: whether or not to trim silence. Defaults to False channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - --------- NLP SPECIFIC ARGS ------------- + + :note: below args are NLP-specific + max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. add_bos (bool): Whether to add a beginning of sentence token to each data example @@ -228,9 +233,16 @@ class AudioTextDataset(TextProcessing, Dataset): answer_only_loss: If True, will compute the loss only on the answer part of the input. If False, will compute the loss on the entire input. truncation_field: Field to use for truncation. (Options: "answer", "context"). Field to be used for truncation if the combined length exceeds the max sequence length. pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch. - prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output} + prompt_template: Prompt template to inject via an fstring. Formatted like: + + .. code-block:: text + + Q: {input}\\n\\nA: {output} + end_string: Optional[str] = None, if not None, add this string to the end of the answer. - --------------- additional args for misc purposes ---------------- + + :note: below args are for miscellaneous purposes + context_file: Optional[Union[List[str], str]] = None, if provided, will use this file to load random questions from, if question is not in manifest. sample_alpha: Optional[float] = None, for SPE subword sampling audio_locator: Optional[str] = None, a special string to split the context into multiple audio segments. @@ -583,26 +595,30 @@ class TarredAudioTextDataset(TextProcessing, IterableDataset): pad_id (id): Token used to pad when collating samples in batches. If this is None, pads using 0s. Defaults to None. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. - - .. warning:: - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. + shard_strategy (str): Tarred dataset shard distribution strategy chosen as a + str value during ddp. + + - `scatter`: The default shard strategy applied by WebDataset, where each node gets + a unique set of shards, which are permanently pre-allocated and never changed at runtime. + - `replicate`: Optional shard strategy, where each node gets all of the set of shards + available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. + The benefit of replication is that it allows each node to sample data points from the entire + dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. + + :warning: Replicated strategy allows every node to sample the entire set of available tarfiles, + and therefore more than one node may sample the same tarfile, and even sample the same + data points! As such, there is no assured guarantee that all samples in the dataset will be + sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific + occasions (when the number of shards is not divisible with ``world_size``), will not sample + the entire dataset. For these reasons it is not advisable to use tarred datasets as validation + or test datasets. + shard_manifests (bool): Whether or not to try / shard manifests. Defaults to False. global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - --------- NLP SPECIFIC ARGS ------------- + + :note: Below args are NLP-specific + max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. add_bos (bool): Whether to add a beginning of sentence token to each data example @@ -617,11 +633,19 @@ class TarredAudioTextDataset(TextProcessing, IterableDataset): answer_only_loss: If True, will compute the loss only on the answer part of the input. If False, will compute the loss on the entire input. truncation_field: Field to use for truncation. (Options: "answer", "context"). Field to be used for truncation if the combined length exceeds the max sequence length. pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch. - prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output} + prompt_template: Prompt template to inject via an fstring. Formatted like: + + .. code-block:: text + + Q: {input}\\n\\nA: {output} + end_string: Optional[str] = None, if not None, add this string to the end of the answer. - --------------- additional args for misc purposes ---------------- + + :note: Below args are for miscellaneous purposes + context_file: Optional[Union[List[str], str]] = None, if provided, will use this file to load random questions from, if question is not in manifest. sample_alpha: Optional[float] = None, for SPE subword sampling + """ def __init__( diff --git a/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py b/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py index d3e70343d507..204a92e5b7ab 100644 --- a/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py +++ b/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py @@ -1,3 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + import torch.utils.data from lhotse.dataset import AudioSamples from lhotse.dataset.collation import collate_vectors as collate_vectors_lhotse @@ -63,7 +79,7 @@ def __init__( self.context_key = context_key self.default_context_key = default_context_key - def __getitem__(self, cuts) -> dict[str, torch.Tensor | list[str] | dict]: + def __getitem__(self, cuts) -> dict[str, Union[torch.Tensor, list[str], dict]]: cuts = cuts.sort_by_duration() audio, audio_lens, cuts = self.load_audio(cuts) diff --git a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py index 021ac1ff3dad..20c478825946 100644 --- a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py +++ b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py @@ -200,54 +200,56 @@ class MultiAudioPerceptionModule(NeuralModule, Exportable): """ Audio perception module that consists of multiple audio encoders and shared modality adapter. This module is experimental. An example perception cfg is: - ------------------- - perception: - modality_adapter: - _target_: nemo.collections.multimodal.speechllm.modules.PoolingMLPConnectors - hidden_dim: 512 - pooling: 'cat' - pooling_factor: 2 - num_layers: 4 - input_dim: -1 - output_dim: -1 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoders: - asr_model: - _target_: nemo.collections.asr.models.ASRModel - output_key: d_model - freeze: True - pretrained_model: stt_en_fastconformer_transducer_large - ssl_model: - _target_: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel - output_key: d_model - freeze: True - pretrained_model: ssl_en_conformer_large - use_multi_layer_feat: True - multi_layer_feat: - layer_idx_list: [0,16] + + .. code-block:: yaml + + perception: + modality_adapter: + _target_: nemo.collections.multimodal.speechllm.modules.PoolingMLPConnectors + hidden_dim: 512 + pooling: 'cat' + pooling_factor: 2 + num_layers: 4 + input_dim: -1 + output_dim: -1 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoders: + asr_model: + _target_: nemo.collections.asr.models.ASRModel + output_key: d_model + freeze: True + pretrained_model: stt_en_fastconformer_transducer_large + ssl_model: + _target_: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel + output_key: d_model + freeze: True + pretrained_model: ssl_en_conformer_large + use_multi_layer_feat: True + multi_layer_feat: + layer_idx_list: [0,16] + aggregator: + mode: "cat" + pooling: "avg" + rounding: "floor" + + speaker_model: + segment_length_in_secs: 0.4 + freeze: True + pretrained_model: titanet_large + + ref_model: asr_model aggregator: mode: "cat" - pooling: "avg" + pooling: "mean" rounding: "floor" - speaker_model: - segment_length_in_secs: 0.4 - freeze: True - pretrained_model: titanet_large - - ref_model: asr_model - aggregator: - mode: "cat" - pooling: "mean" - rounding: "floor" - ------------------- """ def __init__(self, cfg: DictConfig): @@ -441,9 +443,10 @@ def lens_to_mask(lens, max_length): class TransformerCrossAttention(NeuralModule, Exportable): """Transformer module for cross-attention between speech and text embeddings. The module allows optional projection from the input embeddings to a lower dimension before feeding them to the transformer. + Args: cfg: DictConfig, configuration object for the module which should include: - xattn: DictConfig, configuration object for the transformer decoder + xattn: DictConfig, configuration object for the transformer decoder """ def __init__(self, cfg: DictConfig, *args, **kwargs): From 397ca258fa98dfbde7e9d17aa076707e1ebd78ce Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Mon, 19 Aug 2024 10:40:09 -0700 Subject: [PATCH 019/664] Add Mistral-NeMo 2407 configs (#10184) * Add Mistral-NeMo 2407 configs Signed-off-by: Alexandros Koumparoulis * Make mistral importer support models without window size Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- nemo/collections/llm/gpt/model/mistral.py | 41 +++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py index b8eed78141c9..61a96917537c 100644 --- a/nemo/collections/llm/gpt/model/mistral.py +++ b/nemo/collections/llm/gpt/model/mistral.py @@ -44,6 +44,40 @@ class MistralConfig7B(GPTConfig): window_size: List[int] = field(default_factory=lambda: [4096, 0]) +@dataclass +class MistralNeMo2407Config12B(MistralConfig7B): + """ + https://mistral.ai/news/mistral-nemo/ + """ + + num_layers: int = 40 + hidden_size: int = 5120 + kv_channels: int = 128 + seq_length: int = 4096 # but "max_position_embeddings": 1024000, + + window_size: List[int] = None + rotary_percent: float = 1.0 + rotary_base: float = 1000000.0 + + +@dataclass +class MistralNeMo2407Config123B(MistralConfig7B): + """ + https://mistral.ai/news/mistral-large-2407/ + """ + + num_layers: int = 88 + hidden_size: int = 12288 + ffn_hidden_size: int = 28672 + num_attention_heads: int = 96 + kv_channels: int = 128 + seq_length: int = 4096 # but "max_position_embeddings": 131072, + + window_size: List[int] = None + rotary_percent: float = 1.0 + rotary_base: float = 1000000.0 + + class MistralModel(GPTModel): def __init__( self, @@ -109,6 +143,9 @@ def make_vocab_size_divisible_by(mistral_vocab_size): base //= 2 return base + window_size = None + if getattr(source, 'sliding_window', None) is not None: + window_size = [source.sliding_window, 0] output = MistralConfig7B( seq_length=source.sliding_window, num_layers=source.num_hidden_layers, @@ -123,7 +160,7 @@ def make_vocab_size_divisible_by(mistral_vocab_size): rotary_base=source.rope_theta, gated_linear_unit=True, make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size), - window_size=[source.sliding_window, 0], + window_size=window_size, share_embeddings_and_output_weights=False, ) @@ -176,7 +213,7 @@ def config(self) -> "MistralConfig": from transformers import MistralConfig as HfMistralConfig return HfMistralConfig( - sliding_window=source.window_size[0], + sliding_window=source.window_size[0] if source.window_size is not None else None, num_hidden_layers=source.num_layers, hidden_size=source.hidden_size, intermediate_size=source.ffn_hidden_size, From 05c5124862b0d035bc2fff01cbf6dfc4254572ab Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Mon, 19 Aug 2024 13:51:43 -0700 Subject: [PATCH 020/664] [NeMo UX] Disable softmax fusion for non-te/non-apex test (#10205) * disable softmax fusion for non-te/non-apex test Signed-off-by: ashors1 * add dest Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 --------- Signed-off-by: ashors1 Signed-off-by: ashors1 Co-authored-by: ashors1 --- .github/workflows/cicd-main.yml | 6 ++++-- examples/llm/megatron_gpt_pretraining.py | 7 +++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index a4d65c5a4dc0..797b7888b01e 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4722,7 +4722,8 @@ jobs: --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \ - --index-mapping-dir=examples/llm/gpt_index_mappings + --index-mapping-dir=examples/llm/gpt_index_mappings \ + --no-masked-softmax-fusion python examples/llm/megatron_gpt_pretraining.py \ --devices=2 \ @@ -4731,7 +4732,8 @@ jobs: --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \ - --index-mapping-dir=examples/llm/gpt_index_mappings + --index-mapping-dir=examples/llm/gpt_index_mappings \ + --no-masked-softmax-fusion AFTER_SCRIPT: | rm -rf examples/llm/gpt_pretrain_results rm -rf examples/llm/gpt_index_mappings diff --git a/examples/llm/megatron_gpt_pretraining.py b/examples/llm/megatron_gpt_pretraining.py index 73e96a23bf81..cfdb6a6acb4b 100644 --- a/examples/llm/megatron_gpt_pretraining.py +++ b/examples/llm/megatron_gpt_pretraining.py @@ -25,6 +25,12 @@ def get_args(): parser.add_argument('--vocab-path', type=str, help="Path to vocab file") parser.add_argument('--merges-path', type=str, help="Path to merges file") parser.add_argument('--index-mapping-dir', type=str, help="directory to write index mappings to") + parser.add_argument( + '--no-masked-softmax-fusion', + action='store_false', + help='Disable fusion of softmax.', + dest='masked_softmax_fusion', + ) return parser.parse_args() @@ -59,6 +65,7 @@ def get_args(): attention_dropout=0.1, layernorm_epsilon=1e-5, make_vocab_size_divisible_by=128, + masked_softmax_fusion=args.masked_softmax_fusion, ) model = llm.GPTModel(gpt_config, tokenizer=data.tokenizer) strategy = nl.MegatronStrategy() From 54458fa9c1c913b2b0ea80f072b32d011c063e67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Mon, 19 Aug 2024 17:47:50 -0400 Subject: [PATCH 021/664] Enable specifying kwargs to DDPStrategy via config for most models (#10046) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Enable specifying kwargs to DDPStrategy via config for most models Signed-off-by: Piotr Żelasko * fix Signed-off-by: Piotr Żelasko * Add test and fix Signed-off-by: Piotr Żelasko * Revert accidental change Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko --- examples/asr/asr_ctc/speech_to_text_ctc.py | 3 ++- .../asr/asr_ctc/speech_to_text_ctc_bpe.py | 3 ++- .../speech_to_text_hybrid_rnnt_ctc_bpe.py | 3 ++- .../speech_to_text_hybrid_rnnt_ctc_char.py | 3 ++- .../asr/asr_transducer/speech_to_text_rnnt.py | 3 ++- .../asr_transducer/speech_to_text_rnnt_bpe.py | 3 ++- .../speech_to_text_bpe_with_text.py | 3 ++- .../speech_to_text_bpe_with_text_finetune.py | 3 ++- .../speech_multitask/fast-conformer_aed.yaml | 4 +++- .../speech_multitask/speech_to_text_aed.py | 3 ++- .../speech_pretraining/speech_pre_training.py | 4 ++-- examples/asr/speech_to_text_finetune.py | 3 ++- .../speech_to_text_transformer.py | 3 ++- nemo/utils/trainer_utils.py | 20 +++++++++++++++++++ tests/utils/test_trainer_utils.py | 20 +++++++++++++++++++ 15 files changed, 67 insertions(+), 14 deletions(-) create mode 100644 nemo/utils/trainer_utils.py create mode 100644 tests/utils/test_trainer_utils.py diff --git a/examples/asr/asr_ctc/speech_to_text_ctc.py b/examples/asr/asr_ctc/speech_to_text_ctc.py index a39a0eab078a..87b1b11633f7 100644 --- a/examples/asr/asr_ctc/speech_to_text_ctc.py +++ b/examples/asr/asr_ctc/speech_to_text_ctc.py @@ -75,13 +75,14 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg @hydra_runner(config_path="../conf", config_name="config") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecCTCModel(cfg=cfg.model, trainer=trainer) diff --git a/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py b/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py index 5f36f3b0382f..b4e3be5f650a 100644 --- a/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py +++ b/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py @@ -71,13 +71,14 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg @hydra_runner(config_path="../conf/citrinet/", config_name="config_bpe") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer) diff --git a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py index 2de150c71328..796005a8fcee 100644 --- a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py +++ b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py @@ -65,6 +65,7 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg @hydra_runner( @@ -73,7 +74,7 @@ def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecHybridRNNTCTCBPEModel(cfg=cfg.model, trainer=trainer) diff --git a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py index 532e2c9ed0be..423e005d8f02 100644 --- a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py +++ b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py @@ -76,13 +76,14 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg @hydra_runner(config_path="../conf/conformer/hybrid_transducer_ctc/", config_name="conformer_hybrid_transducer_ctc") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecHybridRNNTCTCModel(cfg=cfg.model, trainer=trainer) diff --git a/examples/asr/asr_transducer/speech_to_text_rnnt.py b/examples/asr/asr_transducer/speech_to_text_rnnt.py index bc75a0189dd0..5b4f1e8a985d 100644 --- a/examples/asr/asr_transducer/speech_to_text_rnnt.py +++ b/examples/asr/asr_transducer/speech_to_text_rnnt.py @@ -74,13 +74,14 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg @hydra_runner(config_path="experimental/contextnet_rnnt", config_name="config_rnnt") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecRNNTModel(cfg=cfg.model, trainer=trainer) diff --git a/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py b/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py index 339f65aa1eb6..1fffea55686f 100644 --- a/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py +++ b/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py @@ -66,13 +66,14 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg @hydra_runner(config_path="experimental/contextnet_rnnt", config_name="config_rnnt_bpe") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecRNNTBPEModel(cfg=cfg.model, trainer=trainer) diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py index 946202364c53..b435d418fda2 100644 --- a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py +++ b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py @@ -56,6 +56,7 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg @hydra_runner(config_path="examples/asr/conf/conformer", config_name="conformer_transducer_bpe") @@ -67,7 +68,7 @@ def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') OmegaConf.resolve(cfg) - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = ASRWithTTSModel.from_asr_config( diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py index 5ded1ff3dfa3..99bc41ba966b 100644 --- a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py +++ b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py @@ -52,6 +52,7 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg @hydra_runner(config_path="examples/asr/asr_tts", config_name="hybrid_asr_tts") @@ -59,7 +60,7 @@ def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') OmegaConf.resolve(cfg) - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = ASRWithTTSModel(cfg.model, trainer=trainer) diff --git a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml index 4c6aa643c19d..3d1a8c8bdf47 100644 --- a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml +++ b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml @@ -248,7 +248,9 @@ trainer: max_steps: 100000 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 0.0 precision: bf16-mixed # Should be set to bf16-mixed/16-mixed for O1 and O2 to enable the AMP. diff --git a/examples/asr/speech_multitask/speech_to_text_aed.py b/examples/asr/speech_multitask/speech_to_text_aed.py index b0e5333249f4..0c13e5289d86 100644 --- a/examples/asr/speech_multitask/speech_to_text_aed.py +++ b/examples/asr/speech_multitask/speech_to_text_aed.py @@ -57,13 +57,14 @@ from nemo.core.config import hydra_runner from nemo.utils import logging, model_utils from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg @hydra_runner(config_path="../conf/speech_multitask/", config_name="fast-conformer_aed") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) # Check for spl tokens to create spl_tokenizer. diff --git a/examples/asr/speech_pretraining/speech_pre_training.py b/examples/asr/speech_pretraining/speech_pre_training.py index a7200a19a92b..cec9444096c3 100644 --- a/examples/asr/speech_pretraining/speech_pre_training.py +++ b/examples/asr/speech_pretraining/speech_pre_training.py @@ -20,7 +20,7 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager - +from nemo.utils.trainer_utils import resolve_trainer_cfg """ # Example of unsupervised pre-training of a model @@ -54,7 +54,7 @@ def main(cfg): logging.info(f"Hydra config: {OmegaConf.to_yaml(cfg)}") - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = SpeechEncDecSelfSupervisedModel(cfg=cfg.model, trainer=trainer) diff --git a/examples/asr/speech_to_text_finetune.py b/examples/asr/speech_to_text_finetune.py index ee043c0bd131..148b11d8b70f 100644 --- a/examples/asr/speech_to_text_finetune.py +++ b/examples/asr/speech_to_text_finetune.py @@ -62,6 +62,7 @@ from nemo.utils import logging, model_utils from nemo.utils.exp_manager import exp_manager from nemo.utils.get_rank import is_global_rank_zero +from nemo.utils.trainer_utils import resolve_trainer_cfg def get_base_model(trainer, cfg): @@ -193,7 +194,7 @@ def setup_dataloaders(asr_model, cfg): def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) if hasattr(cfg, 'init_from_ptl_ckpt') and cfg.init_from_ptl_ckpt is not None: diff --git a/examples/asr/speech_translation/speech_to_text_transformer.py b/examples/asr/speech_translation/speech_to_text_transformer.py index 56b600e0b4e0..ac4dc4334164 100644 --- a/examples/asr/speech_translation/speech_to_text_transformer.py +++ b/examples/asr/speech_translation/speech_to_text_transformer.py @@ -47,13 +47,14 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg @hydra_runner(config_path="../conf/speech_translation/", config_name="fast-conformer_transformer") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecTransfModelBPE(cfg=cfg.model, trainer=trainer) diff --git a/nemo/utils/trainer_utils.py b/nemo/utils/trainer_utils.py new file mode 100644 index 000000000000..790ccb819069 --- /dev/null +++ b/nemo/utils/trainer_utils.py @@ -0,0 +1,20 @@ +from typing import Mapping + +_HAS_HYDRA = True + +try: + import hydra + from omegaconf import DictConfig, OmegaConf +except ModuleNotFoundError: + DictConfig = Mapping + OmegaConf = None + _HAS_HYDRA = False + + +def resolve_trainer_cfg(trainer_cfg: DictConfig) -> DictConfig: + trainer_cfg = OmegaConf.to_container(trainer_cfg, resolve=True) + if not _HAS_HYDRA: + return trainer_cfg + if (strategy := trainer_cfg.get("strategy", None)) is not None and isinstance(strategy, Mapping): + trainer_cfg["strategy"] = hydra.utils.instantiate(strategy) + return trainer_cfg diff --git a/tests/utils/test_trainer_utils.py b/tests/utils/test_trainer_utils.py new file mode 100644 index 000000000000..ed13b0c4ac38 --- /dev/null +++ b/tests/utils/test_trainer_utils.py @@ -0,0 +1,20 @@ +from omegaconf import OmegaConf +from pytorch_lightning.strategies import DDPStrategy + +from nemo.utils.trainer_utils import resolve_trainer_cfg + + +def test_resolve_trainer_cfg_strategy(): + cfg = OmegaConf.create({"strategy": "ddp"}) + ans = resolve_trainer_cfg(cfg) + assert isinstance(ans, dict) + assert ans["strategy"] == "ddp" + + cfg = OmegaConf.create( + {"strategy": {"_target_": "pytorch_lightning.strategies.DDPStrategy", "gradient_as_bucket_view": True}} + ) + ans = resolve_trainer_cfg(cfg) + assert isinstance(ans, dict) + assert isinstance(ans["strategy"], DDPStrategy) + assert "gradient_as_bucket_view" in ans["strategy"]._ddp_kwargs + assert ans["strategy"]._ddp_kwargs["gradient_as_bucket_view"] == True From d4f02b55325ff6f7d5e9d49ec5b07d66af10e6f7 Mon Sep 17 00:00:00 2001 From: Marc Romeyn Date: Tue, 20 Aug 2024 07:57:34 +0200 Subject: [PATCH 022/664] Set parallelism correctly when using VPP (#10204) --- nemo/lightning/_strategy_lib.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py index 9dd59939fa8a..9b4aaa8d0330 100644 --- a/nemo/lightning/_strategy_lib.py +++ b/nemo/lightning/_strategy_lib.py @@ -140,6 +140,8 @@ def set_model_parallel_attributes(model, parallelism): if not hasattr(config, attr_name): continue setattr(config, attr_name, getattr(parallelism, attr_name)) + if hasattr(config, "__io__"): + setattr(config.__io__, attr_name, getattr(parallelism, attr_name)) return config From 60442c2f5341309726be6844d2f529983bd7eb63 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:57:09 -0700 Subject: [PATCH 023/664] Akoumparouli/nemo ux precision plugin refactor (#10129) * rename mixed_precision.py to precision.py Signed-off-by: Alexandros Koumparoulis * replace print with logging.warning Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * also patch ddp_config Signed-off-by: Alexandros Koumparoulis * Rename patch_dtype_config to update_config_with_dtype_overrides Signed-off-by: Alexandros Koumparoulis * Add GradScaler's args to constructor's arg list Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * fix import Signed-off-by: Alexandros Koumparoulis * Leverage mcore's fp16 grad scaler Signed-off-by: Alexandros Koumparoulis * remove unused param Signed-off-by: Alexandros Koumparoulis * Add precision plugin test Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * Also update __io__ configs Signed-off-by: Alexandros Koumparoulis * remove unused imports Signed-off-by: Alexandros Koumparoulis * fix fabric to ptl converter mcore precision plugin Signed-off-by: Alexandros Koumparoulis * fix test Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- nemo/lightning/fabric/plugins.py | 2 - .../pytorch/plugins/mixed_precision.py | 159 ++++++++++++------ nemo/lightning/pytorch/strategies.py | 10 ++ tests/lightning/test_precision_plugin.py | 95 +++++++++++ 4 files changed, 212 insertions(+), 54 deletions(-) create mode 100644 tests/lightning/test_precision_plugin.py diff --git a/nemo/lightning/fabric/plugins.py b/nemo/lightning/fabric/plugins.py index 79e1455cb33f..dba103abf2a4 100644 --- a/nemo/lightning/fabric/plugins.py +++ b/nemo/lightning/fabric/plugins.py @@ -124,6 +124,4 @@ def forward_context(self) -> Generator[None, None, None]: def _convert_megatron_mixed_precision(plugin: MegatronMixedPrecision) -> FabricMegatronMixedPrecision: return FabricMegatronMixedPrecision( precision=plugin.precision, - device=plugin.device, - scaler=plugin.scaler, ) diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py index 65b7c6292249..79394cc4bbb1 100644 --- a/nemo/lightning/pytorch/plugins/mixed_precision.py +++ b/nemo/lightning/pytorch/plugins/mixed_precision.py @@ -13,15 +13,16 @@ # limitations under the License. from contextlib import contextmanager +from dataclasses import dataclass, fields from typing import Any, Callable, Generator, List, Literal, Tuple, TypeVar, Union import pytorch_lightning as pl import torch -from pytorch_lightning.plugins.precision import MixedPrecision +from pytorch_lightning.plugins.precision import Precision from torch.nn import Module from torch.optim import Optimizer -from nemo.lightning._strategy_lib import GradScaler +from nemo.utils import logging AnyT = TypeVar("AnyT") @@ -33,18 +34,93 @@ def get_optim_config(optimizer: Optimizer): raise ValueError("Failed to extract optimizer config from module.") -class MegatronMixedPrecision(MixedPrecision): +@dataclass +class DtypeConfig: + fp32: bool = False + fp16: bool = False + bf16: bool = False + params_dtype: torch.dtype = None + pipeline_dtype: torch.dtype = None + autocast_dtype: torch.dtype = None + autocast_enabled: bool = False + grad_reduce_in_fp32: bool = True + # fp8 related + fp8: str = None + fp8_margin: int = 0 + fp8_interval: int = 1 + fp8_amax_history_len: int = 1 + fp8_amax_compute_algo: str = "most_recent" + fp8_wgrad: bool = True + fp8_dot_product_attention: bool = False + fp8_multi_head_attention: bool = False + # FP16 Loss scaling + loss_scale: float = (None,) + initial_loss_scale: float = (None,) + min_loss_scale: float = (None,) + loss_scale_window: float = (None,) + hysteresis: float = (None,) + + +class MegatronMixedPrecision(Precision): def __init__( self, - precision: Literal["16-mixed", "bf16-mixed"], - device="cuda", + precision: Literal["16-mixed", "bf16-mixed", "32"], + params_dtype: torch.dtype = None, + pipeline_dtype: torch.dtype = None, + autocast_dtype: torch.dtype = None, + autocast_enabled: bool = False, + grad_reduce_in_fp32: bool = True, + # fp8 related, + fp8: str = None, + fp8_margin: int = 0, + fp8_interval: int = 1, + fp8_amax_history_len: int = 1, + fp8_amax_compute_algo: str = "most_recent", + fp8_wgrad: bool = True, + fp8_dot_product_attention: bool = False, + fp8_multi_head_attention: bool = False, + fp16_loss_scale: float = None, + fp16_initial_loss_scale: float = 4294967296, + fp16_min_loss_scale: float = 1.0, + fp16_loss_scale_window: int = 1000, + fp16_hysteresis: int = 2, ) -> None: - if precision == "bf16-mixed": - scaler = None - else: - scaler = GradScaler(init_scale=2**32, growth_interval=1000, hysteresis=2) - super().__init__(precision, device, scaler) + if isinstance(precision, int): + precision = str(precision) + + dtype = torch.bfloat16 if precision in ['bf16', 'bf16-mixed'] else torch.float32 + self.dtype_config = DtypeConfig( + fp32=precision in ['fp32', '32'], + fp16=precision in ['fp16', 'fp16-mixed', '16', '16-mixed'], + bf16=precision in ['bf16', 'bf16-mixed'], + params_dtype=params_dtype or torch.float32, + pipeline_dtype=pipeline_dtype or dtype, + autocast_dtype=autocast_dtype or dtype, + autocast_enabled=autocast_enabled, + grad_reduce_in_fp32=grad_reduce_in_fp32, + fp8=fp8, + fp8_margin=fp8_margin, + fp8_interval=fp8_interval, + fp8_amax_history_len=fp8_amax_history_len, + fp8_amax_compute_algo=fp8_amax_compute_algo, + fp8_wgrad=fp8_wgrad, + fp8_dot_product_attention=fp8_dot_product_attention, + fp8_multi_head_attention=fp8_multi_head_attention, + # fp16 loss scale + loss_scale=fp16_loss_scale, + initial_loss_scale=fp16_initial_loss_scale, + min_loss_scale=fp16_min_loss_scale, + loss_scale_window=fp16_loss_scale_window, + hysteresis=fp16_hysteresis, + ) + super().__init__() + if self.dtype_config.fp16: + self.precision = "16-mixed" + elif self.dtype_config.bf16: + self.precision = "bf16-mixed" + else: + self.precision = "32-true" def convert_module(self, module: Module) -> Module: """Convert the module parameters to the precision type this plugin handles. @@ -55,11 +131,11 @@ def convert_module(self, module: Module) -> Module: from megatron.core.transformer.module import Float16Module from megatron.core.utils import get_model_config - if self.precision in ["16-mixed", "bf16-mixed"]: + if self.dtype_config.fp16 or self.dtype_config.bf16: + # Patch config options config = get_model_config(module.module) - config.fp16 = self.precision == "16-mixed" - config.bf16 = self.precision == "bf16-mixed" - config.autocast = False + config.fp16 = self.dtype_config.fp16 + config.bf16 = self.dtype_config.bf16 if hasattr(module, 'module'): module.module = Float16Module(config, module.module) else: @@ -74,8 +150,8 @@ def convert_optimizer(self, optimizer: Optimizer) -> Optimizer: """ optim_config = get_optim_config(optimizer) - assert optim_config.bf16 == (self.precision == "bf16-mixed"), "BF16 enabled on model but not on optimizer" - assert optim_config.fp16 == (self.precision == "fp16-mixed"), "BF16 enabled on model but not on optimizer" + assert optim_config.bf16 == self.dtype_config.bf16, "BF16 enabled on model but not on optimizer" + assert optim_config.fp16 == self.dtype_config.fp16, "BF16 enabled on model but not on optimizer" return optimizer def convert_input(self, data: AnyT) -> AnyT: @@ -96,42 +172,6 @@ def convert_output(self, data: AnyT) -> AnyT: """ return data - def optimizer_step( - self, - optimizer: torch.optim.Optimizer, - model: Union["pl.LightningModule", torch.nn.Module], - closure: Callable[[], Any], - **kwargs: Any, - ) -> None: - from nemo.core.optim import MainParamsOptimizerWrapper - - if not isinstance(optimizer, MainParamsOptimizerWrapper): - return super().optimizer_step(optimizer, model, closure, **kwargs) - - if self.scaler is None: - assert optimizer.fp32_grad_accumulation, "BF16 uses FP32 grad accumulation" - _ = closure() - self._after_closure(model, optimizer) - return optimizer.step(**kwargs) - - assert not optimizer.fp32_grad_accumulation, "FP16 uses FP16 grad accumulation" - closure_result = closure() - - # TODO: Add an option for merged all-reduce - - # cast fp16 grads to fp32 and copy to main grads, which are used for unscale and param update - optimizer.copy_model_grads_to_main_grads() - # `unscale` after the closure is executed but before the `on_before_optimizer_step` hook. - # unscale main (fp32) gradients - self.scaler.unscale_(optimizer) - self._after_closure(model, optimizer) - skipped_backward = closure_result is None - # in manual optimization, the closure does not return a value - if not isinstance(model, pl.LightningModule) or not model.automatic_optimization or not skipped_backward: - # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found - self.scaler.step(optimizer, **kwargs) - self.scaler.update() - @contextmanager def forward_context(self) -> Generator[None, None, None]: """No explicit precision casting. Inputs are supposed to be manually casted.""" @@ -141,4 +181,19 @@ def forward_context(self) -> Generator[None, None, None]: pass +def update_config_with_dtype_overrides(dtype_config, config): + if hasattr(config, "__io__"): + config.__io__ = update_config_with_dtype_overrides(dtype_config, config.__io__) + for field in fields(dtype_config): + if not hasattr(config, field.name): + continue + # If we overwrote a value, throw a warning. + old_val = getattr(config, field.name) + new_val = getattr(dtype_config, field.name) + if old_val != new_val: + setattr(config, field.name, new_val) + logging.warning(f"Overwrote {type(config).__name__}.{field.name} {old_val} -> {new_val}") + return config + + __all__ = ["MegatronMixedPrecision"] diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py index 0250709a4e03..668b088a4864 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies.py @@ -219,6 +219,12 @@ def connect(self, model: pl.LightningModule) -> None: if _maybe_mcore_config: self._mcore_config = _maybe_mcore_config + dtype_config = getattr(self._precision_plugin, 'dtype_config', None) + if dtype_config: + from nemo.lightning.pytorch.plugins.mixed_precision import update_config_with_dtype_overrides + + model.config = update_config_with_dtype_overrides(dtype_config, model.config) + has_optim = getattr(model, "optim", None) if has_optim: opt_config = getattr(model.optim, "config", None) @@ -228,6 +234,10 @@ def connect(self, model: pl.LightningModule) -> None: raise ValueError("PyTorch DDP is not enabled for mcore optimizer") ddp_config = cast(DistributedDataParallelConfig, self.ddp_config) + if dtype_config: + model.optim.config = update_config_with_dtype_overrides(dtype_config, model.optim.config) + self.ddp_config = update_config_with_dtype_overrides(dtype_config, self.ddp_config) + if mcore_opt_config.use_distributed_optimizer != ddp_config.use_distributed_optimizer: from nemo.utils import logging diff --git a/tests/lightning/test_precision_plugin.py b/tests/lightning/test_precision_plugin.py new file mode 100644 index 000000000000..bdd834c3bf7a --- /dev/null +++ b/tests/lightning/test_precision_plugin.py @@ -0,0 +1,95 @@ +import pytest +import pytorch_lightning as pl +import torch +from megatron.core.optimizer import OptimizerConfig + +from nemo import lightning as nl +from nemo.collections import llm + + +class DummyTokenizer: + def __init__(self): + self.vocab_size = 30000 + + +class TestMegatronMixedPrecision: + """Unit tests for the MegatronMixedPrecision class.""" + + @pytest.mark.run_only_on('GPU') + def test_precision_plugin_fp8_passed(self): + """Test __init__ with default parameters.""" + + class TrainerHook(nl.Trainer): + def connect(self, model: pl.LightningModule) -> None: + assert model.config.bf16 == False + assert model.config.fp8 is None + super().connect(model) + assert model.config.fp8 == 'e4m3' + assert model.config.bf16 == True + + trainer = TrainerHook( + devices=2, + accelerator="gpu", + max_steps=2, + strategy=nl.MegatronStrategy( + tensor_model_parallel_size=2, + sequence_parallel=True, + ckpt_include_optimizer=False, + ), + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed", fp8='e4m3'), + limit_val_batches=0.0, + num_sanity_val_steps=0, + ) + + optim = nl.MegatronOptimizerModule( + config=OptimizerConfig( + optimizer="adam", + lr=1e-5, + use_distributed_optimizer=False, + fp16=True, + params_dtype=torch.float32, + ), + ) + config = llm.Llama2Config7B() + config.num_layers = 2 + model = llm.LlamaModel(config, tokenizer=DummyTokenizer(), optim=optim) + trainer.strategy.connect(model) + + @pytest.mark.run_only_on('GPU') + def test_precision_plugin_precision_params_override(self): + """Test __init__ with default parameters.""" + trainer = nl.Trainer( + devices=2, + accelerator="gpu", + max_steps=2, + strategy=nl.MegatronStrategy( + tensor_model_parallel_size=2, + sequence_parallel=True, + ckpt_include_optimizer=False, + ), + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + limit_val_batches=0.0, + num_sanity_val_steps=0, + ) + + optim = nl.MegatronOptimizerModule( + config=OptimizerConfig( + optimizer="adam", + lr=1e-5, + use_distributed_optimizer=False, + fp16=True, + params_dtype=torch.float32, + ), + ) + config = llm.Llama2Config7B() + config.num_layers = 2 + config.fp16 = True + config.bf16 = False + model = llm.LlamaModel(config, tokenizer=DummyTokenizer(), optim=optim) + trainer.strategy.connect(model) + assert optim.config.bf16 is not None + assert optim.config.fp16 is not None + assert optim.config.bf16 == True + assert optim.config.fp16 == False + assert model.config.fp16 == False + assert model.config.bf16 == True From 8d2e43af5962eb7ea8251811271e3b504820c1b8 Mon Sep 17 00:00:00 2001 From: Vivian Chen <140748220+xuanzic@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:49:47 -0700 Subject: [PATCH 024/664] Add blend dataset in NeVA (#10000) * inital commit on adding blenddataset for neva Signed-off-by: Vivian Chen * sequence packing support try 1 Signed-off-by: Vivian Chen * clean up Signed-off-by: Vivian Chen * remove unused Signed-off-by: Vivian Chen * Apply isort and black reformatting Signed-off-by: xuanzic * fix config Signed-off-by: Vivian Chen * fix based on reviews Signed-off-by: Vivian Chen * Apply isort and black reformatting Signed-off-by: xuanzic * fix for neva config Signed-off-by: Vivian Chen * modify nemo config for neva tutorial Signed-off-by: Vivian Chen * address comments Signed-off-by: Vivian Chen * Apply isort and black reformatting Signed-off-by: xuanzic --------- Signed-off-by: Vivian Chen Signed-off-by: Vivian Chen Signed-off-by: xuanzic Co-authored-by: Vivian Chen Co-authored-by: xuanzic Co-authored-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> --- .../multimodal/mllm/sequence_packing.rst | 4 +- .../multimodal_llm/neva/conf/neva_config.yaml | 20 ++- .../sequence_packing/preprocess_dataset.py | 1 + .../multimodal/data/neva/neva_dataset.py | 9 +- .../models/multimodal_llm/neva/neva_model.py | 127 +++++++++++++++++- .../megatron/blendable_dataset.py | 11 +- tutorials/multimodal/NeVA Tutorial.ipynb | 2 + 7 files changed, 160 insertions(+), 14 deletions(-) mode change 100644 => 100755 examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py diff --git a/docs/source/multimodal/mllm/sequence_packing.rst b/docs/source/multimodal/mllm/sequence_packing.rst index b061ee1d89c6..c5587e3f7173 100644 --- a/docs/source/multimodal/mllm/sequence_packing.rst +++ b/docs/source/multimodal/mllm/sequence_packing.rst @@ -103,15 +103,13 @@ To train with packed sequences, modify four items in the SFT/PEFT config file. .. code-block:: bash - ++model.data.data_prefix=/lustre/fsw/coreai_dlalgo_genai/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset - ++model.data.crop_size=[224,224] ++model.data.packed_sequence=True 2. Use the new dataset file instead of the original JSONL file and ensure the crop sizes are correctly specified since images are now cached: .. code-block:: bash - ++model.data.data_prefix=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset + ++model.data.data_path=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset ++model.data.crop_size=[336,336] 4. Adjust batch sizes: diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml index 9315b0fa3712..89e61a8b917c 100644 --- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml +++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml @@ -38,7 +38,7 @@ exp_manager: save_top_k: 10 mode: min always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}' model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} ema: @@ -60,6 +60,7 @@ model: tensor_model_parallel_size: 1 # intra-layer model parallelism pipeline_model_parallel_size: 1 # inter-layer model parallelism + context_parallel_size: 1 # kqv model parallelism virtual_pipeline_model_parallel_size: null # interleaved pipeline restore_from_path: null # used in fine-tuning @@ -185,7 +186,22 @@ model: packed_sequence: False num_workers: 8 dataloader_type: cyclic - data_path: + data_path: + # This configuration can either be a single string pointing to a data path, or a list of data paths for data blending. + # When using a blendable dataset, be aware of the following: + # - The sampling of data across datasets depends on both the relative sizes of the datasets and the concat_sampling_probabilities. + # - For example, if there are two datasets with lengths of 100 and 10, and the sampling probabilities are set to 0.5 for each, + # then 55 samples would be taken from the dataset of length 100 and 55 from the dataset of length 10 (with repetition). + # - This means not all data might be seen in one epoch, and smaller datasets may need to be repeated to match the number of samples. + # Please adjust your concat_sampling_probabilities accordingly to ensure balanced and effective training. + + # - /path/to/json + # - /path/to/json + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + concat_sampling_probabilities: null + # - 0.5 + # - 0.5 lazy_preprocess: True is_multimodal: True media_type: image # currently supported: image diff --git a/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py old mode 100644 new mode 100755 index 60f882fa9821..b670d171fd1d --- a/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py +++ b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py @@ -271,6 +271,7 @@ def main(): logging.info(f"Output directory: {output_dir}") prefix_path = f"{output_dir}/packed_seq_dataset" + os.makedirs(prefix_path, exist_ok=True) # Original Datasets to Sequence Lengths Files builders = {} for item_dict in tqdm(train_dl, desc="Building indexed datasets"): diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 17cb6e6cf644..8102d179757e 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -1004,6 +1004,8 @@ def __len__(self): return len(self.list_data_dict) def __getitem__(self, i) -> Dict[str, torch.Tensor]: + if isinstance(i, np.integer): + i = int(i) sources = self.list_data_dict[i] if isinstance(i, int): sources = [sources] @@ -1190,7 +1192,6 @@ class NevaDataset(LazySupervisedDataset): """Dataset for supervised fine-tuning.""" def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict, data_cfg: dict): - if data_path.endswith(".json"): super(NevaDataset, self).__init__(data_path, tokenizer, multimodal_cfg, data_cfg) @@ -1313,7 +1314,7 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: return batch -def make_supervised_data_module(tokenizer, image_processor, model_cfg) -> Dict: +def make_supervised_data_module(tokenizer, image_processor, model_cfg, each_file_from_path=None) -> Dict: """Make dataset and collator for supervised fine-tuning.""" data_cfg = model_cfg.data mm_cfg = model_cfg.mm_cfg @@ -1321,10 +1322,10 @@ def make_supervised_data_module(tokenizer, image_processor, model_cfg) -> Dict: if getattr(model_cfg, 'no_seqlen_plus_one_input_tokens', False): add_extra_token = 0 crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224)) - + data_path = each_file_from_path if each_file_from_path is not None else data_cfg.data_path train_dataset = NevaDataset( tokenizer=tokenizer, - data_path=data_cfg.data_path, + data_path=data_path, multimodal_cfg=dict( is_multimodal=data_cfg.is_multimodal, sep_image_conv_front=data_cfg.sep_image_conv_front, diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index 40b1b4ed9a02..6218332c2bde 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -21,7 +21,7 @@ import torch import torch.nn.functional as F from einops import rearrange, reduce, repeat -from omegaconf.dictconfig import DictConfig +from omegaconf import DictConfig, ListConfig from pkg_resources import packaging from pytorch_lightning.trainer.trainer import Trainer from transformers import CLIPVisionModel, SiglipVisionModel @@ -38,6 +38,10 @@ MegatronCLIPModel, ) from nemo.collections.multimodal.parts.utils import create_image_processor, load_nemo_model_weights +from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import ( + get_datasets_weights_and_num_samples, +) +from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import MegatronPretrainingSampler from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel, get_specs @@ -1242,15 +1246,132 @@ def setup(self, stage=None): if self.cfg.get('transformer_engine', False): self.setup_transformer_engine_tp_groups() + def build_train_valid_test_datasets_blend(self): + logging.info('Building Blending Neva datasets.') + + train_datasets = [] + valid_datasets = [] + + data_cfg = self.cfg.data + is_packed_sequence = data_cfg.get("packed_sequence", False) + + if is_packed_sequence: + assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence" + + # Check if concat_sampling_probabilities is properly set + if data_cfg.get('concat_sampling_probabilities') is None or not isinstance( + data_cfg.concat_sampling_probabilities, ListConfig + ): + raise ValueError( + "concat_sampling_probabilities must be a ListConfig with the same number of entries as data_path." + ) + + if len(data_cfg.concat_sampling_probabilities) != len(data_cfg.data_path): + raise ValueError( + f"concat_sampling_probabilities must be of the same size as number of files from data path. " + f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.data_path)}" + ) + + for each_file_from_path in data_cfg.data_path: + if is_packed_sequence: + train_dataset = NevaPackedSeqDatatset( + each_file_from_path, self.cfg.mm_cfg.vision_encoder.get("crop_size") + ) + valid_dataset = NevaPackedSeqDatatset( + each_file_from_path, self.cfg.mm_cfg.vision_encoder.get("crop_size") + ) + else: + ds_dict = make_supervised_data_module( + tokenizer=self.tokenizer, + image_processor=( + self.model.module.image_processor + if hasattr(self.model, "module") + else self.model.image_processor + ), + model_cfg=self.cfg, + each_file_from_path=each_file_from_path, + ) + train_dataset = ds_dict["train_dataset"] + valid_dataset = ds_dict["eval_dataset"] + + train_datasets.append(train_dataset) + valid_datasets.append(valid_dataset) + + # Create BlendableDataset for training + if self.trainer.max_steps is None or self.trainer.max_steps <= 0: + raise ValueError(f'Trainer max_steps must be set to a positive integer. Found {self.trainer.max_steps}') + + num_train_samples = self.trainer.max_steps * data_cfg.global_batch_size + _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples( + data_prefix=[ + weight for pair in zip(data_cfg.concat_sampling_probabilities, data_cfg.data_path) for weight in pair + ], + num_samples=[num_train_samples], + ) + num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset]) + + logging.info(f"Number of train datasets: {len(train_datasets)}") + logging.info(f"Lengths of train datasets: {[len(ds) for ds in train_datasets]}") + logging.info(f"Number of train datasets after blending: {num_train_samples_after_blend}") + + if is_packed_sequence: + num_train_samples_after_blend = sum([len(ds) for ds in train_datasets]) + + self._train_ds = BlendableDataset( + datasets=train_datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend + ) + + self._validation_ds = BlendableDataset( + datasets=valid_datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend + ) + + logging.info(f'Length of train dataset: {len(self._train_ds)}') + logging.info(f'Length of validation dataset: {len(self._validation_ds)}') + + return self._train_ds, self._validation_ds + def build_train_valid_test_datasets(self): logging.info('Building Neva datasets.') + + if isinstance(self.cfg.data.data_path, (list, ListConfig)): + if len(self.cfg.data.data_path) > 1: + # Only consider data blending if there are multiple dataset paths + if self.cfg.data.get('concat_sampling_probabilities') is None: + logging.warning("No sampling probabilities provided. Defaulting to uniform sampling.") + self.cfg.data.concat_sampling_probabilities = [1 / len(self.cfg.data.data_path)] * len( + self.cfg.data.data_path + ) + else: + # Normalize the sampling probabilities if they don't sum to 1 + total = sum(self.cfg.data.concat_sampling_probabilities) + if total != 1: + logging.warning(f"Concat_sampling_probabilities sum to {total}. Normalizing to sum to 1.") + self.cfg.data.concat_sampling_probabilities = [ + prob / total for prob in self.cfg.data.concat_sampling_probabilities + ] + return self.build_train_valid_test_datasets_blend() + elif len(self.cfg.data.data_path) == 1: + if self.cfg.data.concat_sampling_probabilities is not None: + logging.warning( + "Using sampling probabilities with a single dataset has no effect. Defaulting to None and not using blend dataset." + ) + self.cfg.data.concat_sampling_probabilities = None + self.cfg.data.data_path = self.cfg.data.data_path[0] + else: + raise ValueError("data_path must contain at least one valid path.") + elif isinstance(self.cfg.data.data_path, str): + pass + else: + raise TypeError("data_path must be a list of paths or a single string") + if self.cfg.data.get("packed_sequence", False): assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence" + self._train_ds = NevaPackedSeqDatatset( - self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size") + self.cfg.data.data_path, self.cfg.mm_cfg.vision_encoder.get("crop_size") ) self._validation_ds = NevaPackedSeqDatatset( - self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size") + self.cfg.data.data_path, self.cfg.mm_cfg.vision_encoder.get("crop_size") ) else: ds_dict = make_supervised_data_module( diff --git a/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py index ae2b5fff6be1..39b64ae89865 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py @@ -25,7 +25,6 @@ class BlendableDataset(torch.utils.data.Dataset): def __init__(self, datasets, weights, size): - self.datasets = datasets num_datasets = len(datasets) assert num_datasets == len(weights) @@ -43,6 +42,7 @@ def __init__(self, datasets, weights, size): assert num_datasets < 255 self.dataset_index = np.zeros(self.size, dtype=np.uint8) self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) + app_state = AppState() try: if app_state.local_rank == 0: @@ -74,6 +74,13 @@ def __len__(self): def __getitem__(self, idx): dataset_idx = self.dataset_index[idx] sample_idx = self.dataset_sample_index[idx] + dataset_size = len(self.datasets[dataset_idx]) + # Ensure the sample index doesn't exceed the dataset size + if sample_idx >= dataset_size: + logging.warning(f"Index {sample_idx} out of bounds for dataset {dataset_idx}. Reusing existing examples.") + sample_idx = sample_idx % dataset_size + logging.warning(f"Reusing index {sample_idx} for dataset {dataset_idx}.") + return self.datasets[dataset_idx][sample_idx] def create_data_mmap(self): @@ -85,7 +92,7 @@ class MemoryEfficientBlendableDataset(torch.utils.data.Dataset): """ A BlendableDataset implementation that uses less memory than the original implementation. Indices are computed algorithmically instead of storing them in memory. - + To test call: MemoryEfficientBlendableDataset.test_index_blending() """ diff --git a/tutorials/multimodal/NeVA Tutorial.ipynb b/tutorials/multimodal/NeVA Tutorial.ipynb index 921452ac08c0..4914ccd6fcb1 100644 --- a/tutorials/multimodal/NeVA Tutorial.ipynb +++ b/tutorials/multimodal/NeVA Tutorial.ipynb @@ -186,6 +186,7 @@ " model.mm_cfg.vision_encoder.from_hf=True \\\n", " model.optim.name=\"fused_adam\" \\\n", " exp_manager.create_checkpoint_callback=True \\\n", + " exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \\\n", " exp_manager.create_wandb_logger=False" ] }, @@ -255,6 +256,7 @@ " model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \\\n", " model.mm_cfg.vision_encoder.from_hf=True \\\n", " exp_manager.create_checkpoint_callback=True \\\n", + " exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \\\n", " exp_manager.name=\"nemo_neva_finetune\" \\\n", " model.optim.name=\"fused_adam\"" ] From 276aaf0b7ee3e89fa425f40165c8f0e2a58c473f Mon Sep 17 00:00:00 2001 From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> Date: Tue, 20 Aug 2024 19:14:27 -0500 Subject: [PATCH 025/664] Fix train loss broadcasting (#10212) * log train loss on all ranks Signed-off-by: Maanu Grover * clarity and doc choice Signed-off-by: Maanu Grover --------- Signed-off-by: Maanu Grover --- nemo/lightning/pytorch/strategies.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py index 668b088a4864..d6ef18770fa4 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies.py @@ -484,12 +484,10 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP ) if self.log_train_loss: - # p2p now, broadcast later at ckpt + # p2p now, broadcast later at ckpt. only with pp, some ranks will log 0.0 + # WHICH IS OK because we broadcast later at checkpoint time _strategy_lib._sync_from_last_pipeline_stage(out, broadcast=False) - if torch.distributed.get_rank() == 0: - self.lightning_module.log( - 'reduced_train_loss', out, prog_bar=True, rank_zero_only=True, batch_size=1 - ) + self.lightning_module.log('reduced_train_loss', out, prog_bar=True, batch_size=1, sync_dist=False) return out From aa25424051edb2b6b47fff2a46c8310ee47b96aa Mon Sep 17 00:00:00 2001 From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> Date: Tue, 20 Aug 2024 19:14:45 -0500 Subject: [PATCH 026/664] Precision plugin recipes (#10187) * create mixed precision plugin recipes Signed-off-by: Maanu Grover * update llama recipes Signed-off-by: Maanu Grover * cleanup Signed-off-by: Maanu Grover * fix annotation Signed-off-by: Maanu Grover * full definition instead of attach Signed-off-by: Maanu Grover * remove fp8 Signed-off-by: Maanu Grover --------- Signed-off-by: Maanu Grover --- nemo/collections/llm/recipes/llama3_70b.py | 8 ++---- nemo/collections/llm/recipes/llama3_8b.py | 8 ++---- .../llm/recipes/precision/__init__.py | 0 .../llm/recipes/precision/mixed_precision.py | 26 +++++++++++++++++++ 4 files changed, 30 insertions(+), 12 deletions(-) create mode 100644 nemo/collections/llm/recipes/precision/__init__.py create mode 100644 nemo/collections/llm/recipes/precision/mixed_precision.py diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index 4b99aef74a30..c784989ac370 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -13,6 +13,7 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback @@ -47,11 +48,6 @@ def trainer( ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, - ddp=Config( - DistributedDataParallelConfig, - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - ), ) trainer = Config( @@ -66,7 +62,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + plugins=bf16_mixed_plugin(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index d70366f6c5ed..340cfbdf6e26 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -13,6 +13,7 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback @@ -47,11 +48,6 @@ def trainer( ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, - ddp=Config( - DistributedDataParallelConfig, - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - ), ) trainer = Config( @@ -66,7 +62,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + plugins=bf16_mixed_plugin(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, diff --git a/nemo/collections/llm/recipes/precision/__init__.py b/nemo/collections/llm/recipes/precision/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/nemo/collections/llm/recipes/precision/mixed_precision.py b/nemo/collections/llm/recipes/precision/mixed_precision.py new file mode 100644 index 000000000000..6a9cb64404ce --- /dev/null +++ b/nemo/collections/llm/recipes/precision/mixed_precision.py @@ -0,0 +1,26 @@ +import torch + +from nemo.collections.llm.utils import Config +from nemo.lightning.pytorch.plugins.mixed_precision import MegatronMixedPrecision + + +def bf16_mixed_plugin() -> Config[MegatronMixedPrecision]: + return Config( + MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + pipeline_dtype=torch.bfloat16, + autocast_enabled=False, + grad_reduce_in_fp32=True, + ) + + +def fp16_mixed_plugin() -> Config[MegatronMixedPrecision]: + return Config( + MegatronMixedPrecision, + precision="16-mixed", + params_dtype=torch.half, + pipeline_dtype=torch.half, + autocast_enabled=False, + grad_reduce_in_fp32=False, + ) From 90e146e522329a2f3842b85903e0da366132bd3c Mon Sep 17 00:00:00 2001 From: BoxiangW <45734921+BoxiangW@users.noreply.github.com> Date: Tue, 20 Aug 2024 20:28:44 -0700 Subject: [PATCH 027/664] Add Long Context Recipe for NeMo 2.0 (#10140) --- nemo/collections/llm/gpt/model/__init__.py | 1 + nemo/collections/llm/recipes/__init__.py | 26 +++- .../collections/llm/recipes/llama3_70b_16k.py | 59 +++++++++ .../collections/llm/recipes/llama3_70b_64k.py | 59 +++++++++ nemo/collections/llm/recipes/llama3_8b_16k.py | 4 +- nemo/collections/llm/recipes/llama3_8b_64k.py | 4 +- nemo/collections/llm/recipes/mixtral_8x22b.py | 113 +++++++++++++++++ .../llm/recipes/mixtral_8x22b_4k.py | 64 ---------- nemo/collections/llm/recipes/mixtral_8x3b.py | 116 ++++++++++++++++++ .../llm/recipes/mixtral_8x3b_16k.py | 61 +++++++++ .../llm/recipes/mixtral_8x3b_64k.py | 61 +++++++++ nemo/collections/llm/recipes/mixtral_8x7b.py | 116 ++++++++++++++++++ .../llm/recipes/mixtral_8x7b_16k.py | 61 +++++++++ .../llm/recipes/mixtral_8x7b_4k.py | 64 ---------- .../llm/recipes/mixtral_8x7b_64k.py | 61 +++++++++ 15 files changed, 737 insertions(+), 133 deletions(-) create mode 100644 nemo/collections/llm/recipes/llama3_70b_16k.py create mode 100644 nemo/collections/llm/recipes/llama3_70b_64k.py create mode 100644 nemo/collections/llm/recipes/mixtral_8x22b.py delete mode 100644 nemo/collections/llm/recipes/mixtral_8x22b_4k.py create mode 100644 nemo/collections/llm/recipes/mixtral_8x3b.py create mode 100644 nemo/collections/llm/recipes/mixtral_8x3b_16k.py create mode 100644 nemo/collections/llm/recipes/mixtral_8x3b_64k.py create mode 100644 nemo/collections/llm/recipes/mixtral_8x7b.py create mode 100644 nemo/collections/llm/recipes/mixtral_8x7b_16k.py delete mode 100644 nemo/collections/llm/recipes/mixtral_8x7b_4k.py create mode 100644 nemo/collections/llm/recipes/mixtral_8x7b_64k.py diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index e2d940e02d32..d657b63f779a 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -45,6 +45,7 @@ "MistralModel", "MixtralConfig8x3B", "MixtralConfig8x7B", + "MixtralConfig8x22B", "MixtralModel", "LlamaConfig", "Llama2Config7B", diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index d9fb5cc61f38..950ca6db7ac6 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -1,4 +1,19 @@ -from nemo.collections.llm.recipes import llama3_8b, llama3_8b_16k, llama3_8b_64k, llama3_70b, mistral +from nemo.collections.llm.recipes import ( + llama3_8b, + llama3_8b_16k, + llama3_8b_64k, + llama3_70b, + llama3_70b_16k, + llama3_70b_64k, + mistral, + mixtral_8x3b, + mixtral_8x3b_16k, + mixtral_8x3b_64k, + mixtral_8x7b, + mixtral_8x7b_16k, + mixtral_8x7b_64k, + mixtral_8x22b, +) from nemo.collections.llm.recipes.log.default import default_log, default_resume from nemo.collections.llm.recipes.optim import adam @@ -7,7 +22,16 @@ "llama3_8b_16k", "llama3_8b_64k", "llama3_70b", + "llama3_70b_16k", + "llama3_70b_64k", "mistral", + "mixtral_8x3b", + "mixtral_8x3b_16k", + "mixtral_8x3b_64k", + "mixtral_8x7b", + "mixtral_8x7b_16k", + "mixtral_8x7b_64k", + "mixtral_8x22b", "adam", "default_log", "default_resume", diff --git a/nemo/collections/llm/recipes/llama3_70b_16k.py b/nemo/collections/llm/recipes/llama3_70b_16k.py new file mode 100644 index 000000000000..8829aa6b407b --- /dev/null +++ b/nemo/collections/llm/recipes/llama3_70b_16k.py @@ -0,0 +1,59 @@ +from typing import Callable + +import torch + +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.recipes import llama3_70b +from nemo.collections.llm.utils import Partial + +NAME = "llama3_70b_16k" + + +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + recipe = llama3_70b.pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn + ) + + trainer = llama3_70b.trainer( + tensor_parallelism=2, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=5, + context_parallelism=2, + sequence_parallelism=True, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = llama3_70b.model() + model.config.seq_length = 16384 + + recipe.model = model + recipe.trainer = trainer + + return recipe + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = llama3_70b.finetune_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node + ) + + trainer = llama3_70b.trainer( + tensor_parallelism=2, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=5, + context_parallelism=2, + sequence_parallelism=True, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = llama3_70b.model() + model.config.seq_length = 16384 + + recipe.model = model + recipe.trainer = trainer + + return recipe diff --git a/nemo/collections/llm/recipes/llama3_70b_64k.py b/nemo/collections/llm/recipes/llama3_70b_64k.py new file mode 100644 index 000000000000..33f46f767a4d --- /dev/null +++ b/nemo/collections/llm/recipes/llama3_70b_64k.py @@ -0,0 +1,59 @@ +from typing import Callable + +import torch + +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.recipes import llama3_70b +from nemo.collections.llm.utils import Partial + +NAME = "llama3_70b_64k" + + +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + recipe = llama3_70b.pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn + ) + + trainer = llama3_70b.trainer( + tensor_parallelism=8, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=5, + context_parallelism=8, + sequence_parallelism=True, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = llama3_70b.model() + model.config.seq_length = 65536 + + recipe.model = model + recipe.trainer = trainer + + return recipe + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = llama3_70b.finetune_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node + ) + + trainer = llama3_70b.trainer( + tensor_parallelism=2, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=5, + context_parallelism=2, + sequence_parallelism=True, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = llama3_70b.model() + model.config.seq_length = 65536 + + recipe.model = model + recipe.trainer = trainer + + return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py index 8bb2b636eba0..a57b4ef37298 100644 --- a/nemo/collections/llm/recipes/llama3_8b_16k.py +++ b/nemo/collections/llm/recipes/llama3_8b_16k.py @@ -32,7 +32,7 @@ def pretrain_recipe( recipe.model = model recipe.trainer = trainer - return trainer + return recipe def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: @@ -56,4 +56,4 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: recipe.model = model recipe.trainer = trainer - return trainer + return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py index b42e1e53399e..d06c9b08a716 100644 --- a/nemo/collections/llm/recipes/llama3_8b_64k.py +++ b/nemo/collections/llm/recipes/llama3_8b_64k.py @@ -32,7 +32,7 @@ def pretrain_recipe( recipe.model = model recipe.trainer = trainer - return trainer + return recipe def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: @@ -56,4 +56,4 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: recipe.model = model recipe.trainer = trainer - return trainer + return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py new file mode 100644 index 000000000000..aaf0149dbdac --- /dev/null +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -0,0 +1,113 @@ +from typing import Callable, Optional + +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x22B, MixtralModel +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback + +NAME = "mixtral_8x22b" + + +def model() -> Config[pl.LightningModule]: + return Config(MixtralModel, config=Config(MixtralConfig8x22B)) + + +def trainer( + tensor_parallelism: int, + pipeline_parallelism: int, + pipeline_parallelism_type: Optional[torch.dtype], + virtual_pipeline_parallelism: Optional[int], + context_parallelism: int, + sequence_parallelism: bool, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[Config[Callback]]] = None, +) -> Config[nl.Trainer]: + strategy = Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_include_optimizer=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + ), + ) + + trainer = Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + gradient_clip_val=1.0, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + return Partial( + fn, + model=model(), + trainer=trainer( + tensor_parallelism=8, + pipeline_parallelism=1, + pipeline_parallelism_type=None, + virtual_pipeline_parallelism=None, + context_parallelism=1, + sequence_parallelism=True, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], + ), + data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +def hf_resume() -> Config[nl.AutoResume]: + return Config(nl.AutoResume, import_path="hf://mistralai/Mixtral-8x22B-v0.1") + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune + ) + recipe.resume = hf_resume() + recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x22b_4k.py b/nemo/collections/llm/recipes/mixtral_8x22b_4k.py deleted file mode 100644 index 5a29cca38506..000000000000 --- a/nemo/collections/llm/recipes/mixtral_8x22b_4k.py +++ /dev/null @@ -1,64 +0,0 @@ -import pytorch_lightning as pl - -from nemo import lightning as nl -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.api import squad -from nemo.collections.llm.gpt.model.llama import MixtralConfig8x22B, MixtralModel -from nemo.collections.llm.peft.api import gpt_lora -from nemo.collections.llm.recipes.log.default import default_log -from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.utils import Partial, factory - -NAME = "mixtral_8x22b_4k" - - -@factory(name=NAME) -def model() -> pl.LightningModule: - return MixtralModel(MixtralConfig8x22B(seq_length=4096)) - - -@factory(name=NAME) -def trainer(devices=8) -> nl.Trainer: - strategy = nl.MegatronStrategy( - tensor_model_parallel_size=8, - sequence_parallel=True, - ) - - return nl.Trainer( - devices=devices, - max_steps=100, - accelerator="gpu", - strategy=strategy, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), - ) - - -@factory(name=NAME + "_hf") -def hf_resume() -> nl.AutoResume: - return nl.AutoResume(import_path="hf://mistralai/Mixtral-8x22B-v0.1") - - -@factory(name=NAME, for_task="llm.pretrain") -def pretrain_recipe() -> Partial: - return Partial( - pretrain, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=distributed_fused_adam_with_cosine_annealing(), - ) - - -@factory(name=NAME, for_task="llm.finetune") -def finetune_recipe() -> Partial: - return Partial( - finetune, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=distributed_fused_adam_with_cosine_annealing(), - peft=gpt_lora, - resume=hf_resume, - ) diff --git a/nemo/collections/llm/recipes/mixtral_8x3b.py b/nemo/collections/llm/recipes/mixtral_8x3b.py new file mode 100644 index 000000000000..223fe68af05d --- /dev/null +++ b/nemo/collections/llm/recipes/mixtral_8x3b.py @@ -0,0 +1,116 @@ +from typing import Callable, Optional + +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback + +NAME = "mixtral_8x3b" + + +def model() -> Config[pl.LightningModule]: + return Config(MixtralModel, config=Config(MixtralConfig8x3B)) + + +def trainer( + tensor_parallelism: int, + pipeline_parallelism: int, + pipeline_parallelism_type: Optional[torch.dtype], + virtual_pipeline_parallelism: Optional[int], + context_parallelism: int, + sequence_parallelism: bool, + expert_parallelism: int, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[Config[Callback]]] = None, +) -> Config[nl.Trainer]: + strategy = Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + expert_model_parallel_size=expert_parallelism, + gradient_as_bucket_view=True, + ckpt_include_optimizer=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + ), + ) + + trainer = Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + gradient_clip_val=1.0, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + return Partial( + fn, + model=model(), + trainer=trainer( + tensor_parallelism=4, + pipeline_parallelism=1, + pipeline_parallelism_type=None, + virtual_pipeline_parallelism=None, + context_parallelism=1, + sequence_parallelism=True, + expert_parallelism=1, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], + ), + data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +def hf_resume() -> Config[nl.AutoResume]: + return Config(nl.AutoResume, import_path="hf://mistralai/Mixtral-8x7B-v0.1") + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune + ) + recipe.resume = hf_resume() + recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py new file mode 100644 index 000000000000..e496349a35d6 --- /dev/null +++ b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py @@ -0,0 +1,61 @@ +from typing import Callable + +import torch + +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.recipes import mixtral_8x3b +from nemo.collections.llm.utils import Partial + +NAME = "mixtral_8x3b_16k" + + +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + recipe = mixtral_8x3b.pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn + ) + + trainer = mixtral_8x3b.trainer( + tensor_parallelism=2, + pipeline_parallelism=2, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=8, + context_parallelism=2, + sequence_parallelism=True, + expert_parallelism=2, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = mixtral_8x3b.model() + model.config.seq_length = 16384 + + recipe.model = model + recipe.trainer = trainer + + return recipe + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = mixtral_8x3b.finetune_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node + ) + + trainer = mixtral_8x3b.trainer( + tensor_parallelism=2, + pipeline_parallelism=2, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=8, + context_parallelism=2, + sequence_parallelism=True, + expert_parallelism=2, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = mixtral_8x3b.model() + model.config.seq_length = 16384 + + recipe.model = model + recipe.trainer = trainer + + return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py new file mode 100644 index 000000000000..f034f30ecd94 --- /dev/null +++ b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py @@ -0,0 +1,61 @@ +from typing import Callable + +import torch + +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.recipes import mixtral_8x3b +from nemo.collections.llm.utils import Partial + +NAME = "mixtral_8x3b_64k" + + +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + recipe = mixtral_8x3b.pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn + ) + + trainer = mixtral_8x3b.trainer( + tensor_parallelism=4, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=8, + context_parallelism=4, + sequence_parallelism=True, + expert_parallelism=4, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = mixtral_8x3b.model() + model.config.seq_length = 65536 + + recipe.model = model + recipe.trainer = trainer + + return recipe + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = mixtral_8x3b.finetune_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node + ) + + trainer = mixtral_8x3b.trainer( + tensor_parallelism=2, + pipeline_parallelism=2, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=8, + context_parallelism=4, + sequence_parallelism=True, + expert_parallelism=2, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = mixtral_8x3b.model() + model.config.seq_length = 65536 + + recipe.model = model + recipe.trainer = trainer + + return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py new file mode 100644 index 000000000000..1710727bd711 --- /dev/null +++ b/nemo/collections/llm/recipes/mixtral_8x7b.py @@ -0,0 +1,116 @@ +from typing import Callable, Optional + +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback + +NAME = "mixtral_8x7b" + + +def model() -> Config[pl.LightningModule]: + return Config(MixtralModel, config=Config(MixtralConfig8x7B)) + + +def trainer( + tensor_parallelism: int, + pipeline_parallelism: int, + pipeline_parallelism_type: Optional[torch.dtype], + virtual_pipeline_parallelism: Optional[int], + context_parallelism: int, + sequence_parallelism: bool, + expert_parallelism: int, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[Config[Callback]]] = None, +) -> Config[nl.Trainer]: + strategy = Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + expert_model_parallel_size=expert_parallelism, + gradient_as_bucket_view=True, + ckpt_include_optimizer=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + ), + ) + + trainer = Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + gradient_clip_val=1.0, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + return Partial( + fn, + model=model(), + trainer=trainer( + tensor_parallelism=8, + pipeline_parallelism=1, + pipeline_parallelism_type=None, + virtual_pipeline_parallelism=None, + context_parallelism=1, + sequence_parallelism=True, + expert_parallelism=1, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], + ), + data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +def hf_resume() -> Config[nl.AutoResume]: + return Config(nl.AutoResume, import_path="hf://mistralai/Mixtral-8x7B-v0.1") + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune + ) + recipe.resume = hf_resume() + recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py new file mode 100644 index 000000000000..352069fc6831 --- /dev/null +++ b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py @@ -0,0 +1,61 @@ +from typing import Callable + +import torch + +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.recipes import mixtral_8x7b +from nemo.collections.llm.utils import Partial + +NAME = "mixtral_8x7b_16k" + + +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + recipe = mixtral_8x7b.pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn + ) + + trainer = mixtral_8x7b.trainer( + tensor_parallelism=2, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=8, + context_parallelism=4, + sequence_parallelism=True, + expert_parallelism=8, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = mixtral_8x7b.model() + model.config.seq_length = 16384 + + recipe.model = model + recipe.trainer = trainer + + return recipe + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = mixtral_8x7b.finetune_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node + ) + + trainer = mixtral_8x7b.trainer( + tensor_parallelism=2, + pipeline_parallelism=2, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=8, + context_parallelism=2, + sequence_parallelism=True, + expert_parallelism=8, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = mixtral_8x7b.model() + model.config.seq_length = 16384 + + recipe.model = model + recipe.trainer = trainer + + return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_4k.py b/nemo/collections/llm/recipes/mixtral_8x7b_4k.py deleted file mode 100644 index 5afa3cd072f6..000000000000 --- a/nemo/collections/llm/recipes/mixtral_8x7b_4k.py +++ /dev/null @@ -1,64 +0,0 @@ -import pytorch_lightning as pl - -from nemo import lightning as nl -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.api import squad -from nemo.collections.llm.gpt.model.llama import MixtralConfig8x7B, MixtralModel -from nemo.collections.llm.peft.api import gpt_lora -from nemo.collections.llm.recipes.log.default import default_log -from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.utils import Partial, factory - -NAME = "mixtral_8x7b_4k" - - -@factory(name=NAME) -def model() -> pl.LightningModule: - return MixtralModel(MixtralConfig8x7B(seq_length=4096)) - - -@factory(name=NAME) -def trainer(devices=8) -> nl.Trainer: - strategy = nl.MegatronStrategy( - tensor_model_parallel_size=8, - sequence_parallel=True, - ) - - return nl.Trainer( - devices=devices, - max_steps=100, - accelerator="gpu", - strategy=strategy, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), - ) - - -@factory(name=NAME + "_hf") -def hf_resume() -> nl.AutoResume: - return nl.AutoResume(import_path="hf://mistralai/Mixtral-8x7B-v0.1") - - -@factory(name=NAME, for_task="llm.pretrain") -def pretrain_recipe() -> Partial: - return Partial( - pretrain, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=distributed_fused_adam_with_cosine_annealing(), - ) - - -@factory(name=NAME, for_task="llm.finetune") -def finetune_recipe() -> Partial: - return Partial( - finetune, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=distributed_fused_adam_with_cosine_annealing(), - peft=gpt_lora, - resume=hf_resume, - ) diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py new file mode 100644 index 000000000000..503c83ecb66a --- /dev/null +++ b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py @@ -0,0 +1,61 @@ +from typing import Callable + +import torch + +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.recipes import mixtral_8x7b +from nemo.collections.llm.utils import Partial + +NAME = "mixtral_8x7b_64k" + + +def pretrain_recipe( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + recipe = mixtral_8x7b.pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn + ) + + trainer = mixtral_8x7b.trainer( + tensor_parallelism=4, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=8, + context_parallelism=4, + sequence_parallelism=True, + expert_parallelism=8, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = mixtral_8x7b.model() + model.config.seq_length = 65536 + + recipe.model = model + recipe.trainer = trainer + + return recipe + + +def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: + recipe = mixtral_8x7b.finetune_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node + ) + + trainer = mixtral_8x7b.trainer( + tensor_parallelism=2, + pipeline_parallelism=4, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=8, + context_parallelism=2, + sequence_parallelism=True, + expert_parallelism=8, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + ) + model = mixtral_8x7b.model() + model.config.seq_length = 65536 + + recipe.model = model + recipe.trainer = trainer + + return recipe From e0ebd6ff5c0ede0f040ac4e1785e7ff6798b99a6 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Tue, 20 Aug 2024 22:03:14 -0700 Subject: [PATCH 028/664] [NeMo-UX] Add LLM and Lightning READMEs (#9894) * add LLM and lightning readmes Signed-off-by: ashors1 * minor improvement Signed-off-by: ashors1 * address some comments Signed-off-by: ashors1 * fix typo Signed-off-by: ashors1 * address comment Signed-off-by: ashors1 * update NeMoLogger docstring Signed-off-by: ashors1 * fix format Signed-off-by: ashors1 * update hyperlinks Signed-off-by: ashors1 * remove TODO Signed-off-by: ashors1 * small typo Signed-off-by: ashors1 --------- Signed-off-by: ashors1 --- nemo/collections/llm/README.md | 11 +++++++++++ nemo/lightning/README.md | 13 +++++++++++++ nemo/lightning/nemo_logger.py | 5 +++++ 3 files changed, 29 insertions(+) create mode 100644 nemo/collections/llm/README.md create mode 100644 nemo/lightning/README.md diff --git a/nemo/collections/llm/README.md b/nemo/collections/llm/README.md new file mode 100644 index 000000000000..3e25f84a0c54 --- /dev/null +++ b/nemo/collections/llm/README.md @@ -0,0 +1,11 @@ +NeMo LLM Collection +=================== + +The NeMo LLM Collection introduces NeMo 2.0, a redesign that enhances the user experience by adopting a more PyTorch Lightning-like approach. This redesign aims to simplify NeMo and make it more modular. + +The following models are currently reimplemented in 2.0 as part of this collection: +- **GPT** +- **LLaMA** +- **Mixtral** + +For detailed tutorials and documentation on NeMo 2.0, refer to the [docs](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo_2.0/index.html). diff --git a/nemo/lightning/README.md b/nemo/lightning/README.md new file mode 100644 index 000000000000..7b9266d3fa30 --- /dev/null +++ b/nemo/lightning/README.md @@ -0,0 +1,13 @@ +# NeMo Lightning + +The NeMo Lightning directory provides custom PyTorch Lightning-compatible objects for seamlessly training NeMo 2.0 models using PTL. NeMo 2.0 models +are implemented using [Megatron Core](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). NeMo Lightning provides the bridge between higher-level, object-oriented PTL APIs and lower-level Megatron APIs. +For detailed tutorials and documentation on NeMo 2.0, refer to the [docs](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo_2.0/index.html). + +Some of the helpful classes provided here include: +- [`Trainer`](./pytorch/trainer.py): A lightweight wrapper around PTL's `Trainer` object which provides some additional support for capturing the arguments used to initialized the trainer. More information on NeMo 2's serialization mechanisms is available [here](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo_2.0/design/serialization.html). +- [`MegatronStrategy`](./pytorch/strategies.py): A PTL strategy that enables training of Megatron models on NVIDIA GPUs. +- [`MegatronParallel`](./megatron_parallel.py): Class which sets up and manages Megatron's distributed model parallelism. +- [`MegatronMixedPrecision`](./pytorch/plugins/mixed_precision.py): A specialized precision plugin for training Megatron-based models in PTL. + +More information on `MegatronStrategy`, `MegatronParallel`, and `MegatronMixedPrecision` can be found in [this document](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo_2.0/design/megatron.html). diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py index 5ba2c39f9cff..6509c384f8cf 100644 --- a/nemo/lightning/nemo_logger.py +++ b/nemo/lightning/nemo_logger.py @@ -36,6 +36,11 @@ class NeMoLogger(IOMixin): save_dir will not be overwritten. This argument applies only to TensorBoardLogger and WandbLogger instances. ckpt (Optional[ModelCheckpoint]): Model checkpoint callback. + tensorboard: (Optional[TensorBoardLogger]): A PyTorch Lightning TensorBoardLogger instance + to add to the trainer. + wandb (Optional[WandbLogger]): A PyTorch Lightning WandBLogger instance + to add to the trainer. + extra_loggers(Optional[List[Logger]]): Any additional loggers to add to the trainer. """ name: str = "default" From cef98dbaa61971b889bb2484916b90c11a4c2a2d Mon Sep 17 00:00:00 2001 From: tomlifu Date: Tue, 20 Aug 2024 22:24:40 -0700 Subject: [PATCH 029/664] Remove Attention Mask in SFT Chat Dataset (#10210) * remove attention mask in SFT chat dataset * Apply isort and black reformatting Signed-off-by: tomlifu * change get_attention_mask_from_fusion default value to True --------- Signed-off-by: tomlifu Co-authored-by: tomlifu --- .../megatron/gpt_sft_chat_dataset.py | 13 ++++++++----- .../models/language_modeling/megatron_gpt_model.py | 2 +- .../language_modeling/megatron_gpt_sft_model.py | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py index 3d5d7effc9de..ef09c7ff068e 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py @@ -73,7 +73,7 @@ def get_prompt_template_example(special_tokens): def identify_start_index_of_subsequence(subsequence, sequence): - """ find the location of the small tensor in the large tensor. + """find the location of the small tensor in the large tensor. e.g. small = [1,3], large = [2,3,1,3], returns 2 small = [3,2], large = [2,3,1,3], returns -1 Args: @@ -100,7 +100,7 @@ def _mask_targets( label_start_ids, num_turn_start_tokens, ): - """ This function masks the tokens so the loss is computed only on the non-masked role's responses. + """This function masks the tokens so the loss is computed only on the non-masked role's responses. For 'TEXT_TO_VALUE' type, the loss is computed on the value attributes. Args: @@ -373,8 +373,9 @@ def collate_fn(self, batch): max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 8)) assert max_length <= self.max_seq_length - attention_mask = [self._create_attention_mask(max_length) for _ in batch] - attention_mask = torch.stack(attention_mask) + if not self.get_attention_mask_from_fusion: + attention_mask = [self._create_attention_mask(max_length) for _ in batch] + attention_mask = torch.stack(attention_mask) position_ids = [list(range(max_length)) for _ in batch] position_ids = torch.LongTensor(position_ids) input_ids = torch.LongTensor( @@ -389,7 +390,6 @@ def collate_fn(self, batch): processed_batch = { 'tokens': input_ids, 'labels': labels, - 'attention_mask': attention_mask, 'loss_mask': loss_mask, 'position_ids': position_ids, 'contexts': contexts, @@ -398,4 +398,7 @@ def collate_fn(self, batch): 'metadata': metadata, } + if not self.get_attention_mask_from_fusion: + processed_batch['attention_mask'] = attention_mask + return processed_batch diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 67310d59db45..ee2d891e83e4 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1197,7 +1197,7 @@ def get_batch_on_this_context_parallel_rank(self, batch): if cp_size > 1: cp_rank = parallel_state.get_context_parallel_rank() for key, val in batch.items(): - if val is not None: + if val is not None and key != "context_lengths": seq_dim = 1 if key != 'attention_mask' else 2 val = val.view( *val.shape[0:seq_dim], diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 9c2372ef38ca..08bc5501363c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -301,7 +301,7 @@ def _build_dataset(self, data_cfg, is_train=True): index_mapping_dir=data_cfg.get('index_mapping_dir', None), prompt_template=data_cfg.get('prompt_template', None), ceil_to_power_2=data_cfg.get('ceil_to_power_2', False), - get_attention_mask_from_fusion=data_cfg.get('get_attention_mask_from_fusion', False), + get_attention_mask_from_fusion=data_cfg.get('get_attention_mask_from_fusion', True), global_sample_mapping=data_cfg.get('global_sample_mapping', False), virtual_tokens=self.virtual_tokens, tokens_to_generate=data_cfg.get( From 860635030526c7709fbb3734c8bedbd2a0a471d1 Mon Sep 17 00:00:00 2001 From: Aleksandr Laptev Date: Wed, 21 Aug 2024 19:46:20 +0700 Subject: [PATCH 030/664] Riva and k2 ASR WFST decoding (2) (#9391) * upload Signed-off-by: Aleksandr Laptev * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add comments and use case Signed-off-by: Aleksandr Laptev * Apply isort and black reformatting Signed-off-by: GNroy * add initial doc Signed-off-by: Aleksandr Laptev * fix doc and k2+cuda eval Signed-off-by: Aleksandr Laptev * isolate decoder components installation and fix suggestions Signed-off-by: Aleksandr Laptev * Apply isort and black reformatting Signed-off-by: GNroy * fix trailing newline Signed-off-by: Aleksandr Laptev --------- Signed-off-by: Aleksandr Laptev Signed-off-by: GNroy Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: GNroy Co-authored-by: Vladimir Bataev --- ...sr_language_modeling_and_customization.rst | 63 + .../asr/parts/k2/graph_decoders.py | 352 +++- nemo/collections/asr/parts/k2/utils.py | 118 +- .../asr/parts/submodules/ctc_beam_decoding.py | 340 +++- .../asr/parts/submodules/ctc_decoding.py | 95 +- .../asr/parts/submodules/wfst_decoder.py | 791 +++++++++ .../collections/asr/parts/utils/wfst_utils.py | 1478 +++++++++++++++++ nemo/core/utils/k2_utils.py | 2 +- .../ngram_lm/eval_wfst_decoding_ctc.py | 439 +++++ scripts/installers/install_riva_decoder.sh | 17 + .../asr/test_asr_ctc_encoder_model_bpe.py | 29 + 11 files changed, 3658 insertions(+), 66 deletions(-) create mode 100644 nemo/collections/asr/parts/submodules/wfst_decoder.py create mode 100644 nemo/collections/asr/parts/utils/wfst_utils.py create mode 100644 scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py create mode 100755 scripts/installers/install_riva_decoder.sh diff --git a/docs/source/asr/asr_language_modeling_and_customization.rst b/docs/source/asr/asr_language_modeling_and_customization.rst index d5a748e2379e..02fed8b89760 100644 --- a/docs/source/asr/asr_language_modeling_and_customization.rst +++ b/docs/source/asr/asr_language_modeling_and_customization.rst @@ -547,6 +547,69 @@ The following is the list of the arguments for the opengrm script: | force | bool | ``False`` | Whether to recompile and rewrite all files | +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ +.. _wfst-ctc-decoding: + +WFST CTC decoding +================= +Weighted Finite-State Transducers (WFST) are finite-state machines with input and output symbols on each transition and some weight element of a semiring. WFSTs can act as N-gram LMs in a special type of LM-forced beam search, called WFST decoding. + +.. note:: + + More precisely, WFST decoding is more of a greedy N-depth search with LM. + Thus, it is asymptotically worse than conventional beam search decoding algorithms, but faster. + +**WARNING** +At the moment, NeMo supports WFST decoding only for CTC models and word-based LMs. + +To run WFST decoding in NeMo, one needs to provide a NeMo ASR model and either an ARPA LM or a WFST LM (advanced). An ARPA LM can be built from source text with KenLM as follows: ``/lmplz -o --arpa --prune ``. + +The script to evaluate an ASR model with WFST decoding and N-gram models can be found at +`scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py +`__. + +This script has a large number of possible argument overrides, therefore it is advised to use ``python eval_wfst_decoding_ctc.py --help`` to see the full list of arguments. + +You may evaluate an ASR model as the following: + +.. code-block:: + + python eval_wfst_decoding_ctc.py nemo_model_file= \ + input_manifest= \ + arpa_model_file= \ + decoding_wfst_file= \ + beam_width=[] \ + lm_weight=[] \ + open_vocabulary_decoding= \ + decoding_mode= \ + decoding_search_type= \ + preds_output_folder= \ + probs_cache_file=null + +.. note:: + + Since WFST decoding is LM-forced (the search goes over the WIDEST graph), only word sequences accepted by the WFST can appear in the decoding results. + To circumvent this restriction, one can pass ``open_vocabulary_decoding=true`` (experimental feature). + + +Quick start example +------------------- + +.. code-block:: + + wget -O - https://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz | \ + gunzip -c | tr '[:upper:]' '[:lower:]' > 3-gram.pruned.1e-7.arpa && \ + python eval_wfst_decoding_ctc.py nemo_model_file="stt_en_conformer_ctc_small_ls" \ + input_manifest="/Librispeech/test_other.json" \ + arpa_model_file="3-gram.pruned.1e-7.arpa" \ + decoding_wfst_file="3-gram.pruned.1e-7.fst" \ + beam_width=[8] \ + lm_weight=[0.5,0.6,0.7,0.8,0.9] + +.. note:: + + Building a decoding WFST is a long process, so it is better to provide a ``decoding_wfst_file`` path even if you don't have it. + This way, the decoding WFST will be buffered to the specified file path and there will be no need to re-build it on the next run. + *************************************************** Context-biasing (word boosting) without external LM diff --git a/nemo/collections/asr/parts/k2/graph_decoders.py b/nemo/collections/asr/parts/k2/graph_decoders.py index 33218588b79f..981025e7c418 100644 --- a/nemo/collections/asr/parts/k2/graph_decoders.py +++ b/nemo/collections/asr/parts/k2/graph_decoders.py @@ -13,14 +13,28 @@ # limitations under the License. from abc import abstractmethod +from collections import defaultdict +from pathlib import Path from typing import List, Optional, Tuple, Union import torch +from jiwer import wer as word_error_rate from omegaconf import DictConfig from nemo.collections.asr.parts.k2.classes import GraphIntersectDenseConfig from nemo.collections.asr.parts.k2.loss_mixins import CtcK2Mixin, RnntK2Mixin -from nemo.collections.asr.parts.k2.utils import invert_permutation, load_graph +from nemo.collections.asr.parts.k2.utils import ( + create_supervision, + invert_permutation, + levenshtein_graph_k2, + load_graph, +) +from nemo.collections.asr.parts.submodules.wfst_decoder import ( + AbstractWFSTDecoder, + WfstNbestHypothesis, + collapse_tokenword_hypotheses, +) +from nemo.core.utils.k2_guard import k2 from nemo.utils import logging @@ -121,7 +135,8 @@ def _decode_impl( return lats else: shortest_path_fsas = k2.index_fsa( - k2.shortest_path(lats, True), invert_permutation(order).to(device=log_probs.device), + k2.shortest_path(lats, True), + invert_permutation(order).to(device=log_probs.device), ) return self._extract_labels_and_probabilities(shortest_path_fsas, return_ilabels, output_aligned) @@ -336,3 +351,336 @@ def update_graph(self, graph: 'k2.Fsa'): self.num_classes, self.blank, self.topo_type, self.topo_with_self_loops, self.device, token_lm ) self.base_graph = k2.create_fsa_vec([self.graph_compiler.base_graph]).to(self.device) + + +class K2WfstDecoder(AbstractWFSTDecoder): + """ + Used for performing WFST decoding of the logprobs with the k2 WFST decoder. + + Args: + lm_fst: + Kaldi-type language model WFST or its path. + + decoding_mode: + Decoding mode. Choices: `nbest`, `lattice`. + + beam_size: + Beam width (float) for the WFST decoding. + + config: + Riva Decoder config. + + tokenword_disambig_id: + Tokenword disambiguation index. Set to -1 to disable the tokenword mode. + + lm_weight: + Language model weight in decoding. + + nbest_size: + N-best size for decoding_mode == `nbest` + + device: + Device for running decoding. Choices: `cuda`, `cpu`. + """ + + def __init__( + self, + lm_fst: Union['k2.Fsa', Path, str], + decoding_mode: str = 'nbest', + beam_size: float = 10.0, + config: Optional[GraphIntersectDenseConfig] = None, + tokenword_disambig_id: int = -1, + lm_weight: float = 1.0, + nbest_size: int = 1, + device: str = "cuda", + ): + self._nbest_size = nbest_size + self._device = device + super().__init__(lm_fst, decoding_mode, beam_size, config, tokenword_disambig_id, lm_weight) + + def _set_decoder_config(self, config: Optional[GraphIntersectDenseConfig] = None): + if config is None: + config = GraphIntersectDenseConfig() + config.search_beam = 20.0 + config.output_beam = self._beam_size + config.max_active_states = 10000 + self._config = config + + def _set_decoding_mode(self, decoding_mode: str): + if decoding_mode not in ('nbest', 'lattice'): + raise ValueError(f"Unsupported mode: {decoding_mode}") + self._decoding_mode = decoding_mode + + @torch.inference_mode(False) + def _init_decoder(self): + lm_fst = load_graph(self._lm_fst) if isinstance(self._lm_fst, (Path, str)) else self._lm_fst.clone() + lm_fst.lm_scores = lm_fst.scores.clone() + self._lm_fst = lm_fst.to(device=self._device) + + if self._id2word is None: + self._id2word = { + int(line.split()[1]): line.split()[0] + for line in self._lm_fst.aux_labels_sym.to_str().strip().split("\n") + } + word2id = self._id2word.__class__(map(reversed, self._id2word.items())) + word_unk_id = word2id[""] + self._word2id = defaultdict(lambda: word_unk_id) + for k, v in word2id.items(): + self._word2id[k] = v + if self._id2token is None: + self._id2token = { + int(line.split()[1]): line.split()[0] for line in self._lm_fst.labels_sym.to_str().strip().split("\n") + } + token2id = self._id2token.__class__(map(reversed, self._id2token.items())) + token_unk_id = token2id[""] + self._token2id = defaultdict(lambda: token_unk_id) + for k, v in token2id.items(): + self._token2id[k] = v + + def _beam_size_setter(self, value: float): + if self._beam_size != value: + self._config.output_beam = value + self._beam_size = value + + def _lm_weight_setter(self, value: float): + if self._lm_weight != value: + self._lm_weight = value + + @property + def nbest_size(self): + return self._nbest_size + + @nbest_size.setter + def nbest_size(self, value: float): + self._nbest_size_setter(value) + + def _nbest_size_setter(self, value: float): + if self._nbest_size != value: + self._nbest_size = value + + def _decoding_mode_setter(self, value: str): + if self._decoding_mode != value: + self._set_decoding_mode(value) + + @torch.inference_mode(False) + def _decode_lattice(self, emissions_fsas: 'k2.DenseFsaVec', order: torch.Tensor) -> 'k2.Fsa': + """ + Decodes logprobs into k2-type lattices. + + Args: + emissions_fsas: + A k2.DenseFsaVec of the predicted log-probabilities. + order: + A torch.Tensor that stores the order of the emissions_fsas elements. + + Returns: + k2-type FsaVec. + """ + lats = k2.intersect_dense_pruned( + a_fsas=self._lm_fst, + b_fsas=emissions_fsas, + search_beam=self._config.search_beam, + output_beam=self._config.output_beam, + min_active_states=self._config.min_active_states, + max_active_states=self._config.max_active_states, + frame_idx_name="frame_idx", + allow_partial=True, + ) + lats = k2.connect(k2.expand_ragged_attributes(lats)) + lats.am_scores = lats.scores - lats.lm_scores + if self._lm_weight != 1.0: + lats.scores = lats.am_scores + self._lm_weight * lats.lm_scores + # just in case + lats.__dict__["_properties"] = None + return k2.index_fsa(lats, invert_permutation(order).to(device=self._device)) + + @torch.inference_mode(False) + def decode( + self, log_probs: torch.Tensor, log_probs_length: torch.Tensor + ) -> Union[List[WfstNbestHypothesis], List['k2.Fsa']]: + """ + Decodes logprobs into recognition hypotheses. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + Returns: + List of recognition hypotheses. + """ + supervisions = create_supervision(log_probs_length) + order = supervisions[:, 0] + emissions_fsas = k2.DenseFsaVec(log_probs.to(device=self._device), supervisions) + lats = self._decode_lattice(emissions_fsas, order) + hypotheses = self._post_decode(lats) + return hypotheses + + @torch.inference_mode(False) + def _post_decode(self, hypotheses: 'k2.Fsa') -> Union[List[WfstNbestHypothesis], List['k2.Fsa']]: + """ + Does various post-processing of the recognition hypotheses. + + Args: + hypotheses: + FsaVec of k2-type lattices. + + Returns: + List of processed recognition hypotheses. + """ + if self._decoding_mode == 'nbest': + hypotheses_fsa = hypotheses + hypotheses = [] + if self._nbest_size == 1: + shortest_path_fsas = k2.shortest_path(hypotheses_fsa, True) + scores = shortest_path_fsas.get_tot_scores(True, False).tolist() + # direct iterating does not work as expected + for i in range(shortest_path_fsas.shape[0]): + fsa = shortest_path_fsas[i] + non_eps_mask = fsa.aux_labels > 0 + words = [self._id2word[l] for l in fsa.aux_labels[non_eps_mask].tolist()] + alignment = fsa.labels[fsa.labels > 0].tolist() + # some timesteps may be 0 if self.open_vocabulary_decoding + timesteps = fsa.frame_idx[non_eps_mask] + timesteps_left = timesteps[:-1] + timesteps_right = timesteps[1:] + timesteps_right_zero_mask = timesteps_right == 0 + timesteps_right[timesteps_right_zero_mask] = timesteps_left[timesteps_right_zero_mask] + timesteps[1:] = timesteps_right + timesteps = timesteps.tolist() + hypotheses.append( + WfstNbestHypothesis( + tuple( + [ + tuple([tuple(words), tuple(timesteps), tuple(alignment), -scores[i]]), + ] + ) + ) + ) + else: + nbest_fsas = k2.Nbest.from_lattice(hypotheses_fsa, self._nbest_size) + nbest_fsas.fsa.frame_idx = k2.index_select(hypotheses_fsa.frame_idx, nbest_fsas.kept_path.values) + scores = nbest_fsas.fsa.get_tot_scores(True, False).tolist() + nbest_hypothesis_list = [[] for _ in range(nbest_fsas.shape.dim0)] + for i, j in enumerate(nbest_fsas.shape.row_ids(1)): + fsa = nbest_fsas.fsa[i] + non_eps_mask = fsa.aux_labels > 0 + words = [self._id2word[l] for l in fsa.aux_labels[non_eps_mask].tolist()] + alignment = fsa.labels[fsa.labels > 0].tolist() + # some timesteps may be 0 if self.open_vocabulary_decoding + timesteps = fsa.frame_idx[non_eps_mask] + timesteps_left = timesteps[:-1] + timesteps_right = timesteps[1:] + timesteps_right_zero_mask = timesteps_right == 0 + timesteps_right[timesteps_right_zero_mask] = timesteps_left[timesteps_right_zero_mask] + timesteps[1:] = timesteps_right + timesteps = timesteps.tolist() + nbest_hypothesis_list[j].append( + tuple([tuple(words), tuple(timesteps), tuple(alignment), -scores[i]]) + ) + for nbest_hypothesis in nbest_hypothesis_list: + hypotheses.append(WfstNbestHypothesis(tuple(nbest_hypothesis))) + return ( + collapse_tokenword_hypotheses(hypotheses, self._id2word[self._tokenword_disambig_id]) + if self._open_vocabulary_decoding + else hypotheses + ) + else: + return [hypotheses[i].to(device="cpu") for i in range(len(hypotheses))] + + @torch.inference_mode(False) + def calibrate_lm_weight( + self, log_probs: torch.Tensor, log_probs_length: torch.Tensor, reference_texts: List[str] + ) -> Tuple[float, float]: + """ + Calibrates LM weight to achieve the best WER for given logprob-text pairs. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + reference_texts: + List of reference word sequences. + + Returns: + Pair of (best_lm_weight, best_wer). + """ + assert len(log_probs) == len(reference_texts) + decoding_mode_backup = self.decoding_mode + lm_weight_backup = self.lm_weight + nbest_size_backup = self.nbest_size + self.decoding_mode = "lattice" + lattices = self.decode(log_probs, log_probs_length) + best_lm_weight, best_wer = -1.0, float('inf') + self.decoding_mode = "nbest" + self.nbest_size = 1 + for lm_weight in range(1, 21): # enough for most cases + lm_weight_act = lm_weight / 10 + for lat in lattices: + lat.scores = lat.am_scores + lm_weight_act * lat.lm_scores + hypotheses = self._post_decode(lattices) + wer = word_error_rate([" ".join(h[0].words) for h in hypotheses], reference_texts) + if wer < best_wer: + best_lm_weight, best_wer = lm_weight_act, wer + self.nbest_size = nbest_size_backup + self.decoding_mode = decoding_mode_backup + self.lm_weight = lm_weight_backup + return best_lm_weight, best_wer + + @torch.inference_mode(False) + def calculate_oracle_wer( + self, log_probs: torch.Tensor, log_probs_length: torch.Tensor, reference_texts: List[str] + ) -> Tuple[float, List[float]]: + """ + Calculates the oracle (the best possible WER for given logprob-text pairs. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + reference_texts: + List of reference word sequences. + + Returns: + Pair of (oracle_wer, oracle_wer_per_utterance). + """ + if self._open_vocabulary_decoding: + raise NotImplementedError + assert len(log_probs) == len(reference_texts) + word_ids = [[self._word2id[w] for w in text.split()] for text in reference_texts] + counts = torch.tensor([len(wid) for wid in word_ids]) + decoding_mode_backup = self.decoding_mode + self.decoding_mode = "lattice" + lattices = self.decode(log_probs, log_probs_length) + oracle_disambig = max(self._id2word.keys()) + 1 + lattices.aux_labels[lattices.aux_labels == 0] = oracle_disambig + lattices = lattices.invert() + delattr(lattices, 'aux_labels') + hyps = levenshtein_graph_k2(lattices).invert() + refs = levenshtein_graph_k2(k2.linear_fsa(word_ids)) + refs, arc_map = k2.add_epsilon_self_loops(refs, ret_arc_map=True) + labels = refs.labels.clone() + labels[arc_map == -1] = oracle_disambig + refs.labels = labels + refs.__dict__["_properties"] = None + refs = k2.arc_sort(refs) + ali_lats = k2.compose(hyps, refs, treat_epsilons_specially=False) + ali_lats = k2.remove_epsilon_self_loops(ali_lats) + # TODO: find out why it fails for some utterances + try: + alignment = k2.shortest_path(ali_lats, use_double_scores=True) + except RuntimeError as e: + logging.warning("calculate_oracle_wer failed") + return -1.0, [] + scores = -alignment.get_tot_scores(True, True).to(dtype=torch.int64) + wer_per_utt = scores / counts + self.decoding_mode = decoding_mode_backup + return (scores.sum() / counts.sum()).item(), wer_per_utt.tolist() diff --git a/nemo/collections/asr/parts/k2/utils.py b/nemo/collections/asr/parts/k2/utils.py index f55620a81356..eca2b2379b43 100644 --- a/nemo/collections/asr/parts/k2/utils.py +++ b/nemo/collections/asr/parts/k2/utils.py @@ -42,7 +42,12 @@ def create_supervision(input_lengths: torch.Tensor) -> torch.Tensor: These supervisions are required for some k2 methods. """ supervisions = torch.stack( - (torch.tensor(range(input_lengths.shape[0])), torch.zeros(input_lengths.shape[0]), input_lengths.cpu(),), 1, + ( + torch.tensor(range(input_lengths.shape[0])), + torch.zeros(input_lengths.shape[0]), + input_lengths.cpu(), + ), + 1, ).to(dtype=torch.int32) # the duration column has to be sorted in decreasing order return supervisions[torch.argsort(supervisions[:, -1], descending=True)] @@ -50,7 +55,7 @@ def create_supervision(input_lengths: torch.Tensor) -> torch.Tensor: def invert_permutation(indices: torch.Tensor) -> torch.Tensor: """Produces a tensor of reverse permutation for a given indices. - + Based on https://github.com/k2-fsa/snowfall/blob/master/snowfall/common.py """ ans = torch.zeros(indices.shape, device=indices.device, dtype=indices.dtype) @@ -59,8 +64,7 @@ def invert_permutation(indices: torch.Tensor) -> torch.Tensor: def make_non_pad_mask(input_lengths: torch.Tensor, seq_len: int): - """Converts input_lengths to a non-padding mask. The mask is 2D. - """ + """Converts input_lengths to a non-padding mask. The mask is 2D.""" batch_size = input_lengths.shape[0] seq_range = torch.arange(0, seq_len, device=input_lengths.device) seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, seq_len) @@ -72,8 +76,7 @@ def make_non_pad_mask(input_lengths: torch.Tensor, seq_len: int): def make_non_pad_mask_3d( lengths_x: torch.Tensor, lengths_y: torch.Tensor, max_length_x: int, max_length_y: int ) -> torch.Tensor: - """Converts two orthogonal input_lengths to a non-padding mask. The mask is 3D. - """ + """Converts two orthogonal input_lengths to a non-padding mask. The mask is 3D.""" assert lengths_x.size() == lengths_y.size() return make_non_pad_mask(lengths_x, max_length_x).unsqueeze(2) & make_non_pad_mask( lengths_y, max_length_y @@ -81,8 +84,7 @@ def make_non_pad_mask_3d( def ragged_to_tensor_2axes_simple(rt: k2.RaggedTensor) -> Optional[torch.Tensor]: - """Converts k2.RaggedTensor to torch.Tensor if the RaggedTensor is shallow (has two axes). - """ + """Converts k2.RaggedTensor to torch.Tensor if the RaggedTensor is shallow (has two axes).""" rt_list = rt.tolist() result_list = [] for e in rt_list: @@ -96,8 +98,7 @@ def ragged_to_tensor_2axes_simple(rt: k2.RaggedTensor) -> Optional[torch.Tensor] def load_graph(graph_path: str) -> 'k2.Fsa': - """Fsa graph loading helper function. Loads graphs stored in different formats. - """ + """Fsa graph loading helper function. Loads graphs stored in different formats.""" if os.path.exists(graph_path): errors = [] try: @@ -122,8 +123,7 @@ def load_graph(graph_path: str) -> 'k2.Fsa': def intersect_with_self_loops(base_graph: 'k2.Fsa', aux_graph: 'k2.Fsa') -> 'k2.Fsa': - """Intersection helper function. - """ + """Intersection helper function.""" assert hasattr(base_graph, "aux_labels") assert not hasattr(aux_graph, "aux_labels") aux_graph_with_self_loops = k2.arc_sort(k2.add_epsilon_self_loops(aux_graph)).to(base_graph.device) @@ -133,8 +133,7 @@ def intersect_with_self_loops(base_graph: 'k2.Fsa', aux_graph: 'k2.Fsa') -> 'k2. def compose_with_self_loops(base_graph: 'k2.Fsa', aux_graph: 'k2.Fsa') -> 'k2.Fsa': - """Composition helper function. - """ + """Composition helper function.""" aux_graph_with_self_loops = k2.arc_sort(k2.add_epsilon_self_loops(aux_graph)).to(base_graph.device) return k2.compose(base_graph, aux_graph_with_self_loops, treat_epsilons_specially=False, inner_labels="phones") @@ -145,13 +144,16 @@ def create_sparse_wrapped( size: Optional[Union[Tuple[int, int], Tuple[int, int, int]]] = None, min_col_index: Optional[int] = None, ) -> torch.Tensor: - """Wraps up k2.create_sparse to create 2- or 3-dimensional sparse tensors. - """ + """Wraps up k2.create_sparse to create 2- or 3-dimensional sparse tensors.""" assert size is None or len(indices) == len(size) if len(indices) == 2: return k2.create_sparse( - rows=indices[0], cols=indices[1], values=values, size=size, min_col_index=min_col_index, + rows=indices[0], + cols=indices[1], + values=values, + size=size, + min_col_index=min_col_index, ) elif len(indices) == 3: assert indices[0].ndim == indices[1].ndim == indices[2].ndim == 1 @@ -164,28 +166,43 @@ def create_sparse_wrapped( values = values[kept_indices] if size is not None: return torch.sparse_coo_tensor( - torch.stack(indices), values, size=size, device=values.device, requires_grad=values.requires_grad, + torch.stack(indices), + values, + size=size, + device=values.device, + requires_grad=values.requires_grad, ) else: return torch.sparse_coo_tensor( - torch.stack(indices), values, device=values.device, requires_grad=values.requires_grad, + torch.stack(indices), + values, + device=values.device, + requires_grad=values.requires_grad, ) else: raise ValueError(f"len(indices) = {len(indices)}") def prep_padded_densefsavec(log_softmax: torch.Tensor, supervisions: torch.Tensor) -> 'k2.DenseFsaVec': - """Performs special epsilon-padding required for composition with some of the topologies. - """ + """Performs special epsilon-padding required for composition with some of the topologies.""" log_softmax_eps = torch.cat( [ log_softmax, - torch.full((log_softmax.shape[0], log_softmax.shape[1], 1), -float("inf"), device=log_softmax.device,), + torch.full( + (log_softmax.shape[0], log_softmax.shape[1], 1), + -float("inf"), + device=log_softmax.device, + ), ], axis=-1, ) log_softmax_padded = torch.zeros( - (log_softmax_eps.shape[0], log_softmax_eps.shape[1] * 2, log_softmax_eps.shape[2],), device=log_softmax.device, + ( + log_softmax_eps.shape[0], + log_softmax_eps.shape[1] * 2, + log_softmax_eps.shape[2], + ), + device=log_softmax.device, ) log_softmax_padded[:, ::2] = log_softmax_eps supervisions_padded = supervisions.clone() @@ -235,8 +252,7 @@ def add_self_loops(graph: 'k2.Fsa', label: int = 0, mode: str = "auto"): def get_arc_weights(graph: 'k2.Fsa') -> torch.Tensor: - """Returns 1d torch.Tensor with arc weights of a given graph. - """ + """Returns 1d torch.Tensor with arc weights of a given graph.""" if len(graph.shape) > 2: raise NotImplementedError("FsaVec is not supported at the moment.") weights_int = graph.arcs.values()[:, -1].tolist() @@ -254,7 +270,7 @@ def get_tot_objf_and_finite_mask(tot_scores: torch.Tensor, reduction: str) -> Tu Returns: Returns a tuple of 2 scalar tensors: (tot_score, finite_mask) where finite_mask is a tensor containing successful segment mask. - + Based on get_tot_objf_and_num_frames from https://github.com/k2-fsa/snowfall/blob/master/snowfall/objectives/common.py """ @@ -324,3 +340,53 @@ def apply_rnnt_prune_ranges( index=ranges.reshape((B, T, window_size_with_blank, 1)).expand((B, T, window_size_with_blank, D2)), ) return encoder_outputs_pruned, decoder_outputs_pruned + + +def levenshtein_graph_k2(fsa: 'k2.Fsa', ins_del_score: float = -0.501) -> 'k2.Fsa': + """Construct the levenshtein graph from a k2-type WFST or a lattice. + + See also levenshtein_graph from k2. + + Args: + fst: + K2-type source WFST or lattice. + + ins_del_score: + Insertion and deletion penalty. + Should be more than 0.5 for substitutions to be preferred over insertions/deletions, or less otherwise. + + Returns: + K2-type levenshtein WFST. + """ + sub_score = -0.5 + sub_score_int = struct.unpack('@i', struct.pack('@f', sub_score))[0] + arcs = fsa.arcs.values() + final_indices = (fsa.labels == -1).nonzero() + template_mask = ~torch.zeros(len(arcs) * 2, dtype=bool) + no_duplicate_final_mask = template_mask.clone() + no_duplicate_final_mask[final_indices * 2 + 1] = False + new_mask = ~template_mask + new_mask[1::2] = True + new_mask = new_mask[no_duplicate_final_mask] + duplicate_indices = torch.arange(len(arcs)).repeat_interleave(2)[no_duplicate_final_mask] + new_arcs = arcs[duplicate_indices] + new_arcs[:, -1] = torch.where(new_mask, sub_score_int, 0) + if len(fsa.shape) == 3: + new_shape, _ = fsa.arcs.shape().index(2, duplicate_indices.to(dtype=torch.int32)) + # apparently k2 does not support indexing RaggedArc with RaggedShape + new_splits = new_shape.row_splits(2)[new_shape.row_splits(1)] + levenshtein_fsa = k2.create_fsa_vec([k2.Fsa(new_arcs[i:j]) for i, j in zip(new_splits[:-1], new_splits[1:])]) + else: + levenshtein_fsa = k2.Fsa(new_arcs) + levenshtein_fsa.aux_labels = levenshtein_fsa.labels.clone() + labels = levenshtein_fsa.labels.clone() + labels[new_mask] = 0 + levenshtein_fsa.labels = labels + levenshtein_fsa.__dict__["_properties"] = None + levenshtein_fsa, arc_map = k2.add_epsilon_self_loops(levenshtein_fsa, ret_arc_map=True) + scores = levenshtein_fsa.scores.clone() + scores[arc_map == -1] = ins_del_score + levenshtein_fsa.scores = scores + levenshtein_fsa.__dict__["_properties"] = None + levenshtein_fsa = k2.arc_sort(levenshtein_fsa) + return levenshtein_fsa diff --git a/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py b/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py index 5ed504fd9c45..0beab5f54cb1 100644 --- a/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py @@ -19,6 +19,8 @@ import torch +from nemo.collections.asr.parts.k2.classes import GraphIntersectDenseConfig +from nemo.collections.asr.parts.submodules.wfst_decoder import RivaDecoderConfig from nemo.collections.asr.parts.utils import rnnt_utils from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.core.classes import Typing, typecheck @@ -29,7 +31,8 @@ def pack_hypotheses( - hypotheses: List[rnnt_utils.NBestHypotheses], logitlen: torch.Tensor, + hypotheses: List[rnnt_utils.NBestHypotheses], + logitlen: torch.Tensor, ) -> List[rnnt_utils.NBestHypotheses]: if logitlen is not None: @@ -51,6 +54,39 @@ def pack_hypotheses( return hypotheses +def pack_wfst_hypotheses( + hypotheses: List['WfstNbestHypothesis'], + logits: torch.Tensor, + logitlen: torch.Tensor, +) -> List[rnnt_utils.NBestHypotheses]: + + logitlen_cpu = logitlen.to('cpu') + + new_hypotheses = [] + for idx, nbest_hyp in enumerate(hypotheses): # type: WfstNbestHypothesis + new_hyp = [] + y_sequence = logits[idx, : logitlen[idx]].to('cpu') + length = logitlen_cpu[idx] + for candidate_idx, cand in enumerate(nbest_hyp): + cand_hyp = rnnt_utils.Hypothesis( + y_sequence=[], + score=cand.score, + text=" ".join(cand.words), + timestep=list(cand.timesteps), + alignments=list(cand.alignment), + ) + cand_hyp.y_sequence = y_sequence + + if logitlen is not None: + cand_hyp.length = length + + new_hyp.append(cand_hyp) + + new_hypotheses.append(rnnt_utils.NBestHypotheses(new_hyp)) + + return new_hypotheses + + def _states_to_device(dec_state, device='cpu'): if torch.is_tensor(dec_state): dec_state = dec_state.to(device) @@ -74,8 +110,7 @@ class AbstractBeamCTCInfer(Typing): @property def input_types(self): - """Returns definitions of module input ports. - """ + """Returns definitions of module input ports.""" return { "decoder_output": NeuralType(('B', 'T', 'D'), LogprobsType()), "decoder_lengths": NeuralType(tuple('B'), LengthsType()), @@ -83,8 +118,7 @@ def input_types(self): @property def output_types(self): - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return {"predictions": [NeuralType(elements_type=HypothesisType())]} def __init__(self, blank_id: int, beam_size: int): @@ -147,7 +181,9 @@ def set_tokenizer(self, tokenizer: TokenizerSpec): @typecheck() def forward( - self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor, + self, + decoder_output: torch.Tensor, + decoder_lengths: torch.Tensor, ) -> Tuple[List[Union[rnnt_utils.Hypothesis, rnnt_utils.NBestHypotheses]]]: """Returns a list of hypotheses given an input batch of the encoder hidden embedding. Output token is generated auto-repressively. @@ -246,7 +282,9 @@ def __init__( @typecheck() def forward( - self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor, + self, + decoder_output: torch.Tensor, + decoder_lengths: torch.Tensor, ) -> Tuple[List[Union[rnnt_utils.Hypothesis, rnnt_utils.NBestHypotheses]]]: """Returns a list of hypotheses given an input batch of the encoder hidden embedding. Output token is generated auto-repressively. @@ -568,6 +606,276 @@ def set_decoding_type(self, decoding_type: str): self.token_offset = DEFAULT_TOKEN_OFFSET +class WfstCTCInfer(AbstractBeamCTCInfer): + """A WFST-based beam CTC decoder. + + Provides a common abstraction for sample level and batch level beam decoding. + + Args: + TBD + + """ + + def __init__( + self, + blank_id: int, + beam_size: int, + search_type: str = "riva", # 'riva', 'k2' + return_best_hypothesis: bool = True, + preserve_alignments: bool = False, + compute_timestamps: bool = False, + decoding_mode: str = 'nbest', # 'nbest', 'mbr' ('mbr' works only for search_type == 'riva' and beam_size == 1) + open_vocabulary_decoding: bool = False, + beam_width: float = 10.0, + lm_weight: float = 1.0, + device: str = "cuda", + arpa_lm_path: str = None, + wfst_lm_path: str = None, + riva_decoding_cfg: Optional['RivaDecoderConfig'] = None, + k2_decoding_cfg: Optional['GraphIntersectDenseConfig'] = None, + ): + super().__init__(blank_id=blank_id, beam_size=beam_size) + + self.search_type = search_type + self.return_best_hypothesis = return_best_hypothesis + self.preserve_alignments = preserve_alignments + self.compute_timestamps = compute_timestamps + + self.decoding_algorithm = None + if search_type in ("default", "riva"): + self.decoding_algorithm = self._riva_decoding + elif search_type == "k2": + self.decoding_algorithm = self._k2_decoding + + # Log the WFST search_type + logging.info(f"WFST beam search search_type: {search_type}") + self.search_type = search_type + + if beam_size > 1 and decoding_mode != 'nbest': + logging.warning( + f"`beam_size` > 1 is supported only for `decoding_mode` == `nbest`\n" + f"(provided: `{decoding_mode}`).\n" + f"`beam_size` rewritten as 1" + ) + self.beam_size = 1 + self.decoding_mode = decoding_mode + + self.open_vocabulary_decoding = open_vocabulary_decoding + self._tokenword_disambig_id = -1 + self.beam_width = beam_width + self.lm_weight = lm_weight + self.device = device + + # Default beam search args + self.arpa_lm_path = arpa_lm_path + self.wfst_lm_path = wfst_lm_path + + self.riva_decoding_cfg = riva_decoding_cfg + self.k2_decoding_cfg = k2_decoding_cfg + + # Default beam search scorer functions + self.riva_decoder = None + self.k2_decoder = None + + @typecheck() + def forward( + self, + decoder_output: torch.Tensor, + decoder_lengths: torch.Tensor, + ) -> Tuple[List[Union[rnnt_utils.Hypothesis, rnnt_utils.NBestHypotheses]]]: + """Returns a list of hypotheses given an input batch of the encoder hidden embedding. + Output token is generated auto-repressively. + + Args: + decoder_output: A tensor of size (batch, timesteps, features). + decoder_lengths: list of int representing the length of each sequence + output sequence. + + Returns: + packed list containing batch number of sentences (Hypotheses). + """ + if self.vocab is None: + raise RuntimeError("Please set the vocabulary with `set_vocabulary()` before calling this function.") + + if self.decoding_type != 'subword': + raise ValueError( + f"`decoding_type` other than `subword` is not supported. Provided: `{self.decoding_type}`" + ) + elif self.tokenizer is None: + raise ValueError("Tokenizer must be provided for subword decoding. Use set_tokenizer().") + if self.decoding_algorithm is None: + raise NotImplementedError( + f"The decoding search_type ({self.search_type}) supplied is not supported!\n" + f"Please use one of : (default, riva, k2)" + ) + + with torch.no_grad(), torch.inference_mode(): + # Process each sequence independently + prediction_tensor = decoder_output + + if prediction_tensor.ndim != 3: + raise ValueError( + f"`decoder_output` must be a tensor of shape [B, T, V] (log probs, float). " + f"Provided shape = {prediction_tensor.shape}" + ) + + hypotheses = self.decoding_algorithm(prediction_tensor, decoder_lengths) + + # Pack results into Hypotheses + packed_result = pack_wfst_hypotheses(hypotheses, prediction_tensor, decoder_lengths) + + # Pack the result + if self.return_best_hypothesis and isinstance(packed_result[0], rnnt_utils.NBestHypotheses): + packed_result = [res.n_best_hypotheses[0] for res in packed_result] # type: Hypothesis + + return (packed_result,) + + def _prepare_decoding_lm_wfst(self) -> Union[str, 'kaldifst.StdFst', 'k2.Fsa']: + """TBD""" + arpa_lm_path_exists = self.arpa_lm_path is not None and os.path.exists(self.arpa_lm_path) + wfst_lm_path_exists = self.wfst_lm_path is not None and os.path.exists(self.wfst_lm_path) + lm_fst = None + if wfst_lm_path_exists: + if self.search_type == "riva" and not self.wfst_lm_path.endswith(".fst"): + raise ValueError( + f"Search type `riva` expects WFSTs in the `.fst` format. Provided: `{self.wfst_lm_path}`" + ) + if self.search_type == "k2" and not self.wfst_lm_path.endswith(".pt"): + raise ValueError( + f"Search type `k2` expects WFSTs in the `.pt` format. Provided: `{self.wfst_lm_path}`" + ) + if arpa_lm_path_exists: + logging.warning( + "Both `arpa_lm_path` and `wfst_lm_path` are provided and not empty. The latter will be used." + ) + lm_fst = self.wfst_lm_path + elif not arpa_lm_path_exists: + raise FileNotFoundError( + f"Arpa LM file not found at `{self.arpa_lm_path}` and WFST LM is not found at `{self.wfst_lm_path}`.\n" + f"Please set a valid path in the decoding config for at least one of those." + ) + else: + logging.warning( + f"Since WFST LM is not found at `{self.wfst_lm_path}`, " + f"it will be made from the Arpa LM at `{self.arpa_lm_path}`.\n" + f"This procedure will take some time." + ) + if self.wfst_lm_path is not None: + logging.info(f"WFST LM will be buffered at `{self.wfst_lm_path}`.") + write_tlg_path = self.wfst_lm_path + else: + logging.warning("Consider providing a write-permitted `wfst_lm_path` for WFST LM buffering.") + write_tlg_path = None + ctc_topology = "default" # there is no way to indicate the need of other topologies + target = "kaldi" if self.search_type == "riva" else "k2" + + from nemo.collections.asr.parts.utils.wfst_utils import mkgraph_ctc_ov + + lm_fst, tokenword_disambig_id = mkgraph_ctc_ov( + tokenizer=self.tokenizer, + lm_path=self.arpa_lm_path, + topology_name=ctc_topology, + write_tlg_path=write_tlg_path, + open_vocabulary=self.open_vocabulary_decoding, + target=target, + ) + self._tokenword_disambig_id = tokenword_disambig_id + + return lm_fst + + @torch.no_grad() + def _riva_decoding(self, x: torch.Tensor, out_len: torch.Tensor) -> List['WfstNbestHypothesis']: + """ + Riva Asrlib WFST decoder Algorithm. + + Args: + x: Tensor of shape [B, T, V+1], where B is the batch size, T is the maximum sequence length, + and V is the vocabulary size. The tensor contains log-probabilities. + out_len: Tensor of shape [B], contains lengths of each sequence in the batch. + + Returns: + A list of WfstNbestHypothesis objects, one for each sequence in the batch. + """ + if self.riva_decoder is None: + lm_fst = self._prepare_decoding_lm_wfst() + if self.open_vocabulary_decoding and self._tokenword_disambig_id == -1: + # trying to extract tokenword_disambig_id from the lm_fst + if isinstance(lm_fst, str): + # use importer instead of direct import to possibly get an installation message + from nemo.collections.asr.parts.utils.wfst_utils import kaldifst_importer + + kaldifst = kaldifst_importer() + lm_fst = kaldifst.StdVectorFst.read(self.wfst_lm_path) + tokenword_disambig_id = lm_fst.output_symbols.find("#1") + if tokenword_disambig_id == -1: + raise ValueError( + "Cannot determine `tokenword_disambig_id` " + "which is required if `open_vocabulary_decoding` == True" + ) + self._tokenword_disambig_id = tokenword_disambig_id + if not self.device.startswith("cuda"): + raise ValueError(f"Riva decoder does not support non-cuda device. Provided: `{self.device}`") + + from nemo.collections.asr.parts.submodules.wfst_decoder import RivaGpuWfstDecoder + + self.riva_decoder = RivaGpuWfstDecoder( + lm_fst=lm_fst, + decoding_mode=self.decoding_mode, + beam_size=self.beam_width, + config=self.riva_decoding_cfg, + tokenword_disambig_id=self._tokenword_disambig_id, + lm_weight=self.lm_weight, + nbest_size=self.beam_size, + ) + + return self.riva_decoder.decode(x.to(device=self.device), out_len.to(device=self.device)) + + @torch.no_grad() + def _k2_decoding(self, x: torch.Tensor, out_len: torch.Tensor) -> List['WfstNbestHypothesis']: + """ + K2 WFST decoder Algorithm. + + Args: + x: Tensor of shape [B, T, V+1], where B is the batch size, T is the maximum sequence length, + and V is the vocabulary size. The tensor contains log-probabilities. + out_len: Tensor of shape [B], contains lengths of each sequence in the batch. + + Returns: + A list of WfstNbestHypothesis objects, one for each sequence in the batch. + """ + if self.k2_decoder is None: + lm_fst = self._prepare_decoding_lm_wfst() + if self.open_vocabulary_decoding and self._tokenword_disambig_id == -1: + if isinstance(lm_fst, str): + from nemo.collections.asr.parts.k2.utils import load_graph + + with torch.inference_mode(False): + lm_fst = load_graph(lm_fst) + try: + tokenword_disambig_id = lm_fst.aux_labels_sym.get("#1") + self._tokenword_disambig_id = tokenword_disambig_id + except KeyError: + raise ValueError( + "Cannot determine `tokenword_disambig_id` " + "which is required if `open_vocabulary_decoding` == True" + ) + + from nemo.collections.asr.parts.k2.graph_decoders import K2WfstDecoder + + self.k2_decoder = K2WfstDecoder( + lm_fst=lm_fst, + decoding_mode=self.decoding_mode, + beam_size=self.beam_width, + config=self.k2_decoding_cfg, + tokenword_disambig_id=self._tokenword_disambig_id, + lm_weight=self.lm_weight, + nbest_size=self.beam_size, + device=self.device, + ) + + return self.k2_decoder.decode(x.to(device=self.device), out_len.to(device=self.device)) + + @dataclass class PyCTCDecodeConfig: # These arguments cannot be imported from pyctcdecode (optional dependency) @@ -604,3 +912,21 @@ class BeamCTCInferConfig: flashlight_cfg: Optional[FlashlightConfig] = field(default_factory=lambda: FlashlightConfig()) pyctcdecode_cfg: Optional[PyCTCDecodeConfig] = field(default_factory=lambda: PyCTCDecodeConfig()) + + +@dataclass +class WfstCTCInferConfig: + beam_size: int + search_type: str = "riva" # 'riva', 'k2' + return_best_hypothesis: bool = True + preserve_alignments: bool = False + compute_timestamps: bool = False + decoding_mode: str = 'nbest' # 'nbest', 'mbr' ('mbr' works only for search_type == 'riva' and beam_size == 1) + open_vocabulary_decoding: bool = False + beam_width: float = 10.0 + lm_weight: float = 1.0 + device: str = "cuda" + arpa_lm_path: Optional[str] = None + wfst_lm_path: Optional[str] = None + riva_decoding_cfg: Optional['RivaDecoderConfig'] = field(default_factory=lambda: RivaDecoderConfig()) + k2_decoding_cfg: Optional['GraphIntersectDenseConfig'] = field(default_factory=lambda: GraphIntersectDenseConfig()) diff --git a/nemo/collections/asr/parts/submodules/ctc_decoding.py b/nemo/collections/asr/parts/submodules/ctc_decoding.py index d2bfb629293e..ec27d3dbbd22 100644 --- a/nemo/collections/asr/parts/submodules/ctc_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_decoding.py @@ -213,7 +213,7 @@ def __init__(self, decoding_cfg, blank_id: int): self.batch_dim_index = self.cfg.get('batch_dim_index', 0) self.word_seperator = self.cfg.get('word_seperator', ' ') - possible_strategies = ['greedy', 'greedy_batch', 'beam', 'pyctcdecode', 'flashlight'] + possible_strategies = ['greedy', 'greedy_batch', 'beam', 'pyctcdecode', 'flashlight', 'wfst'] if self.cfg.strategy not in possible_strategies: raise ValueError(f"Decoding strategy must be one of {possible_strategies}. Given {self.cfg.strategy}") @@ -314,6 +314,28 @@ def __init__(self, decoding_cfg, blank_id: int): self.decoding.override_fold_consecutive_value = False + elif self.cfg.strategy == 'wfst': + + self.decoding = ctc_beam_decoding.WfstCTCInfer( + blank_id=blank_id, + beam_size=self.cfg.wfst.get('beam_size', 1), + search_type=self.cfg.wfst.get('search_type', 'riva'), + return_best_hypothesis=self.cfg.wfst.get('return_best_hypothesis', True), + preserve_alignments=self.preserve_alignments, + compute_timestamps=self.compute_timestamps, + decoding_mode=self.cfg.wfst.get('decoding_mode', 'nbest'), + open_vocabulary_decoding=self.cfg.wfst.get('open_vocabulary_decoding', False), + beam_width=self.cfg.wfst.get('beam_width', 10.0), + lm_weight=self.cfg.wfst.get('lm_weight', 1.0), + device=self.cfg.wfst.get('device', 'cuda'), + arpa_lm_path=self.cfg.wfst.get('arpa_lm_path', None), + wfst_lm_path=self.cfg.wfst.get('wfst_lm_path', None), + riva_decoding_cfg=self.cfg.wfst.get('riva_decoding_cfg', None), + k2_decoding_cfg=self.cfg.wfst.get('k2_decoding_cfg', None), + ) + + self.decoding.override_fold_consecutive_value = False + else: raise ValueError( f"Incorrect decoding strategy supplied. Must be one of {possible_strategies}\n" @@ -374,48 +396,56 @@ def ctc_decoder_predictions_tensor( hypotheses_list = hypotheses_list[0] # type: List[Hypothesis] if isinstance(hypotheses_list[0], NBestHypotheses): - hypotheses = [] - all_hypotheses = [] + if self.cfg.strategy == 'wfst': + all_hypotheses = [hyp.n_best_hypotheses for hyp in hypotheses_list] + hypotheses = [hyp[0] for hyp in all_hypotheses] + else: + hypotheses = [] + all_hypotheses = [] - for nbest_hyp in hypotheses_list: # type: NBestHypotheses - n_hyps = nbest_hyp.n_best_hypotheses # Extract all hypotheses for this sample - decoded_hyps = self.decode_hypothesis( - n_hyps, fold_consecutive - ) # type: List[Union[Hypothesis, NBestHypotheses]] + for nbest_hyp in hypotheses_list: # type: NBestHypotheses + n_hyps = nbest_hyp.n_best_hypotheses # Extract all hypotheses for this sample + decoded_hyps = self.decode_hypothesis( + n_hyps, fold_consecutive + ) # type: List[Union[Hypothesis, NBestHypotheses]] - # If computing timestamps - if self.compute_timestamps is True: - timestamp_type = self.cfg.get('ctc_timestamp_type', 'all') - for hyp_idx in range(len(decoded_hyps)): - decoded_hyps[hyp_idx] = self.compute_ctc_timestamps(decoded_hyps[hyp_idx], timestamp_type) + # If computing timestamps + if self.compute_timestamps is True: + timestamp_type = self.cfg.get('ctc_timestamp_type', 'all') + for hyp_idx in range(len(decoded_hyps)): + decoded_hyps[hyp_idx] = self.compute_ctc_timestamps(decoded_hyps[hyp_idx], timestamp_type) - hypotheses.append(decoded_hyps[0]) # best hypothesis - all_hypotheses.append(decoded_hyps) + hypotheses.append(decoded_hyps[0]) # best hypothesis + all_hypotheses.append(decoded_hyps) if return_hypotheses: return hypotheses, all_hypotheses best_hyp_text = [h.text for h in hypotheses] + # alaptev: The line below might contain a bug. Do we really want all_hyp_text to be flat? all_hyp_text = [h.text for hh in all_hypotheses for h in hh] return best_hyp_text, all_hyp_text else: - hypotheses = self.decode_hypothesis( - hypotheses_list, fold_consecutive - ) # type: List[Union[Hypothesis, NBestHypotheses]] + if self.cfg.strategy == 'wfst': + hypotheses = hypotheses_list + else: + hypotheses = self.decode_hypothesis( + hypotheses_list, fold_consecutive + ) # type: List[Union[Hypothesis, NBestHypotheses]] - # If computing timestamps - if self.compute_timestamps is True: - # greedy decoding, can get high-level confidence scores - if return_hypotheses and (self.preserve_word_confidence or self.preserve_token_confidence): - hypotheses = self.compute_confidence(hypotheses) - else: - # remove unused token_repetitions from Hypothesis.text - for hyp in hypotheses: - hyp.text = hyp.text[:2] - timestamp_type = self.cfg.get('ctc_timestamp_type', 'all') - for hyp_idx in range(len(hypotheses)): - hypotheses[hyp_idx] = self.compute_ctc_timestamps(hypotheses[hyp_idx], timestamp_type) + # If computing timestamps + if self.compute_timestamps is True: + # greedy decoding, can get high-level confidence scores + if return_hypotheses and (self.preserve_word_confidence or self.preserve_token_confidence): + hypotheses = self.compute_confidence(hypotheses) + else: + # remove unused token_repetitions from Hypothesis.text + for hyp in hypotheses: + hyp.text = hyp.text[:2] + timestamp_type = self.cfg.get('ctc_timestamp_type', 'all') + for hyp_idx in range(len(hypotheses)): + hypotheses[hyp_idx] = self.compute_ctc_timestamps(hypotheses[hyp_idx], timestamp_type) if return_hypotheses: return hypotheses, None @@ -1324,6 +1354,11 @@ class CTCDecodingConfig: default_factory=lambda: ctc_beam_decoding.BeamCTCInferConfig(beam_size=4) ) + # wfst decoding config + wfst: ctc_beam_decoding.WfstCTCInferConfig = field( + default_factory=lambda: ctc_beam_decoding.WfstCTCInferConfig(beam_size=4) + ) + # confidence config confidence_cfg: ConfidenceConfig = field(default_factory=lambda: ConfidenceConfig()) diff --git a/nemo/collections/asr/parts/submodules/wfst_decoder.py b/nemo/collections/asr/parts/submodules/wfst_decoder.py new file mode 100644 index 000000000000..373e041da1be --- /dev/null +++ b/nemo/collections/asr/parts/submodules/wfst_decoder.py @@ -0,0 +1,791 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import gc +import tempfile +from abc import ABC, abstractmethod +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union + +import torch +from jiwer import wer as word_error_rate +from omegaconf import DictConfig + +from nemo.collections.asr.parts.utils.wfst_utils import TW_BREAK, kaldifst_importer + +RIVA_DECODER_INSTALLATION_MESSAGE = ( + "riva decoder is not installed or is installed incorrectly.\n" + "please run `bash scripts/installers/install_riva_decoder.sh` or `pip install riva-asrlib-decoder` to install." +) + + +def riva_decoder_importer(): + """Import helper function that returns Riva asrlib decoder package or raises ImportError exception.""" + try: + import riva.asrlib.decoder.python_decoder as riva_decoder + except (ImportError, ModuleNotFoundError): + raise ImportError(RIVA_DECODER_INSTALLATION_MESSAGE) + return riva_decoder + + +def _riva_config_to_dict(conf: Any) -> Dict[str, Any]: + """ + Helper function for parsing Riva configs (namely BatchedMappedDecoderCudaConfig) into a dictionary. + + Args: + conf: + Inner Riva config. + + Returns: + Dictionary corresponding to the Riva config. + """ + result = {} + for name in conf.__dir__(): + if not name.startswith("__"): + attribute = getattr(conf, name) + result[name] = ( + attribute if attribute.__class__.__module__ == 'builtins' else _riva_config_to_dict(attribute) + ) + return result + + +def _fill_inner_riva_config_(riva_conf, nemo_conf): + """ + Helper function for filling Riva configs (namely BatchedMappedDecoderCudaConfig) + according to the corresponding NeMo config. + + Note: in-place for the first argument. + + Args: + riva_conf: + Inner Riva config. + + nemo_conf: + Corresponding NeMo config. + """ + for nemo_k, nemo_v in nemo_conf.items(): + if isinstance(nemo_v, DictConfig): + _fill_inner_riva_config_(getattr(riva_conf, nemo_k), nemo_v) + else: + setattr(riva_conf, nemo_k, nemo_v) + + +class RivaDecoderConfig(DictConfig): + """ + NeMo config for the RivaGpuWfstDecoder. + """ + + def __init__(self): + try: + riva_decoder = riva_decoder_importer() + + config = riva_decoder.BatchedMappedDecoderCudaConfig() + config.online_opts.lattice_postprocessor_opts.acoustic_scale = 10.0 + config.n_input_per_chunk = 50 + config.online_opts.decoder_opts.default_beam = 20.0 + config.online_opts.decoder_opts.max_active = 10000 + config.online_opts.determinize_lattice = True + config.online_opts.max_batch_size = 800 + config.online_opts.num_channels = 800 + config.online_opts.frame_shift_seconds = 1 # not actual frame shift + config.online_opts.lattice_postprocessor_opts.word_ins_penalty = 0.0 + + content = _riva_config_to_dict(config) + except ImportError: + content = {} + super().__init__(content) + + +class WfstNbestUnit(NamedTuple): + """ + Container for a single RivaGpuWfstDecoder n-best hypothesis. + """ + + words: Tuple[str] + timesteps: Tuple[int] + alignment: Tuple[int] + score: float + + +class WfstNbestHypothesis: + """ + Container for the RivaGpuWfstDecoder n-best results represented as a list of WfstNbestUnit objects. + """ + + def __init__(self, raw_hypotheses: Tuple[Tuple[Tuple[str], Tuple[int], Tuple[int], float]]): + for i, rh in enumerate(raw_hypotheses): + assert isinstance(rh[0], tuple), f"{rh[0]}" + assert isinstance(rh[1], tuple), f"{rh[1]}, {rh[0]}" + assert isinstance(rh[2], tuple), f"{rh[2]}" + assert isinstance(rh[3], float), f"{rh[3]}" + assert len(rh[0]) == len(rh[1]) or len(rh[1]) == 0, "words do not match timesteps" + + self._hypotheses = sorted([WfstNbestUnit(*rh) for rh in raw_hypotheses], key=lambda hyp: hyp.score) + self._shape0 = len(self._hypotheses) + self._shape1 = [len(h.words) for h in self._hypotheses] + self._has_timesteps = len(self._hypotheses[0].timesteps) > 0 + self._has_alignment = len(self._hypotheses[0].alignment) > 0 + + def __iter__(self): + yield from self._hypotheses + + def __getitem__(self, index): + return self._hypotheses[index] + + def __len__(self): + return self.shape0 + + def replace_unit_( + self, index: int, new_unit: Union[WfstNbestUnit, Tuple[Tuple[str], Tuple[int], Tuple[int], float]] + ): + """ + Replaces a WfstNbestUnit by index. + + Note: in-place operation. + + Args: + index: + Index of the unit to be replaced. + + new_unit: + Replacement unit. + """ + assert 0 <= index < self.shape0 + assert ( + self.has_timesteps + and len(new_unit[0]) == len(new_unit[1]) + or not self.has_timesteps + and len(new_unit[1]) == 0 + ) + assert ( + index == 0 + and (len(self._hypotheses) == 1 or new_unit[3] <= self._hypotheses[index + 1].score) + or index == self.shape0 - 1 + and self._hypotheses[index - 1].score <= new_unit[3] + or self._hypotheses[index - 1].score <= new_unit[3] <= self._hypotheses[index + 1].score + ) + + if not isinstance(new_unit, WfstNbestUnit): + new_unit = WfstNbestUnit(*new_unit) + self._hypotheses[index] = new_unit + self._shape1[index] = len(new_unit.words) + + @property + def shape0(self): + return self._shape0 + + @property + def shape1(self): + return self._shape1 + + @property + def has_timesteps(self): + return self._has_timesteps + + @property + def has_alignment(self): + return self._has_alignment + + +def collapse_tokenword_hypotheses( + hypotheses: List[WfstNbestHypothesis], tokenword_disambig_str: str +) -> List[WfstNbestHypothesis]: + """ + Searches for tokenwords in the input hypotheses and collapses them into words. + + Args: + hypotheses: + List of input WfstNbestHypothesis. + + tokenword_disambig_str: + Tokenword disambiguation symbol (e.g. `#1`). + + Returns: + List of WfstNbestHypothesis. + """ + new_hypotheses = copy.deepcopy(hypotheses) + for hyp in new_hypotheses: + for k, h_unit in enumerate(hyp): + twds_list = [] + for i, word in enumerate(h_unit.words): + if word == tokenword_disambig_str: + twds_list.append(i) + if len(twds_list) > 0: + # a rare case when the recognition stopped before completing the tokenword + old_words = list(h_unit.words) + old_timesteps = list(h_unit.timesteps) + words_len = len(old_words) + if len(twds_list) % 2 == 1: + twds_list.append(words_len) + new_words, new_timesteps = [], [] + j_prev = 0 + for i, j in zip(twds_list[::2], twds_list[1::2]): + new_words += old_words[j_prev:i] + # drop tokenword disambig -> remove token disanbig suffix -> remove word begin mark + new_word = "".join(old_words[i + 1 : j]).replace(f"{TW_BREAK}{tokenword_disambig_str}", "")[1:] + new_words.append(new_word) + new_timesteps += old_timesteps[j_prev:i] + [ + old_timesteps[i], + ] + j_prev = j + 1 + if j_prev < words_len: + new_words += old_words[j_prev:words_len] + new_timesteps += old_timesteps[j_prev:words_len] + hyp.replace_unit_(k, (tuple(new_words), tuple(new_timesteps), h_unit.alignment, h_unit.score)) + return new_hypotheses + + +class AbstractWFSTDecoder(ABC): + """ + Used for performing WFST decoding of the logprobs. + + Args: + lm_fst: + Language model WFST. + + decoding_mode: + Decoding mode. E.g. `nbest`. + + beam_size: + Beam width (float) for the WFST decoding. + + config: + Decoder config. + + tokenword_disambig_id: + Tokenword disambiguation index. Set to -1 to disable the tokenword mode. + + lm_weight: + Language model weight in decoding. + """ + + def __init__( + self, + lm_fst: Any, + decoding_mode: str, + beam_size: float, + config: Optional[Any], + tokenword_disambig_id: int = -1, + lm_weight: float = 1.0, + ): + self._lm_fst = lm_fst + self._beam_size = beam_size + self._tokenword_disambig_id = tokenword_disambig_id + self._open_vocabulary_decoding = self._tokenword_disambig_id >= 0 + self._lm_weight = lm_weight + self._id2word, self._word2id = None, None + self._id2token, self._token2id = None, None + self._decoding_mode, self._config, self._decoder = None, None, None + + self._set_decoding_mode(decoding_mode) + self._set_decoder_config(config) + self._init_decoder() + + @abstractmethod + def _set_decoder_config(self, config: Optional[Any] = None): + pass + + @abstractmethod + def _set_decoding_mode(self, decoding_mode: str): + pass + + @abstractmethod + def _init_decoder(self): + pass + + @property + def decoding_mode(self): + return self._decoding_mode + + @decoding_mode.setter + def decoding_mode(self, value: str): + self._decoding_mode_setter(value) + + @abstractmethod + def _decoding_mode_setter(self, value: str): + pass + + @property + def beam_size(self): + return self._beam_size + + @beam_size.setter + def beam_size(self, value: float): + self._beam_size_setter(value) + + @abstractmethod + def _beam_size_setter(self, value: float): + pass + + @property + def lm_weight(self): + return self._lm_weight + + @lm_weight.setter + def lm_weight(self, value: float): + self._lm_weight_setter(value) + + @abstractmethod + def _lm_weight_setter(self, value: float): + pass + + @property + def tokenword_disambig_id(self): + return self._tokenword_disambig_id + + @property + def open_vocabulary_decoding(self): + return self._open_vocabulary_decoding + + @abstractmethod + def decode(self, log_probs: torch.Tensor, log_probs_length: torch.Tensor) -> List[Any]: + """ + Decodes logprobs into recognition hypotheses. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + Returns: + List of recognition hypotheses. + """ + pass + + @abstractmethod + def _post_decode(self, hypotheses: List[Any]) -> List[Any]: + """ + Does various post-processing of the recognition hypotheses. + + Args: + hypotheses: + List of recognition hypotheses. + + Returns: + List of processed recognition hypotheses. + """ + pass + + @abstractmethod + def calibrate_lm_weight( + self, log_probs: torch.Tensor, log_probs_length: torch.Tensor, reference_texts: List[str] + ) -> Tuple[float, float]: + """ + Calibrates LM weight to achieve the best WER for given logprob-text pairs. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + reference_texts: + List of reference word sequences. + + Returns: + Pair of (best_lm_weight, best_wer). + """ + pass + + @abstractmethod + def calculate_oracle_wer( + self, log_probs: torch.Tensor, log_probs_length: torch.Tensor, reference_texts: List[str] + ) -> Tuple[float, List[float]]: + """ + Calculates the oracle (the best possible WER for given logprob-text pairs. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + reference_texts: + List of reference word sequences. + + Returns: + Pair of (oracle_wer, oracle_wer_per_utterance). + """ + pass + + +class RivaGpuWfstDecoder(AbstractWFSTDecoder): + """ + Used for performing WFST decoding of the logprobs with the Riva WFST decoder. + + Args: + lm_fst: + Kaldi-type language model WFST or its path. + + decoding_mode: + Decoding mode. Choices: `nbest`, `mbr`, `lattice`. + + beam_size: + Beam width (float) for the WFST decoding. + + config: + Riva Decoder config. + + tokenword_disambig_id: + Tokenword disambiguation index. Set to -1 to disable the tokenword mode. + + lm_weight: + Language model weight in decoding. + + nbest_size: + N-best size for decoding_mode == `nbest` + """ + + def __init__( + self, + lm_fst: Union['kaldifst.StdFst', Path, str], + decoding_mode: str = 'mbr', + beam_size: float = 10.0, + config: Optional['RivaDecoderConfig'] = None, + tokenword_disambig_id: int = -1, + lm_weight: float = 1.0, + nbest_size: int = 1, + ): + self._nbest_size = nbest_size + self._load_word_lattice = None + super().__init__(lm_fst, decoding_mode, beam_size, config, tokenword_disambig_id, lm_weight) + + def _set_decoder_config(self, config: Optional['RivaDecoderConfig'] = None): + if config is None or len(config) == 0: + config = RivaDecoderConfig() + if not hasattr(config, "online_opts"): + # most likely empty config + # call importer to raise the exception + installation message + riva_decoder_importer() + # just in case + raise RuntimeError("Unexpected config error. Please debug manually.") + config.online_opts.decoder_opts.lattice_beam = self._beam_size + config.online_opts.lattice_postprocessor_opts.lm_scale = ( + self._lm_weight * config.online_opts.lattice_postprocessor_opts.acoustic_scale + ) + config.online_opts.lattice_postprocessor_opts.nbest = self._nbest_size + self._config = config + + def _init_decoder(self): + + # use importers instead of direct import to possibly get an installation message + kaldifst = kaldifst_importer() + riva_decoder = riva_decoder_importer() + + from nemo.collections.asr.parts.utils.wfst_utils import load_word_lattice + + self._load_word_lattice = load_word_lattice + # BatchedMappedDecoderCuda supports filepaths only + # TODO: fix when possible + lm_fst = self._lm_fst + tmp_fst = None + tmp_fst_file = None + if isinstance(lm_fst, (Path, str)): + # We only read lm_fst to extract words.txt and num_tokens_with_blank + tmp_fst = kaldifst.StdVectorFst.read(lm_fst) + elif isinstance(lm_fst, (kaldifst.StdVectorFst, kaldifst.StdConstFst)): + tmp_fst = lm_fst + tmp_fst_file = tempfile.NamedTemporaryFile(mode='w+t') + tmp_fst.write(tmp_fst_file.name) + lm_fst = tmp_fst_file.name + else: + raise ValueError(f"Unsupported lm_fst type: {type(lm_fst)}") + + # we assume that lm_fst has at least one disambig after real tokens + num_tokens_with_blank = tmp_fst.input_symbols.find('#0') - 1 + if self._id2word is None: + self._id2word = { + int(line.split("\t")[1]): line.split("\t")[0] + for line in str(tmp_fst.output_symbols).strip().split("\n") + } + word2id = self._id2word.__class__(map(reversed, self._id2word.items())) + word_unk_id = word2id[""] + self._word2id = defaultdict(lambda: word_unk_id) + for k, v in word2id.items(): + self._word2id[k] = v + if self._id2token is None: + self._id2token = { + int(line.split("\t")[1]): line.split("\t")[0] + for line in str(tmp_fst.input_symbols).strip().split("\n") + } + token2id = self._id2token.__class__(map(reversed, self._id2token.items())) + token_unk_id = token2id[""] + self._token2id = defaultdict(lambda: token_unk_id) + for k, v in token2id.items(): + self._token2id[k] = v + with tempfile.NamedTemporaryFile(mode='w+t') as words_tmp: + tmp_fst.output_symbols.write_text(words_tmp.name) + config = riva_decoder.BatchedMappedDecoderCudaConfig() + _fill_inner_riva_config_(config, self._config) + self._decoder = riva_decoder.BatchedMappedDecoderCuda( + config, lm_fst, words_tmp.name, num_tokens_with_blank + ) + if tmp_fst_file: + tmp_fst_file.close() + + def _set_decoding_mode(self, decoding_mode: str): + if decoding_mode == 'nbest': + self._decode = self._decode_nbest + elif decoding_mode == 'mbr': + self._decode = self._decode_mbr + elif decoding_mode == 'lattice': + self._decode = self._decode_lattice + else: + raise ValueError(f"Unsupported mode: {decoding_mode}") + self._decoding_mode = decoding_mode + + def _beam_size_setter(self, value: float): + if self._beam_size != value: + self._release_gpu_memory() + self._config.online_opts.decoder_opts.lattice_beam = value + self._init_decoder() + self._beam_size = value + + def _lm_weight_setter(self, value: float): + if self._lm_weight != value: + self._release_gpu_memory() + self._config.online_opts.lattice_postprocessor_opts.lm_scale = ( + value * self._config.online_opts.lattice_postprocessor_opts.acoustic_scale + ) + self._init_decoder() + self._lm_weight = value + + def _decoding_mode_setter(self, value: str): + if self._decoding_mode != value: + self._set_decoding_mode(value) + + @property + def nbest_size(self): + return self._nbest_size + + @nbest_size.setter + def nbest_size(self, value: float): + self._nbest_size_setter(value) + + def _nbest_size_setter(self, value: float): + if self._nbest_size != value: + self._release_gpu_memory() + self._config.online_opts.lattice_postprocessor_opts.nbest = value + self._init_decoder() + self._nbest_size = value + + def _decode_nbest( + self, log_probs: torch.Tensor, log_probs_length: torch.Tensor + ) -> List[WfstNbestHypothesis]: # words, timesteps, alignment, score + """ + Decodes logprobs into recognition hypotheses via the N-best decoding decoding. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + Returns: + List of WfstNbestHypothesis with empty alignment and trivial score. + """ + hypotheses_nbest = self._decoder.decode_nbest(log_probs, log_probs_length) + hypotheses = [] + for nh in hypotheses_nbest: + nbest_container = [] + for h in nh: + words, timesteps = [], [] + for w, t in zip(h.words, h.word_start_times_seconds): + if w != 0: + words.append(self._id2word[w]) + timesteps.append(int(t)) + alignment = [ilabel - 1 for ilabel in h.ilabels] + score = h.score + nbest_container.append(tuple([tuple(words), tuple(timesteps), tuple(alignment), score])) + hypotheses.append(WfstNbestHypothesis(tuple(nbest_container))) + return hypotheses + + def _decode_mbr(self, log_probs: torch.Tensor, log_probs_length: torch.Tensor) -> List[WfstNbestHypothesis]: + """ + Decodes logprobs into recognition hypotheses via the Minimum Bayes Risk (MBR) decoding. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + Returns: + List of WfstNbestHypothesis with empty alignment and trivial score. + """ + hypotheses_mbr = self._decoder.decode_mbr(log_probs, log_probs_length) + hypotheses = [] + for h in hypotheses_mbr: + words, timesteps = [], [] + for e in h: + words.append(e[0]) + timesteps.append(int(e[1])) + hypotheses.append(WfstNbestHypothesis(tuple([tuple([tuple(words), tuple(timesteps), tuple(), 0.0])]))) + return hypotheses + + def _decode_lattice(self, log_probs: torch.Tensor, log_probs_length: torch.Tensor) -> List['KaldiWordLattice']: + """ + Decodes logprobs into kaldi-type lattices. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + Returns: + List of KaldiWordLattice. + """ + with tempfile.NamedTemporaryFile() as tmp_lat: + tmp_lat_name = f"{tmp_lat.name}.lats" + self._decoder.decode_write_lattice( + log_probs, log_probs_length, [str(i) for i in range(len(log_probs))], f"ark,t:{tmp_lat_name}" + ) + hypotheses_lattice = self._load_word_lattice( + tmp_lat_name, self._id2word, self._id2word + ) # input and output token ids are the same + hypotheses = [hypotheses_lattice[str(i)] for i in range(len(log_probs))] + return hypotheses + + def decode( + self, log_probs: torch.Tensor, log_probs_length: torch.Tensor + ) -> Union[List[WfstNbestHypothesis], List['KaldiWordLattice']]: + """ + Decodes logprobs into recognition hypotheses. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + Returns: + List of recognition hypotheses. + """ + log_probs = log_probs.contiguous() + log_probs_length = log_probs_length.to(torch.long).to('cpu').contiguous() + hypotheses = self._decode(log_probs, log_probs_length) + hypotheses = self._post_decode(hypotheses) + return hypotheses + + def _post_decode( + self, hypotheses: Union[List[WfstNbestHypothesis], List['KaldiWordLattice']] + ) -> Union[List[WfstNbestHypothesis], List['KaldiWordLattice']]: + """ + Does various post-processing of the recognition hypotheses. + + Args: + hypotheses: + List of recognition hypotheses. + + Returns: + List of processed recognition hypotheses. + """ + if self._open_vocabulary_decoding and self._decoding_mode in ('nbest', 'mbr'): + return collapse_tokenword_hypotheses(hypotheses, self._id2word[self._tokenword_disambig_id]) + else: + return hypotheses + + def calibrate_lm_weight( + self, log_probs: torch.Tensor, log_probs_length: torch.Tensor, reference_texts: List[str] + ) -> Tuple[float, float]: + """ + Calibrates LM weight to achieve the best WER for given logprob-text pairs. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + reference_texts: + List of reference word sequences. + + Returns: + Pair of (best_lm_weight, best_wer). + """ + assert len(log_probs) == len(reference_texts) + decoding_mode_backup = self.decoding_mode + lm_weight_backup = self.lm_weight + self.decoding_mode = "mbr" + best_lm_weight, best_wer = -1.0, float('inf') + for lm_weight in range(1, 21): # enough for most cases + self.lm_weight = lm_weight / 10 + hypotheses = self.decode(log_probs, log_probs_length) + wer = word_error_rate([" ".join(h[0].words) for h in hypotheses], reference_texts) + print(lm_weight, wer) + if wer < best_wer: + best_lm_weight, best_wer = self.lm_weight, wer + self.decoding_mode = decoding_mode_backup + self.lm_weight = lm_weight_backup + return best_lm_weight, best_wer + + def calculate_oracle_wer( + self, log_probs: torch.Tensor, log_probs_length: torch.Tensor, reference_texts: List[str] + ) -> Tuple[float, List[float]]: + """ + Calculates the oracle (the best possible WER for given logprob-text pairs. + + Args: + log_probs: + A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary]. + + log_probs_length: + A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements. + + reference_texts: + List of reference word sequences. + + Returns: + Pair of (oracle_wer, oracle_wer_per_utterance). + """ + if self._open_vocabulary_decoding: + raise NotImplementedError + assert len(log_probs) == len(reference_texts) + decoding_mode_backup = self.decoding_mode + self.decoding_mode = "lattice" + lattices = self.decode(log_probs, log_probs_length) + scores, counts, wer_per_utt = [], [], [] + for lattice, text in zip(lattices, reference_texts): + word_ids = [self._word2id[w] for w in text.strip().split()] + counts.append(len(word_ids) if word_ids else 1) + scores.append(lattice.edit_distance(word_ids)) + wer_per_utt.append(scores[-1] / counts[-1]) + self.decoding_mode = decoding_mode_backup + return sum(scores) / sum(counts), wer_per_utt + + def _release_gpu_memory(self): + """ + Forces freeing of GPU memory by deleting the Riva decoder object. + """ + try: + del self._decoder + except Exception: + # apparently self._decoder was previously deleted, do nothing + pass + gc.collect() + + def __del__(self): + self._release_gpu_memory() diff --git a/nemo/collections/asr/parts/utils/wfst_utils.py b/nemo/collections/asr/parts/utils/wfst_utils.py new file mode 100644 index 000000000000..31f394fb60ac --- /dev/null +++ b/nemo/collections/asr/parts/utils/wfst_utils.py @@ -0,0 +1,1478 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import tempfile +from abc import ABC, abstractmethod, abstractproperty +from collections import defaultdict, namedtuple +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union + +from nemo.utils import logging + + +TW_BREAK = "‡" + + +try: + import kaldifst + + # check that kaldifst package is not empty + # Note: pytorch_lightning.utilities.imports.package_available may not help here + kaldifst.StdVectorFst() + _KALDIFST_AVAILABLE = True +except (ImportError, ModuleNotFoundError, AttributeError): + _KALDIFST_AVAILABLE = False + + +try: + import graphviz + + _GRAPHVIZ_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + _GRAPHVIZ_AVAILABLE = False + + +try: + import kaldilm + + _KALDILM_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + _KALDILM_AVAILABLE = False + + +KALDIFST_INSTALLATION_MESSAGE = ( + "kaldifst is not installed or is installed incorrectly.\n" + "please run `pip install kaldifst` or `bash scripts/installers/install_riva_decoder.sh` to install." +) + + +GRAPHVIZ_INSTALLATION_MESSAGE = ( + "graphviz is not installed.\n" "please run `bash scripts/installers/install_graphviz.sh` to install." +) + + +KALDILM_INSTALLATION_MESSAGE = ( + "kaldilm is not installed.\n" + "please run `pip install kaldilm` or `bash scripts/installers/install_riva_decoder.sh` to install." +) + + +def _kaldifst_maybe_raise(): + if _KALDIFST_AVAILABLE is False: + raise ImportError(KALDIFST_INSTALLATION_MESSAGE) + + +def kaldifst_importer(): + """Import helper function that returns kaldifst package or raises ImportError exception.""" + _kaldifst_maybe_raise() + return kaldifst + + +def _graphviz_maybe_raise(): + if _GRAPHVIZ_AVAILABLE is False: + raise ImportError(GRAPHVIZ_INSTALLATION_MESSAGE) + + +def graphviz_importer(): + """Import helper function that returns graphviz package or raises ImportError exception.""" + _graphviz_maybe_raise() + return graphviz + + +def _kaldilm_maybe_raise(): + if _KALDILM_AVAILABLE is False: + raise ImportError(KALDILM_INSTALLATION_MESSAGE) + + +def kaldilm_importer(): + """Import helper function that returns kaldifst package or raises ImportError exception.""" + _kaldilm_maybe_raise() + return kaldilm + + +@dataclass +class LexiconUnit: + """A dataclass encapsulating the name of the language unit (e.g. wordpiece) and its mark (e.g. word begin).""" + + name: str + mark: str = "" + + +class Lexicon: + def __init__( + self, + wordid2tokenid: Dict[int, List[List[int]]], + id2word: Union[Dict[int, str], Dict[int, LexiconUnit]], + id2token: Union[Dict[int, str], Dict[int, LexiconUnit]], + disambig_pattern: str = re.compile(r"^#\d+$"), + ): + """ + Lexicon class which contains word-to-token-sequence, word-to-id, and token-to-id mappings. + + Args: + wordid2tokenid: + Lexicon. + Mapping from word_id to token1_id token2_id ... tokenN_id. + + id2word: + Word index. + Mapping from word_id to word_str. + + id2token: + Token index. + Mapping from token_id to token_str. + + disambig_pattern: + Pattern for disambiguation symbols. + """ + is_id2token_str = not isinstance(list(id2token.values())[0], LexiconUnit) + self.id2token = {k: LexiconUnit(v) for k, v in id2token.items()} if is_id2token_str else id2token + self.token2id = {v.name: k for k, v in self.id2token.items()} + is_id2word_str = not isinstance(list(id2word.values())[0], LexiconUnit) + self.id2word = {k: LexiconUnit(v) for k, v in id2word.items()} if is_id2word_str else id2word + self.word2id = {v.name: k for k, v in self.id2word.items()} + self.wordid2tokenid = wordid2tokenid + word2tokens = defaultdict(list) + for k, v in self.wordid2tokenid.items(): + word2tokens[self.id2word[k].name] += [[self.id2token[i].name for i in vp] for vp in v] + self.word2tokens = word2tokens + self.disambig_pattern = disambig_pattern + + max_disambig_id = -1 + num_disambigs = 0 + self.has_epsilon = False + self._default_disambig_mark = "disambig" + self._default_epsilon_mark = "epsilon" + self._default_epsilon_name = "" + for i, s in self.id2token.items(): + if self.disambig_pattern.match(s.name): + if is_id2token_str or not s.mark.startswith(self._default_disambig_mark): + s.mark = self._default_disambig_mark + if i > max_disambig_id: + max_disambig_id = i + num_disambigs += 1 + if s.name == self._default_epsilon_name or s.mark == self._default_epsilon_mark: + assert i == 0 + self.has_epsilon = True + self.max_disambig_id = max_disambig_id + self.num_disambigs = num_disambigs + + if is_id2word_str: + for i, s in self.id2word.items(): + if self.disambig_pattern.match(s.name): + s.mark = self._default_disambig_mark + elif s.name == self._default_epsilon_name: + s.mark == self._default_epsilon_mark + + def __iter__(self) -> Tuple[str, List[str]]: + for wordid, tokenid_list in self.wordid2tokenid.items(): + for tokenids in tokenid_list: + yield wordid, tokenids + + def __str__(self): + return str(self.word2tokens) + + @property + def token_ids(self) -> List[int]: + """Return a list of token IDs excluding those from + disambiguation symbols. + """ + ans = [] + for i, s in self.id2token.items(): + if not s.mark.startswith(self._default_epsilon_mark) and (not self.has_epsilon or i != 0): + ans.append(i) + ans.sort() + return ans + + +def arpa2fst(lm_path: str, attach_symbol_table: bool = True) -> 'kaldifst.StdVectorFst': + """ + Compiles an ARPA LM file into a grammar WFST (G.fst). + + Args: + lm_path: + Path to the ARPA LM file. + + attach_symbol_table: + Whether to attach the words for indices of the returned WFST. + + Returns: + Kaldi-type grammar WFST. + """ + _kaldifst_maybe_raise() + _kaldilm_maybe_raise() + + with tempfile.TemporaryDirectory() as tempdirname: + output_fst = os.path.join(tempdirname, "output.fst") + words_txt = os.path.join(tempdirname, "words.txt") + # with suppress_stdout_stderr(): + kaldilm.arpa2fst( + input_arpa=lm_path, + output_fst=output_fst, + disambig_symbol="#0", + write_symbol_table=words_txt, + ) + + G = kaldifst.StdVectorFst.read(output_fst) + + if attach_symbol_table: + osym = kaldifst.SymbolTable() + with open(words_txt, encoding="utf-8") as f: + for line in f: + w, i = line.strip().split() + osym.add_symbol(symbol=w, key=int(i)) + G.output_symbols = osym + + kaldifst.arcsort(G, sort_type="ilabel") + return G + + +def add_tokenwords_( + g_fst: 'kaldifst.StdVectorFst', + tokens: List[str], + word_weight: float = 2.0, + token_unigram_weight: float = 4.0, + token_oov: str = "", +) -> int: + """ + Adds special words representing individual tokens (tokenwords). + In-place operation. + + Args: + g_fst: + Kaldi-type grammar WFST. + Will be augmented with the tokenwords. + + tokens: + Token vocabulary. + + word_weight: + The weight of an Out Of Vocabulary (OOV) word emission. + + token_unigram_weight: + The weight of a tokenword emission. + + token_oov: + OOV token. + + Returns: + The id of the tokenword disambiguation token. + """ + _kaldifst_maybe_raise() + + unigram_state = 0 + # check if 0 is the unigram state (has no outgoing epsilon arcs) + assert kaldifst.ArcIterator(g_fst, unigram_state).value.ilabel not in (0, g_fst.output_symbols.find("#0")) + + # we put tokenword self-loops in a separate state wrapped with a tokenword_disambig token + tokenword_disambig_id = g_fst.output_symbols.available_key() + tokenword_disambig = "#1" + g_fst.output_symbols.add_symbol(tokenword_disambig, tokenword_disambig_id) + tokenword_state = g_fst.add_state() + # we keep olabel !=0 to mark tokenword segments in the recognition results + g_fst.add_arc( + state=unigram_state, + arc=kaldifst.StdArc( + ilabel=tokenword_disambig_id, + olabel=tokenword_disambig_id, + weight=word_weight, + nextstate=tokenword_state, + ), + ) + g_fst.add_arc( + state=tokenword_state, + arc=kaldifst.StdArc( + ilabel=tokenword_disambig_id, + olabel=tokenword_disambig_id, + weight=0.0, + nextstate=unigram_state, + ), + ) + label = tokenword_disambig_id + 1 + for t in tokens: + if t != token_oov: + g_fst.add_arc( + state=tokenword_state, + arc=kaldifst.StdArc( + ilabel=label, + olabel=label, + weight=token_unigram_weight, + nextstate=tokenword_state, + ), + ) + g_fst.output_symbols.add_symbol(f"{t}{TW_BREAK}{tokenword_disambig}", label) + label += 1 + + return tokenword_disambig_id + + +def generate_lexicon_sentencepiece( + tokenizer: 'TokenizerSpec', + id2word: Dict[int, str], + oov: str = "", + add_epsilon: bool = False, + first_tokenword_id: int = -1, + disambig_pattern: str = re.compile(r"^#\d+$"), +) -> Lexicon: + """ + Generate a Lexicon using a SentencePiece tokenizer. + + Args: + tokenizer: + NeMo SentencePiece tokenizer. + + id2word: + Word index. + Mapping from word_id to word_str. + + oov: + Out Of Vocabulary word in lexicon. + + Returns: + Lexicon object. + """ + word2id = {v: k for k, v in id2word.items()} + backoff_disambig = "#0" + tokenword_disambig = "#1" + word_begin_mark = "▁" + + tokenword_mode = first_tokenword_id != -1 + if tokenword_mode: + words, tokenwords = [], [] + for k, v in id2word.items(): + if disambig_pattern.match(v): + continue + words.append(v) if k < first_tokenword_id else tokenwords.append(v) + else: + words, tokenwords = [v for v in id2word.values() if not disambig_pattern.match(v)], [] + + # Use encode to avoid OOV tokens + words_piece_ids = tokenizer.encode(words, out_type=int) + + # tokenizer.get_vocab() gives indices starting with 1 + maybe_add_one = int(add_epsilon) + maybe_subtract_one = int(not add_epsilon) + vocab = tokenizer.get_vocab() + id2token = { + v - maybe_subtract_one: LexiconUnit(k, "begin" if k.startswith(word_begin_mark) else "") + for k, v in vocab.items() + } + + # Introduce unk, blank, and the first disambig ids + unk_id = tokenizer.piece_to_id(oov) + maybe_add_one + id2token[unk_id] = LexiconUnit(oov, "unk") + # We assume blank to have the last output id of the neural network output + max_token_id = max(id2token.keys()) + id2token[max_token_id + 1] = LexiconUnit("", "blank") + id2token[max_token_id + 2] = LexiconUnit(backoff_disambig, "disambig_backoff") + if tokenword_mode: + id2token[max_token_id + 3] = LexiconUnit(tokenword_disambig, "disambig_tokenword") + if add_epsilon: + # insert first + id2token[0] = LexiconUnit("", "epsilon") + id2token = {k: v for k, v in sorted(id2token.items(), key=lambda item: item[0])} + + if tokenword_mode: + words += tokenwords + words_piece_ids += [[vocab[tw.rstrip(f"{TW_BREAK}{tokenword_disambig}")] - maybe_add_one] for tw in tokenwords] + + wordid2tokenid = defaultdict(list) + + for word, piece_ids in zip(words, words_piece_ids): + if word.startswith("<") and word != "": # not a real word, probably some tag + continue + elif word == "": # we do not need to tokelize + continue + else: + wordid2tokenid[word2id[word]].append([p + maybe_add_one for p in piece_ids]) + + lexicon = Lexicon(wordid2tokenid, id2word, id2token) + # state disambig purpose explicitly for further use + lexicon.id2word[lexicon.word2id[backoff_disambig]].mark = "disambig_backoff" + if tokenword_mode: + lexicon.id2word[lexicon.word2id[tokenword_disambig]].mark = "disambig_tokenword" + for tw in tokenwords: + lexicon.id2word[lexicon.word2id[tw]].mark = "tokenword" + return lexicon + + +def add_disambig_symbols(lexicon: Lexicon) -> Lexicon: + """ + Adds pseudo-token disambiguation symbols #1, #2 and so on + at the ends of tokens to ensure that all pronunciations are different, + and that none is a prefix of another. + + See also add_lex_disambig.pl from kaldi. + + Args: + lexicon: + Lexicon object. + + Returns: + Return Lexicon augmented with subseqence disambiguation symbols. + """ + + tokenword_mode = "#1" in lexicon.word2id + if tokenword_mode: + first_tokenword_id = lexicon.word2id["#1"] + 1 + last_used_disambig_id = lexicon.token2id["#1"] + else: + last_used_disambig_id = lexicon.token2id["#0"] + + # (1) Work out the count of each token-sequence in the lexicon. + count = defaultdict(int) + for _, token_ids in lexicon: + count[tuple(token_ids)] += 1 + + # (2) For each left sub-sequence of each token-sequence, note down + # that it exists (for identifying prefixes of longer strings). + issubseq = defaultdict(int) + for word_id, token_ids in lexicon: + if tokenword_mode and word_id >= first_tokenword_id: + continue + token_ids = token_ids.copy() + token_ids.pop() + while token_ids: + issubseq[tuple(token_ids)] = 1 + token_ids.pop() + + # (3) For each entry in the lexicon: + # if the token sequence is unique and is not a + # prefix of another word, no disambig symbol. + # Else output #1, or #2, #3, ... if the same token-seq + # has already been assigned a disambig symbol. + wordid2tokenid = defaultdict(list) + id2token = lexicon.id2token.copy() + + first_allowed_disambig = lexicon.num_disambigs + first_allowed_disambig_id = last_used_disambig_id + 1 + max_disambig = first_allowed_disambig - 1 + last_used_disambig_id_of = defaultdict(int) + + for word_id, token_ids in lexicon: + token_key = tuple(token_ids) + assert len(token_key) > 0 + if issubseq[token_key] == 0 and count[token_key] == 1 or tokenword_mode and word_id >= first_tokenword_id: + wordid2tokenid[word_id].append(token_ids) + continue + + cur_disambig_id = last_used_disambig_id_of[token_key] + if cur_disambig_id == 0: + cur_disambig = first_allowed_disambig + cur_disambig_id = first_allowed_disambig_id + else: + cur_disambig = int(id2token[cur_disambig_id].name.lstrip("#")) + 1 + + if cur_disambig > max_disambig: + max_disambig = cur_disambig + cur_disambig_id = max(id2token.keys()) + 1 + id2token[cur_disambig_id] = LexiconUnit(f"#{max_disambig}", "disambig_subsequence") + last_used_disambig_id_of[token_key] = cur_disambig_id + wordid2tokenid[word_id].append(token_ids + [cur_disambig_id]) + return Lexicon(wordid2tokenid, lexicon.id2word, id2token) + + +def make_lexicon_fst_no_silence( + lexicon: Lexicon, + attach_symbol_table: bool = True, +) -> 'kaldifst.StdVectorFst': + """ + Compiles a Lexicon into a lexicon WFST (L.fst). + + See also make_lexicon_fst.py from kaldi. + + Args: + lexicon: + Lexicon object. + + Returns: + Kaldi-type lexicon WFST. + """ + _kaldifst_maybe_raise() + + backoff_disambig = "#0" + tokenword_disambig = "#1" + tokenword_mode = tokenword_disambig in lexicon.word2id + if tokenword_mode: + first_tokenword_id = lexicon.word2id[tokenword_disambig] + 1 + + fst = kaldifst.StdVectorFst() + start_state = fst.add_state() + fst.start = start_state + fst.set_final(state=start_state, weight=0) + fst.add_arc( + state=start_state, + arc=kaldifst.StdArc( + ilabel=lexicon.token2id[backoff_disambig], + olabel=lexicon.word2id[backoff_disambig], + weight=0, + nextstate=start_state, + ), + ) + if tokenword_mode: + tokenword_state_begin = fst.add_state() + fst.add_arc( + state=start_state, + arc=kaldifst.StdArc( + ilabel=lexicon.token2id[tokenword_disambig], + olabel=lexicon.word2id[tokenword_disambig], + weight=0, + nextstate=tokenword_state_begin, + ), + ) + + for word_id, token_ids in lexicon: + cur_state = start_state + + if not tokenword_mode or word_id < first_tokenword_id - 1: + for i, token_id in enumerate(token_ids[:-1]): + next_state = fst.add_state() + fst.add_arc( + state=cur_state, + arc=kaldifst.StdArc( + ilabel=token_id, + olabel=word_id if i == 0 else 0, + weight=0, + nextstate=next_state, + ), + ) + cur_state = next_state + i = len(token_ids) - 1 # note: i == -1 if tokens is empty. + fst.add_arc( + state=cur_state, + arc=kaldifst.StdArc( + ilabel=token_ids[-1] if i >= 0 else 0, + olabel=word_id if i <= 0 else 0, + weight=0, + nextstate=start_state, + ), + ) + if tokenword_mode: + tokenword_begin, tokenword_other = [], [] + for word_id in range(first_tokenword_id, max(lexicon.id2word) + 1): + token_id = lexicon.token2id[lexicon.id2word[word_id].name.rstrip(f"{TW_BREAK}{tokenword_disambig}")] + token_unit = lexicon.id2token[token_id] + if token_unit.mark.startswith("begin"): + tokenword_begin.append((token_id, word_id)) + elif token_unit.mark == "": + tokenword_other.append((token_id, word_id)) + else: + raise RuntimeError(f"Unexpected mark `{token_unit.mark}` for tokenword `{token_unit.name}`") + + tokenword_state_main = fst.add_state() + for token_id, word_id in tokenword_begin: + fst.add_arc( + state=tokenword_state_begin, + arc=kaldifst.StdArc( + ilabel=token_id, + olabel=word_id, + weight=0, + nextstate=tokenword_state_main, + ), + ) + tokenword_state_end = fst.add_state() + for token_id, word_id in tokenword_other: + fst.add_arc( + state=tokenword_state_main, + arc=kaldifst.StdArc( + ilabel=token_id, + olabel=word_id, + weight=0, + nextstate=tokenword_state_main, + ), + ) + fst.add_arc( + state=tokenword_state_main, + arc=kaldifst.StdArc( + ilabel=token_id, + olabel=word_id, + weight=0, + nextstate=tokenword_state_end, + ), + ) + fst.add_arc( + state=tokenword_state_end, + arc=kaldifst.StdArc( + ilabel=lexicon.token2id[tokenword_disambig], + olabel=lexicon.word2id[tokenword_disambig], + weight=0, + nextstate=start_state, + ), + ) + + if attach_symbol_table: + isym = kaldifst.SymbolTable() + for p, i in lexicon.token2id.items(): + isym.add_symbol(symbol=p, key=i) + fst.input_symbols = isym + + osym = kaldifst.SymbolTable() + for w, i in lexicon.word2id.items(): + osym.add_symbol(symbol=w, key=i) + fst.output_symbols = osym + + kaldifst.arcsort(fst, sort_type="ilabel") + return fst + + +def build_topo( + name: str, token2id: Dict[str, int], with_self_loops: bool = True, attach_symbol_table: bool = True +) -> 'kaldifst.StdVectorFst': + """Helper function to build a topology WFST (T.fst). + + Args: + name: + Topology name. Choices: default, compact, minimal + + token2id: + Token index. + Mapping from token_str to token_id. + + with_self_loops: + Whether to add token-to-epsilon self-loops to the topology. + + attach_symbol_table: + Whether to attach the token names for indices of the returned WFST. + + Returns: + Kaldi-type topology WFST. + """ + _kaldifst_maybe_raise() + + if name == "default": + fst = build_default_topo(token2id, with_self_loops) + elif name == "compact": + fst = build_compact_topo(token2id, with_self_loops) + elif name == "minimal": + fst = build_minimal_topo(token2id) + else: + raise ValueError(f"Unknown topo name: {name}") + + if attach_symbol_table: + isym = kaldifst.SymbolTable() + for t, i in token2id.items(): + isym.add_symbol(symbol=t, key=i) + fst.input_symbols = isym + fst.output_symbols = fst.input_symbols.copy() + return fst + + +def build_default_topo(token2id: Dict[str, int], with_self_loops: bool = True) -> 'kaldifst.StdVectorFst': + """Build the default (correct) CTC topology.""" + _kaldifst_maybe_raise() + + disambig_pattern = re.compile(r"^#\d+$") + blank_id = token2id[""] + fst = kaldifst.StdVectorFst() + start_state = fst.add_state() + fst.start = start_state + fst.set_final(state=start_state, weight=0) + fst.add_arc( + state=start_state, + arc=kaldifst.StdArc( + ilabel=blank_id, + olabel=0, + weight=0, + nextstate=start_state, # token2id[""] is always 0 + ), + ) + + disambig_ids = [] + token_ids = {} + for s, i in token2id.items(): + if s == "" or s == "": + continue + elif disambig_pattern.match(s): + disambig_ids.append(i) + else: + state = fst.add_state() + fst.set_final(state=state, weight=0) + token_ids[state] = i + fst.add_arc( + state=start_state, + arc=kaldifst.StdArc( + ilabel=i, + olabel=i, + weight=0, + nextstate=state, + ), + ) + if with_self_loops: + fst.add_arc( + state=state, + arc=kaldifst.StdArc( + ilabel=i, + olabel=0, + weight=0, + nextstate=state, # token2id[""] is always 0 + ), + ) + fst.add_arc( + state=state, + arc=kaldifst.StdArc( + ilabel=blank_id, + olabel=0, + weight=0, + nextstate=start_state, # token2id[""] is always 0 + ), + ) + + for istate in kaldifst.StateIterator(fst): + if istate > 0: + for ostate in kaldifst.StateIterator(fst): + if ostate > 0 and istate != ostate: + label = token_ids[ostate] + fst.add_arc( + state=istate, + arc=kaldifst.StdArc( + ilabel=label, + olabel=label, + weight=0, + nextstate=ostate, + ), + ) + for disambig_id in disambig_ids: + fst.add_arc( + state=istate, + arc=kaldifst.StdArc( + ilabel=0, + olabel=disambig_id, + weight=0, + nextstate=istate, # token2id[""] is always 0 + ), + ) + + return fst + + +def build_compact_topo(token2id: Dict[str, int], with_self_loops: bool = True) -> 'kaldifst.StdVectorFst': + """Build the Compact CTC topology.""" + _kaldifst_maybe_raise() + + disambig_pattern = re.compile(r"^#\d+$") + blank_id = token2id[""] + fst = kaldifst.StdVectorFst() + start_state = fst.add_state() + fst.start = start_state + fst.set_final(state=start_state, weight=0) + fst.add_arc( + state=start_state, + arc=kaldifst.StdArc( + ilabel=blank_id, + olabel=0, + weight=0, + nextstate=start_state, # token2id[""] is always 0 + ), + ) + + for s, i in token2id.items(): + if s == "" or s == "": + continue + elif disambig_pattern.match(s): + fst.add_arc( + state=start_state, + arc=kaldifst.StdArc( + ilabel=0, + olabel=i, + weight=0, + nextstate=start_state, # token2id[""] is always 0 + ), + ) + else: + state = fst.add_state() + fst.add_arc( + state=start_state, + arc=kaldifst.StdArc( + ilabel=i, + olabel=i, + weight=0, + nextstate=state, + ), + ) + if with_self_loops: + fst.add_arc( + state=state, + arc=kaldifst.StdArc( + ilabel=i, + olabel=0, + weight=0, + nextstate=state, # token2id[""] is always 0 + ), + ) + fst.add_arc( + state=state, + arc=kaldifst.StdArc( + ilabel=0, # token2id[""] is always 0 + olabel=0, # token2id[""] is always 0 + weight=0, + nextstate=start_state, + ), + ) + + return fst + + +def build_minimal_topo(token2id: Dict[str, int]) -> 'kaldifst.StdVectorFst': + """Build the Minimal CTC topology.""" + _kaldifst_maybe_raise() + + disambig_pattern = re.compile(r"^#\d+$") + blank_id = token2id[""] + fst = kaldifst.StdVectorFst() + start_state = fst.add_state() + fst.start = start_state + fst.set_final(state=start_state, weight=0) + fst.add_arc( + state=start_state, + arc=kaldifst.StdArc( + ilabel=blank_id, + olabel=0, + weight=0, + nextstate=start_state, # token2id[""] is always 0 + ), + ) + + for s, i in token2id.items(): + if s == "" or s == "": + continue + elif disambig_pattern.match(s): + fst.add_arc( + state=start_state, + arc=kaldifst.StdArc( + ilabel=0, + olabel=i, + weight=0, + nextstate=start_state, # token2id[""] is always 0 + ), + ) + else: + fst.add_arc( + state=start_state, + arc=kaldifst.StdArc( + ilabel=i, + olabel=i, + weight=0, + nextstate=start_state, + ), + ) + + return fst + + +def mkgraph_ctc_ov( + tokenizer: 'TokenizerSpec', + lm_path: Union[Path, str], + topology_name: str = "default", + write_tlg_path: Optional[Union[Path, str]] = None, + open_vocabulary: bool = False, + open_vocabulary_weights: Tuple[float, float] = (2.0, 4.0), + target: str = "kaldi", # "kaldi", "k2" +) -> Tuple[Union['kaldifst.StdVectorFst', 'k2.Fsa'], int]: + """ + Builds a decoding WFST (TLG.fst or TLG.pt). + + See also mkgraph.sh from kaldi. + + Args: + tokenizer: + NeMo SentencePiece tokenizer. + + lm_path: + Path to the ARPA LM file. + + topology_name: + Topology name. Choices: default, compact, minimal. + + write_tlg_path: + Where to buffer the TLG. + + open_vocabulary: + Whether to build a decoding WFST suitable for the open vocabulary decoding. + + open_vocabulary_weights: + Pair of weights (oov_word_weight, token_unigram_weight). + + target: + What type to build the WFST for. Choices: kaldi, k2. + + Returns: + A pair of kaldi- or k2-type decoding WFST and its id of the tokenword disambiguation token. + """ + _kaldifst_maybe_raise() + + logging.info("Compiling G.fst ...") + G = arpa2fst(lm_path) + if open_vocabulary: + # in-place for g_fst + tokenword_disambig_id = add_tokenwords_( + g_fst=G, + tokens=tokenizer.tokenizer.get_vocab().keys(), + word_weight=open_vocabulary_weights[0], + token_unigram_weight=open_vocabulary_weights[1], + ) + else: + tokenword_disambig_id = -1 + + logging.info("Building L.fst ...") + id2word = {int(line.split("\t")[1]): line.split("\t")[0] for line in str(G.output_symbols).strip().split("\n")} + lexicon = generate_lexicon_sentencepiece( + tokenizer.tokenizer, id2word, add_epsilon=True, first_tokenword_id=tokenword_disambig_id + ) + lexicon_disambig = add_disambig_symbols(lexicon) + + L = make_lexicon_fst_no_silence(lexicon_disambig) + kaldifst.arcsort(L, sort_type="olabel") + + logging.info("Building LG.fst ...") + LG = kaldifst.compose(L, G) + kaldifst.determinize_star(LG) + kaldifst.minimize_encoded(LG) + kaldifst.arcsort(LG, sort_type="ilabel") + + logging.info("Building TLG.fst ...") + T = build_topo(topology_name, lexicon_disambig.token2id) + kaldifst.arcsort(T, sort_type="olabel") + TLG = kaldifst.compose(T, LG) + + if target == "kaldi": + if write_tlg_path: + logging.info(f"Buffering TLG.fst into {write_tlg_path} ...") + TLG.write(write_tlg_path) + elif target == "k2": + logging.info("Converting TLG.fst to k2 ...") + import torch + + from nemo.core.utils.k2_guard import k2 + + blank_id = [i for i, t in lexicon_disambig.id2token.items() if t.mark == "blank"][0] + first_token_disambig_id = [i for i, t in lexicon_disambig.id2token.items() if t.mark == "disambig_backoff"][0] + word_disambig_id = lexicon_disambig.word2id[lexicon_disambig.id2token[first_token_disambig_id].name] + assert lexicon_disambig.id2word[word_disambig_id].mark == "disambig_backoff" + input_symbols = "\n".join( + [f"{k} {v - 1}" for k, v in lexicon_disambig.token2id.items() if 0 < v < first_token_disambig_id] + ) + output_symbols = str(TLG.output_symbols) + TLG.input_symbols = None + TLG.output_symbols = None + # k2 does not support torch.inference_mode enabled + with torch.inference_mode(False): + TLG = k2.Fsa.from_openfst(TLG.to_str(show_weight_one=True), acceptor=False) + TLG.labels[TLG.labels >= first_token_disambig_id] = blank_id + TLG.aux_labels[TLG.aux_labels.values == word_disambig_id] = 0 + TLG.__dict__["_properties"] = None + TLG = k2.arc_sort(k2.connect(k2.remove_epsilon(TLG))) + TLG.labels[TLG.labels > 0] = TLG.labels[TLG.labels > 0] - 1 + TLG.__dict__["_properties"] = None + TLG.labels_sym = k2.SymbolTable.from_str(input_symbols) + TLG.aux_labels_sym = k2.SymbolTable.from_str(output_symbols) + TLG = k2.arc_sort(TLG) + if write_tlg_path: + logging.info(f"Buffering TLG.pt into {write_tlg_path} ...") + torch.save(TLG.as_dict(), write_tlg_path) + else: + raise ValueError(f"Unsupported target: `{target}`") + + return TLG, tokenword_disambig_id + + +class KaldiFstMask(Enum): + Acceptor = 65536 + Error = 4 + TopSorted = 274877906944 + Acyclic = 34359738368 + IlabelSorted = 268435456 + OlabelSorted = 1073741824 + IlabelDeterministic = 262144 + OlabelDeterministic = 1048576 + HasEpsilons = 4194304 + HasIEpsilons = 16777216 + Accessible = 1099511627776 + Coaccessible = 4398046511104 + Weighted = 4294967296 + + +class LatticeProperties(NamedTuple): + Acceptor: bool + Valid: bool + Nonempty: bool + TopSorted: bool + Acyclic: bool + ArcSorted: bool + Deterministic: bool + EpsilonFree: bool + InputEpsilonFree: bool + Connected: bool + Weighted: bool + + +class AbstractLattice(ABC): + """A lattice wrapper with high-level capabilities.""" + + def __init__(self, lattice: Any): + self._lattice = lattice + self._properties = None + + @abstractmethod + def as_tensor(self) -> 'torch.Tensor': + """Represents the lattice as a tensor. + + Returns: + torch.Tensor + """ + pass + + @abstractmethod + def draw( + self, filename: Optional[Union[Path, str]] = None, title: Optional[Union[Path, str]] = None, zoom: float = 1.0 + ) -> Union['graphviz.Digraph', 'IPython.display.HTML']: + """Render FSA as an image via graphviz, and return the Digraph object; and optionally save to file filename. + filename must have a suffix that graphviz understands, such as pdf, svg or png. + + Note: + You need to install graphviz to use this function:: + + ./scripts/installers/install_graphviz.sh + + Args: + filename: + Filename to (optionally) save to, e.g. ‘foo.png’, ‘foo.svg’, ‘foo.png’. + + title: + Title to be displayed in image, e.g. ‘A simple lattice example’. + + zoom: + Zoom-in lattice in IPython notebook (needed for large lattices). + + Returns: + graphviz.Digraph or IPython.display.HTML + """ + pass + + @abstractmethod + def edit_distance(self, reference_sequence: List[int]) -> int: + """Get the edit distance from a reference sequence to the lattice. + + Args: + reference_sequence: + List of word- or token-ids. + + Returns: + Number of edits. + """ + + @property + def lattice(self): + self._properties = None + return self._lattice + + @abstractproperty + def properties(self) -> LatticeProperties: + pass + + @abstractproperty + def symbol_table(self) -> Optional[Dict[int, str]]: + pass + + @abstractproperty + def auxiliary_tables(self) -> Optional[Tuple[Any]]: + pass + + +class KaldiWordLattice(AbstractLattice): + """A Kaldi lattice wrapper with high-level capabilities.""" + + def __init__( + self, + lattice: 'kaldifst.Lattice', + symbol_table: Optional[Dict[int, str]] = None, + auxiliary_tables: Optional[Dict[str, Any]] = None, + ): + _kaldifst_maybe_raise() + + if not isinstance(lattice, kaldifst.Lattice): + raise ValueError(f"Wrong lattice type: `{type(lattice)}`") + super().__init__(lattice) + + kaldi_symbols2dict = lambda symbols: { + int(line.split("\t")[1]): line.split("\t")[0] for line in str(symbols).strip().split("\n") + } + self._symbol_table = None + # most likely lattice will have empty input_symbols + if symbol_table is not None: + self._symbol_table = symbol_table + elif self._lattice.output_symbols is not None: + # we suppose that lattice.input_symbols will not be changed + self._symbol_table = kaldi_symbols2dict(self._lattice.output_symbols) + + self._auxiliary_tables = None + if auxiliary_tables is not None: + attributes, values = list(auxiliary_tables.keys()), list(auxiliary_tables.values()) + if "input_symbols" not in attributes and self._lattice.input_symbols is not None: + # rare but possible case + attributes.append("input_symbols") + values.append(kaldi_symbols2dict(self._lattice.input_symbols)) + self._auxiliary_tables = namedtuple("KaldiAuxiliaryTables", attributes)(*values) + elif self._lattice.input_symbols is not None: + self._auxiliary_tables = namedtuple("KaldiAuxiliaryTables", "input_symbols")( + kaldi_symbols2dict(self._lattice.input_symbols) + ) + + @property + def properties(self) -> LatticeProperties: + if self._properties is None: + acceptor = self._lattice.properties(KaldiFstMask.Acceptor.value, True) == KaldiFstMask.Acceptor.value + valid = self._lattice.properties(KaldiFstMask.Error.value, True) != KaldiFstMask.Error.value + nonempty = self._lattice.num_states > 0 + top_sorted = self._lattice.properties(KaldiFstMask.TopSorted.value, True) == KaldiFstMask.TopSorted.value + acyclic = self._lattice.properties(KaldiFstMask.Acyclic.value, True) == KaldiFstMask.Acyclic.value + arc_sorted = ( + self._lattice.properties(KaldiFstMask.IlabelSorted.value, True) == KaldiFstMask.IlabelSorted.value + and self._lattice.properties(KaldiFstMask.OlabelSorted.value, True) == KaldiFstMask.OlabelSorted.value + ) + deterministic = ( + self._lattice.properties(KaldiFstMask.IlabelDeterministic.value, True) + == KaldiFstMask.IlabelDeterministic.value + and self._lattice.properties(KaldiFstMask.OlabelDeterministic.value, True) + == KaldiFstMask.OlabelDeterministic.value + ) + epsilon_free = ( + self._lattice.properties(KaldiFstMask.HasEpsilons.value, True) != KaldiFstMask.HasEpsilons.value + ) + input_epsilon_free = ( + self._lattice.properties(KaldiFstMask.HasIEpsilons.value, True) != KaldiFstMask.HasIEpsilons.value + ) + connected = ( + self._lattice.properties(KaldiFstMask.Accessible.value, True) == KaldiFstMask.Accessible.value + and self._lattice.properties(KaldiFstMask.Coaccessible.value, True) == KaldiFstMask.Coaccessible.value + ) + weighted = self._lattice.properties(KaldiFstMask.Weighted.value, True) == KaldiFstMask.Weighted.value + self._properties = LatticeProperties( + Acceptor=acceptor, + Valid=valid, + Nonempty=nonempty, + TopSorted=top_sorted, + Acyclic=acyclic, + ArcSorted=arc_sorted, + Deterministic=deterministic, + EpsilonFree=epsilon_free, + InputEpsilonFree=input_epsilon_free, + Connected=connected, + Weighted=weighted, + ) + return self._properties + + @property + def symbol_table(self) -> Optional[Dict[int, str]]: + return self._symbol_table + + @property + def auxiliary_tables(self) -> Optional[Tuple[Any]]: + return self._auxiliary_tables + + def as_tensor(self) -> 'torch.Tensor': + """Represents the lattice as a tensor. + + Returns: + torch.Tensor + """ + raise NotImplementedError("Tensor representation is not supported yet.") + + def edit_distance(self, reference_sequence: List[int]) -> int: + """Get the edit distance from a reference sequence to the lattice. + + Args: + reference_sequence: + List of word- or token-ids. + + Returns: + Number of edits. + """ + _kaldifst_maybe_raise() + + if not self.properties.InputEpsilonFree: + logging.warning(f"Lattice contains input epsilons. Edit distance calculations may not be accurate.") + if not all(reference_sequence): + raise ValueError(f"reference_sequence contains zeros, which is not allowed.") + ref = levenshtein_graph_kaldi(kaldifst.make_linear_acceptor(reference_sequence)) + hyp = levenshtein_graph_kaldi(self._lattice) + kaldifst.invert(hyp) + ali_fst = kaldifst.compose(hyp, ref) + succeeded, _, _, total_weight = kaldifst.get_linear_symbol_sequence(kaldifst.shortest_path(ali_fst)) + if not succeeded: + raise RuntimeError("Something went wrong while calculating edit_distance. Please check input manually.") + return round(total_weight.value) + + def draw( + self, filename: Optional[Union[Path, str]] = None, title: Optional[Union[Path, str]] = None, zoom: float = 1.0 + ) -> Union['graphviz.Digraph', 'IPython.display.HTML']: + """Render FSA as an image via graphviz, and return the Digraph object; and optionally save to file filename. + filename must have a suffix that graphviz understands, such as pdf, svg or png. + + Note: + You need to install graphviz to use this function:: + + ./scripts/installers/install_graphviz.sh + + Args: + filename: + Filename to (optionally) save to, e.g. ‘foo.png’, ‘foo.svg’, ‘foo.png’. + + title: + Title to be displayed in image, e.g. ‘A simple lattice example’. + + zoom: + Zoom-in lattice in IPython notebook (needed for large lattices). + + Returns: + graphviz.Digraph or IPython.display.HTML + """ + _kaldifst_maybe_raise() + _graphviz_maybe_raise() + + isym, osym = None, None + if self._symbol_table: + osym = kaldifst.SymbolTable() + for i, w in self._symbol_table.items(): + osym.add_symbol(symbol=w, key=i) + + if ( + self._auxiliary_tables + and hasattr(self._auxiliary_tables, "input_symbols") + and self._auxiliary_tables.input_symbols + ): + isym = kaldifst.SymbolTable() + for i, t in self._auxiliary_tables.input_symbols.items(): + isym.add_symbol(symbol=t, key=i) + + fst_dot = kaldifst.draw( + self._lattice, acceptor=False, portrait=True, isymbols=isym, osymbols=osym, show_weight_one=True + ) + source = graphviz.Source(fst_dot) + source_lines = str(source).splitlines() + # Remove 'digraph tree {' + source_lines.pop(0) + # Remove the closing brackets '}' + source_lines.pop(-1) + graph_attr = { + 'rankdir': 'LR', + 'size': '8.5,11', + 'center': '1', + 'orientation': 'Portrait', + 'ranksep': '0.4', + 'nodesep': '0.25', + 'margin': '0.0', + } + if title is not None: + graph_attr['label'] = title + digraph = graphviz.Digraph(graph_attr=graph_attr) + digraph.body += source_lines + if filename: + _, extension = os.path.splitext(filename) + if extension == '' or extension[0] != '.': + raise ValueError(f"Filename needs to have a suffix like .png, .pdf, .svg, or .gv: `{filename}`") + with tempfile.TemporaryDirectory() as tmp_dir: + temp_fn = digraph.render(filename='temp', directory=tmp_dir, format=extension[1:], cleanup=True) + + shutil.move(temp_fn, filename) + if _is_notebook(): + import warnings + + from IPython.display import HTML + + with tempfile.TemporaryDirectory() as tmp_dir: + temp_fn = digraph.render(filename='temp', directory=tmp_dir, format="svg", cleanup=True) + svg, (width, height) = _svg_srcdoc_resize(temp_fn, zoom) + # IFrame requires src file to be present when rendering + # so we use HTML with iframe srcdoc instead + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return HTML( + f"""""" + ) + return digraph + + +def _is_notebook() -> bool: + try: + shell = get_ipython().__class__.__name__ + if shell == 'ZMQInteractiveShell' or 'Shell': + return True # Jupyter notebook, Google Colab notebook, or qtconsole + elif shell == 'TerminalInteractiveShell': + return False # Terminal running IPython + else: + return False # Other type + except NameError: + return False # Probably standard Python interpreter + + +def _svg_srcdoc_resize(filename: Union[Path, str], zoom: float) -> Tuple[str, Tuple[int, int]]: + with open(filename, "rt", encoding="utf-8") as f: + line = f.readline() + while not line.startswith(" 'kaldifst.StdFst': + """Construct the levenshtein graph from a kaldi-type WFST or a lattice. + + See also levenshtein_graph from k2. + + Args: + fst: + Kaldi-type source WFST or lattice. + + ins_del_score: + Insertion and deletion penalty. + Should be more than 0.5 for substitutions to be preferred over insertions/deletions, or less otherwise. + + Returns: + Kaldi-type levenshtein WFST. + """ + _kaldifst_maybe_raise() + + if fst.properties(KaldiFstMask.Acceptor.value, True) != KaldiFstMask.Acceptor.value: + logging.warning( + "Levenshtein graph construction is not safe for WFSTs with different input and output symbols." + ) + if fst.properties(KaldiFstMask.Acyclic.value, True) != KaldiFstMask.Acyclic.value: + raise ValueError("Levenshtein graph is not defined for WFSTs with cycles.") + if isinstance(fst, kaldifst.StdFst): + lfst = fst.copy(safe=True) + elif isinstance(fst, kaldifst.Lattice): + # dropping lattice weights + lfst = kaldifst.compile(re.sub("[-\d.]+,[-\d.]+", "0", fst.to_str(show_weight_one=True))) + else: + raise ValueError(f"Levenshtein graph building is not supported for the type `{type(fst)}`.") + sub_score = 0.5 + eps = 0 + for state in kaldifst.StateIterator(lfst): + # epsilon self-loop for insertions and deletions + arcs_to_add = [ + kaldifst.StdArc( + ilabel=eps, + olabel=eps, + weight=ins_del_score, + nextstate=state, + ) + ] + for arc in kaldifst.ArcIterator(lfst, state): + # epsilon-to-ilabel arc for substitutions + arcs_to_add.append( + kaldifst.StdArc( + ilabel=eps, + olabel=arc.ilabel, + weight=sub_score, + nextstate=arc.nextstate, + ) + ) + # zero weight for correct ids (redundant for lattices) + arc.weight = 0.0 + for arc in arcs_to_add: + lfst.add_arc(state=state, arc=arc) + kaldifst.arcsort(lfst) + return lfst + + +def load_word_lattice( + lat_filename: Union[Path, str], id2word: Optional[Dict[int, str]] = None, id2token: Optional[Dict[int, str]] = None +) -> Dict[str, KaldiWordLattice]: + """Helper function to load riva-decoder recognition lattices. + + Args: + lat_filename: + Path to the riva-decoder recognition lattice file. + + id2word: + Word index. + Mapping from word_id to word_str. + + id2token: + Token index. + Mapping from token_id to token_str. + + Returns: + Dictionary with lattice names and corresponding lattices in KaldiWordLattice format. + """ + _kaldifst_maybe_raise() + + lattice_dict = {} + lattice = None + max_state = 0 + token_seq_list = [] + with open(lat_filename, "rt") as f: + for line in f.readlines(): + line_items = line.strip().split() + line_len = len(line_items) + if line_len == 0: # end of lattice + token_seq_list = [] + lattice = None + max_state = 0 + elif line_len == 1: # lattice identifier + assert lattice is None + assert max_state == 0 + assert len(token_seq_list) == 0 + lat_id = line_items[0] + lattice = kaldifst.Lattice() + lattice_dict[lat_id] = KaldiWordLattice( + lattice=lattice, + symbol_table=id2word, + auxiliary_tables={"token_seq_list": token_seq_list, "input_symbols": id2token}, + ) + start = lattice.add_state() + lattice.start = start + max_state += 1 + elif line_len in (3, 4): # arc + if line_len == 4: # regular arc + state, next_state, label = [int(i) for i in line_items[:-1]] + trunk = line_items[-1].split(',') + graph_cost, acoustic_cost = [float(i) for i in trunk[:-1]] + else: # arc without weight + logging.warning( + f"""An arc without weight is detected for lattice `{lat_id}`. + Weights and token sequences will be set trivially.""" + ) + state, next_state, label = [int(i) for i in line_items] + trunk = [""] + graph_cost, acoustic_cost = 0.0, 0.0 + if next_state >= max_state: + for i in range(max_state, next_state + 1): + lattice.add_state() + max_state = next_state + 1 + ark = kaldifst.LatticeArc( + ilabel=label, + olabel=label, + weight=kaldifst.LatticeWeight(graph_cost=graph_cost, acoustic_cost=acoustic_cost), + nextstate=next_state, + ) + lattice.add_arc(state=state, arc=ark) + token_seq_list.append((ark, [int(i) for i in trunk[-1].split(TW_BREAK)] if trunk[-1] != "" else [])) + elif line_len == 2: # final state + state = int(line_items[0]) + trunk = line_items[-1].split(',') + graph_cost, acoustic_cost = [float(i) for i in trunk[:-1]] + lattice.set_final( + state=state, weight=kaldifst.LatticeWeight(graph_cost=graph_cost, acoustic_cost=acoustic_cost) + ) + else: + raise RuntimeError(f"Broken line: `{line}`") + return lattice_dict diff --git a/nemo/core/utils/k2_utils.py b/nemo/core/utils/k2_utils.py index 3dff6a35d3e3..3e7c2a6f5a70 100644 --- a/nemo/core/utils/k2_utils.py +++ b/nemo/core/utils/k2_utils.py @@ -16,7 +16,7 @@ K2_INSTALLATION_MESSAGE = ( "Could not import `k2`.\n" "Please install k2 in one of the following ways:\n" - "1) (recommended) Run `bash scripts/speech_recognition/k2/setup.sh`\n" + "1) (recommended) Run `bash scripts/installers/install_k2.sh`\n" "2) Use any approach from https://k2-fsa.github.io/k2/installation/index.html " "if your your cuda and pytorch versions are supported.\n" "It is advised to always install k2 using setup.sh only, " diff --git a/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py b/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py new file mode 100644 index 000000000000..a1db7cec4f23 --- /dev/null +++ b/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py @@ -0,0 +1,439 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +# This script would evaluate an N-gram language model in ARPA format in +# fusion with WFST decoders on top of a trained ASR model with CTC decoder. +# NeMo's WFST decoders use WFST decoding graphs made from ARPA LMs +# to find the best candidates. This script supports BPE level encodings only +# and models which is detected automatically from the type of the model. +# You may train the LM model with e.g. SRILM. + +# Config Help + +To discover all arguments of the script, please run : +python eval_wfst_decoding_ctc.py --help +python eval_wfst_decoding_ctc.py --cfg job + +# USAGE + +python eval_wfst_decoding_ctc.py nemo_model_file= \ + input_manifest= \ + arpa_model_file= \ + decoding_wfst_file= \ + beam_width=[] \ + lm_weight=[] \ + decoding_mode= \ + decoding_search_type= \ + open_vocabulary_decoding= \ + preds_output_folder= \ + probs_cache_file=null + ... + + +# Grid Search for Hyper parameters + +For grid search, you can provide a list of arguments as follows - + + beam_width=[5.0,10.0,15.0,20.0] \ + lm_weight=[0.1,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4,1.5,2.0] \ + +""" + + +import contextlib +import json +import os +import pickle +from dataclasses import dataclass, field, is_dataclass +from pathlib import Path +from typing import List, Optional + +import editdistance +import numpy as np +import torch +from omegaconf import MISSING, OmegaConf +from sklearn.model_selection import ParameterGrid +from tqdm.auto import tqdm + +import nemo.collections.asr as nemo_asr +from nemo.collections.asr.models import EncDecHybridRNNTCTCModel +from nemo.collections.asr.parts.submodules import ctc_beam_decoding +from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization, TextProcessingConfig +from nemo.core.config import hydra_runner +from nemo.utils import logging + +# fmt: off + + +@dataclass +class EvalWFSTNGramConfig: + """ + Evaluate an ASR model with WFST decoding and n-gram ARPA language model. + """ + # # The path of the '.nemo' file of the ASR model or the name of a pretrained model (ngc / huggingface) + nemo_model_file: str = MISSING + + # File paths + input_manifest: str = MISSING # The manifest file of the evaluation set + arpa_model_file: Optional[str] = None # The path of the ARPA model file + decoding_wfst_file: Optional[str] = None # The path of the decoding WFST file + preds_output_folder: Optional[str] = None # The optional folder where the predictions are stored + probs_cache_file: Optional[str] = None # The cache file for storing the logprobs of the model + + # Parameters for inference + acoustic_batch_size: int = 16 # The batch size to calculate log probabilities + beam_batch_size: int = 512 # The batch size to be used for beam search decoding + device: str = "cuda" # The device to load the model onto to calculate log probabilities and run WFST decoding + use_amp: bool = False # Whether to use AMP if available to calculate log probabilities + + # WFST decoding hyperparameters + + beam_width: List[float] = field(default_factory=lambda: [10]) # The width or list of the beam widths for the WFST decoding + lm_weight: List[float] = field(default_factory=lambda: [1.0]) # The language model weight parameter or list of parameters for the WFST decoding + + open_vocabulary_decoding: bool = False # Whether to use open vocabulary mode for WFST decoding + decoding_mode: str = "nbest" + decoding_search_type: str = "riva" + decoding: ctc_beam_decoding.WfstCTCInferConfig = field( + default_factory=lambda: ctc_beam_decoding.WfstCTCInferConfig(beam_size=1) + ) + + text_processing: Optional[TextProcessingConfig] = field(default_factory=lambda: TextProcessingConfig( + punctuation_marks = ".,?", + separate_punctuation = False, + do_lowercase = False, + rm_punctuation = False, + )) +# fmt: on + + +def beam_search_eval( + model: nemo_asr.models.ASRModel, + cfg: EvalWFSTNGramConfig, + all_probs: List[torch.Tensor], + target_transcripts: List[str], + preds_output_file: str = None, + lm_weight: float = 1.0, + beam_width: float = 10.0, + beam_batch_size: int = 512, + progress_bar: bool = True, + punctuation_capitalization: PunctuationCapitalization = None, +): + level = logging.getEffectiveLevel() + logging.setLevel(logging.CRITICAL) + # Reset config + if isinstance(model, EncDecHybridRNNTCTCModel): + model.change_decoding_strategy(decoding_cfg=None, decoder_type="ctc") + else: + model.change_decoding_strategy(None) + + # Override the beam search config with current search candidate configuration + cfg.decoding.beam_width = beam_width + cfg.decoding.lm_weight = lm_weight + cfg.decoding.open_vocabulary_decoding = cfg.open_vocabulary_decoding + cfg.decoding.return_best_hypothesis = False + cfg.decoding.arpa_lm_path = cfg.arpa_model_file + cfg.decoding.wfst_lm_path = cfg.decoding_wfst_file + cfg.decoding.device = cfg.device + cfg.decoding.decoding_mode = cfg.decoding_mode + cfg.decoding.search_type = cfg.decoding_search_type + + # Update model's decoding strategy config + model.cfg.decoding.strategy = "wfst" + model.cfg.decoding.wfst = cfg.decoding + + # Update model's decoding strategy + if isinstance(model, EncDecHybridRNNTCTCModel): + model.change_decoding_strategy(model.cfg.decoding, decoder_type='ctc') + decoding = model.ctc_decoding + else: + model.change_decoding_strategy(model.cfg.decoding) + decoding = model.decoding + logging.setLevel(level) + + wer_dist_first = cer_dist_first = 0 + wer_dist_best = cer_dist_best = 0 + words_count = 0 + chars_count = 0 + sample_idx = 0 + if preds_output_file: + out_file = open(preds_output_file, 'w', encoding='utf_8', newline='\n') + + if progress_bar: + it = tqdm( + range(int(np.ceil(len(all_probs) / beam_batch_size))), + desc=f"Beam search decoding with width={beam_width}, lm_weight={lm_weight}", + ncols=120, + ) + else: + it = range(int(np.ceil(len(all_probs) / beam_batch_size))) + for batch_idx in it: + # disabling type checking + probs_batch = all_probs[batch_idx * beam_batch_size : (batch_idx + 1) * beam_batch_size] + probs_lens = torch.tensor([prob.shape[0] for prob in probs_batch]) + with torch.no_grad(): + packed_batch = torch.zeros(len(probs_batch), max(probs_lens), probs_batch[0].shape[-1], device='cpu') + + for prob_index in range(len(probs_batch)): + packed_batch[prob_index, : probs_lens[prob_index], :] = probs_batch[prob_index].to( + device=packed_batch.device, dtype=packed_batch.dtype + ) + + _, beams_batch = decoding.ctc_decoder_predictions_tensor( + packed_batch, + decoder_lengths=probs_lens, + return_hypotheses=True, + ) + + for beams_idx, beams in enumerate(beams_batch): + target = target_transcripts[sample_idx + beams_idx] + target_split_w = target.split() + target_split_c = list(target) + words_count += len(target_split_w) + chars_count += len(target_split_c) + wer_dist_min = cer_dist_min = 10000 + for candidate_idx, candidate in enumerate(beams): # type: (int, ctc_beam_decoding.rnnt_utils.Hypothesis) + pred_text = candidate.text + if cfg.text_processing.do_lowercase: + pred_text = punctuation_capitalization.do_lowercase([pred_text])[0] + if cfg.text_processing.rm_punctuation: + pred_text = punctuation_capitalization.rm_punctuation([pred_text])[0] + if cfg.text_processing.separate_punctuation: + pred_text = punctuation_capitalization.separate_punctuation([pred_text])[0] + pred_split_w = pred_text.split() + wer_dist = editdistance.eval(target_split_w, pred_split_w) + pred_split_c = list(pred_text) + cer_dist = editdistance.eval(target_split_c, pred_split_c) + + wer_dist_min = min(wer_dist_min, wer_dist) + cer_dist_min = min(cer_dist_min, cer_dist) + + if candidate_idx == 0: + # first candidate + wer_dist_first += wer_dist + cer_dist_first += cer_dist + + score = candidate.score + if preds_output_file: + out_file.write(f'{pred_text}\t{score}\n') + wer_dist_best += wer_dist_min + cer_dist_best += cer_dist_min + sample_idx += len(probs_batch) + + if preds_output_file: + out_file.close() + logging.info(f"Stored the predictions of beam search decoding at '{preds_output_file}'.") + + logging.info( + 'WER/CER with beam search decoding and N-gram model = {:.2%}/{:.2%}'.format( + wer_dist_first / words_count, cer_dist_first / chars_count + ) + ) + logging.info( + 'Oracle WER/CER in candidates with perfect LM= {:.2%}/{:.2%}'.format( + wer_dist_best / words_count, cer_dist_best / chars_count + ) + ) + logging.info(f"=================================================================================") + + return wer_dist_first / words_count, cer_dist_first / chars_count + + +@hydra_runner(config_path=None, config_name='EvalWFSTNGramConfig', schema=EvalWFSTNGramConfig) +def main(cfg: EvalWFSTNGramConfig): + if is_dataclass(cfg): + cfg = OmegaConf.structured(cfg) # type: EvalWFSTNGramConfig + + if cfg.nemo_model_file.endswith('.nemo'): + asr_model = nemo_asr.models.ASRModel.restore_from(cfg.nemo_model_file, map_location=torch.device(cfg.device)) + else: + logging.warning( + "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name." + ) + asr_model = nemo_asr.models.ASRModel.from_pretrained( + cfg.nemo_model_file, map_location=torch.device(cfg.device) + ) + + target_transcripts = [] + manifest_dir = Path(cfg.input_manifest).parent + with open(cfg.input_manifest, 'r', encoding='utf_8') as manifest_file: + audio_file_paths = [] + for line in tqdm(manifest_file, desc=f"Reading Manifest {cfg.input_manifest} ...", ncols=120): + data = json.loads(line) + audio_file = Path(data['audio_filepath']) + if not audio_file.is_file() and not audio_file.is_absolute(): + audio_file = manifest_dir / audio_file + target_transcripts.append(data['text']) + audio_file_paths.append(str(audio_file.absolute())) + + punctuation_capitalization = PunctuationCapitalization(cfg.text_processing.punctuation_marks) + if cfg.text_processing.do_lowercase: + target_transcripts = punctuation_capitalization.do_lowercase(target_transcripts) + if cfg.text_processing.rm_punctuation: + target_transcripts = punctuation_capitalization.rm_punctuation(target_transcripts) + if cfg.text_processing.separate_punctuation: + target_transcripts = punctuation_capitalization.separate_punctuation(target_transcripts) + + if cfg.probs_cache_file and os.path.exists(cfg.probs_cache_file): + logging.info(f"Found a pickle file of probabilities at '{cfg.probs_cache_file}'.") + logging.info(f"Loading the cached pickle file of probabilities from '{cfg.probs_cache_file}' ...") + with open(cfg.probs_cache_file, 'rb') as probs_file: + all_probs = pickle.load(probs_file) + + if len(all_probs) != len(audio_file_paths): + raise ValueError( + f"The number of samples in the probabilities file '{cfg.probs_cache_file}' does not " + f"match the manifest file. You may need to delete the probabilities cached file." + ) + else: + + @contextlib.contextmanager + def default_autocast(): + yield + + if cfg.use_amp: + if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): + logging.info("AMP is enabled!\n") + autocast = torch.cuda.amp.autocast + + else: + autocast = default_autocast + else: + + autocast = default_autocast + + with autocast(): + with torch.no_grad(): + if isinstance(asr_model, EncDecHybridRNNTCTCModel): + asr_model.cur_decoder = 'ctc' + all_hyps = asr_model.transcribe( + audio_file_paths, batch_size=cfg.acoustic_batch_size, return_hypotheses=True + ) + all_logits = [h.y_sequence for h in all_hyps] + + all_probs = all_logits + if cfg.probs_cache_file: + os.makedirs(os.path.split(cfg.probs_cache_file)[0], exist_ok=True) + logging.info(f"Writing pickle files of probabilities at '{cfg.probs_cache_file}'...") + with open(cfg.probs_cache_file, 'wb') as f_dump: + pickle.dump(all_probs, f_dump) + + wer_dist_greedy = 0 + cer_dist_greedy = 0 + words_count = 0 + chars_count = 0 + for batch_idx, probs in enumerate(all_probs): + preds = np.argmax(probs, axis=1) + preds_tensor = preds.to(device='cpu').unsqueeze(0) + preds_lens = torch.tensor([preds_tensor.shape[1]], device='cpu') + if isinstance(asr_model, EncDecHybridRNNTCTCModel): + pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor, preds_lens)[0][0] + else: + pred_text = asr_model.decoding.ctc_decoder_predictions_tensor(preds_tensor, preds_lens)[0][0] + + if cfg.text_processing.do_lowercase: + pred_text = punctuation_capitalization.do_lowercase([pred_text])[0] + if cfg.text_processing.rm_punctuation: + pred_text = punctuation_capitalization.rm_punctuation([pred_text])[0] + if cfg.text_processing.separate_punctuation: + pred_text = punctuation_capitalization.separate_punctuation([pred_text])[0] + + pred_split_w = pred_text.split() + target_split_w = target_transcripts[batch_idx].split() + pred_split_c = list(pred_text) + target_split_c = list(target_transcripts[batch_idx]) + + wer_dist = editdistance.eval(target_split_w, pred_split_w) + cer_dist = editdistance.eval(target_split_c, pred_split_c) + + wer_dist_greedy += wer_dist + cer_dist_greedy += cer_dist + words_count += len(target_split_w) + chars_count += len(target_split_c) + + logging.info('Greedy WER/CER = {:.2%}/{:.2%}'.format(wer_dist_greedy / words_count, cer_dist_greedy / chars_count)) + + asr_model = asr_model.to('cpu') + + if (cfg.arpa_model_file is None or not os.path.exists(cfg.arpa_model_file)) and ( + cfg.decoding_wfst_file is None or not os.path.exists(cfg.decoding_wfst_file) + ): + raise FileNotFoundError( + f"Could not find both the ARPA model file `{cfg.arpa_model_file}` " + f"and the decoding WFST file `{cfg.decoding_wfst_file}`." + ) + + if cfg.beam_width is None or cfg.lm_weight is None: + raise ValueError("beam_width and lm_weight are needed to perform WFST decoding.") + params = {'beam_width': cfg.beam_width, 'lm_weight': cfg.lm_weight} + hp_grid = ParameterGrid(params) + hp_grid = list(hp_grid) + + best_wer_beam_width, best_cer_beam_width = None, None + best_wer_lm_weight, best_cer_lm_weight = None, None + best_wer, best_cer = 1e6, 1e6 + + logging.info(f"==============================Starting the beam search decoding===============================") + logging.info(f"Grid search size: {len(hp_grid)}") + logging.info(f"It may take some time...") + logging.info(f"==============================================================================================") + + if cfg.preds_output_folder and not os.path.exists(cfg.preds_output_folder): + os.mkdir(cfg.preds_output_folder) + for hp in hp_grid: + if cfg.preds_output_folder: + preds_output_file = os.path.join( + cfg.preds_output_folder, + f"preds_out_beam_width{hp['beam_width']}_lm_weight{hp['lm_weight']}.tsv", + ) + else: + preds_output_file = None + + candidate_wer, candidate_cer = beam_search_eval( + asr_model, + cfg, + all_probs=all_probs, + target_transcripts=target_transcripts, + preds_output_file=preds_output_file, + beam_width=hp["beam_width"], + lm_weight=hp["lm_weight"], + beam_batch_size=cfg.beam_batch_size, + progress_bar=True, + punctuation_capitalization=punctuation_capitalization, + ) + + if candidate_cer < best_cer: + best_cer_beam_width = hp["beam_width"] + best_cer_lm_weight = hp["lm_weight"] + best_cer = candidate_cer + + if candidate_wer < best_wer: + best_wer_beam_width = hp["beam_width"] + best_wer_lm_weight = hp["lm_weight"] + best_wer = candidate_wer + + logging.info( + f'Best WER Candidate = {best_wer:.2%} :: Beam size = {best_wer_beam_width}, LM weight = {best_wer_lm_weight}' + ) + + logging.info( + f'Best CER Candidate = {best_cer:.2%} :: Beam size = {best_cer_beam_width}, LM weight = {best_cer_lm_weight}' + ) + logging.info(f"=================================================================================") + + +if __name__ == '__main__': + main() diff --git a/scripts/installers/install_riva_decoder.sh b/scripts/installers/install_riva_decoder.sh new file mode 100755 index 000000000000..4e6e99b570ab --- /dev/null +++ b/scripts/installers/install_riva_decoder.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pip install kaldifst kaldilm riva-asrlib-decoder diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py index 0d7c555ee778..247906247091 100644 --- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py +++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py @@ -24,6 +24,7 @@ from nemo.collections.asr.data import audio_to_text from nemo.collections.asr.models import configs from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE +from nemo.collections.asr.parts.submodules import ctc_beam_decoding as beam_decode from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig from nemo.collections.common import tokenizers from nemo.utils.config_utils import assert_dataclass_signature_match @@ -279,6 +280,34 @@ def test_decoding_change(self, asr_model): assert asr_model.decoding.preserve_alignments is True assert asr_model.decoding.compute_timestamps is True + new_strategy = DictConfig({}) + new_strategy.strategy = 'beam' + new_strategy.beam = DictConfig({'beam_size': 1}) + asr_model.change_decoding_strategy(decoding_cfg=new_strategy) + assert isinstance(asr_model.decoding.decoding, beam_decode.BeamCTCInfer) + assert asr_model.decoding.decoding.search_type == "default" + + new_strategy = DictConfig({}) + new_strategy.strategy = 'pyctcdecode' + new_strategy.beam = DictConfig({'beam_size': 1}) + asr_model.change_decoding_strategy(decoding_cfg=new_strategy) + assert isinstance(asr_model.decoding.decoding, beam_decode.BeamCTCInfer) + assert asr_model.decoding.decoding.search_type == "pyctcdecode" + + new_strategy = DictConfig({}) + new_strategy.strategy = 'flashlight' + new_strategy.beam = DictConfig({'beam_size': 1}) + asr_model.change_decoding_strategy(decoding_cfg=new_strategy) + assert isinstance(asr_model.decoding.decoding, beam_decode.BeamCTCInfer) + assert asr_model.decoding.decoding.search_type == "flashlight" + + new_strategy = DictConfig({}) + new_strategy.strategy = 'wfst' + new_strategy.beam = DictConfig({'beam_size': 1}) + asr_model.change_decoding_strategy(decoding_cfg=new_strategy) + assert isinstance(asr_model.decoding.decoding, beam_decode.WfstCTCInfer) + assert asr_model.decoding.decoding.search_type == "riva" + @pytest.mark.unit def test_ASRDatasetConfig_for_AudioToBPEDataset(self): # ignore some additional arguments as dataclass is generic From 3a8081fef634686881d7464c5b847212a16c5431 Mon Sep 17 00:00:00 2001 From: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Date: Thu, 22 Aug 2024 00:20:40 +0530 Subject: [PATCH 031/664] 24.07 vboost numbers (#10221) * 24.07 vboost numbers Signed-off-by: Malay Nagda * 175b 512gpus Signed-off-by: Malay Nagda --------- Signed-off-by: Malay Nagda Co-authored-by: Sangkug Lym --- .../source/performance/performance_summary.md | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/source/performance/performance_summary.md b/docs/source/performance/performance_summary.md index c5bdda7b040d..eca42f2d0695 100644 --- a/docs/source/performance/performance_summary.md +++ b/docs/source/performance/performance_summary.md @@ -11,18 +11,18 @@ | Model | #-GPUs | GBS | MBS | Sequence Length| TP | PP | CP | VP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to train in days (10T tokens, 1K GPUs)*** | | ----- | ------ | --- | --- | ---------------| -- | -- | -- | -- | ------------------ | ----------------------- | ------------------------------------------------------ | -| GPT3-5B | 64 | 2048 | 4 | 2048 | 1 | 1 | 1 | 1 | 22521 | 736 | ***5*** | -| GPT3-20B | 64 | 256 | 2 | 2048 | 2 | 1 | 1 | 1 | 5851 | 750 | ***19*** | -| GPT3-175B | 128 | 256 | 1 | 2048 | 4 | 8 | 1 | 6 | 726 | 782 | **156** | -| GPT3-175B | 512 | 2048 | 2 | 2048 | 4 | 8 | 1 | 6 | 782 | [842](https://mlcommons.org/benchmarks/training/) | **145** | -| LLAMA2-7B | 8 | 128 | 1 | 4096 | 1 | 1 | 1 | 1 | 16847 | 776 | ***7*** | -| LLAMA2-13B | 16 | 128 | 1 | 4096 | 1 | 4 | 1 | 10 | 8646 | 754 | ***13*** | -| LLAMA2-70B | 64 | 128 | 1 | 4096 | 4 | 4 | 1 | 20 | 1707 | 759 | ***66*** | -| Nemotron-8B | 64 | 256 | 4 | 4096 | 2 | 1 | 1 | 1 | 12701 | 653 | ***9*** | -| Nemotron-22B | 64 | 256 | 2 | 4096 | 2 | 4 | 1 | 10 | 4256 | 554 | ***27*** | -| Nemotron-340B | 128 | 32 | 1 | 4096 | 8 | 8 | 1 | 12 | 322 | 678 | ***351*** | -| LLAMA3-8B | 8 | 128 | 1 | 8192 | 1 | 1 | 2 | 1 | 12036 | 697 | ***9*** | -| LLAMA3-70B | 64 | 128 | 1 | 8192 | 4 | 4 | 2 | 5 | 1533 | 738 | ***74*** | +| GPT3-5B | 64 | 2048 | 4 | 2048 | 1 | 1 | 1 | 1 | 23574 | 770 | ***5*** | +| GPT3-20B | 64 | 256 | 2 | 2048 | 2 | 1 | 1 | 1 | 5894 | 755 | ***19*** | +| GPT3-175B | 128 | 256 | 1 | 2048 | 4 | 8 | 1 | 6 | 745 | 802 | **152** | +| GPT3-175B | 512 | 2048 | 2 | 2048 | 4 | 8 | 1 | 6 | 832 | [895](https://mlcommons.org/benchmarks/training/) | **136** | +| LLAMA2-7B | 8 | 128 | 1 | 4096 | 1 | 1 | 1 | 1 | 16634 | 767 | ***7*** | +| LLAMA2-13B | 16 | 128 | 1 | 4096 | 1 | 4 | 1 | 10 | 8715 | 760 | ***13*** | +| LLAMA2-70B | 64 | 128 | 1 | 4096 | 4 | 4 | 1 | 20 | 1717 | 763 | ***66*** | +| Nemotron-8B | 64 | 256 | 4 | 4096 | 2 | 1 | 1 | 1 | 12507 | 643 | ***9*** | +| Nemotron-22B | 64 | 256 | 2 | 4096 | 2 | 4 | 1 | 10 | 4289 | 559 | ***26*** | +| Nemotron-340B | 128 | 32 | 1 | 4096 | 8 | 8 | 1 | 12 | 328 | 691 | ***344*** | +| LLAMA3-8B | 8 | 128 | 1 | 8192 | 1 | 1 | 2 | 1 | 11883 | 688 | ***10*** | +| LLAMA3-70B | 64 | 128 | 1 | 8192 | 4 | 4 | 2 | 5 | 1549 | 746 | ***73*** | ### Finetuning @@ -34,9 +34,9 @@ | Model | Task | #-GPUs | GBS | MBS | Packed Sequence Length | TP | PP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to finetune in mins (10M tokens)*** | | ----- | ---- | --- | --- | --- | --------------- | -- | -- | ------------------ | ----------------------- | -------------------------------------------------- | -| LLAMA2-7B | SFT | 8 | 32 | 1 | 4096 | 1 | 1 | 17120 | 682 | ***1.2*** | -| LLAMA2-13B | SFT | 8 | 32 | 1 | 4096 | 1 | 4 | 9741 | 754 | ***2.1*** | -| LLAMA2-70B | SFT | 16 | 32 | 1 | 4096 | 4 | 4 | 1833 | 756 | ***5.7*** | +| LLAMA2-7B | SFT | 8 | 32 | 1 | 4096 | 1 | 1 | 17617 | 702 | ***1.2*** | +| LLAMA2-13B | SFT | 8 | 32 | 1 | 4096 | 1 | 4 | 10176 | 787 | ***2.0*** | +| LLAMA2-70B | SFT | 16 | 32 | 1 | 4096 | 4 | 4 | 1812 | 747 | ***5.7*** | | LLAMA2-7B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 25206 | 673 | ***0.8*** | -| LLAMA2-13B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 14161 | 733 | ***1.5*** | -| LLAMA2-70B | LoRA | 8 | 32 | 1 | 4096 | 2 | 4 | 2557 | 705 | ***8.1*** | +| LLAMA2-13B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 14760 | 764 | ***1.4*** | +| LLAMA2-70B | LoRA | 8 | 32 | 1 | 4096 | 2 | 4 | 2621 | 722 | ***7.9*** | From b88b662c8753e51629fce3245b8ad1a4d126e03f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Wed, 21 Aug 2024 15:02:04 -0400 Subject: [PATCH 032/664] Enable DDP memory optimization by default (#10214) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Żelasko --- examples/asr/asr_adapters/train_asr_adapter.py | 3 ++- examples/asr/conf/asr_adapters/asr_adaptation.yaml | 4 +++- examples/asr/conf/asr_adapters/asr_adaptation_hp.yaml | 4 +++- examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml | 4 +++- .../asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml | 4 +++- .../fastconformer_ctc_bpe_streaming.yaml | 4 +++- .../fastconformer_ctc_char_streaming.yaml | 4 +++- .../fastconformer_transducer_bpe_streaming.yaml | 4 +++- .../fastconformer_transducer_char_streaming.yaml | 4 +++- examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml | 4 +++- .../asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml | 4 +++- .../fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml | 4 +++- .../fastconformer_hybrid_transducer_ctc_char_streaming.yaml | 4 +++- .../fastconformer_hybrid_transducer_ctc_bpe.yaml | 4 +++- .../fastconformer_hybrid_transducer_ctc_char.yaml | 4 +++- .../long_fastconformer/fast-conformer-long_ctc_bpe.yaml | 4 +++- .../fast-conformer-long_transducer_bpe.yaml | 4 +++- 17 files changed, 50 insertions(+), 17 deletions(-) diff --git a/examples/asr/asr_adapters/train_asr_adapter.py b/examples/asr/asr_adapters/train_asr_adapter.py index 5a94e2bb332d..3f82ef8fe554 100644 --- a/examples/asr/asr_adapters/train_asr_adapter.py +++ b/examples/asr/asr_adapters/train_asr_adapter.py @@ -92,6 +92,7 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import clean_exp_ckpt, exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg def update_model_config_to_support_adapter(model_cfg, current_cfg): @@ -154,7 +155,7 @@ def main(cfg): if cfg.model.pretrained_model is not None and cfg.model.nemo_model is not None: raise ValueError("Cannot set both `cfg.model.nemo_model` and `cfg.model.pretrained_model`. Select one only.") - trainer = pl.Trainer(**cfg.trainer) + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) if cfg.model.pretrained_model is not None: diff --git a/examples/asr/conf/asr_adapters/asr_adaptation.yaml b/examples/asr/conf/asr_adapters/asr_adaptation.yaml index 6ab3f12d6a1a..b9a2a003217e 100644 --- a/examples/asr/conf/asr_adapters/asr_adaptation.yaml +++ b/examples/asr/conf/asr_adapters/asr_adaptation.yaml @@ -181,7 +181,9 @@ trainer: max_steps: 1000 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: null precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. diff --git a/examples/asr/conf/asr_adapters/asr_adaptation_hp.yaml b/examples/asr/conf/asr_adapters/asr_adaptation_hp.yaml index 4afbc3b51c29..958e6d23375c 100644 --- a/examples/asr/conf/asr_adapters/asr_adaptation_hp.yaml +++ b/examples/asr/conf/asr_adapters/asr_adaptation_hp.yaml @@ -181,7 +181,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: null precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. diff --git a/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml b/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml index 6808f4941916..3b5717efddf9 100644 --- a/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml +++ b/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml @@ -80,7 +80,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 0.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml b/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml index 172d09ccd60b..e6d9b0b49c65 100644 --- a/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml +++ b/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml @@ -138,7 +138,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 0.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml index acb499f18ffb..4c80d2f2e9d4 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml @@ -171,7 +171,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 1.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml index 8dd978bb00e4..0796a60260a1 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml @@ -176,7 +176,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 1.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml index 9f199c2dd488..4edcc38396fa 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml @@ -227,7 +227,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 1.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml index c7f83216aa0b..97b64ef93402 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml @@ -233,7 +233,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 1.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml index 9b51edf614b8..d8808b83069c 100644 --- a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +++ b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml @@ -195,7 +195,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 0.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml index 680d96e1afaf..90a77dee2913 100644 --- a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml +++ b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml @@ -248,7 +248,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 0.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml index 6f356ce91caa..daef1ed67a9f 100644 --- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml @@ -244,7 +244,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 1.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml index 870bb0190c03..96aee4af1803 100644 --- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml +++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml @@ -249,7 +249,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 1.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml index 3fc91cc1e436..4ba55e368bb9 100644 --- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml +++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml @@ -223,7 +223,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 1.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml index e99ba69df57a..ed2ad8ca9c0d 100644 --- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml +++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml @@ -228,7 +228,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 1.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml index 3e3d2bf6788e..773a500ef2db 100644 --- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml +++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml @@ -168,7 +168,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 0.0 precision: 32 # 16, 32, or bf16 diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml index 5f6c37288ae9..fec2a2839efa 100644 --- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml +++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml @@ -222,7 +222,9 @@ trainer: max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto - strategy: ddp + strategy: + _target_: pytorch_lightning.strategies.DDPStrategy + gradient_as_bucket_view: true accumulate_grad_batches: 1 gradient_clip_val: 0.0 precision: 32 # 16, 32, or bf16 From c87e542c9be15966b2b4510161abf73752cfe80f Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:30:14 -0500 Subject: [PATCH 033/664] fix mamba convert/ add test (#10224) * fix mamba convert/ add test * Apply isort and black reformatting Signed-off-by: JRD971000 * add mamba test * fix ngroup in cicd --------- Signed-off-by: JRD971000 Co-authored-by: JRD971000 --- .github/workflows/cicd-main.yml | 16 +++++++++++ .../convert_mamba2_pyt_to_nemo.py | 28 +++++++++++-------- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 797b7888b01e..3fc2b1a127e7 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -159,6 +159,21 @@ jobs: rm -f /home/TestData/nlp/megatron_ir/sbert/sbert.nemo rm -rf /home/TestData/nlp/megatron_ir/sbert/model_weights + L2_Community_LLM_Checkpoints_tests_Mamba2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ + --input_name_or_path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt \ + --output_path /home/TestData/nlp/megatron_mamba/converted_mamba.nemo \ + --precision=bf16 \ + --mamba_ssm_ngroups 1 + AFTER_SCRIPT: | + rm -f /home/TestData/nlp/megatron_mamba/converted_mamba.nemo + rm -rf /home/TestData/nlp/megatron_mamba/model_weights + L2_Community_LLM_Checkpoints_tests_Llama: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4745,6 +4760,7 @@ jobs: - L0_Unit_Tests_GPU #- OPTIONAL_L0_Unit_Tests_CPU - L2_Community_LLM_Checkpoints_tests_Bert + - L2_Community_LLM_Checkpoints_tests_Mamba2 - L2_Community_LLM_Checkpoints_tests_Llama - L2_Community_LLM_Checkpoints_tests_StarCoder - L2_Community_LLM_Checkpoints_tests_Falcon diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py index 1a0a13709421..7a7484bf9c20 100644 --- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py +++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py @@ -26,7 +26,7 @@ ''' Example -CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ +CUDA_VISIBLE_DEVICES="0" python /opt/NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ --input_name_or_path \ --output_path \ --mamba_ssm_ngroups 8 \ @@ -63,10 +63,24 @@ def get_args(): def convert(args): - checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu') + checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu')['model'] new_state_dict = {} if 'backbone' in list(checkpoint_weights.keys())[0]: + if 'model' in list(checkpoint_weights.keys())[0]: + checkpoint_weights = {key.replace('model.', '', 1): value for key, value in checkpoint_weights.items()} + + # Codestral Mamba Model Tokenizer Settings + tokenizer_library = 'megatron' + tokenizer_type = 'GPTSentencePieceTokenizer' + tokenizer_model = args.tokenizer_model_dir + + else: + + # Tri Dao and Albert Gu Mamba Model Tokenizer Settings + tokenizer_library = 'huggingface' + tokenizer_type = 'EleutherAI/gpt-neox-20b' + tokenizer_model = None layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'backbone\.layers\.\d+\.', key)] layer_numbers = set(int(re.search(r'backbone\.layers\.(\d+)\.', key).group(1)) for key in layer_keys) @@ -103,11 +117,6 @@ def convert(args): old_key = f'backbone.layers.{i}.{attr}' new_state_dict[new_key] = checkpoint_weights[old_key] - # Tokenizer settings - tokenizer_library = 'huggingface' - tokenizer_type = 'EleutherAI/gpt-neox-20b' - tokenizer_model = None - else: layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'decoder\.layers\.\d+\.', key)] @@ -124,11 +133,6 @@ def convert(args): tokenizer_type = 'GPTSentencePieceTokenizer' tokenizer_model = args.tokenizer_model_dir - # Tokenizer settings - tokenizer_library = 'megatron' - tokenizer_type = 'GPTSentencePieceTokenizer' - tokenizer_model = args.tokenizer_model_dir - layers = defaultdict(list) for key in new_state_dict.keys(): From ff7c614ab8226c2038b268d4575015e5871e17ec Mon Sep 17 00:00:00 2001 From: "John St. John" Date: Wed, 21 Aug 2024 16:38:49 -0700 Subject: [PATCH 034/664] Optionally disable logging in the data sampler to support predict_step (#10127) * Resolve merge conflicts with consumed sample logging Signed-off-by: John St John * Add test file that captures the predict step error Signed-off-by: John St John * Add fixme comment around proper checkpoint nemo2 handling Signed-off-by: John St John * Skip megatron training test on CPU nodes Signed-off-by: John St John * Move output_log to last arg for compatibility Signed-off-by: John St John * try setting the default root dir in predict to avoid writing artifacts to cwd Signed-off-by: John St John * Handle the new check for batch samplers to enable predict_step Signed-off-by: John St John * Only reset the global microbatch, not entire parallel state Signed-off-by: John St John * Destroy the right sets of state in test of lightning trainer Signed-off-by: John St John * Fix typo and rename state resetting functions Signed-off-by: John St John * Run test in a subprocess to avoid contaminating global state Signed-off-by: John St John --------- Signed-off-by: John St John --- nemo/lightning/megatron_parallel.py | 19 +- .../lightning/pytorch/plugins/data_sampler.py | 30 +- .../collections/llm/test_mnist_model_nemo2.py | 598 ++++++++++++++++++ 3 files changed, 634 insertions(+), 13 deletions(-) create mode 100644 tests/collections/llm/test_mnist_model_nemo2.py diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index 56146498b539..dd10a726e67a 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -231,7 +231,24 @@ def forward( pipeline = self.pipeline - use_global_batch_sampler = self.trainer.datamodule.data_sampler.dataloader_type == 'batch' + # FIXME: cleanup the following code block which is here for backwards compatibility with nemo1. The "batch" + # sampler is a nemo1 sampler. It requires some custom code here to use (if use_global_batch_sampler). + # by default we shouldn't use this "batch" sampler probably. + if getattr(self.trainer, "datamodule", None) is not None: + use_global_batch_sampler = self.trainer.datamodule.data_sampler.dataloader_type == 'batch' + elif getattr(self.trainer, "predict_dataloaders", None) is not None: + from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import ( # noqa: I001 + MegatronPretrainingBatchSampler, + ) + + # The batch_sampler gets injected into the dataloader by the data_sampler. When doing predict without a + # datamodule we can look inside the dataloader's batch_sampler to see if it is the nemo1 style sampler + # that we need to handle specially below. + use_global_batch_sampler = isinstance( + self.trainer.predict_dataloaders.batch_sampler, MegatronPretrainingBatchSampler + ) + else: + raise ValueError("Unsure how to check for nemo1 global_batch_sampler status. TODO maybe default to False?") if use_global_batch_sampler: from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py index 13a0caa98f0c..bacb7cb0af5c 100644 --- a/nemo/lightning/pytorch/plugins/data_sampler.py +++ b/nemo/lightning/pytorch/plugins/data_sampler.py @@ -26,8 +26,10 @@ def __init__( dataloader_type: Literal["single", "cyclic", "batch"] = "single", init_consumed_samples: int = 0, init_global_step: int = 0, + output_log: bool = True, ): self.seq_len = seq_len + self.output_log = output_log self.micro_batch_size = micro_batch_size self.global_batch_size = global_batch_size self.rampup_batch_size = rampup_batch_size @@ -95,12 +97,14 @@ def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModul self.prev_global_batch_size = self.current_global_batch_size consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_global_step) - pl_module.log( - 'consumed_samples', - consumed_samples, - prog_bar=True, - batch_size=1, - ) + if self.output_log: + # You may need to turn off logging, for example when doing trainer.predict(model, data) + pl_module.log( + 'consumed_samples', + consumed_samples, + prog_bar=True, + batch_size=1, + ) self.prev_consumed_samples = consumed_samples @@ -108,12 +112,14 @@ def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModul consumed_samples=consumed_samples, consistency_check=False, ) - pl_module.log( - "global_batch_size", - self.current_global_batch_size, - prog_bar=True, - batch_size=1, - ) + if self.output_log: + # You may need to turn off logging, for example when doing trainer.predict(model, data) + pl_module.log( + "global_batch_size", + self.current_global_batch_size, + prog_bar=True, + batch_size=1, + ) self.if_first_step = 1 @property diff --git a/tests/collections/llm/test_mnist_model_nemo2.py b/tests/collections/llm/test_mnist_model_nemo2.py new file mode 100644 index 000000000000..c78306201751 --- /dev/null +++ b/tests/collections/llm/test_mnist_model_nemo2.py @@ -0,0 +1,598 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import subprocess +import sys +import tempfile +from contextlib import contextmanager +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, TypedDict, TypeVar, Union + +import megatron.core.num_microbatches_calculator +import pytest +import pytorch_lightning as pl +import torch +import torch.distributed +from megatron.core import ModelParallelConfig, parallel_state +from megatron.core.optimizer import OptimizerConfig +from megatron.core.transformer.enums import ModelType +from megatron.core.transformer.module import MegatronModule +from pytorch_lightning.loggers import TensorBoardLogger +from torch import Tensor, nn +from torch.utils.data import DataLoader +from torchvision import transforms +from torchvision.datasets import MNIST + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.lightning import NeMoLogger, io, resume +from nemo.lightning.megatron_parallel import DataT, MegatronLossReduction, ReductionT +from nemo.lightning.pytorch import callbacks as nl_callbacks +from nemo.lightning.pytorch.optim import MegatronOptimizerModule +from nemo.lightning.pytorch.plugins import MegatronDataSampler + +TokenizerType = Any + +"""This is intended to be a minimal self-container NeMo2 example.""" + + +T = TypeVar("T") + + +@dataclass +class ExampleConfig(ModelParallelConfig): + """ExampleConfig is a dataclass that is used to configure the model. + + Timers from ModelParallelConfig are required for megatron forward compatibility. + """ + + calculate_per_token_loss: bool = False + + def configure_model(self) -> nn.Module: + """This function is called by the strategy to construct the model. + + Note: Must pass self into Model since model requires having a config object. + + Returns: + The model object. + """ + return ExampleModel(self) + + +class MSELossReduction(MegatronLossReduction): + """A class used for calculating the loss, and for logging the reduced loss across micro batches.""" + + def forward(self, batch: DataT, forward_out: Tensor) -> Tuple[Tensor, ReductionT]: + """Calculates the loss within a micro-batch. A micro-batch is a batch of data on a single GPU. + + Args: + batch: A batch of data that gets passed to the original forward inside LitAutoEncoder. + forward_out: the output of the forward method inside LitAutoEncoder. + + Returns: + A tuple containing [, ReductionT] where the loss tensor will be used for + backpropagation and the ReductionT will be passed to the reduce method + (which currently only works for logging.). + """ + x = batch["data"] + outputs = forward_out + x_hat = outputs["x_hat"] + # you could also put a latent loss on z here. + xview = x.view(x.size(0), -1) + loss = nn.functional.mse_loss(x_hat, xview) + + return loss, {"avg": loss} + + def reduce(self, losses_reduced_per_micro_batch: Sequence[ReductionT]) -> Tensor: + """Works across micro-batches. (data on single gpu). + + Note: This currently only works for logging and this loss will not be used for backpropagation. + + Args: + losses_reduced_per_micro_batch: a list of the outputs of forward + + Returns: + A tensor that is the mean of the losses. (used for logging). + """ + mse_losses = torch.stack([loss["avg"] for loss in losses_reduced_per_micro_batch]) + return mse_losses.mean() + + +def some_first(seq: Iterable[Optional[T]]) -> T: + """Returns the first non-None value from the sequence or fails""" # noqa: D415 + for s in seq: + if s is not None: + return s + raise ValueError("non-None value not found") + + +def get_dtype_device(torch_object) -> Tuple[torch.dtype, torch.device]: # noqa: D103 + match torch_object: + case []: + raise ValueError("Looking up dtype on an empty list") + case {**data} if not data: + raise ValueError("Looking up dtype on an empty dict") + case torch.Tensor(dtype=dtype, device=device): + return dtype, device + case torch.nn.Module() as m: + try: + p = next(m.parameters()) + except StopIteration as e: + raise ValueError("Cannot get dtype on a torch module with no parameters.") from e + return p.dtype, p.device + case dict(keys=_, values=values): + val = some_first(values()) + return get_dtype_device(val) + case list() as l: + val = some_first(l) + return get_dtype_device(val) + case _: + raise TypeError("Got something we didnt expect") + + +# NOTE(SKH): These types are all wrong, but are close. The inner type must always be a torch.Tensor, but the outer container should be generic. +def batch_collator(batches: Optional[Union[Tuple[ReductionT], List[ReductionT]]]) -> Optional[ReductionT]: + """Takes a sequence of batches and collates them into a single batch. + This is distinct from the standard pytorch default_collator since it does + not add the batch dimension, it's assumed the batch + dimension is already present in the input, as would be the case when + parallelizing across minibatches. + + IMPORTANT: The underlying data primitive _must_ be a torch Tensor. The input to this function is a recurisve type, + there can be any amount of nesting between dictionaries, tuples, and lists, as long as the inner type is a n-d torch.Tensor. + + Examples: + Outer container = Dict: + [{'a': torch.tensor([1]), 'b': torch.tensor([2])}, {'a': torch.tensor([2]), 'b': torch.tensor([3])}] -> {'a': torch.tensor([1, 2]), 'b': torch.tensor([2, 3])} + Outer container = List: + [[torch.tensor([1]), torch.tensor([2])], [torch.tensor([2]), torch.tensor([3])]] -> [torch.tensor([1, 2]), torch.tensor([2, 3])] + Outer container = Tuple: + ([torch.tensor([1]), torch.tensor([2])], [torch.tensor([2]), torch.tensor([3])]) -> (torch.tensor([1, 2]), torch.tensor([2, 3])) + + Args: + batches (Optional[Sequence[ReductionT]]): sequence of batches to collate into a single batch. + + Returns: + A single batch of the same type as the elements of your input sequence. + """ # noqa: D205 + match batches: + case [torch.Tensor(), *_]: + return torch.cat(batches, dim=0) + case [dict(), *_]: + return {key: batch_collator([batch[key] for batch in batches]) for key in batches[0]} + case [tuple(), *_]: + return tuple(batch_collator([batch[i] for batch in batches]) for i in range(len(batches[0]))) + case [list(), *_]: + return [batch_collator([batch[i] for batch in batches]) for i in range(len(batches[0]))] + case None: + return None + case []: + raise ValueError("Cannot process an empty sequence") + case _: + raise ValueError("Unsupported input structure in batch_collator") + + +class PassthroughLossReduction(MegatronLossReduction): + """Internally in NeMo2.0 the forward step is always expected to return a loss reduction class, and forward is expected to return a loss. + This class hijacks that mechanism to instead pass through the forward output unperturbed as the loss (to enable inference in the predict step), and then the + reduce method is used to collate the batch of forward outputs into a single batch. This supports the model forward output being a tensor, dict, tuple, + or list of tensors. The inner type _must always be a torch.Tensor_. + """ # noqa: D205 + + def forward(self, batch: DataT, forward_out: DataT) -> Tuple[torch.Tensor, DataT]: + """_summary_ + + Args: + batch (DataT): The batch of data that was passed through the model to generate output. + forward_out (torch.Tensor): The output from your model's forward pass. + + Returns: + Tuple[torch.Tensor, ReductionT]: A tuple containing the loss tensor (dummy in this case) and the forward output (unmodified). + """ # noqa: D415 + dtype, device = get_dtype_device(forward_out) + return torch.zeros(1, device=device, dtype=dtype), forward_out + + def reduce(self, forward_out: List[DataT]) -> DataT: + """This overrides the standard reduce with a simplified version that just takes a list of your model's forward outputs + and collates them togehter into a single output. + + Args: + forward_out (List[ReductionT]): _description_ + + Returns: + ReductionT: _description_ + """ # noqa: D205 + return batch_collator(forward_out) + + +class LitAutoEncoder(pl.LightningModule, io.IOMixin, io.ConnectorMixin): + """A very basic lightning module for testing the megatron strategy and the megatron-nemo2-bionemo contract.""" + + def __init__(self, config): + """Initializes the model. + + Args: + config: a Config object necessary to construct the actual nn.Module (the thing that has the parameters). + """ + super().__init__() + self.config = config + self.optim = MegatronOptimizerModule( + config=OptimizerConfig(lr=1e-4, optimizer="adam", use_distributed_optimizer=True), + ) + # Bind the configure_optimizers method to the model + self.optim.connect(self) + + def forward(self, batch: Dict, batch_idx: Optional[int] = None) -> Any: + """This forward will be called by the megatron scheduler and it will be wrapped. + + !!! note + + The `training_step` defines the training loop and is independent of the `forward` method here. + + Args: + batch: A dictionary of data. + batch_idx: The index of the batch. + + Returns: + The output of the model. + """ + x = batch["data"] + return self.module(x) + + def training_step(self, batch, batch_idx: Optional[int] = None): + """The training step is where the loss is calculated and the backpropagation is done. + + Background: + - NeMo's Strategy overrides this method. + - The strategies' training step will call the forward method of the model. + - That forward method then calls the wrapped forward step of MegatronParallel which wraps the forward method of the model. + - That wrapped forward step is then executed inside the Mcore scheduler, which calls the `_forward_step` method from the + MegatronParallel class. + - Which then calls the training_step function here. + + In this particular use case, we simply call the forward method of this class, the lightning module. + + Args: + batch: A dictionary of data. requires `batch_idx` as default None. + batch_idx: The index of the batch. + """ + return self(batch, batch_idx) + + def training_loss_reduction(self) -> MegatronLossReduction: # noqa: D102 + # This is the function that takes batch['loss_mask'] and the logits output by the model and reduces the loss + return MSELossReduction() + + def validation_loss_reduction(self) -> MegatronLossReduction: # noqa: D102 + return MSELossReduction() + + def test_loss_reduction(self) -> MegatronLossReduction: # noqa: D102 + return MSELossReduction() + + def predict_loss_reduction(self) -> MegatronLossReduction: # noqa: D102 + # This allows us to do inference (not output the loss) + return PassthroughLossReduction() + + def configure_model(self) -> None: # noqa: D102 + self.module = self.config.configure_model() + + +class ExampleModel(MegatronModule): # noqa: D101 + def __init__(self, config: ModelParallelConfig) -> None: + """Constructor of the model. + + Args: + config: The config object is responsible for telling the strategy what model to create. + """ + super().__init__(config) + self.model_type = ModelType.encoder_or_decoder + self.linear1 = nn.Linear(28 * 28, 64) + self.relu = nn.ReLU() + self.linear2 = nn.Linear(64, 3) + self.linear3 = nn.Linear(3, 64) + self.relu2 = nn.ReLU() + self.linear4 = nn.Linear(64, 28 * 28) + + def forward(self, x: Tensor) -> Dict[str, Tensor]: + """Forward pass of the model. + + Args: + x: The input data. + + Returns: + x_hat: The result of the last linear layer of the network. + """ + x = x.view(x.size(0), -1) + z = self.linear1(x) + z = self.relu(z) + z = self.linear2(z) + x_hat = self.linear3(z) + x_hat = self.relu2(x_hat) + x_hat = self.linear4(x_hat) + return {"x_hat": x_hat, "z": z} + + def set_input_tensor(self, input_tensor: Optional[Tensor]) -> None: + """This is needed because it is a megatron convention. Even if it is a no-op for single GPU testing. + + See megatron.model.transformer.set_input_tensor() + + Note: Currently this is a no-op just to get by an mcore function. + + Args: + input_tensor: Input tensor. + """ + pass + + +class MnistItem(TypedDict): + data: Tensor + label: Tensor + idx: int + + +class MNISTCustom(MNIST): # noqa: D101 + def __getitem__(self, index: int) -> MnistItem: + """Wraps the getitem method of the MNIST dataset such that we return a Dict + instead of a Tuple or tensor. + + Args: + index: The index we want to grab, an int. + + Returns: + A dict containing the data ("x"), label ("y"), and index ("idx"). + """ # noqa: D205 + x, y = super().__getitem__(index) + + return { + "data": x, + "label": y, + "idx": index, + } + + +# TODO: remove this callback after `val` loss is logged by default in training in NeMo2 +class LossLoggingCallback(pl.Callback): # noqa: D101 + def __init__(self): + """Log the loss at the end of each batch. For training do not reduce across the epoch but do so for validation/test.""" + self.val_losses = [] + self.test_losses = [] + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): # noqa: D102 + # Assuming the loss is computed internally and stored in pl_module + if torch.distributed.get_rank() == 0 and parallel_state.is_pipeline_last_stage(): + if isinstance(outputs, dict): + outputs = outputs["loss"] + loss = outputs + pl_module.log("train_loss", loss, on_step=True, prog_bar=True, logger=True, rank_zero_only=True) + + def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx=0): # noqa: D102 + if torch.distributed.get_rank() == 0 and parallel_state.is_pipeline_last_stage(): + if isinstance(outputs, dict): + outputs = outputs["loss"] + loss = outputs + self.test_losses.append(loss) + + def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx=0): # noqa: D102 + # Assuming the loss is computed internally and stored in pl_module + if torch.distributed.get_rank() == 0 and parallel_state.is_pipeline_last_stage(): + if isinstance(outputs, dict): + outputs = outputs["loss"] + loss = outputs + self.val_losses.append(loss) + + def on_validation_epoch_end(self, trainer, pl_module): # noqa: D102 + if torch.distributed.get_rank() == 0 and parallel_state.is_pipeline_last_stage(): + if len(self.val_losses) > 0: + avg_val_loss = torch.stack(self.val_losses).mean() + pl_module.log("val_loss", avg_val_loss, prog_bar=True, logger=True, rank_zero_only=True) + self.val_losses.clear() + + def on_test_epoch_end(self, trainer, pl_module): # noqa: D102 + if torch.distributed.get_rank() == 0 and parallel_state.is_pipeline_last_stage(): + if len(self.test_losses) > 0: + avg_test_loss = torch.stack(self.test_losses).mean() + pl_module.log("test_loss", avg_test_loss, prog_bar=True, logger=True, rank_zero_only=True) + self.test_losses.clear() + + +class MNISTDataModule(pl.LightningDataModule): # noqa: D101 + def __init__(self, data_dir: str = "./", batch_size: int = 32) -> None: # noqa: D107 + super().__init__() + self.data_dir = data_dir + self.batch_size = batch_size + self.micro_batch_size = 8 + self.global_batch_size = 8 + self.max_len = 100 + self.rampup_batch_size = None + + # Note that this sampler is sequential, meaning it does not do any shuffling. Let's wrap our data in a shuffler. + # Wraps the datasampler with the MegatronDataSampler. The MegatronDataSampler is a wrapper that allows the sampler + # to be used with megatron. It sets up the capability to utilize micro-batching and gradient accumulation. It is also + # the place where the global batch size is constructed. + self.data_sampler = MegatronDataSampler( + seq_len=self.max_len, + micro_batch_size=self.micro_batch_size, + global_batch_size=self.global_batch_size, + rampup_batch_size=self.rampup_batch_size, + ) + + def setup(self, stage: str) -> None: + """Sets up the datasets + + Args: + stage: can be one of train / test / predict. + """ # noqa: D415 + self.mnist_test = MNISTCustom(self.data_dir, download=True, transform=transforms.ToTensor(), train=False) + self.mnist_predict = MNISTCustom(self.data_dir, download=True, transform=transforms.ToTensor(), train=False) + mnist_full = MNISTCustom(self.data_dir, download=True, transform=transforms.ToTensor(), train=True) + self.mnist_train, self.mnist_val = torch.utils.data.random_split( + mnist_full, [55000, 5000], generator=torch.Generator().manual_seed(42) + ) + + def train_dataloader(self) -> DataLoader: # noqa: D102 + return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=0) + + def val_dataloader(self) -> DataLoader: # noqa: D102 + return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=0) + + def test_dataloader(self) -> DataLoader: # noqa: D102 + return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=0) + + +### Begin model environment related utilities +def _reset_megatron_parallel_state(): + """Resets _GLOBAL_NUM_MICROBATCHES_CALCULATOR in megatron which is used in NeMo to initialized model parallel in + nemo.collections.nlp.modules.common.megatron.megatron_init.initialize_model_parallel_for_nemo + """ # noqa: D205, D415 + megatron.core.num_microbatches_calculator._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None + # Clean up any process groups created in testing + torch.cuda.empty_cache() + if parallel_state.is_initialized(): + parallel_state.destroy_model_parallel() + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + +@contextmanager +def reset_megatron_parallel_state() -> Iterator[None]: + """Puts you into a clean parallel state, and again tears it down at the end.""" + try: + _reset_megatron_parallel_state() + yield + finally: + _reset_megatron_parallel_state() + + +@pytest.mark.run_only_on("GPU") +@pytest.mark.integration +def test_train_mnist_litautoencoder_with_megatron_strategy_single_gpu(): + path = os.path.abspath(__file__) + call = f"python {path}" + # Raises a CalledProcessError if there is a failure in the subprocess + subprocess.check_call(call, shell=True, stdout=sys.stdout, stderr=sys.stdout) + + +def run_train_mnist_litautoencoder_with_megatron_strategy_single_gpu(): + """This is the actual test that will get run in a subprocess so it does not contaminate the state of other tests.""" + with tempfile.TemporaryDirectory() as tmpdir_str: + tmpdir = Path(tmpdir_str) + assert tmpdir.exists() + assert tmpdir.is_dir() + with reset_megatron_parallel_state(): + # Configure our custom Checkpointer + name = "test_experiment" + checkpoint_callback = nl_callbacks.ModelCheckpoint( + save_best_model=True, + save_last=True, + monitor="val_loss", + save_top_k=1, + every_n_train_steps=5, + # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe + enable_nemo_ckpt_io=True, + ) + root_dir = tmpdir + save_dir = root_dir / name + tb_logger = TensorBoardLogger(save_dir=str(save_dir), name=name) + # Setup the logger and train the model + nemo_logger = NeMoLogger( + dir=str(root_dir), # WARNING: passing a path in here results in mutating the Path class. + name=name, + tensorboard=tb_logger, + ckpt=checkpoint_callback, + ) + # Needed so that the trainer can find an output directory for the profiler + # nemo_logger.save_dir = tmpdir + + model = LitAutoEncoder(config=ExampleConfig()) + strategy = nl.MegatronStrategy( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + ddp="megatron", + find_unused_parameters=True, + enable_nemo_ckpt_io=True, + ) + trainer = nl.Trainer( + accelerator="gpu", + devices=1, + strategy=strategy, + limit_val_batches=5, + val_check_interval=5, + max_steps=20, + num_nodes=1, + log_every_n_steps=5, + callbacks=[io.track_io(LossLoggingCallback)()], + ) + data_module = MNISTDataModule(data_dir=tmpdir) + llm.train( + model=model, + data=data_module, + trainer=trainer, + log=nemo_logger, + resume=resume.AutoResume( + path=None, # Overrides the path found by resume_if_exists when set. + resume_if_exists=True, # Looks for the -last checkpoint to continue training. + resume_ignore_no_checkpoint=True, # When false this will throw an error with no existing checkpoint. + ), + ) + trainer._teardown() + with reset_megatron_parallel_state(): + pred_strategy = nl.MegatronStrategy( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + ddp="megatron", + find_unused_parameters=True, + enable_nemo_ckpt_io=True, + data_sampler=MegatronDataSampler( + seq_len=28 * 28, + micro_batch_size=2, + global_batch_size=2, + output_log=False, # Disable logs to support predict_step + ), + ) + predict_trainer = nl.Trainer( + accelerator="gpu", + devices=1, + strategy=pred_strategy, + default_root_dir=str(root_dir), # WARNING: passing a path in here results in mutating the Path class. + ) + ckpt_path = checkpoint_callback.last_model_path.replace( + ".ckpt", "" + ) # strip .ckpt off the end of the last path + + assert Path( + ckpt_path + ).exists(), f"checkpoint {ckpt_path} not found in {os.listdir(Path(ckpt_path).parent)}" + # FIXME: the below checkpoint loading strategy and manual module unwrapping probably only works in single GPU + # and maybe DDP. + unwrapped_trained_model = trainer.model.module # TODO clean this up. Would be good not to have to unwrap. + forward_output = batch_collator( + predict_trainer.predict( + unwrapped_trained_model, dataloaders=data_module.test_dataloader(), ckpt_path=ckpt_path + ) + ) + assert set(forward_output.keys()) == { + "z", + "x_hat", + }, f"We expect forward output from predit_step, not the loss, got: {forward_output}" + assert forward_output["x_hat"].shape == (len(data_module.mnist_test), 28 * 28) + assert forward_output["z"].shape == (len(data_module.mnist_test), 3) # latent bottleneck in model of dim 3 + predict_trainer._teardown() + + +if __name__ == "__main__": + # Have the test run this one item as a subprocess call + run_train_mnist_litautoencoder_with_megatron_strategy_single_gpu() From 9612d04a4dfef71e4a100aca23e904c27d88eb3f Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Wed, 21 Aug 2024 21:17:52 -0700 Subject: [PATCH 035/664] Set apply_query_key_layer_scaling=False in Llama3Config (#10216) Signed-off-by: Hemil Desai --- nemo/collections/llm/gpt/model/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 425170c07707..ab2f46378a1e 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -72,7 +72,7 @@ class Llama3Config(GPTConfig): add_bias_linear: bool = False activation_func: Callable = F.silu gated_linear_unit: bool = True - apply_query_key_layer_scaling: bool = True + apply_query_key_layer_scaling: bool = False # Fusions bias_activation_fusion: bool = True masked_softmax_fusion: bool = True From 08cddbfa432917de7ba71c7b97412712012ec869 Mon Sep 17 00:00:00 2001 From: Ao Tang Date: Thu, 22 Aug 2024 10:11:40 -0400 Subject: [PATCH 036/664] [Draft] Nemotron in Nemo-UX (#10138) * add nemotron * add nemotron exporter. make converted model identical * Apply isort and black reformatting Signed-off-by: suiyoubi * add more config * Apply isort and black reformatting Signed-off-by: suiyoubi * add config * Apply isort and black reformatting Signed-off-by: suiyoubi * import refactor * Apply isort and black reformatting Signed-off-by: suiyoubi * refactor config * add 22B config --------- Signed-off-by: suiyoubi Co-authored-by: suiyoubi --- nemo/collections/llm/__init__.py | 14 + nemo/collections/llm/fn/activation.py | 6 + nemo/collections/llm/gpt/model/__init__.py | 16 + nemo/collections/llm/gpt/model/nemotron.py | 336 +++++++++++++++++++++ 4 files changed, 372 insertions(+) create mode 100644 nemo/collections/llm/gpt/model/nemotron.py diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 7b2b38e50bc3..812daddf02b6 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -48,6 +48,13 @@ MixtralConfig8x7B, MixtralConfig8x22B, MixtralModel, + Nemotron3Config4B, + Nemotron3Config8B, + Nemotron4Config15B, + Nemotron4Config22B, + Nemotron4Config340B, + NemotronConfig, + NemotronModel, gpt_data_step, gpt_forward_step, ) @@ -73,6 +80,13 @@ "MixtralConfig8x7B", "MixtralConfig8x22B", "MixtralModel", + "NemotronModel", + "Nemotron3Config4B", + "Nemotron3Config8B", + "Nemotron4Config15B", + "Nemotron4Config22B", + "Nemotron4Config340B", + "NemotronConfig", "LlamaConfig", "Llama2Config7B", "Llama2Config13B", diff --git a/nemo/collections/llm/fn/activation.py b/nemo/collections/llm/fn/activation.py index 89b5ba93f0f6..fb638ee31f86 100644 --- a/nemo/collections/llm/fn/activation.py +++ b/nemo/collections/llm/fn/activation.py @@ -9,3 +9,9 @@ def gelu_impl(x): def openai_gelu(x): return gelu_impl(x) + + +@torch.jit.script +def squared_relu(x): + """Squared ReLU activation function.""" + return torch.pow(torch.nn.functional.relu(x), 2) diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index d657b63f779a..a0132a34d185 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -37,6 +37,15 @@ MixtralConfig8x22B, MixtralModel, ) +from nemo.collections.llm.gpt.model.nemotron import ( + Nemotron3Config4B, + Nemotron3Config8B, + Nemotron4Config15B, + Nemotron4Config22B, + Nemotron4Config340B, + NemotronConfig, + NemotronModel, +) __all__ = [ "GPTConfig", @@ -53,6 +62,13 @@ "Llama2Config70B", "Llama3Config8B", "Llama3Config70B", + "NemotronConfig", + "Nemotron3Config4B", + "Nemotron3Config8B", + "Nemotron4Config15B", + "Nemotron4Config22B", + "Nemotron4Config340B", + "NemotronModel", "CodeLlamaConfig7B", "CodeLlamaConfig13B", "CodeLlamaConfig34B", diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py new file mode 100644 index 000000000000..dd659f7eedf7 --- /dev/null +++ b/nemo/collections/llm/gpt/model/nemotron.py @@ -0,0 +1,336 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Annotated, Callable, Optional + +import torch +from torch import nn +from transformers import NemotronConfig as HFNemotronConfig +from transformers import NemotronForCausalLM + +from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer +from nemo.collections.llm.fn.activation import squared_relu +from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel +from nemo.collections.llm.utils import Config +from nemo.lightning import OptimizerModule, io, teardown + +if TYPE_CHECKING: + from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec + + +@dataclass +class NemotronConfig(GPTConfig): + # configs that are common across model sizes + normalization: str = "LayerNorm" + activation_func: Callable = squared_relu + position_embedding_type: str = "rope" + share_embeddings_and_output_weights: bool = False + add_bias_linear: bool = False + + hidden_dropout: float = 0.0 + attention_dropout: float = 0.0 + apply_query_key_layer_scaling: bool = True + rotary_percent: float = 0.5 + masked_softmax_fusion: bool = True + persist_layer_norm: bool = True + bias_dropout_add_fusion: bool = False + layernorm_zero_centered_gamma: bool = True + + # Nemotron3Config4B as default configs + num_layers: int = 32 + seq_length: int = 4096 + hidden_size: int = 3072 + ffn_hidden_size: int = 9216 + num_attention_heads: int = 24 + num_query_groups: Optional[int] = 8 + kv_channels: Optional[int] = 128 + init_method_std: float = 0.0134 + + +@dataclass +class Nemotron3Config4B(NemotronConfig): + num_layers: int = 32 + seq_length: int = 4096 + hidden_size: int = 3072 + ffn_hidden_size: int = 9216 + num_attention_heads: int = 24 + num_query_groups: int = 8 + kv_channels: Optional[int] = 128 + init_method_std: float = 0.0134 + + +@dataclass +class Nemotron3Config8B(NemotronConfig): + num_layers: int = 32 + seq_length: int = 4096 + hidden_size: int = 4096 + ffn_hidden_size: int = 16384 + num_attention_heads: int = 32 + num_query_groups: Optional[int] = None + kv_channels: Optional[int] = None + init_method_std: float = 0.010 + + +@dataclass +class Nemotron4Config15B(NemotronConfig): + num_layers: int = 32 + seq_length: int = 4096 + hidden_size: int = 6144 + ffn_hidden_size: int = 24576 + num_attention_heads: int = 48 + num_query_groups: Optional[int] = 8 + kv_channels: Optional[int] = None + init_method_std: float = 0.0134 + + +@dataclass +class Nemotron4Config22B(NemotronConfig): + num_layers: int = 40 + seq_length: int = 4096 + hidden_size: int = 6144 + ffn_hidden_size: int = 24576 + num_attention_heads: int = 48 + num_query_groups: Optional[int] = None + kv_channels: Optional[int] = None + init_method_std: float = 0.008 + + +@dataclass +class Nemotron4Config340B(NemotronConfig): + num_layers: int = 96 + seq_length: int = 4096 + hidden_size: int = 18432 + ffn_hidden_size: int = 73728 + num_attention_heads: int = 96 + num_query_groups: Optional[int] = 8 + kv_channels: Optional[int] = None + init_method_std: float = 0.0063 + + +class NemotronModel(GPTModel): + def __init__( + self, + config: Annotated[Optional[NemotronConfig], Config[NemotronConfig]] = None, + optim: Optional[OptimizerModule] = None, + tokenizer: Optional["TokenizerSpec"] = None, + model_transform: Optional[Callable[[nn.Module], nn.Module]] = None, + ): + super().__init__(config or NemotronConfig(), optim=optim, tokenizer=tokenizer, model_transform=model_transform) + + +@io.model_importer(NemotronModel, "hf") +class HFNemotronImporter(io.ModelConnector["NemotronForCausalLM", NemotronModel]): + def init(self) -> NemotronModel: + return NemotronModel(self.config, tokenizer=self.tokenizer) + + def apply(self, output_path: Path) -> Path: + source = NemotronForCausalLM.from_pretrained(str(self)) + target = self.init() + trainer = self.nemo_setup(target) + self.convert_state(source, target) + self.nemo_save(output_path, trainer) + + print(f"Converted Nemotron model to Nemo, model saved to {output_path}") + + teardown(trainer, target) + del trainer, target + + return output_path + + def convert_state(self, source, target): + mapping = { + "model.embed_tokens.weight": "embedding.word_embeddings.weight", + "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight", + "model.layers.*.mlp.up_proj.weight": "decoder.layers.*.mlp.linear_fc1.weight", + "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight", + "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight", + "model.layers.*.input_layernorm.bias": "decoder.layers.*.self_attention.linear_qkv.layer_norm_bias", + "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight", + "model.layers.*.post_attention_layernorm.bias": "decoder.layers.*.mlp.linear_fc1.layer_norm_bias", + "model.norm.weight": "decoder.final_layernorm.weight", + "model.norm.bias": "decoder.final_layernorm.bias", + "lm_head.weight": "output_layer.weight", + } + + return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv]) + + @property + def tokenizer(self) -> "AutoTokenizer": + return AutoTokenizer(str(self)) + + @property + def config(self) -> NemotronConfig: + source = HFNemotronConfig.from_pretrained(str(self)) + + def make_vocab_size_divisible_by(vocab_size): + base = 128 + while vocab_size % base != 0: + base //= 2 + return base + + output = NemotronConfig( + num_layers=source.num_hidden_layers, + hidden_size=source.hidden_size, + ffn_hidden_size=source.intermediate_size, + num_attention_heads=source.num_attention_heads, + init_method_std=source.initializer_range, + seq_length=source.max_position_embeddings, + layernorm_epsilon=source.norm_eps, + num_query_groups=source.num_key_value_heads, + rotary_base=source.rope_theta, + rotary_percent=source.partial_rotary_factor, + make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size), + share_embeddings_and_output_weights=False, + ) + + return output + + +@io.model_exporter(NemotronModel, "hf") +class HFNemotronExporter(io.ModelConnector[NemotronModel, "NemotronForCausalLM"]): + def init(self) -> "NemotronForCausalLM": + return NemotronForCausalLM.from_config(self.config) + + def apply(self, output_path: Path) -> Path: + target = self.init() + source, _ = self.nemo_load(str(self)) + target = self.convert_state(source, target) + + target = target.cpu() + target.save_pretrained(output_path) + self.tokenizer.save_pretrained(output_path) + + return output_path + + def convert_state(self, source, target): + mapping = { + "embedding.word_embeddings.weight": "model.embed_tokens.weight", + "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight", + "decoder.layers.*.mlp.linear_fc1.weight": "model.layers.*.mlp.up_proj.weight", + "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight", + "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight", + "decoder.layers.*.self_attention.linear_qkv.layer_norm_bias": "model.layers.*.input_layernorm.bias", + "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight", + "decoder.layers.*.mlp.linear_fc1.layer_norm_bias": "model.layers.*.post_attention_layernorm.bias", + "decoder.final_layernorm.weight": "model.norm.weight", + "decoder.final_layernorm.bias": "model.norm.bias", + "output_layer.weight": "lm_head.weight", + } + + return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv]) + + @property + def tokenizer(self): + return io.load_context(str(self)).model.tokenizer.tokenizer + + @property + def config(self) -> "HFNemotronConfig": + source: NemotronConfig = io.load_context(str(self)).model.config + + return HFNemotronConfig( + num_hidden_layers=source.num_layers, + hidden_size=source.hidden_size, + intermediate_size=source.ffn_hidden_size, + num_attention_heads=source.num_attention_heads, + head_dim=( + source.kv_channels + if source.kv_channels is not None + else source.hidden_size // source.num_attention_heads + ), + tie_word_embeddings=source.share_embeddings_and_output_weights, + max_position_embeddings=source.seq_length, + initializer_range=source.init_method_std, + norm_eps=source.layernorm_epsilon, + num_key_value_heads=source.num_query_groups, + rope_theta=source.rotary_base, + partial_rotary_factor=source.rotary_percent, + vocab_size=self.tokenizer.vocab_size, + ) + + +@io.state_transform( + source_key=( + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", + ), + target_key="decoder.layers.*.self_attention.linear_qkv.weight", +) +def _import_qkv(ctx: io.TransformCTX, q, k, v): + megatron_config = ctx.target.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + + old_tensor_shape = q.size() + new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:] + new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:] + + q = q.view(*new_q_tensor_shape) + k = k.view(*new_kv_tensor_shape) + v = v.view(*new_kv_tensor_shape) + + qkv_weights_l = [] + for i in range(num_query_groups): + qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :]) + qkv_weights_l.append(k[i : i + 1, :, :]) + qkv_weights_l.append(v[i : i + 1, :, :]) + qkv_weights = torch.cat(qkv_weights_l) + assert qkv_weights.ndim == 3, qkv_weights.shape + assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape + assert qkv_weights.shape[1] == head_size, qkv_weights.shape + assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape + + qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) + + return qkv_weights + + +@io.state_transform( + source_key="decoder.layers.*.self_attention.linear_qkv.weight", + target_key=( + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", + ), +) +def _export_qkv(ctx: io.TransformCTX, linear_qkv): + megatron_config = ctx.source.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + qkv_total_dim = head_num + 2 * num_query_groups + + linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size]) + q_slice = torch.cat( + [ + torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group) + for i in range(num_query_groups) + ] + ) + k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2)) + v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) + + q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu() + k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu() + v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu() + + return q_proj, k_proj, v_proj + + +__all__ = [ + "NemotronConfig", + "Nemotron3Config4B", + "Nemotron3Config8B", + "Nemotron4Config15B", + "Nemotron4Config22B", + "Nemotron4Config340B", + "NemotronModel", +] From d323022e8e37f023293cc5403bc47e526b8a6af5 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Thu, 22 Aug 2024 09:53:24 -0700 Subject: [PATCH 037/664] akoumparouli/add_check_param_hashes_across_dp_replicas (#9811) * Riva and k2 ASR WFST decoding (2) (#9391) * upload Signed-off-by: Aleksandr Laptev * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add comments and use case Signed-off-by: Aleksandr Laptev * Apply isort and black reformatting Signed-off-by: GNroy * add initial doc Signed-off-by: Aleksandr Laptev * fix doc and k2+cuda eval Signed-off-by: Aleksandr Laptev * isolate decoder components installation and fix suggestions Signed-off-by: Aleksandr Laptev * Apply isort and black reformatting Signed-off-by: GNroy * fix trailing newline Signed-off-by: Aleksandr Laptev --------- Signed-off-by: Aleksandr Laptev Signed-off-by: GNroy Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: GNroy Co-authored-by: Vladimir Bataev Signed-off-by: Alexandros Koumparoulis * Add DdpParamParityChecker Callback Signed-off-by: Alexandros Koumparoulis * Improve messaging Signed-off-by: Alexandros Koumparoulis * Rename to DdpParityChecker Signed-off-by: Alexandros Koumparoulis * Add ddp test Signed-off-by: Alexandros Koumparoulis * rename to ddp_parity_checker Signed-off-by: Alexandros Koumparoulis * remove red. imports Signed-off-by: Alexandros Koumparoulis * test fix Signed-off-by: Alexandros Koumparoulis * missign import Signed-off-by: Alexandros Koumparoulis * ignore test Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa Signed-off-by: Alexandros Koumparoulis * add missing import Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa Signed-off-by: Alexandros Koumparoulis * another missing import Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa Signed-off-by: Alexandros Koumparoulis * make limit_val_batches int Signed-off-by: Alexandros Koumparoulis * remove dup file Signed-off-by: Alexandros Koumparoulis * AG groups decisions on DDP parity Signed-off-by: Alexandros Koumparoulis * fix test Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * Exclude from pytest Signed-off-by: Alexandros Koumparoulis * Add L2_NeMo_2_GPT_DDP_Param_Parity_check to NeMo_CICD_Test.needs Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Aleksandr Laptev Signed-off-by: GNroy Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: Aleksandr Laptev Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: GNroy Co-authored-by: Vladimir Bataev Co-authored-by: akoumpa --- .github/workflows/cicd-main.yml | 17 +++ nemo/lightning/pytorch/callbacks/__init__.py | 3 +- .../pytorch/callbacks/ddp_parity_checker.py | 74 ++++++++++ tests/lightning/test_ddp_parity_checker.py | 129 ++++++++++++++++++ 4 files changed, 222 insertions(+), 1 deletion(-) create mode 100644 nemo/lightning/pytorch/callbacks/ddp_parity_checker.py create mode 100644 tests/lightning/test_ddp_parity_checker.py diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 3fc2b1a127e7..a086a493f683 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4753,6 +4753,22 @@ jobs: rm -rf examples/llm/gpt_pretrain_results rm -rf examples/llm/gpt_index_mappings + L2_NeMo_2_GPT_DDP_Param_Parity_check: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/lightning/test_ddp_parity_checker.py \ + --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document + + AFTER_SCRIPT: | + rm -rf examples/llm/gpt_pretrain_results + rm -rf examples/llm/gpt_index_mappings + Nemo_CICD_Test: needs: - gpu-test @@ -4859,6 +4875,7 @@ jobs: - Speech_Checkpoints_tests #- OPTIONAL_L2_Stable_Diffusion_Training - L2_NeMo_2_GPT_Pretraining_no_transformer_engine + - L2_NeMo_2_GPT_DDP_Param_Parity_check if: always() runs-on: ubuntu-latest steps: diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py index 00637c9d57d4..5b3113dea885 100644 --- a/nemo/lightning/pytorch/callbacks/__init__.py +++ b/nemo/lightning/pytorch/callbacks/__init__.py @@ -1,3 +1,4 @@ +from nemo.lightning.pytorch.callbacks.ddp_parity_checker import DdpParityChecker from nemo.lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform from nemo.lightning.pytorch.callbacks.nsys import NsysCallback @@ -6,7 +7,6 @@ from nemo.lightning.pytorch.callbacks.progress_bar import MegatronProgressBar from nemo.lightning.pytorch.callbacks.progress_printer import ProgressPrinter - __all__ = [ "ModelCheckpoint", "ModelTransform", @@ -15,4 +15,5 @@ "MegatronProgressBar", "ProgressPrinter", "PreemptionCallback", + "DdpParityChecker", ] diff --git a/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py b/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py new file mode 100644 index 000000000000..b5c2127433d7 --- /dev/null +++ b/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py @@ -0,0 +1,74 @@ +from functools import cache + +import torch +from megatron.core.utils import check_param_hashes_across_dp_replicas +from pytorch_lightning.callbacks.callback import Callback + +from nemo.lightning import io +from nemo.utils import logging + + +@cache +def pl_has_dist_opt_with_ovelap(trainer): + optim_config = getattr(getattr(trainer.strategy.model, 'optim', None), 'config', None) + if not getattr(optim_config, 'use_distributed_optimizer', False): + return False + if not getattr(optim_config, 'overlap_param_gather', False): + return False + return True + + +def pl_check_param_hashes_across_dp_replicas(trainer): + if pl_has_dist_opt_with_ovelap(trainer): + for opt in self.optimizers: + opt.disable_pre_hook() + import megatron.core.parallel_state as mp + + res = check_param_hashes_across_dp_replicas([trainer.strategy.model]) + torch.distributed.barrier() + + all_res = [False for _ in range(mp.get_data_parallel_world_size())] + + torch.distributed.all_gather_object(all_res, res, group=mp.get_data_parallel_group_gloo()) + + if pl_has_dist_opt_with_ovelap(trainer): + for opt in self.optimizers: + opt.enable_pre_hook() + return all(all_res) + + +class DdpParityChecker(Callback, io.IOMixin): + """ + This callback enables weight parity checkping across DDP replicas with Mcore models. + + User can specify their desired interval for weights to be checked via the `interval` parameter. + + Args: + dir (Optional[str]): Directory to store the memory profile dump + + Example: + >>> callback = DdpParityChecker(interval=10) + >>> trainer = Trainer(callbacks=[callback]) + """ + + def __init__(self, interval: int = 0): + """ + interval (int): How frequently to check DDP weights for errors. Default to 0 (off). + """ + assert interval > 0, "Expected interval to be > 0. A zero interval makes DdpParityChecker a no-op." + self.interval = interval + self.step = 0 + + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, unused=0) -> None: + if self.step == self.interval - 1: + if pl_check_param_hashes_across_dp_replicas(trainer): + logging.info(f"DDP Param parity check passed for batch-id= {batch_idx}") + else: + trainer.should_stop = True + trainer.limit_val_batches = 0 + logging.info(f"DDP Param parity check FAILED for batch-id= {batch_idx}") + self.step = (self.step + 1) % self.interval + + def on_train_end(self, trainer, pl_module) -> None: + pl_check_param_hashes_across_dp_replicas(trainer) + logging.info("DDP Param parity check passed at end of training.") diff --git a/tests/lightning/test_ddp_parity_checker.py b/tests/lightning/test_ddp_parity_checker.py new file mode 100644 index 000000000000..7d180ba17dfe --- /dev/null +++ b/tests/lightning/test_ddp_parity_checker.py @@ -0,0 +1,129 @@ +import argparse +import os + +import pytest +import torch +from megatron.core.optimizer import OptimizerConfig + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.gpt.data import PreTrainingDataModule +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.lightning.pytorch.callbacks import DdpParityChecker + + +def make_parser(): + parser = argparse.ArgumentParser(description='Train a small GPT model using NeMo 2.0') + parser.add_argument('--data-path', type=str, help="Path to data file") + parser.add_argument('--vocab-path', type=str, help="Path to vocab file") + parser.add_argument('--merges-path', type=str, help="Path to merges file") + + return parser + + +def wrap_config(config, trainer): + class ConfigWrapper(type(config)): + def configure_model(self, tokenizer) -> "MCoreGPTModel": + return make_byzantine_model_wrapper(super().configure_model(tokenizer), trainer) + + config.__class__ = ConfigWrapper + return config + + +def make_byzantine_model_wrapper(model, trainer): + class ByzantineModel(type(model)): + def forward(self, *ans, **kwargs): + ans = super().forward(*ans, **kwargs) + with torch.no_grad(): + import random + + rank = int(os.environ['LOCAL_RANK']) + if rank != 1: + return ans + for opt in trainer.strategy.model.optim._optimizers: + for g in opt.param_groups: + for param in g['params']: + param.fill_(random.uniform(0, 1)) + return ans + + model.__class__ = ByzantineModel + return model + + +@pytest.mark.skip(reason="tested with GH") +def test_failing(trainer, ddp_parity, optim, data, tokenizer): + config = llm.Llama2Config7B(num_layers=2) + config = wrap_config(config, trainer) + model = llm.LlamaModel(config, tokenizer=tokenizer, optim=optim) + trainer.fit(model, data) + + +@pytest.mark.skip(reason="tested with GH") +def test_working(trainer, ddp_parity, optim, data, tokenizer): + config = llm.Llama2Config7B(num_layers=2) + model = llm.LlamaModel(config, tokenizer=tokenizer, optim=optim) + trainer.fit(model, data) + + +def make_trainer_optim(args): + ddp_parity = DdpParityChecker(1) + trainer = nl.Trainer( + devices=2, + max_steps=4, + accelerator="gpu", + strategy=nl.MegatronStrategy( + ckpt_include_optimizer=False, + ), + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + limit_val_batches=1, + num_sanity_val_steps=0, + log_every_n_steps=1, + logger=None, + callbacks=[ddp_parity], + ) + + optim = nl.MegatronOptimizerModule( + config=OptimizerConfig( + optimizer="adam", + lr=1e-5, + use_distributed_optimizer=False, + fp16=False, + bf16=True, + params_dtype=torch.float32, + ), + ) + + tokenizer = get_nmt_tokenizer( + "megatron", + "GPT2BPETokenizer", + vocab_file=args.vocab_path, + merges_file=args.merges_path, + ) + data = PreTrainingDataModule( + paths=args.data_path, + seq_length=2048, + global_batch_size=32, + seed=1234, + tokenizer=tokenizer, + ) + + return trainer, ddp_parity, optim, data, tokenizer + + +@pytest.mark.skip(reason="tested with GH") +def main(): + args = make_parser().parse_args() + trainer, ddp_parity, optim, data, tokenizer = make_trainer_optim(args) + test_failing(trainer, ddp_parity, optim, data, tokenizer) + if trainer.should_stop != True: + raise ValueError("DDP parity checking failed.") + + try: + test_working(*make_trainer_optim(args)) + print("DDP parity checking worked as expected") + except: + raise + + +if __name__ == "__main__": + main() From 78f57fee1e9755c3cf57f264a59ad82cc09a1dd6 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Thu, 22 Aug 2024 10:06:02 -0700 Subject: [PATCH 038/664] force optimizer.param_groups to match mcore_optimizer.param_groups after restoring from a checkpoint (#10225) Signed-off-by: ashors1 --- nemo/core/optim/mcore_optim.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py index 9feb70cc90a1..c058da52a97a 100644 --- a/nemo/core/optim/mcore_optim.py +++ b/nemo/core/optim/mcore_optim.py @@ -35,8 +35,6 @@ class McoreDistributedOptimizer(torch.optim.Optimizer): def __init__(self, optim): self.defaults = {} self.mcore_optimizer = optim - self.param_groups = self.mcore_optimizer.param_groups - self.state = self.mcore_optimizer.state def zero_grad(self, set_to_none: bool = True): """We only need to zero the model related parameters, i.e., @@ -76,12 +74,39 @@ def step(self, closure): return loss + # Promote state so it can be retrieved or set via + # "optimizer_instance.state" + def _get_state(self): + if hasattr(self, 'mcore_optimizer'): + return self.mcore_optimizer.state + else: + return [] + + def _set_state(self, value): + self.mcore_optimizer.state = value + + state = property(_get_state, _set_state) + def save_parameter_state(self, filename: str): self.mcore_optimizer.save_parameter_state(filename) def load_parameter_state(self, filename: str): self.mcore_optimizer.load_parameter_state(filename) + # Promote param_groups so it can be retrieved or set via + # "optimizer_instance.param_groups" + # (for example, to adjust the learning rate) + def _get_param_groups(self): + if hasattr(self, 'mcore_optimizer'): + return self.mcore_optimizer.param_groups + else: + return [] + + def _set_param_groups(self, value): + self.mcore_optimizer.param_groups = value + + param_groups = property(_get_param_groups, _set_param_groups) + def finish_param_sync(self, model_index): self.mcore_optimizer.finish_param_sync(model_index) From 5269caf9c6feb60fd8cc60f20fcf46f291fe1654 Mon Sep 17 00:00:00 2001 From: meatybobby Date: Thu, 22 Aug 2024 10:28:31 -0700 Subject: [PATCH 039/664] Update TRTLLM 0.12 (#10215) * Update TRTLLM 0.12 * Add model config * Change config * Change deploy script * Apply isort and black reformatting Signed-off-by: meatybobby * Remove parameter --------- Signed-off-by: meatybobby Co-authored-by: meatybobby Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> --- nemo/export/tensorrt_llm.py | 8 ++++---- .../trt_llm/converter/model_converter.py | 18 ++++++++++-------- .../trt_llm/qnemo/qnemo_to_tensorrt_llm.py | 2 -- nemo/export/trt_llm/tensorrt_llm_build.py | 4 ---- nemo/export/trt_llm/tensorrt_llm_run.py | 9 +++++++++ scripts/deploy/nlp/deploy_triton.py | 5 +++-- 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 3c73da1c0731..2a89b76cc099 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -100,6 +100,7 @@ def __init__( use_python_runtime: bool = True, enable_chunked_context: bool = None, max_tokens_in_paged_kv_cache: int = None, + multi_block_mode: bool = False, ): """ Args: @@ -107,6 +108,7 @@ def __init__( lora_ckpt_list (List[str]): lora checkpoint paths. load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir. use_python_runtime (bool): whether to use python or c++ runtime. + multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context. Only available when using c++ runtime """ if use_python_runtime: @@ -122,6 +124,7 @@ def __init__( self.use_python_runtime = use_python_runtime self.enable_chunked_context = enable_chunked_context if enable_chunked_context is not None else False self.max_tokens_in_paged_kv_cache = max_tokens_in_paged_kv_cache + self.multi_block_mode = multi_block_mode self.model = None self.tokenizer = None self.n_gpus = None @@ -157,7 +160,6 @@ def export( paged_context_fmha: bool = False, dtype: str = "bfloat16", load_model: bool = True, - enable_multi_block_mode: bool = False, use_lora_plugin: str = None, lora_target_modules: List[str] = None, max_lora_rank: int = 64, @@ -192,7 +194,6 @@ def export( remove_input_padding (bool): enables removing input padding or not. dtype (str): Floating point type for model weights (Supports BFloat16/Float16). load_model (bool): load TensorRT-LLM model after the export. - enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context. use_lora_plugin (str): use dynamic lora or not. lora_target_modules (List[str]): list of the target lora modules. max_lora_rank (int): maximum lora rank. @@ -288,7 +289,6 @@ def export( use_parallel_embedding=use_parallel_embedding, paged_kv_cache=paged_kv_cache, remove_input_padding=remove_input_padding, - enable_multi_block_mode=enable_multi_block_mode, use_lora_plugin=use_lora_plugin, lora_target_modules=lora_target_modules, max_lora_rank=max_lora_rank, @@ -340,7 +340,6 @@ def export( max_lora_rank=max_lora_rank, lora_target_modules=lora_target_modules, max_prompt_embedding_table_size=max_prompt_embedding_table_size, - enable_multi_block_mode=enable_multi_block_mode, paged_kv_cache=paged_kv_cache, remove_input_padding=remove_input_padding, paged_context_fmha=paged_context_fmha, @@ -960,6 +959,7 @@ def _load(self): use_python_runtime=self.use_python_runtime, enable_chunked_context=self.enable_chunked_context, max_tokens_in_paged_kv_cache=self.max_tokens_in_paged_kv_cache, + multi_block_mode=self.multi_block_mode, ) self._load_prompt_tables() except Exception as error: diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py index 60d50316e9ed..337a0a4e4e77 100755 --- a/nemo/export/trt_llm/converter/model_converter.py +++ b/nemo/export/trt_llm/converter/model_converter.py @@ -22,8 +22,6 @@ from tensorrt_llm._utils import pad_vocab_size from tensorrt_llm.functional import non_gated_version from tensorrt_llm.layers import MoeConfig -from tensorrt_llm.models.gpt.config import GPTConfig -from tensorrt_llm.models.llama.config import LLaMAConfig from tensorrt_llm.models.modeling_utils import PretrainedConfig from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import ( @@ -36,12 +34,16 @@ def get_config(decoder_type, config): - if decoder_type == "llama": - return LLaMAConfig(**config) - elif decoder_type == "gpt" or decoder_type == "gptnext": - return GPTConfig(**config) - else: - return PretrainedConfig(**config) + DECODER_CONFIG = { + "llama": tensorrt_llm.models.llama.config.LLaMAConfig, + "gpt": tensorrt_llm.models.gpt.config.GPTConfig, + "gptnext": tensorrt_llm.models.gpt.config.GPTConfig, + "falcon": tensorrt_llm.models.falcon.config.FalconConfig, + "gemma": tensorrt_llm.models.GemmaConfig, + } + config_cls = DECODER_CONFIG[decoder_type] if decoder_type in DECODER_CONFIG else PretrainedConfig + + return config_cls(**config) def prompt_convert(prompt_config, prompt_weights): diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py index 921c6535a57a..48127a507a58 100644 --- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py +++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py @@ -36,7 +36,6 @@ def qnemo_to_tensorrt_llm( use_parallel_embedding: bool = False, paged_kv_cache: bool = True, remove_input_padding: bool = True, - enable_multi_block_mode: bool = False, use_lora_plugin: Optional[str] = None, lora_target_modules: Optional[List[str]] = None, max_lora_rank: int = 64, @@ -93,7 +92,6 @@ def qnemo_to_tensorrt_llm( build_cmd += f"--nccl_plugin {config.dtype} " build_cmd += f"--paged_kv_cache {'enable' if paged_kv_cache else 'disable'} " build_cmd += f"--remove_input_padding {'enable' if remove_input_padding else 'disable'} " - build_cmd += f"--multi_block_mode {'enable' if enable_multi_block_mode else 'disable'} " build_cmd += f"--multiple_profiles {'enable' if multiple_profiles else 'disable'} " if use_fused_mlp: diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index 1544fdf032d8..e37c3ba1c845 100755 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -41,11 +41,9 @@ def build_and_save_engine( max_lora_rank=64, lora_target_modules=None, max_prompt_embedding_table_size=0, - enable_multi_block_mode: bool = False, paged_kv_cache: bool = True, remove_input_padding: bool = True, paged_context_fmha: bool = False, - use_custom_all_reduce: bool = True, use_refit: bool = False, max_num_tokens: int = None, max_seq_len: int = None, @@ -66,8 +64,6 @@ def build_and_save_engine( plugin_config = PluginConfig() plugin_config.gpt_attention_plugin = gpt_attention_plugin plugin_config.gemm_plugin = gemm_plugin - plugin_config.set_nccl_plugin(use_custom_all_reduce=use_custom_all_reduce) - plugin_config.multi_block_mode = enable_multi_block_mode if paged_kv_cache: plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block) else: diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 14ad0be699bb..852eddc6a468 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -144,6 +144,7 @@ def _load( use_python_runtime: bool = True, enable_chunked_context: bool = False, max_tokens_in_paged_kv_cache: int = None, + multi_block_mode: bool = False, ): """The impl of `load` API for on a single GPU worker.""" try: @@ -164,6 +165,11 @@ def _load( runtime_rank = tensorrt_llm.mpi_rank() if use_python_runtime: + if enable_chunked_context: + logging.warning("enable_chunked_context is disabled when using python runtime") + if multi_block_mode: + logging.warning("multi_block_mode is disabled when using python runtime") + decoder = ModelRunner.from_dir( engine_dir=engine_dir, lora_dir=lora_ckpt_list, @@ -183,6 +189,7 @@ def _load( max_beam_width=max_beam_width, enable_chunked_context=enable_chunked_context, max_tokens_in_paged_kv_cache=max_tokens_in_paged_kv_cache, + multi_block_mode=multi_block_mode, debug_mode=False, ) @@ -296,6 +303,7 @@ def load( use_python_runtime: bool = True, enable_chunked_context: bool = False, max_tokens_in_paged_kv_cache: int = None, + multi_block_mode: bool = False, ) -> TensorrtLLMHostContext: """Loaded the compiled LLM model and run it. @@ -315,6 +323,7 @@ def load( use_python_runtime, enable_chunked_context, max_tokens_in_paged_kv_cache, + multi_block_mode, ) executor = None elif tensorrt_llm.mpi_world_size() > 1: diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index c0acd97e1b50..0ec6264d6bf0 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -128,7 +128,8 @@ def get_args(argv): default=False, action='store_true', help='Split long kv sequence into multiple blocks (applied to generation MHA kernels). \ - It is beneifical when batchxnum_heads cannot fully utilize GPU.', + It is beneifical when batchxnum_heads cannot fully utilize GPU. \ + Only available when using c++ runtime.', ) parser.add_argument( "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences." @@ -274,6 +275,7 @@ def get_trtllm_deployable(args): lora_ckpt_list=args.lora_ckpt, load_model=(args.nemo_checkpoint is None), use_python_runtime=(not args.use_cpp_runtime), + multi_block_mode=args.multi_block_mode, ) if args.nemo_checkpoint is not None: @@ -296,7 +298,6 @@ def get_trtllm_deployable(args): paged_kv_cache=(not args.no_paged_kv_cache), remove_input_padding=(not args.disable_remove_input_padding), dtype=args.dtype, - enable_multi_block_mode=args.multi_block_mode, use_lora_plugin=args.use_lora_plugin, lora_target_modules=args.lora_target_modules, max_lora_rank=args.max_lora_rank, From 42c2910f45eff2aa9d667372d0daff24b69422fd Mon Sep 17 00:00:00 2001 From: anteju <108555623+anteju@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:22:17 -0700 Subject: [PATCH 040/664] Tutorial: audio codec inference (#10186) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ante Jukić --- tutorials/tts/Audio_Codec_Inference.ipynb | 478 ++++++++++++++++++++++ 1 file changed, 478 insertions(+) create mode 100644 tutorials/tts/Audio_Codec_Inference.ipynb diff --git a/tutorials/tts/Audio_Codec_Inference.ipynb b/tutorials/tts/Audio_Codec_Inference.ipynb new file mode 100644 index 000000000000..8eff02916737 --- /dev/null +++ b/tutorials/tts/Audio_Codec_Inference.ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "7X-TwhdTGmlc" + }, + "source": [ + "# License" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fCQUeZRPGnoe" + }, + "source": [ + "> Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n", + ">\n", + "> Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n", + ">\n", + "> http://www.apache.org/licenses/LICENSE-2.0\n", + ">\n", + "> Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rtBDkKqVGZJ8" + }, + "source": [ + "# Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pZ2QSsXuGbMe" + }, + "source": [ + "In this tutorial we show how use NeMo **neural audio codecs** at inference time. To learn more about training and finetuning neural audio codecs in NeMo, check the [Audio Codec Training tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/Audio_Codec_Training.ipynb).\n", + "\n", + "An audio codec typically consists of an encoder, a quantizer and a decoder, with a typical architecture depicted in the figure below.\n", + "An audio codec can be used to encode an input audio signal into a sequence of discrete values.\n", + "In this tutorial, the discrete values will be referred to as **audio tokens**.\n", + "The obtained audio tokens can be decoded into an output audio signal.\n", + "\n", + "Audio tokens can be used to represent the input audio for an automatic speech recognition (ASR) model [[1](https://arxiv.org/abs/2309.10922), [2](https://arxiv.org/pdf/2407.03495)], or to represent the output audio of a text-to-speech (TTS) system [[3](https://arxiv.org/abs/2406.05298), [4](https://arxiv.org/pdf/2406.17957)].\n", + "\n", + "NeMo provides several neural audio codec models, inlcuding audio codecs and mel codecs at different sampling rates.\n", + "The list of the available models can be found [here](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/tts/checkpoints.html#codec-models).\n", + "\n", + "
\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3OZassNG5xff" + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WZvQvPkIhRi3" + }, + "outputs": [], + "source": [ + "BRANCH = 'main'\n", + "# Install NeMo library. If you are running locally (rather than on Google Colab), follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n", + "\n", + "if 'google.colab' in str(get_ipython()):\n", + " !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "v8NGOM0EzK8W" + }, + "outputs": [], + "source": [ + "import math\n", + "import wget\n", + "import os\n", + "import librosa\n", + "import torch\n", + "import numpy as np\n", + "import IPython.display as ipd\n", + "import matplotlib.pyplot as plt\n", + "from pathlib import Path\n", + "\n", + "\n", + "# Utility for displaying signals and metrics\n", + "def show_signal(signal: np.ndarray, sample_rate: int = 16000, tag: str = 'Signal'):\n", + " \"\"\"Show the time-domain signal and its spectrogram.\n", + " \"\"\"\n", + " fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 2.5))\n", + "\n", + " # show waveform\n", + " t = np.arange(0, len(signal)) / sample_rate\n", + "\n", + " ax[0].plot(t, signal)\n", + " ax[0].set_xlim(0, t.max())\n", + " ax[0].grid()\n", + " ax[0].set_xlabel('time / s')\n", + " ax[0].set_ylabel('amplitude')\n", + " ax[0].set_title(tag)\n", + "\n", + " n_fft = 1024\n", + " hop_length = 256\n", + "\n", + " D = librosa.amplitude_to_db(np.abs(librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)), ref=np.max)\n", + " img = librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=sample_rate, n_fft=n_fft, hop_length=hop_length, ax=ax[1])\n", + " ax[1].set_title(tag)\n", + "\n", + " plt.tight_layout()\n", + " plt.colorbar(img, format=\"%+2.f dB\", ax=ax)\n", + "\n", + "\n", + "# Utility for displaying a latent representation\n", + "def show_latent(latent: np.ndarray, tag: str):\n", + " plt.figure(figsize = (16, 3))\n", + " img = plt.imshow(latent, aspect='equal')\n", + " plt.colorbar(img, ax=plt.gca())\n", + " plt.title(tag)\n", + " plt.xlabel('Time frame')\n", + " plt.ylabel('Latent vector index')\n", + " plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8ZKDMTwsEY1K" + }, + "outputs": [], + "source": [ + "# Working directory\n", + "ROOT_DIR = Path().absolute() / 'codec_tutorial'\n", + "\n", + "# Create dataset directory\n", + "DATA_DIR = ROOT_DIR / 'data'\n", + "DATA_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "audio_path = DATA_DIR / 'LJ023-0089.wav'\n", + "audio_url = \"https://multilangaudiosamples.s3.us-east-2.amazonaws.com/LJ023-0089.wav\"\n", + "\n", + "if not os.path.exists(audio_path):\n", + " wget.download(audio_url, audio_path.as_posix())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KAbH7N427FdT" + }, + "source": [ + "# Load a model from NGC" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ODgdGgsAAUku" + }, + "source": [ + "Any of the [pretrained checkpoints](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/tts/checkpoints.html#codec-models) could be used for inference.\n", + "Here, we use `mel_codec_22khz_fullband_medium`, which works for 22.05 kHz audio signals.\n", + "\n", + "The model can be easily restored from NGC:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XqAYWR65aKTx" + }, + "outputs": [], + "source": [ + "from nemo.collections.tts.models.audio_codec import AudioCodecModel\n", + "\n", + "# Optionally specify a pretrained model to fine-tune from. To train from scratch, set this to 'None'.\n", + "model_name = 'mel_codec_22khz_fullband_medium'\n", + "codec_model = AudioCodecModel.from_pretrained(model_name)\n", + "codec_model.freeze()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZnnjL28pEY1L" + }, + "source": [ + "Show information about the loaded model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4xsfeHVyEY1L" + }, + "outputs": [], + "source": [ + "print(f'Loaded model from NeMo:')\n", + "print(f'\\tmodel name : {model_name}')\n", + "print(f'\\tsample rate : {codec_model.sample_rate} Hz')\n", + "print(f'\\tlatent dimension : {codec_model.vector_quantizer.codebook_dim}')\n", + "\n", + "print('\\n\\nModel summary:')\n", + "print(codec_model.summarize())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fM4QPsLTnzK7" + }, + "source": [ + "# Inference" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tkZC6Dl7KRl6" + }, + "source": [ + "## Processing audio\n", + "\n", + "Here we use the codec model to process the input audio by applying the complete model. The input signal is encoded, quantized, dequantized and decoded. Finally, a reconstructed signal is obtained." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sYzvAYr2vo1K" + }, + "outputs": [], + "source": [ + "input_audio, sr = librosa.load(audio_path, sr=codec_model.sample_rate)\n", + "\n", + "# Shape (batch, time)\n", + "input_audio_tensor = torch.from_numpy(input_audio).unsqueeze(dim=0).to(codec_model.device)\n", + "\n", + "# Shape (batch,)\n", + "input_audio_len = torch.tensor([input_audio_tensor.size(-1)]).to(codec_model.device)\n", + "\n", + "# Process audio using the codec model\n", + "output_audio_tensor, _ = codec_model(audio=input_audio_tensor, audio_len=input_audio_len)\n", + "\n", + "# Output audio\n", + "output_audio = output_audio_tensor.squeeze().cpu().numpy()\n", + "\n", + "# Show signals\n", + "show_signal(input_audio, tag='Input audio', sample_rate=codec_model.sample_rate)\n", + "show_signal(output_audio, tag='Output audio', sample_rate=codec_model.sample_rate)\n", + "\n", + "# Play audio\n", + "print('Input audio')\n", + "ipd.display(ipd.Audio(input_audio, rate=codec_model.sample_rate))\n", + "\n", + "print('Output audio')\n", + "ipd.display(ipd.Audio(output_audio, rate=codec_model.sample_rate))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rynZYwg2VP5d" + }, + "source": [ + "## Audio tokens\n", + "\n", + "Audio tokens can be easily computed by using the `encode` method of the `AudioCodec` model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ASKM_jKVEY1L" + }, + "outputs": [], + "source": [ + "# Convert audio to tokens\n", + "tokens, tokens_len = codec_model.encode(audio=input_audio_tensor, audio_len=input_audio_len)\n", + "\n", + "print('tokens information:')\n", + "print(f'\\tshape (batch, codebook, time frame) : {tokens.size()}')\n", + "print(f'\\tdtype : {tokens.dtype}')\n", + "print(f'\\tmin : {tokens.min()}')\n", + "print(f'\\tmax : {tokens.max()}')\n", + "\n", + "# Number of codebooks should match the number of codebooks/groups\n", + "if hasattr(codec_model.vector_quantizer, 'num_groups'):\n", + " # Group FSQ\n", + " assert tokens.size(1) == codec_model.vector_quantizer.num_groups\n", + " print(f'\\tnum_groups : {tokens.size(1)}')\n", + "elif hasattr(codec_model.vector_quantizer, 'codebooks'):\n", + " # RVQ\n", + " assert tokens.size(1) == len(codec_model.vector_quantizer.codebooks)\n", + " print(f'\\tnum_codebooks : {tokens.size(1)}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CmliPMnDEY1L" + }, + "source": [ + "Similarly, audio can be easily reconstructed from audio tokens using the `decode` method of the `AudioCodec` models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RTQ1M9PMEY1L" + }, + "outputs": [], + "source": [ + "# Convert tokens back to audio\n", + "output_audio_from_tokens_tensor, _ = codec_model.decode(tokens=tokens, tokens_len=tokens_len)\n", + "output_audio_from_tokens = output_audio_from_tokens_tensor.squeeze().cpu().numpy()\n", + "\n", + "# Show signals\n", + "show_signal(output_audio_from_tokens, tag='Output audio from tokens', sample_rate=codec_model.sample_rate)\n", + "show_signal(output_audio_from_tokens - output_audio, tag='Difference compared to forward pass', sample_rate=codec_model.sample_rate)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kGqotZkqEY1M" + }, + "source": [ + "## Latent representation\n", + "\n", + "Continuous (non-discrete) latent representation at the output of the encoder can be easily computed using the `encode_audio` method of the `AudioCodec` model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "r-89-gG3EY1M" + }, + "outputs": [], + "source": [ + "# Convert audio to the encoded representation\n", + "encoded, encoded_len = codec_model.encode_audio(audio=input_audio_tensor, audio_len=input_audio_len)\n", + "\n", + "print('encoded information:')\n", + "print(f'\\tshape (batch, codebook, time frame) : {encoded.size()}')\n", + "print(f'\\tdtype : {encoded.dtype}')\n", + "print(f'\\tmin : {encoded.min()}')\n", + "print(f'\\tmax : {encoded.max()}')\n", + "\n", + "\n", + "# Show the encoded representation\n", + "show_latent(encoded.squeeze().cpu().numpy(), tag='Encoder output')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3Ory1U1uEY1M" + }, + "source": [ + "The encoded representation can be easily converted to tokens, dequantized into a continuous latent representation and decoded back to audio." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "btmqUWNkEY1M" + }, + "outputs": [], + "source": [ + "# Encoder output to tokens\n", + "tokens = codec_model.quantize(encoded=encoded, encoded_len=encoded_len)\n", + "\n", + "# Tokens back to a continuous vector\n", + "dequantized = codec_model.dequantize(tokens=tokens, tokens_len=encoded_len)\n", + "\n", + "# Reconstruct audio\n", + "output_audio_from_latent_tensor, _ = codec_model.decode_audio(inputs=dequantized, input_len=encoded_len)\n", + "output_audio_from_latent = output_audio_from_latent_tensor.squeeze().cpu().numpy()\n", + "\n", + "# Show dequantized latent representation\n", + "show_latent(dequantized.squeeze().cpu().numpy(), tag='Decoder input')\n", + "\n", + "# Show signals\n", + "show_signal(output_audio_from_latent, tag='Output audio from latent', sample_rate=codec_model.sample_rate)\n", + "show_signal(output_audio_from_latent - output_audio, tag='Difference compared to forward pass', sample_rate=codec_model.sample_rate)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cMvU0WxlEY1M" + }, + "source": [ + "# Related information" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_LtyHHuLkNDv" + }, + "source": [ + "To learn more about audio codec models in NeMo, look at our [documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/tts/models.html#codecs).\n", + "\n", + "For more information on training and finetuning neural audio codecs in NeMo, check the [Audio Codec Training tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/Audio_Codec_Training.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LeqV3VvJVOb-" + }, + "source": [ + "# References" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Rvu4w2x_3RSY" + }, + "source": [ + "1. [Discrete Audio Representation as an Alternative to Mel-Spectrograms for Speaker and Speech Recognition](https://arxiv.org/abs/2309.10922)\n", + "2. [Codec-ASR: Training Performant Automatic Speech Recognition Systems with Discrete Speech Representations](https://arxiv.org/pdf/2407.03495)\n", + "3. [Spectral Codecs: Spectrogram-Based Audio Codecs for High Quality Speech Synthesis](https://arxiv.org/abs/2406.05298)\n", + "4. [Improving Robustness of LLM-based Speech Synthesis by Learning Monotonic Alignment](https://arxiv.org/pdf/2406.17957)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "colab": { + "provenance": [], + "toc_visible": true + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 753c70e5c5cfc6acb7ecfb416374aed59d5d233c Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Thu, 22 Aug 2024 20:11:30 -0700 Subject: [PATCH 041/664] Move trt imports in nemo.collections.llm inside respective functions (#10234) Signed-off-by: Hemil Desai --- nemo/collections/llm/api.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 46d94d26b03b..8bead26e653e 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -8,25 +8,10 @@ from typing_extensions import Annotated from nemo.collections.llm.utils import Config, task -from nemo.deploy import DeployPyTriton from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform from nemo.utils import logging -trt_llm_supported = True -try: - from nemo.export.tensorrt_llm import TensorRTLLM -except ImportError as error: - logging.warning(f"TensorRTLLM could not be imported from nemo.export: {error}") - trt_llm_supported = False - -uvicorn_supported = True -try: - import uvicorn -except ImportError as error: - logging.warning(f"uvicorn could not be imported: {error}") - uvicorn_supported = False - TokenizerType = Any @@ -253,6 +238,8 @@ def get_trtllm_deployable( max_batch_size, dtype, ): + from nemo.export.tensorrt_llm import TensorRTLLM + if triton_model_repository is None: trt_llm_path = "/tmp/trt_llm_model_dir/" Path(trt_llm_path).mkdir(parents=True, exist_ok=True) @@ -274,8 +261,6 @@ def get_trtllm_deployable( if nemo_checkpoint is not None and model_type is None: raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.") - if not trt_llm_supported: - raise ValueError("TensorRT-LLM engine is not supported in this environment.") trt_llm_exporter = TensorRTLLM( model_dir=trt_llm_path, load_model=(nemo_checkpoint is None), @@ -334,6 +319,8 @@ def deploy( rest_service_port: int = 8000, openai_format_response: bool = False, ): + from nemo.deploy import DeployPyTriton + if start_rest_service: if triton_port == rest_service_port: logging.error("REST service port and Triton server port cannot use the same port.") @@ -370,6 +357,13 @@ def deploy( logging.error("Error message has occurred during deploy function. Error message: " + str(error)) return + uvicorn_supported = True + try: + import uvicorn + except ImportError as error: + logging.warning(f"uvicorn could not be imported: {error}") + uvicorn_supported = False + try: logging.info("Model serving on Triton is will be started.") if start_rest_service and uvicorn_supported: From d4d6a5b25e94bea7d1e0b7033e4cdf3b92a8ab77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Fri, 23 Aug 2024 09:56:30 -0400 Subject: [PATCH 042/664] Add tests for LazyNeMoIterator and fix case with metadata_only=True and offsets in manifest (#10198) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add tests for LazyNeMoIterator and fix case with manifest_only=True and offsets in manifest Signed-off-by: Piotr Żelasko * Address code review Signed-off-by: Piotr Żelasko * fix tests Signed-off-by: Piotr Żelasko * fix tests Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko --- .../common/data/lhotse/nemo_adapters.py | 55 +++-- .../common/test_lhotse_nemo_adapters.py | 188 ++++++++++++++++++ 2 files changed, 228 insertions(+), 15 deletions(-) create mode 100644 tests/collections/common/test_lhotse_nemo_adapters.py diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py index 2a4b71a18880..3c5ced5d4018 100644 --- a/nemo/collections/common/data/lhotse/nemo_adapters.py +++ b/nemo/collections/common/data/lhotse/nemo_adapters.py @@ -24,7 +24,7 @@ import lhotse.serialization import soundfile from cytoolz import groupby -from lhotse import AudioSource, Recording, SupervisionSegment +from lhotse import AudioSource, MonoCut, Recording, SupervisionSegment from lhotse.audio.backend import LibsndfileBackend from lhotse.cut import Cut from lhotse.dataset.dataloading import resolve_seed @@ -112,11 +112,9 @@ def __iter__(self) -> Generator[Cut, None, None]: audio_path = get_full_path(str(data.pop("audio_filepath")), str(self.path)) duration = data.pop("duration") offset = data.pop("offset", None) - recording = self._create_recording(audio_path, duration, data.pop("sampling_rate", None)) - cut = recording.to_cut() - if offset is not None: - cut = cut.truncate(offset=offset, duration=duration, preserve_id=True) - cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}" + cut = self._create_cut( + audio_path=audio_path, offset=offset, duration=duration, sampling_rate=data.pop("sampling_rate", None) + ) # Note that start=0 and not start=offset because supervision's start if relative to the # start of the cut; and cut.start is already set to offset cut.supervisions.append( @@ -140,6 +138,42 @@ def __len__(self) -> int: def __add__(self, other): return LazyIteratorChain(self, other) + def _create_cut( + self, + audio_path: str, + offset: float, + duration: float, + sampling_rate: int | None = None, + ) -> Cut: + if not self.metadata_only: + recording = self._create_recording(audio_path, duration, sampling_rate) + cut = recording.to_cut() + if offset is not None: + cut = cut.truncate(offset=offset, duration=duration, preserve_id=True) + cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}" + else: + # Only metadata requested. + # We'll provide accurate metadata for Cut but inaccurate metadata for Recording to avoid + # incurring IO penalty (note that Lhotse manifests contain more information than + # NeMo manifests, so for actual dataloading we have to fill it using the audio file). + sr = ifnone(sampling_rate, 16000) # fake sampling rate + offset = ifnone(offset, 0.0) + cut = MonoCut( + id=audio_path, + start=offset, + duration=duration, + channel=0, + supervisions=[], + recording=Recording( + id=audio_path, + sources=[AudioSource(type="dummy", channels=[0], source="")], + sampling_rate=sr, + duration=offset + duration, + num_samples=compute_num_samples(offset + duration, sr), + ), + ) + return cut + def _create_recording( self, audio_path: str, @@ -156,15 +190,6 @@ def _create_recording( duration=duration, channel_ids=[0], ) - elif self.metadata_only: - return Recording( - id=audio_path, - sources=[AudioSource(type="file", channels=[0], source=audio_path)], - sampling_rate=-1, - num_samples=-1, - duration=duration, - channel_ids=[0], - ) else: return Recording.from_file(audio_path) diff --git a/tests/collections/common/test_lhotse_nemo_adapters.py b/tests/collections/common/test_lhotse_nemo_adapters.py new file mode 100644 index 000000000000..a76116b10dd7 --- /dev/null +++ b/tests/collections/common/test_lhotse_nemo_adapters.py @@ -0,0 +1,188 @@ +import numpy as np +import pytest +from lhotse import AudioSource, CutSet, MonoCut, Recording, SupervisionSegment +from lhotse.serialization import save_to_jsonl +from lhotse.testing.dummies import DummyManifest + +from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator + + +@pytest.fixture +def nemo_manifest_path(tmp_path_factory): + """2 utterances of length 1s as a NeMo manifest.""" + tmpdir = tmp_path_factory.mktemp("nemo_data") + cuts = DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True).save_audios(tmpdir, progress_bar=False) + nemo = [] + for c in cuts: + nemo.append( + { + "audio_filepath": c.recording.sources[0].source, + "text": "irrelevant", + "duration": c.duration, + "lang": "en", + } + ) + p = tmpdir / "nemo_manifest.json" + save_to_jsonl(nemo, p) + return p + + +def test_lazy_nemo_iterator(nemo_manifest_path): + cuts = CutSet(LazyNeMoIterator(nemo_manifest_path)) + + assert len(cuts) == 2 + + for c in cuts: + assert isinstance(c, MonoCut) + assert c.start == 0.0 + assert c.duration == 1.0 + assert c.num_channels == 1 + assert c.sampling_rate == 16000 + assert c.num_samples == 16000 + + assert c.has_recording + assert isinstance(c.recording, Recording) + assert c.recording.duration == 1.0 + assert c.recording.num_channels == 1 + assert c.recording.num_samples == 16000 + assert len(c.recording.sources) == 1 + assert isinstance(c.recording.sources[0], AudioSource) + assert c.recording.sources[0].type == "file" + + audio = c.load_audio() + assert isinstance(audio, np.ndarray) + assert audio.shape == (1, 16000) + assert audio.dtype == np.float32 + + assert len(c.supervisions) == 1 + s = c.supervisions[0] + assert isinstance(s, SupervisionSegment) + assert s.start == 0 + assert s.duration == 1 + assert s.channel == 0 + assert s.text == "irrelevant" + assert s.language == "en" + + +@pytest.fixture +def nemo_offset_manifest_path(tmp_path_factory): + """ + 4 utterances of length 0.5s as a NeMo manifest. + They are dervied from two audio files of 1s duration, so + two of them have offset 0 and the other two have offset 0.5. + """ + tmpdir = tmp_path_factory.mktemp("nemo_data_offset") + cuts = ( + DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True) + .save_audios(tmpdir, progress_bar=False) + .cut_into_windows(duration=0.5, hop=0.5) + ) + nemo = [] + for c in cuts: + nemo.append( + { + "audio_filepath": c.recording.sources[0].source, + "text": "irrelevant", + "offset": c.start, + "duration": c.duration, + "lang": "en", + } + ) + p = tmpdir / "nemo_manifest.json" + save_to_jsonl(nemo, p) + return p + + +def test_lazy_nemo_iterator_with_offset(nemo_offset_manifest_path): + cuts = CutSet(LazyNeMoIterator(nemo_offset_manifest_path)) + + assert len(cuts) == 4 + + for idx, c in enumerate(cuts): + # Note we originally had 1 cut per 1s audio file. + # Then we cut them into 0.5s cuts, so we have 4 cuts in total, + # 2 of them start at 0s and the other 2 start at 0.5s. + is_even = idx % 2 == 0 + + assert isinstance(c, MonoCut) + if is_even: + assert c.start == 0.0 + else: + assert c.start == 0.5 + assert c.duration == 0.5 + assert c.num_channels == 1 + assert c.sampling_rate == 16000 + assert c.num_samples == 8000 + + assert c.has_recording + assert isinstance(c.recording, Recording) + assert c.recording.duration == 1.0 + assert c.recording.num_channels == 1 + assert c.recording.num_samples == 16000 + assert len(c.recording.sources) == 1 + assert isinstance(c.recording.sources[0], AudioSource) + assert c.recording.sources[0].type == "file" + + audio = c.load_audio() + assert isinstance(audio, np.ndarray) + assert audio.shape == (1, 8000) + assert audio.dtype == np.float32 + + assert len(c.supervisions) == 1 + s = c.supervisions[0] + assert isinstance(s, SupervisionSegment) + assert s.start == 0 + assert s.duration == 0.5 + assert s.channel == 0 + assert s.text == "irrelevant" + assert s.language == "en" + + +def test_lazy_nemo_iterator_with_offset_metadata_only(nemo_offset_manifest_path): + cuts = CutSet(LazyNeMoIterator(nemo_offset_manifest_path, metadata_only=True)) + + assert len(cuts) == 4 + + for idx, c in enumerate(cuts): + # Note we originally had 1 cut per 1s audio file. + # Then we cut them into 0.5s cuts, so we have 4 cuts in total, + # 2 of them start at 0s and the other 2 start at 0.5s. + is_even = idx % 2 == 0 + + assert isinstance(c, MonoCut) + if is_even: + assert c.start == 0.0 + else: + assert c.start == 0.5 + assert c.duration == 0.5 + assert c.num_channels == 1 + assert c.sampling_rate == 16000 + assert c.num_samples == 8000 + + # With metadata_only=True we can't actually check what's in the Recording. + # The metadata for it may be incorrect (but is correct for the actual Cut), + # but we don't have to perform any I/O to read the file for info. + assert c.has_recording + assert isinstance(c.recording, Recording) + if is_even: + assert c.recording.duration == 0.5 + assert c.recording.num_samples == 8000 + else: + assert c.recording.duration == 1.0 + assert c.recording.num_samples == 16000 + assert c.recording.num_channels == 1 + assert len(c.recording.sources) == 1 + assert isinstance(c.recording.sources[0], AudioSource) + assert c.recording.sources[0].type == "dummy" + + with pytest.raises(AssertionError): + c.load_audio() + + assert len(c.supervisions) == 1 + s = c.supervisions[0] + assert isinstance(s, SupervisionSegment) + assert s.start == 0 + assert s.duration == 0.5 + assert s.channel == 0 + assert s.text == "irrelevant" + assert s.language == "en" From 1c90b5e7b12816c48adef1c1d980ccc7708a1741 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Fri, 23 Aug 2024 07:14:01 -0700 Subject: [PATCH 043/664] [NeMo-UX] Fix a serialization bug that prevents users from moving checkpoints (#9939) * perfor serialization using relative paths to allow users to move checkpoints after they're saved Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * remove unused import Signed-off-by: ashors1 * fix artifact load Signed-off-by: ashors1 * fix path artifact Signed-off-by: ashors1 * remove unused import Signed-off-by: ashors1 --------- Signed-off-by: ashors1 Signed-off-by: ashors1 Co-authored-by: ashors1 --- nemo/lightning/io/api.py | 50 +------------ nemo/lightning/io/artifact/base.py | 2 +- nemo/lightning/io/artifact/file.py | 15 ++-- nemo/lightning/io/artifact/pickle.py | 8 +-- nemo/lightning/io/mixin.py | 103 +++++++++++++++++++++++---- 5 files changed, 103 insertions(+), 75 deletions(-) diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py index 4d31f020c44a..4315b3211bf7 100644 --- a/nemo/lightning/io/api.py +++ b/nemo/lightning/io/api.py @@ -1,61 +1,13 @@ -import json from pathlib import Path -from pydoc import locate from typing import Any, Callable, Optional, Type, TypeVar import fiddle as fdl import pytorch_lightning as pl from fiddle._src.experimental import serialization -from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector, track_io +from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector, load from nemo.lightning.io.pl import TrainerContext -CkptType = TypeVar("CkptType") - - -def load(path: Path, output_type: Type[CkptType] = Any) -> CkptType: - """ - Loads a configuration from a pickle file and constructs an object of the specified type. - - Args: - path (Path): The path to the pickle file or directory containing 'io.pkl'. - output_type (Type[CkptType]): The type of the object to be constructed from the loaded data. - - Returns - ------- - CkptType: An instance of the specified type constructed from the loaded configuration. - - Raises - ------ - FileNotFoundError: If the specified file does not exist. - - Example: - loaded_model = load("/path/to/model", output_type=MyModel) - """ - del output_type # Just for type-hint - - _path = Path(path) - if hasattr(_path, 'is_dir') and _path.is_dir(): - _path = Path(_path) / "io.json" - elif hasattr(_path, 'isdir') and _path.isdir: - _path = Path(_path) / "io.json" - - if not _path.is_file(): - raise FileNotFoundError(f"No such file: '{_path}'") - - ## add IO functionality to custom objects present in the json file - with open(_path) as f: - j = json.load(f) - for obj, val in j["objects"].items(): - clss = ".".join([val["type"]["module"], val["type"]["name"]]) - if not serialization.find_node_traverser(locate(clss)): - track_io(locate(clss)) - - with open(_path, "rb") as f: - config = serialization.load_json(f.read()) - - return fdl.build(config) - def load_context(path: Path) -> TrainerContext: """ diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py index 9119b2474b17..a997df42f843 100644 --- a/nemo/lightning/io/artifact/base.py +++ b/nemo/lightning/io/artifact/base.py @@ -11,7 +11,7 @@ def __init__(self, attr: str, required: bool = True): self.required = required @abstractmethod - def dump(self, value: ValueT, path: Path) -> ValueT: + def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT: pass @abstractmethod diff --git a/nemo/lightning/io/artifact/file.py b/nemo/lightning/io/artifact/file.py index 0bd4f48dc17f..76bd0c6003a6 100644 --- a/nemo/lightning/io/artifact/file.py +++ b/nemo/lightning/io/artifact/file.py @@ -6,8 +6,8 @@ class PathArtifact(Artifact[Path]): - def dump(self, value: Path, path: Path) -> Path: - new_value = copy_file(value, path) + def dump(self, value: Path, absolute_dir: Path, relative_dir: Path) -> Path: + new_value = copy_file(value, absolute_dir, relative_dir) return new_value def load(self, path: Path) -> Path: @@ -15,15 +15,16 @@ def load(self, path: Path) -> Path: class FileArtifact(Artifact[str]): - def dump(self, value: str, path: Path) -> str: - new_value = copy_file(value, path) + def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str: + new_value = copy_file(value, absolute_dir, relative_dir) return str(new_value) def load(self, path: str) -> str: return path -def copy_file(src: Union[Path, str], dst: Union[Path, str]): - output = Path(dst) / Path(src).name +def copy_file(src: Union[Path, str], path: Union[Path, str], relative_dst: Union[Path, str]): + relative_path = Path(relative_dst) / Path(src).name + output = Path(path) / relative_path shutil.copy2(src, output) - return output + return relative_path diff --git a/nemo/lightning/io/artifact/pickle.py b/nemo/lightning/io/artifact/pickle.py index 31ed7e36ac93..61a9c82237fc 100644 --- a/nemo/lightning/io/artifact/pickle.py +++ b/nemo/lightning/io/artifact/pickle.py @@ -7,12 +7,12 @@ class PickleArtifact(Artifact[Any]): - def dump(self, value: Any, path: Path) -> Path: - file = self.file_path(path) - with open(file, "wb") as f: + def dump(self, absolute_dir: Path, relative_dir: Path) -> Path: + relative_file = self.file_path(relative_dir) + with open(Path(absolute_dir) / relative_file, "wb") as f: dump(value, f) - return file + return relative_file def load(self, path: Path) -> Any: with open(self.file_path(path), "rb") as f: diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index d0d4d0243ff7..eff4cd9434ce 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -1,5 +1,6 @@ import functools import inspect +import json import shutil import threading import types @@ -7,11 +8,13 @@ from copy import deepcopy from dataclasses import is_dataclass from pathlib import Path +from pydoc import locate from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union import fiddle as fdl import fiddle._src.experimental.dataclasses as fdl_dc -from cloudpickle import dump, load +from cloudpickle import dump +from cloudpickle import load as pickle_load from fiddle._src.experimental import serialization from typing_extensions import Self @@ -21,6 +24,7 @@ from nemo.lightning.io.fdl_torch import enable as _enable_ext ConnT = TypeVar('ConnT', bound=ModelConnector) +CkptType = TypeVar("CkptType") _enable_ext() @@ -136,21 +140,24 @@ def io_dump(self, output: Path): will be stored. """ output_path = Path(output) - artifacts_dir = output_path / "artifacts" + local_artifacts_dir = "artifacts" + artifacts_dir = output_path / local_artifacts_dir artifacts_dir.mkdir(parents=True, exist_ok=True) # Store artifacts directory in thread-local storage - _thread_local.artifacts_dir = artifacts_dir + _thread_local.local_artifacts_dir = local_artifacts_dir + _thread_local.output_path = output_path config_path = output_path / "io.json" with open(config_path, "w") as f: io = deepcopy(self.__io__) - _artifact_transform(io, artifacts_dir) + _artifact_transform_save(io, output_path, local_artifacts_dir) json = serialization.dump_json(io) f.write(json) # Clear thread-local storage after io_dump is complete - del _thread_local.artifacts_dir + del _thread_local.local_artifacts_dir + del _thread_local.output_path # Check if artifacts directory is empty and delete if so if not any(artifacts_dir.iterdir()): @@ -481,23 +488,28 @@ def _io_flatten_object(instance): try: serialization.dump_json(instance.__io__) except (serialization.UnserializableValueError, AttributeError) as e: - if not hasattr(_thread_local, "artifacts_dir"): + if not hasattr(_thread_local, "local_artifacts_dir") or not hasattr(_thread_local, "output_path"): raise e - artifact_dir = _thread_local.artifacts_dir - artifact_path = artifact_dir / f"{uuid.uuid4()}" + local_artifact_path = Path(_thread_local.local_artifacts_dir) / f"{uuid.uuid4()}" + output_path = _thread_local.output_path + artifact_path = output_path / local_artifact_path with open(artifact_path, "wb") as f: dump(getattr(instance, "__io__", instance), f) - return (str(artifact_path),), None + return (str(local_artifact_path),), None return instance.__io__.__flatten__() def _io_unflatten_object(values, metadata): + + assert hasattr(_thread_local, "output_dir") + output_dir = _thread_local.output_dir + if len(values) == 1: pickle_path = values[0] - with open(pickle_path, "rb") as f: - return load(f) + with open(Path(output_dir) / pickle_path, "rb") as f: + return pickle_load(f) return fdl.Config.__unflatten__(values, metadata) @@ -511,19 +523,82 @@ def _io_path_elements_fn(x): return x.__io__.__path_elements__() -def _artifact_transform(cfg: fdl.Config, output_path: Path): +def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "artifacts"): for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []): current_val = getattr(cfg, artifact.attr) if current_val is None: if artifact.required: raise ValueError(f"Artifact '{artifact.attr}' is required but not provided") continue - new_val = artifact.dump(current_val, output_path) + ## dump artifact and return the relative path + new_val = artifact.dump(current_val, output_path, relative_dir) setattr(cfg, artifact.attr, new_val) for attr in dir(cfg): try: if isinstance(getattr(cfg, attr), fdl.Config): - _artifact_transform(getattr(cfg, attr), output_path=output_path) + _artifact_transform_save(getattr(cfg, attr), output_path=output_path, relative_dir=relative_dir) except ValueError: pass + + +def _artifact_transform_load(cfg: fdl.Config, path: Path): + for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []): + current_val = getattr(cfg, artifact.attr) + ## replace local path with absolute one + new_val = str(Path(path) / current_val) + setattr(cfg, artifact.attr, new_val) + + for attr in dir(cfg): + try: + if isinstance(getattr(cfg, attr), fdl.Config): + _artifact_transform_load(getattr(cfg, attr), path=path) + except ValueError: + pass + + +def load(path: Path, output_type: Type[CkptType] = Any) -> CkptType: + """ + Loads a configuration from a pickle file and constructs an object of the specified type. + + Args: + path (Path): The path to the pickle file or directory containing 'io.pkl'. + output_type (Type[CkptType]): The type of the object to be constructed from the loaded data. + + Returns + ------- + CkptType: An instance of the specified type constructed from the loaded configuration. + + Raises + ------ + FileNotFoundError: If the specified file does not exist. + + Example: + loaded_model = load("/path/to/model", output_type=MyModel) + """ + del output_type # Just for type-hint + + _path = Path(path) + _thread_local.output_dir = _path + + if hasattr(_path, 'is_dir') and _path.is_dir(): + _path = Path(_path) / "io.json" + elif hasattr(_path, 'isdir') and _path.isdir: + _path = Path(_path) / "io.json" + + if not _path.is_file(): + raise FileNotFoundError(f"No such file: '{_path}'") + + ## add IO functionality to custom objects present in the json file + with open(_path) as f: + j = json.load(f) + for obj, val in j["objects"].items(): + clss = ".".join([val["type"]["module"], val["type"]["name"]]) + if not serialization.find_node_traverser(locate(clss)): + track_io(locate(clss)) + + with open(_path, "rb") as f: + config = serialization.load_json(f.read()) + _artifact_transform_load(config, path) + + return fdl.build(config) From 6d1be9305b7118b12fdefd77fc1e3376f19df1fb Mon Sep 17 00:00:00 2001 From: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com> Date: Fri, 23 Aug 2024 11:20:34 -0400 Subject: [PATCH 044/664] Add MemoryProfileCallback (#10166) * Add MemoryProfileCallback Signed-off-by: Shriya Palsamudram * Apply isort and black reformatting Signed-off-by: ShriyaPalsamudram * Remove reference cycles, save snapshot on specific ranks Signed-off-by: Shriya Palsamudram * Remove unnecessary imports Signed-off-by: Shriya Palsamudram * Apply isort and black reformatting Signed-off-by: ShriyaPalsamudram * Update docstring Signed-off-by: Shriya Palsamudram --------- Signed-off-by: Shriya Palsamudram Signed-off-by: ShriyaPalsamudram Signed-off-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com> Co-authored-by: ShriyaPalsamudram --- nemo/lightning/pytorch/callbacks/__init__.py | 2 + .../pytorch/callbacks/memory_profiler.py | 78 +++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 nemo/lightning/pytorch/callbacks/memory_profiler.py diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py index 5b3113dea885..ef31e1078298 100644 --- a/nemo/lightning/pytorch/callbacks/__init__.py +++ b/nemo/lightning/pytorch/callbacks/__init__.py @@ -1,4 +1,5 @@ from nemo.lightning.pytorch.callbacks.ddp_parity_checker import DdpParityChecker +from nemo.lightning.pytorch.callbacks.memory_profiler import MemoryProfileCallback from nemo.lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform from nemo.lightning.pytorch.callbacks.nsys import NsysCallback @@ -8,6 +9,7 @@ from nemo.lightning.pytorch.callbacks.progress_printer import ProgressPrinter __all__ = [ + "MemoryProfileCallback", "ModelCheckpoint", "ModelTransform", "PEFT", diff --git a/nemo/lightning/pytorch/callbacks/memory_profiler.py b/nemo/lightning/pytorch/callbacks/memory_profiler.py new file mode 100644 index 000000000000..089479637f61 --- /dev/null +++ b/nemo/lightning/pytorch/callbacks/memory_profiler.py @@ -0,0 +1,78 @@ +import os + +import torch +from pytorch_lightning.callbacks.callback import Callback +from torch.utils.viz._cycles import warn_tensor_cycles + +from nemo.lightning import io +from nemo.utils import logging +from nemo.utils.get_rank import get_rank + + +class MemoryProfileCallback(Callback, io.IOMixin): + """ + This callback enables recording a timeline of memory allocations during training. + The generated .pickle profiles can be analyzed at https://pytorch.org/memory_viz + + More info about the profiles can be found [here](https://pytorch.org/blog/understanding-gpu-memory-1/). + + Args: + dir (Optional[str]): Directory to store the memory profile dump + warn_cycles (Optional[bool]): Whether to enable [reference cycle detection](https://pytorch.org/blog/understanding-gpu-memory-2/) + rank (Optional[list[int]]): List of ranks to collect snapshot on, defaults to all if list is empty + + Example: + >>> callback = MemoryProfileCallback(dir="/mem_profile", ranks=[0]) + >>> trainer = Trainer(callbacks=[callback]) + """ + + def __init__(self, dir: str = "/mem_profile", warn_cycles=True, ranks=[]): + + self.dir = dir + self.ranks = ranks + + os.makedirs(self.dir, exist_ok=True) + logging.info(f"Torch memory profiles will be written to: {self.dir}") + + if warn_cycles: + logging.info("Enabling reference cycle detector") + warn_tensor_cycles() + + def enable_on_rank(self) -> bool: + if not self.ranks: + return True + return get_rank() in self.ranks + + def setup(self, trainer, pl_module, stage) -> None: + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end + We use it here to start recording the memory profiler. + """ + + if trainer.max_steps > 1000: + logging.warning( + f"Memory profiling creates snapshots during the entire training process, \ + where every iteration increases the size of the snapshot. \ + Try reducing trainer.max_steps to avoid running into issues" + ) + + if torch.distributed.is_initialized() and self.enable_on_rank(): + torch.cuda.memory._record_memory_history(max_entries=100000) + + def on_train_end(self, trainer, pl_module) -> None: + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end + We use it here to finish memory profiling and write the snapshot. + """ + + logging.info( + f"on_train_batch_end rank: {get_rank()} mem: {torch.cuda.memory_allocated()/1024/1024/1024} / {torch.cuda.max_memory_reserved()/1024/1024/1024}" + ) + + if torch.distributed.is_initialized() and self.enable_on_rank(): + rank = get_rank() + _snapshot_path = f"{self.dir}/memory_snapshot-rank{rank}.pickle" + logging.info(f"Writing memory profile snapshot to {_snapshot_path}") + torch.cuda.memory._dump_snapshot(f"{_snapshot_path}") + torch.cuda.memory._record_memory_history(enabled=None) + logging.info(f"Finished writing memory profile snapshot: {_snapshot_path}") From d415621e15a22251b41225f6d5ab36b8065fb454 Mon Sep 17 00:00:00 2001 From: Dong Hyuk Chang Date: Fri, 23 Aug 2024 14:47:00 -0400 Subject: [PATCH 045/664] Lower bound transformers to support nemotron (#10240) Signed-off-by: Dong Hyuk Chang Co-authored-by: Dong Hyuk Chang --- requirements/requirements_lightning.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index 1b3397f69033..171abce41f37 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -4,6 +4,6 @@ hydra-core>1.3,<=1.3.2 omegaconf<=2.3 pytorch-lightning>2.2.1 torchmetrics>=0.11.0 -transformers +transformers>=4.44.0 wandb webdataset>=0.2.86 From 7cc99e95fa753f46dffffc47a19e3c1fa375159c Mon Sep 17 00:00:00 2001 From: Kuray107 Date: Sat, 24 Aug 2024 00:38:56 -0400 Subject: [PATCH 046/664] [Audio] SSL Pretraining framework for flow-matching model for audio processing (#10052) Flow matching generative model with SSL pretraining framework Signed-off-by: Pin-Jui Ku Co-authored-by: Kuray107 --- examples/audio/audio_to_audio_train.py | 4 + .../audio/conf/flow_matching_generative.yaml | 164 ++++++ .../flow_matching_generative_finetuning.yaml | 167 ++++++ ...w_matching_generative_ssl_pretraining.yaml | 171 ++++++ .../audio/data/audio_to_audio_lhotse.py | 22 +- .../audio/models/audio_to_audio.py | 33 +- nemo/collections/audio/models/enhancement.py | 269 ++++++++++ .../audio/modules/ssl_pretrain_masking.py | 106 ++++ .../audio/parts/submodules/flow.py | 252 +++++++++ .../audio/parts/submodules/transformerunet.py | 507 ++++++++++++++++++ .../audio/parts/utils/callbacks.py | 177 ++++++ 11 files changed, 1865 insertions(+), 7 deletions(-) create mode 100644 examples/audio/conf/flow_matching_generative.yaml create mode 100644 examples/audio/conf/flow_matching_generative_finetuning.yaml create mode 100644 examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml create mode 100644 nemo/collections/audio/modules/ssl_pretrain_masking.py create mode 100644 nemo/collections/audio/parts/submodules/flow.py create mode 100644 nemo/collections/audio/parts/submodules/transformerunet.py create mode 100644 nemo/collections/audio/parts/utils/callbacks.py diff --git a/examples/audio/audio_to_audio_train.py b/examples/audio/audio_to_audio_train.py index b197d2084144..cef46dcf20b6 100644 --- a/examples/audio/audio_to_audio_train.py +++ b/examples/audio/audio_to_audio_train.py @@ -34,6 +34,7 @@ from nemo.collections.audio.models.enhancement import ( EncMaskDecAudioToAudioModel, + FlowMatchingAudioToAudioModel, PredictiveAudioToAudioModel, SchroedingerBridgeAudioToAudioModel, ScoreBasedGenerativeAudioToAudioModel, @@ -50,6 +51,7 @@ class ModelType(str, Enum): Predictive = 'predictive' ScoreBased = 'score_based' SchroedingerBridge = 'schroedinger_bridge' + FlowMatching = 'flow_matching' def get_model_class(model_type: ModelType): @@ -62,6 +64,8 @@ def get_model_class(model_type: ModelType): return ScoreBasedGenerativeAudioToAudioModel elif model_type == ModelType.SchroedingerBridge: return SchroedingerBridgeAudioToAudioModel + elif model_type == ModelType.FlowMatching: + return FlowMatchingAudioToAudioModel else: raise ValueError(f'Unknown model type: {model_type}') diff --git a/examples/audio/conf/flow_matching_generative.yaml b/examples/audio/conf/flow_matching_generative.yaml new file mode 100644 index 000000000000..5f644f328e6d --- /dev/null +++ b/examples/audio/conf/flow_matching_generative.yaml @@ -0,0 +1,164 @@ +name: flow_matching_generative + +model: + type: flow_matching + sample_rate: 16000 + skip_nan_grad: false + num_outputs: 1 + p_cond: 0.9 # Proability of feeding the conditional input into the model. + normalize_input: true # normalize the input signal to 0dBFS + max_utts_evaluation_metrics: 500 + + train_ds: + manifest_filepath: ??? + input_key: noisy_filepath + target_key: clean_filepath + audio_duration: 6.14 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 768 + random_offset: true + batch_size: 8 # batch size may be increased based on the available memory + shuffle: true + num_workers: 8 + pin_memory: true + + validation_ds: + manifest_filepath: ??? + input_key: noisy_filepath + target_key: clean_filepath + batch_size: 8 + shuffle: false + num_workers: 4 + pin_memory: true + + log_config: + log_tensorboard: true + log_wandb: false + max_utts: 8 + + encoder: + _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram + fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256 + hop_length: 128 + magnitude_power: 0.5 + scale: 0.33 + + decoder: + _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio + fft_length: ${model.encoder.fft_length} + hop_length: ${model.encoder.hop_length} + magnitude_power: ${model.encoder.magnitude_power} + scale: ${model.encoder.scale} + + estimator: + _target_: nemo.collections.audio.parts.submodules.transformerunet.SpectrogramTransformerUNet + in_channels: 2 # concatenation of single-channel perturbed and noisy + out_channels: 1 # single-channel score estimate + depth: 24 + ff_dropout: 0.1 + time_hidden_dim: 1024 + + flow: + _target_: nemo.collections.audio.parts.submodules.flow.OptimalTransportFlow + sigma_start: 1.0 + sigma_end: 1e-4 + + sampler: + _target_: nemo.collections.audio.parts.submodules.flow.ConditionalFlowMatchingEulerSampler + num_steps: 20 + time_min: 1e-8 + time_max: 1.0 + + loss: + _target_: nemo.collections.audio.losses.MSELoss + ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time) + + metrics: + val: + sisdr: # output SI-SDR + _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio + estoi: # output ESTOI + _target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility + fs: ${model.sample_rate} + extended: true + pesq: # output PESQ + _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality + fs: ${model.sample_rate} + mode: wb + + optim: + name: adam + lr: 1e-4 + # optimizer arguments + betas: [0.9, 0.999] + weight_decay: 0.0 + + # scheduler setup + sched: + name: CosineAnnealing + # scheduler config override + warmup_steps: 5000 + warmup_ratio: null + min_lr: 0 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 0.2 + precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + log_every_n_steps: 25 # Interval of logging. + enable_progress_bar: true + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + +exp_manager: + exp_dir: null + name: ${name} + + # use exponential moving average for model parameters + ema: + enable: true + decay: 0.999 # decay rate + cpu_offload: false # offload EMA parameters to CPU to save GPU memory + every_n_steps: 1 # how often to update EMA weights + validate_original_weights: false # use original weights for validation calculation? + + # logging + create_tensorboard_logger: true + + # checkpointing + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: val_pesq + mode: max + save_top_k: 3 + always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints + + # early stopping + create_early_stopping_callback: true + early_stopping_callback_params: + monitor: val_sisdr + mode: max + min_delta: 0.0 + patience: 20 # patience in terms of check_val_every_n_epoch + verbose: true + strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + # you need to set these two to true to continue the training + resume_if_exists: false + resume_ignore_no_checkpoint: false + + # You may use this section to create a W&B logger + create_wandb_logger: false + wandb_logger_kwargs: + name: test + project: gense diff --git a/examples/audio/conf/flow_matching_generative_finetuning.yaml b/examples/audio/conf/flow_matching_generative_finetuning.yaml new file mode 100644 index 000000000000..c7ba19aee466 --- /dev/null +++ b/examples/audio/conf/flow_matching_generative_finetuning.yaml @@ -0,0 +1,167 @@ +name: flow_matching_generative_finetuning + +init_from_nemo_model: null +init_strict: false + +model: + type: flow_matching + sample_rate: 16000 + skip_nan_grad: false + num_outputs: 1 + p_cond: 0.9 # Proability of feeding the conditional input into the model. + normalize_input: true # normalize the input signal to 0dBFS + max_utts_evaluation_metrics: 500 + + train_ds: + manifest_filepath: ??? + input_key: noisy_filepath + target_key: clean_filepath + audio_duration: 6.14 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 768 + random_offset: true + batch_size: 8 # batch size may be increased based on the available memory + shuffle: true + num_workers: 8 + pin_memory: true + + validation_ds: + manifest_filepath: ??? + input_key: noisy_filepath + target_key: clean_filepath + batch_size: 8 + shuffle: false + num_workers: 4 + pin_memory: true + + log_config: + log_tensorboard: true + log_wandb: false + max_utts: 8 + + encoder: + _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram + fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256 + hop_length: 128 + magnitude_power: 0.5 + scale: 0.33 + + decoder: + _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio + fft_length: ${model.encoder.fft_length} + hop_length: ${model.encoder.hop_length} + magnitude_power: ${model.encoder.magnitude_power} + scale: ${model.encoder.scale} + + estimator: + _target_: nemo.collections.audio.parts.submodules.transformerunet.SpectrogramTransformerUNet + in_channels: 2 # concatenation of single-channel perturbed and noisy + out_channels: 1 # single-channel score estimate + depth: 24 + ff_dropout: 0.1 + time_hidden_dim: 1024 + + flow: + _target_: nemo.collections.audio.parts.submodules.flow.OptimalTransportFlow + sigma_start: 1.0 + sigma_end: 1e-4 + + sampler: + _target_: nemo.collections.audio.parts.submodules.flow.ConditionalFlowMatchingEulerSampler + num_steps: 20 + time_min: 1e-8 + time_max: 1.0 + + loss: + _target_: nemo.collections.audio.losses.MSELoss + ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time) + + metrics: + val: + sisdr: # output SI-SDR + _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio + estoi: # output ESTOI + _target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility + fs: ${model.sample_rate} + extended: true + pesq: # output PESQ + _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality + fs: ${model.sample_rate} + mode: wb + + optim: + name: adam + lr: 1e-4 + # optimizer arguments + betas: [0.9, 0.999] + weight_decay: 0.0 + + # scheduler setup + sched: + name: CosineAnnealing + # scheduler config override + warmup_steps: 5000 + warmup_ratio: null + min_lr: 0 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 0.2 + precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + log_every_n_steps: 25 # Interval of logging. + enable_progress_bar: true + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + +exp_manager: + exp_dir: null + name: ${name} + + # use exponential moving average for model parameters + ema: + enable: true + decay: 0.999 # decay rate + cpu_offload: false # offload EMA parameters to CPU to save GPU memory + every_n_steps: 1 # how often to update EMA weights + validate_original_weights: false # use original weights for validation calculation? + + # logging + create_tensorboard_logger: true + + # checkpointing + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: val_pesq + mode: max + save_top_k: 3 + always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints + + # early stopping + create_early_stopping_callback: true + early_stopping_callback_params: + monitor: val_sisdr + mode: max + min_delta: 0.0 + patience: 20 # patience in terms of check_val_every_n_epoch + verbose: true + strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + # you need to set these two to true to continue the training + resume_if_exists: false + resume_ignore_no_checkpoint: false + + # You may use this section to create a W&B logger + create_wandb_logger: false + wandb_logger_kwargs: + name: test + project: gense diff --git a/examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml b/examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml new file mode 100644 index 000000000000..7813a9473644 --- /dev/null +++ b/examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml @@ -0,0 +1,171 @@ +name: flow_matching_generative_ssl_pretraining + +model: + type: flow_matching + sample_rate: 16000 + skip_nan_grad: true + num_outputs: 1 + p_cond: 0.9 # Proability of feeding the conditional input into the model. + normalize_input: true # normalize the input signal to 0dBFS + max_utts_evaluation_metrics: 125 + + train_ds: + shar_path: ??? + use_lhotse: true + truncate_duration: 4.09 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 512 + truncate_offset_type: random + batch_size: 8 # batch size may be increased based on the available memory + shuffle: true + num_workers: 8 + pin_memory: true + + validation_ds: + manifest_filepath: ??? + input_key: clean_filepath + target_key: clean_filepath + random_offset: false + batch_size: 8 + shuffle: false + num_workers: 4 + pin_memory: true + + log_config: + log_tensorboard: true + log_wandb: false + max_utts: 8 + + encoder: + _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram + fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256 + hop_length: 128 + magnitude_power: 0.5 + scale: 0.33 + + decoder: + _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio + fft_length: ${model.encoder.fft_length} + hop_length: ${model.encoder.hop_length} + magnitude_power: ${model.encoder.magnitude_power} + scale: ${model.encoder.scale} + + estimator: + _target_: nemo.collections.audio.parts.submodules.transformerunet.SpectrogramTransformerUNet + in_channels: 2 # concatenation of single-channel perturbed and noisy + out_channels: 1 # single-channel score estimate + depth: 24 + ff_dropout: 0.1 + time_hidden_dim: 1024 + + flow: + _target_: nemo.collections.audio.parts.submodules.flow.OptimalTransportFlow + sigma_start: 1.0 + sigma_end: 1e-4 + + sampler: + _target_: nemo.collections.audio.parts.submodules.flow.ConditionalFlowMatchingEulerSampler + num_steps: 20 + time_min: 1e-8 + time_max: 1.0 + + ssl_pretrain_masking: + _target_: nemo.collections.audio.modules.ssl_pretrain_masking.SSLPretrainWithMaskedPatch + patch_size: 10 + mask_fraction: 0.7 + + loss: + _target_: nemo.collections.audio.losses.MSELoss + ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time) + + metrics: + val: + sisdr: # output SI-SDR + _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio + estoi: # output ESTOI + _target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility + fs: ${model.sample_rate} + extended: true + pesq: # output PESQ + _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality + fs: ${model.sample_rate} + mode: wb + + optim: + name: adam + lr: 5e-5 + # optimizer arguments + betas: [0.9, 0.999] + weight_decay: 0.0 + + # scheduler setup + sched: + name: CosineAnnealing + # scheduler config override + warmup_steps: 5000 + warmup_ratio: null + min_lr: 1e-5 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: 10000 # needs to be set for shar datasets + limit_train_batches: 1000 # number of batches to train on in each pseudo-epoch + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + use_distributed_sampler: false # required for lhotse + accumulate_grad_batches: 1 + gradient_clip_val: 0.2 + precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + log_every_n_steps: 25 # Interval of logging. + enable_progress_bar: true + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + +exp_manager: + exp_dir: null + name: ${name} + + # use exponential moving average for model parameters + ema: + enable: true + decay: 0.999 # decay rate + cpu_offload: false # offload EMA parameters to CPU to save GPU memory + every_n_steps: 1 # how often to update EMA weights + validate_original_weights: false # use original weights for validation calculation? + + # logging + create_tensorboard_logger: true + + # checkpointing + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: val_pesq + mode: max + save_top_k: 3 + always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints + + # early stopping + create_early_stopping_callback: true + early_stopping_callback_params: + monitor: val_sisdr + mode: max + min_delta: 0.0 + patience: 20 # patience in terms of check_val_every_n_epoch + verbose: true + strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + # you need to set these two to true to continue the training + resume_if_exists: false + resume_ignore_no_checkpoint: false + + # You may use this section to create a W&B logger + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/nemo/collections/audio/data/audio_to_audio_lhotse.py b/nemo/collections/audio/data/audio_to_audio_lhotse.py index 27d8a0ed28d7..d8978c19d692 100644 --- a/nemo/collections/audio/data/audio_to_audio_lhotse.py +++ b/nemo/collections/audio/data/audio_to_audio_lhotse.py @@ -44,19 +44,29 @@ class LhotseAudioToTargetDataset(torch.utils.data.Dataset): EMBEDDING_KEY = "embedding_vector" def __getitem__(self, cuts: CutSet) -> dict[str, torch.Tensor]: - src_audio, src_audio_lens = collate_audio(cuts) + # In the rare case, the collate_audio function would raise the FileSeek error when loading .flac (https://github.com/bastibe/python-soundfile/issues/274) + # A workaround is to use fault_tolerant and skip failed data, resulting in a smaller batch size for the few problematic cases. + src_audio, src_audio_lens, retained_padded_cuts = collate_audio(cuts, fault_tolerant=True) ans = { "input_signal": src_audio, "input_length": src_audio_lens, } - if _key_available(cuts, self.TARGET_KEY): - tgt_audio, tgt_audio_lens = collate_audio(cuts, recording_field=self.TARGET_KEY) + # keep only the first non-padding cuts + retained_cuts = [ + cut._first_non_padding_cut if isinstance(cut, MixedCut) else cut for cut in retained_padded_cuts + ] + retained_cuts = CutSet.from_cuts(retained_cuts) + + if _key_available(retained_cuts, self.TARGET_KEY): + # TODO: use fault_tolerant=True for robust loading of target + tgt_audio, tgt_audio_lens = collate_audio(retained_cuts, recording_field=self.TARGET_KEY) ans.update(target_signal=tgt_audio, target_length=tgt_audio_lens) - if _key_available(cuts, self.REFERENCE_KEY): - ref_audio, ref_audio_lens = collate_audio(cuts, recording_field=self.REFERENCE_KEY) + if _key_available(retained_cuts, self.REFERENCE_KEY): + # TODO: use fault_tolerant=True for robust loading of target + ref_audio, ref_audio_lens = collate_audio(retained_cuts, recording_field=self.REFERENCE_KEY) ans.update(reference_signal=ref_audio, reference_length=ref_audio_lens) if _key_available(cuts, self.EMBEDDING_KEY): - emb = collate_custom_field(cuts, field=self.EMBEDDING_KEY) + emb = collate_custom_field(retained_cuts, field=self.EMBEDDING_KEY) ans.update(embedding_signal=emb) return ans diff --git a/nemo/collections/audio/models/audio_to_audio.py b/nemo/collections/audio/models/audio_to_audio.py index ef9ce648f1a2..e1732c1658b7 100644 --- a/nemo/collections/audio/models/audio_to_audio.py +++ b/nemo/collections/audio/models/audio_to_audio.py @@ -483,4 +483,35 @@ def on_after_backward(self): if valid_gradients < 1: logging.warning('detected inf or nan values in gradients! Setting gradients to zero.') - self.zero_grad() + self.zero_grad(set_to_none=False) + + def configure_callbacks(self): + """ + Create an callback to add audio/spectrogram into tensorboard & wandb. + """ + self.log_config = self.cfg.get("log_config", None) + if not self.log_config: + return [] + + log_callbacks = [] + from nemo.collections.audio.parts.utils.callbacks import SpeechEnhancementLoggingCallback + + if isinstance(self._validation_dl, List): + data_loaders = self._validation_dl + else: + data_loaders = [self._validation_dl] + + for data_loader_idx, data_loader in enumerate(data_loaders): + log_callbacks.append( + SpeechEnhancementLoggingCallback( + data_loader=data_loader, + data_loader_idx=data_loader_idx, + loggers=self.trainer.loggers, + log_tensorboard=self.log_config.log_tensorboard, + log_wandb=self.log_config.log_wandb, + sample_rate=self.sample_rate, + max_utts=self.log_config.get("max_utts", None), + ) + ) + + return log_callbacks diff --git a/nemo/collections/audio/models/enhancement.py b/nemo/collections/audio/models/enhancement.py index e7fbc9023117..cd9f47b98096 100644 --- a/nemo/collections/audio/models/enhancement.py +++ b/nemo/collections/audio/models/enhancement.py @@ -30,6 +30,7 @@ 'ScoreBasedGenerativeAudioToAudioModel', 'PredictiveAudioToAudioModel', 'SchroedingerBridgeAudioToAudioModel', + 'FlowMatchingAudioToAudioModel', ] @@ -618,6 +619,274 @@ def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = return {f'{tag}_loss': loss} +class FlowMatchingAudioToAudioModel(AudioToAudioModel): + """This models uses a flow matching process to generate + an encoded representation of the enhanced signal. + + The model consists of the following blocks: + - encoder: transforms input multi-channel audio signal into an encoded representation (analysis transform) + - estimator: neural model, estimates a score for the diffusion process + - flow: ordinary differential equation (ODE) defining a flow and a vector field. + - sampler: sampler for the inference process, estimates coefficients of the target signal + - decoder: transforms sampler output into the time domain (synthesis transform) + - ssl_pretrain_masking: if it is defined, perform the ssl pretrain masking for self reconstruction in the training process + """ + + def __init__(self, cfg: DictConfig, trainer: Trainer = None): + super().__init__(cfg=cfg, trainer=trainer) + self.sample_rate = self._cfg.sample_rate + + # Setup processing modules + self.encoder = self.from_config_dict(self._cfg.encoder) + self.decoder = self.from_config_dict(self._cfg.decoder) + + # Neural estimator + self.estimator = self.from_config_dict(self._cfg.estimator) + + # Flow + self.flow = self.from_config_dict(self._cfg.flow) + + # Sampler + self.sampler = hydra.utils.instantiate(self._cfg.sampler, estimator=self.estimator) + + # probability that the conditional input will be feed into the + # estimator in the training stage + self.p_cond = self._cfg.get('p_cond', 1.0) + + # Self-Supervised Pretraining + if self._cfg.get('ssl_pretrain_masking') is not None: + logging.debug('SSL-pretrain_masking is found and will be initialized') + self.ssl_pretrain_masking = self.from_config_dict(self._cfg.ssl_pretrain_masking) + else: + self.ssl_pretrain_masking = None + + # Normalization + self.normalize_input = self._cfg.get('normalize_input', False) + + # Metric evaluation + self.max_utts_evaluation_metrics = self._cfg.get('max_utts_evaluation_metrics') + + if self.max_utts_evaluation_metrics is not None: + logging.warning( + 'Metrics will be evaluated on first %d examples of the evaluation datasets.', + self.max_utts_evaluation_metrics, + ) + + # Regularization + self.eps = self._cfg.get('eps', 1e-8) + + # Setup optional Optimization flags + self.setup_optimization_flags() + + logging.debug('Initialized %s', self.__class__.__name__) + logging.debug('\tdoing SSL-pretraining: %s', (self.ssl_pretrain_masking is not None)) + logging.debug('\tp_cond: %s', self.p_cond) + logging.debug('\tnormalize_input: %s', self.normalize_input) + logging.debug('\tloss: %s', self.loss) + logging.debug('\teps: %s', self.eps) + + @property + def input_types(self) -> Dict[str, NeuralType]: + return { + "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)), + "input_length": NeuralType(tuple('B'), LengthsType(), optional=True), + } + + @property + def output_types(self) -> Dict[str, NeuralType]: + return { + "output_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)), + "output_length": NeuralType(tuple('B'), LengthsType(), optional=True), + } + + @typecheck() + @torch.inference_mode() + def forward(self, input_signal, input_length=None): + """Forward pass of the model to generate samples from the target distribution. + + Args: + input_signal: Tensor that represents a batch of raw audio signals, + of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as + `self.sample_rate` number of floating point values. + input_signal_length: Vector of length B, that contains the individual lengths of the audio + sequences. + + Returns: + Output signal `output` in the time domain and the length of the output signal `output_length`. + """ + batch_length = input_signal.size(-1) + + if self.normalize_input: + # max for each example in the batch + norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True) + # scale input signal + input_signal = input_signal / (norm_scale + self.eps) + + # Encoder + encoded, encoded_length = self.encoder(input=input_signal, input_length=input_length) + + if self.p_cond == 0: + encoded = torch.zeros_like(encoded) + elif self.ssl_pretrain_masking is not None: + encoded = self.ssl_pretrain_masking(input_spec=encoded, length=encoded_length) + + init_state = torch.randn_like(encoded) * self.flow.sigma_start + + # Sampler + generated, generated_length = self.sampler( + state=init_state, estimator_condition=encoded, state_length=encoded_length + ) + + # Decoder + output, output_length = self.decoder(input=generated, input_length=generated_length) + + if self.normalize_input: + # rescale to the original scale + output = output * norm_scale + + # Trim or pad the estimated signal to match input length + output = self.match_batch_length(input=output, batch_length=batch_length) + + return output, output_length + + @typecheck( + input_types={ + "target_signal": NeuralType(('B', 'C', 'T'), AudioSignal()), + "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal()), + "input_length": NeuralType(tuple('B'), LengthsType()), + }, + output_types={ + "loss": NeuralType(None, LossType()), + }, + ) + def _step(self, target_signal, input_signal, input_length=None): + batch_size = target_signal.size(0) + + if self.normalize_input: + # max for each example in the batch + norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True) + # scale input signal + input_signal = input_signal / (norm_scale + self.eps) + # scale the target signal + target_signal = target_signal / (norm_scale + self.eps) + + # Apply encoder to both target and the input + input_enc, input_enc_len = self.encoder(input=input_signal, input_length=input_length) + target_enc, _ = self.encoder(input=target_signal, input_length=input_length) + + # Self-Supervised Pretraining + if self.ssl_pretrain_masking is not None: + input_enc = self.ssl_pretrain_masking(input_spec=input_enc, length=input_enc_len) + + # Drop off conditional inputs (input_enc) with (1 - p_cond) probability. + # The dropped conditions will be set to zeros + keep_conditions = einops.rearrange((torch.rand(batch_size) < self.p_cond).float(), 'B -> B 1 1 1') + input_enc = input_enc * keep_conditions.to(input_enc.device) + + x_start = torch.zeros_like(input_enc) + + time = self.flow.generate_time(batch_size=batch_size).to(device=input_enc.device) + sample = self.flow.sample(time=time, x_start=x_start, x_end=target_enc) + + # we want to get a vector field estimate given current state + # at training time, current state is sampled from the conditional path + # the vector field model is also conditioned on input signal + estimator_input = torch.cat([sample, input_enc], dim=-3) + + # Estimate the vector using the neural estimator + estimate, estimate_len = self.estimator(input=estimator_input, input_length=input_enc_len, condition=time) + + conditional_vector_field = self.flow.vector_field(time=time, x_start=x_start, x_end=target_enc, point=sample) + + return self.loss(estimate=estimate, target=conditional_vector_field, input_length=input_enc_len) + + # PTL-specific methods + def training_step(self, batch, batch_idx): + if isinstance(batch, dict): + # lhotse batches are dictionaries + input_signal = batch['input_signal'] + input_length = batch['input_length'] + target_signal = batch.get('target_signal', input_signal.clone()) + else: + input_signal, input_length, target_signal, _ = batch + + # For consistency, the model uses multi-channel format, even if the channel dimension is 1 + if input_signal.ndim == 2: + input_signal = einops.rearrange(input_signal, "B T -> B 1 T") + if target_signal.ndim == 2: + target_signal = einops.rearrange(target_signal, "B T -> B 1 T") + + # Calculate the loss + loss = self._step(target_signal=target_signal, input_signal=input_signal, input_length=input_length) + + # Logs + self.log('train_loss', loss) + self.log('learning_rate', self._optimizer.param_groups[0]['lr']) + self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) + + return loss + + def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'): + + if isinstance(batch, dict): + # lhotse batches are dictionaries + input_signal = batch['input_signal'] + input_length = batch['input_length'] + target_signal = batch.get('target_signal', input_signal.clone()) + else: + input_signal, input_length, target_signal, _ = batch + + # For consistency, the model uses multi-channel format, even if the channel dimension is 1 + if input_signal.ndim == 2: + input_signal = einops.rearrange(input_signal, 'B T -> B 1 T') + if target_signal.ndim == 2: + target_signal = einops.rearrange(target_signal, 'B T -> B 1 T') + + # Calculate loss + loss = self._step( + target_signal=target_signal, + input_signal=input_signal, + input_length=input_length, + ) + + # Update metrics + update_metrics = False + if self.max_utts_evaluation_metrics is None: + # Always update if max is not configured + update_metrics = True + # Number of examples to process + num_examples = input_signal.size(0) # batch size + else: + # Check how many examples have been used for metric calculation + first_metric_name = next(iter(self.metrics[tag][dataloader_idx])) + num_examples_evaluated = self.metrics[tag][dataloader_idx][first_metric_name].num_examples + # Update metrics if some examples were not processed + update_metrics = num_examples_evaluated < self.max_utts_evaluation_metrics + # Number of examples to process + num_examples = min(self.max_utts_evaluation_metrics - num_examples_evaluated, input_signal.size(0)) + + if update_metrics: + # Generate output signal + output_signal, _ = self.forward( + input_signal=input_signal[:num_examples, ...], input_length=input_length[:num_examples] + ) + + # Update metrics + if hasattr(self, 'metrics') and tag in self.metrics: + # Update metrics for this (tag, dataloader_idx) + for name, metric in self.metrics[tag][dataloader_idx].items(): + metric.update( + preds=output_signal, + target=target_signal[:num_examples, ...], + input_length=input_length[:num_examples], + ) + + # Log global step + self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) + + return {f'{tag}_loss': loss} + + class SchroedingerBridgeAudioToAudioModel(AudioToAudioModel): """This models is using a Schrödinger Bridge process to generate an encoded representation of the enhanced signal. diff --git a/nemo/collections/audio/modules/ssl_pretrain_masking.py b/nemo/collections/audio/modules/ssl_pretrain_masking.py new file mode 100644 index 000000000000..ba0722f180d8 --- /dev/null +++ b/nemo/collections/audio/modules/ssl_pretrain_masking.py @@ -0,0 +1,106 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +import einops +import torch + +from nemo.core.classes import NeuralModule, typecheck +from nemo.core.neural_types import LengthsType, NeuralType, SpectrogramType + +__all__ = ['SSLPretrainWithMaskedPatch'] + + +class SSLPretrainWithMaskedPatch(NeuralModule): + """ + Zeroes out fixed size time patches of the spectrogram. + All samples in batch are guaranteed to have the same amount of masked time steps. + Note that this may be problematic when we do pretraining on a unbalanced dataset. + + For example, say a batch contains two spectrograms of length 87 and 276. + With mask_fraction=0.7 and patch_size=10, we'll obrain mask_patches=7. + Each of the two data will then have 7 patches of 10-frame mask. + + Args: + patch_size (int): up to how many time steps does one patch consist of. + Defaults to 10. + mask_fraction (float): how much fraction in each sample to be masked (number of patches is rounded up). + Range from 0.0 to 1.0. Defaults to 0.7. + """ + + @property + def input_types(self): + """Returns definitions of module input types""" + return { + "input_spec": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), + "length": NeuralType(tuple('B'), LengthsType()), + } + + @property + def output_types(self): + """Returns definitions of module output types""" + return {"augmented_spec": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType())} + + def __init__( + self, + patch_size: int = 10, + mask_fraction: float = 0.7, + ): + super().__init__() + self.patch_size = patch_size + if mask_fraction > 1.0 or mask_fraction < 0.0: + raise ValueError('mask_patches cannot be negative') + else: + self.mask_fraction = mask_fraction + + @typecheck() + def forward(self, input_spec, length): + """ + Apply Patched masking on the input_spec. + + + During the training stage, the mask is generated randomly, with + approximately `self.mask_fraction` of the time frames being masked out. + + In the validation stage, the masking pattern is fixed to ensure + consistent evaluation of checkpoints and to prevent overfitting. Note + that the same masking pattern is applied to all data, regardless of + their lengths. On average, approximately `self.mask_fraction` of the + time frames will be masked out. + + """ + augmented_spec = input_spec + + min_len = torch.min(length) + if self.training: + len_fraction = int(min_len * self.mask_fraction) + mask_patches = len_fraction // self.patch_size + int(len_fraction % self.patch_size != 0) + + if min_len < self.patch_size * mask_patches: + mask_patches = min_len // self.patch_size + + for idx, cur_len in enumerate(length.tolist()): + patches = range(cur_len // self.patch_size) + masked_patches = random.sample(patches, mask_patches) + for mp in masked_patches: + augmented_spec[idx, :, :, mp * self.patch_size : (mp + 1) * self.patch_size] = 0.0 + else: + chunk_length = self.patch_size // self.mask_fraction + mask = torch.arange(augmented_spec.size(-1), device=augmented_spec.device) + mask = (mask % chunk_length) >= self.patch_size + mask = einops.rearrange(mask, 'T -> 1 1 1 T').float() + augmented_spec = augmented_spec * mask + + return augmented_spec diff --git a/nemo/collections/audio/parts/submodules/flow.py b/nemo/collections/audio/parts/submodules/flow.py new file mode 100644 index 000000000000..748d4c6c6d3b --- /dev/null +++ b/nemo/collections/audio/parts/submodules/flow.py @@ -0,0 +1,252 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC, abstractmethod +from typing import Tuple + +import einops +import torch + +from nemo.collections.common.parts.utils import mask_sequence_tensor +from nemo.utils import logging + + +class ConditionalFlow(ABC): + """ + Abstract class for different conditional flow-matching (CFM) classes + + Time horizon is [time_min, time_max (should be 1)] + + every path is "conditioned" on endpoints of the path + endpoints are just our paired data samples + subclasses need to implement mean, std, and vector_field + + """ + + def __init__(self, time_min: float = 1e-8, time_max: float = 1.0): + self.time_min = time_min + self.time_max = time_max + + @abstractmethod + def mean(self, *, time: torch.Tensor, x_start: torch.Tensor, x_end: torch.Tensor) -> torch.Tensor: + """ + Return the mean of p_t(x | x_start, x_end) at time t + """ + pass + + @abstractmethod + def std(self, *, time: torch.Tensor, x_start: torch.Tensor, x_end: torch.Tensor) -> torch.Tensor: + """ + Return the standard deviation of p_t(x | x_start, x_end) at time t + """ + pass + + @abstractmethod + def vector_field( + self, *, time: torch.Tensor, x_start: torch.Tensor, x_end: torch.Tensor, point: torch.Tensor + ) -> torch.Tensor: + """ + Compute the conditional vector field v_t( point | x_start, x_end) + """ + pass + + @staticmethod + def _broadcast_time(time: torch.Tensor, n_dim: int) -> torch.Tensor: + """ + Broadcast time tensor to the desired number of dimensions + """ + if time.ndim == 1: + target_shape = ' '.join(['B'] + ['1'] * (n_dim - 1)) + time = einops.rearrange(time, f'B -> {target_shape}') + + return time + + def generate_time(self, batch_size: int) -> torch.Tensor: + """ + Randomly sample a batchsize of time_steps from U[0~1] + """ + return torch.clamp(torch.rand((batch_size,)), self.time_min, self.time_max) + + def sample(self, *, time: torch.Tensor, x_start: torch.Tensor, x_end: torch.Tensor) -> torch.Tensor: + """ + Generate a sample from p_t(x | x_start, x_end) at time t. + Note that this implementation assumes all path marginals are normally distributed. + """ + time = self._broadcast_time(time, n_dim=x_start.ndim) + + mean = self.mean(time=time, x_start=x_start, x_end=x_end) + std = self.std(time=time, x_start=x_start, x_end=x_end) + return mean + std * torch.randn_like(mean) + + def flow( + self, *, time: torch.Tensor, x_start: torch.Tensor, x_end: torch.Tensor, point: torch.Tensor + ) -> torch.Tensor: + """ + Compute the conditional flow phi_t( point | x_start, x_end). + This is an affine flow. + """ + mean = self.mean(time=time, x_start=x_start, x_end=x_end) + std = self.std(time=time, x_start=x_start, x_end=x_end) + return mean + std * (point - x_start) + + +class OptimalTransportFlow(ConditionalFlow): + """The OT-CFM model from [Lipman et at, 2023] + + Every conditional path the following holds: + p_0 = N(x_start, sigma_start) + p_1 = N(x_end, sigma_end), + + mean(x, t) = (time_max - t) * x_start + t * x_end + (linear interpolation between x_start and x_end) + + std(x, t) = (time_max - t) * sigma_start + t * sigma_end + + Every conditional path is optimal transport map from p_0(x_start, x_end) to p_1(x_start, x_end) + Marginal path is not guaranteed to be an optimal transport map from p_0 to p_1 + + To get the OT-CFM model from [Lipman et at, 2023] just pass zeroes for x_start + To get the I-CFM model, set sigma_min=sigma_max + To get the rectified flow model, set sigma_min=sigma_max=0 + + Args: + time_min: minimum time value used in the process + time_max: maximum time value used in the process + sigma_start: the standard deviation of the initial distribution + sigma_end: the standard deviation of the target distribution + """ + + def __init__( + self, time_min: float = 1e-8, time_max: float = 1.0, sigma_start: float = 1.0, sigma_end: float = 1e-4 + ): + super().__init__(time_min=time_min, time_max=time_max) + self.sigma_start = sigma_start + self.sigma_end = sigma_end + + logging.debug('Initialized %s with', self.__class__.__name__) + logging.debug('\ttime_min: %s', self.time_min) + logging.debug('\ttime_max: %s', self.time_max) + logging.debug('\tsgima_start: %s', self.sigma_start) + logging.debug('\tsigma_end: %s', self.sigma_end) + + def mean(self, *, x_start: torch.Tensor, x_end: torch.Tensor, time: torch.Tensor) -> torch.Tensor: + return (self.time_max - time) * x_start + time * x_end + + def std(self, *, x_start: torch.Tensor, x_end: torch.Tensor, time: torch.Tensor) -> torch.Tensor: + return (self.time_max - time) * self.sigma_start + time * self.sigma_end + + def vector_field( + self, + *, + x_start: torch.Tensor, + x_end: torch.Tensor, + time: torch.Tensor, + point: torch.Tensor, + eps: float = 1e-6, + ) -> torch.Tensor: + time = self._broadcast_time(time, n_dim=x_start.ndim) + + if self.sigma_start == self.sigma_end: + return x_end - x_start + + num = self.sigma_end * (point - x_start) - self.sigma_start * (point - x_end) + denom = (1 - time) * self.sigma_start + time * self.sigma_end + return num / (denom + eps) + + +class ConditionalFlowMatchingSampler(ABC): + """ + Abstract class for different sampler to solve the ODE in CFM + + Args: + estimator: the NN-based conditional vector field estimator + num_steps: How many time steps to iterate in the process + time_min: minimum time value used in the process + time_max: maximum time value used in the process + + """ + + def __init__( + self, + estimator: torch.nn.Module, + num_steps: int = 5, + time_min: float = 1e-8, + time_max: float = 1.0, + ): + self.estimator = estimator + self.num_steps = num_steps + self.time_min = time_min + self.time_max = time_max + + @property + def time_step(self): + return (self.time_max - self.time_min) / self.num_steps + + @abstractmethod + def forward( + self, state: torch.Tensor, estimator_condition: torch.Tensor, state_length: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + pass + + +class ConditionalFlowMatchingEulerSampler(ConditionalFlowMatchingSampler): + """ + The Euler Sampler for solving the ODE in CFM on a uniform time grid + """ + + def __init__( + self, + estimator: torch.nn.Module, + num_steps: int = 5, + time_min: float = 1e-8, + time_max: float = 1.0, + ): + super().__init__( + estimator=estimator, + num_steps=num_steps, + time_min=time_min, + time_max=time_max, + ) + logging.debug('Initialized %s with', self.__class__.__name__) + logging.debug('\tnum_steps: %s', self.num_steps) + logging.debug('\ttime_min: %s', self.time_min) + logging.debug('\ttime_max: %s', self.time_max) + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + @torch.inference_mode() + def forward( + self, state: torch.Tensor, estimator_condition: torch.Tensor, state_length: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + time_steps = torch.linspace(self.time_min, self.time_max, self.num_steps + 1) + + if state_length is not None: + state = mask_sequence_tensor(state, state_length) + + for t in time_steps: + time = t * torch.ones(state.shape[0], device=state.device) + + if estimator_condition is None: + estimator_input = state + else: + estimator_input = torch.cat([state, estimator_condition], dim=1) + + vector_field, _ = self.estimator(input=estimator_input, input_length=state_length, condition=time) + + state = state + vector_field * self.time_step + + if state_length is not None: + state = mask_sequence_tensor(state, state_length) + + return state, state_length diff --git a/nemo/collections/audio/parts/submodules/transformerunet.py b/nemo/collections/audio/parts/submodules/transformerunet.py new file mode 100644 index 000000000000..b7c14d513bab --- /dev/null +++ b/nemo/collections/audio/parts/submodules/transformerunet.py @@ -0,0 +1,507 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MIT License +# +# Copyright (c) 2023 Phil Wang +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import math +from functools import partial +from typing import Dict, Optional + +import einops +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn import Module + +from nemo.core.classes import NeuralModule, typecheck +from nemo.core.neural_types import BoolType, FloatType, LengthsType, NeuralType, SpectrogramType +from nemo.utils import logging + +__all__ = ['TransformerUNet'] + + +class LearnedSinusoidalPosEmb(Module): + """The sinusoidal Embedding to encode time conditional information""" + + def __init__(self, dim: int): + super().__init__() + if (dim % 2) != 0: + raise ValueError(f"Input dimension {dim} is not divisible by 2!") + half_dim = dim // 2 + self.weights = nn.Parameter(torch.randn(half_dim)) + + def forward(self, t: torch.Tensor) -> torch.Tensor: + """ + Args: + t: input time tensor, shape (B) + + Return: + fouriered: the encoded time conditional embedding, shape (B, D) + """ + t = einops.rearrange(t, 'b -> b 1') + freqs = t * einops.rearrange(self.weights, 'd -> 1 d') * 2 * math.pi + fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1) + return fouriered + + +class ConvPositionEmbed(Module): + """The Convolutional Embedding to encode time information of each frame""" + + def __init__(self, dim: int, kernel_size: int, groups: Optional[int] = None): + super().__init__() + if (kernel_size % 2) == 0: + raise ValueError(f"Kernel size {kernel_size} is divisible by 2!") + + if groups is None: + groups = dim + + self.dw_conv1d = nn.Sequential( + nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2), nn.GELU() + ) + + def forward(self, x, mask=None): + """ + Args: + x: input tensor, shape (B, T, D) + + Return: + out: output tensor with the same shape (B, T, D) + """ + + if mask is not None: + mask = mask[..., None] + x = x.masked_fill(mask, 0.0) + + x = einops.rearrange(x, 'b n c -> b c n') + x = self.dw_conv1d(x) + out = einops.rearrange(x, 'b c n -> b n c') + + if mask is not None: + out = out.masked_fill(mask, 0.0) + + return out + + +class RMSNorm(Module): + """The Root Mean Square Layer Normalization + + References: + - Zhang et al., Root Mean Square Layer Normalization, 2019 + """ + + def __init__(self, dim): + super().__init__() + self.scale = dim**0.5 + self.gamma = nn.Parameter(torch.ones(dim)) + + def forward(self, x: torch.Tensor): + return F.normalize(x, dim=-1) * self.scale * self.gamma + + +class AdaptiveRMSNorm(Module): + """ + Adaptive Root Mean Square Layer Normalization given a conditional embedding. + This enables the model to consider the conditional input during normalization. + """ + + def __init__(self, dim: int, cond_dim: Optional[int] = None): + super().__init__() + if cond_dim is None: + cond_dim = dim + self.scale = dim**0.5 + + self.to_gamma = nn.Linear(cond_dim, dim) + self.to_beta = nn.Linear(cond_dim, dim) + + # init adaptive normalization to identity + + nn.init.zeros_(self.to_gamma.weight) + nn.init.ones_(self.to_gamma.bias) + + nn.init.zeros_(self.to_beta.weight) + nn.init.zeros_(self.to_beta.bias) + + def forward(self, x: torch.Tensor, cond: torch.Tensor): + normed = F.normalize(x, dim=-1) * self.scale + + gamma, beta = self.to_gamma(cond), self.to_beta(cond) + gamma = einops.rearrange(gamma, 'B D -> B 1 D') + beta = einops.rearrange(beta, 'B D -> B 1 D') + + return normed * gamma + beta + + +class GEGLU(Module): + """The GeGLU activation implementation""" + + def forward(self, x: torch.Tensor): + x, gate = x.chunk(2, dim=-1) + return F.gelu(gate) * x + + +def get_feedforward_layer(dim: int, mult: int = 4, dropout: float = 0.0): + """ + Return a Feed-Forward layer for the Transformer Layer. + GeGLU activation is used in this FF layer + """ + dim_inner = int(dim * mult * 2 / 3) + return nn.Sequential(nn.Linear(dim, dim_inner * 2), GEGLU(), nn.Dropout(dropout), nn.Linear(dim_inner, dim)) + + +class TransformerUNet(NeuralModule): + """ + Implementation of the transformer Encoder Model with U-Net structure used in + VoiceBox and AudioBox + + References: + Le et al., Voicebox: Text-Guided Multilingual Universal Speech Generation at Scale, 2023 + Vyas et al., Audiobox: Unified Audio Generation with Natural Language Prompts, 2023 + """ + + def __init__( + self, + dim: int, + depth: int, + heads: int = 8, + ff_mult: int = 4, + attn_dropout: float = 0.0, + ff_dropout: float = 0.0, + max_positions: int = 6000, + adaptive_rmsnorm: bool = False, + adaptive_rmsnorm_cond_dim_in: Optional[int] = None, + use_unet_skip_connection: bool = True, + skip_connect_scale: Optional[int] = None, + ): + """ + Args: + dim: Embedding dimension + depth: Number of Transformer Encoder Layers + heads: Number of heads in MHA + ff_mult: The multiplier for the feedforward dimension (ff_dim = ff_mult * dim) + attn_dropout: dropout rate for the MHA layer + ff_dropout: droupout rate for the feedforward layer + max_positions: The maximum time length of the input during training and inference + adaptive_rmsnorm: Whether to use AdaptiveRMS layer. + Set to True if the model has a conditional embedding in forward() + adaptive_rms_cond_dim_in: Dimension of the conditional embedding + use_unet_skip_connection: Whether to use U-Net or not + skip_connect_scale: The scale of the U-Net connection. + """ + super().__init__() + if (depth % 2) != 0: + raise ValueError(f"Number of layers {depth} is not divisible by 2!") + self.layers = nn.ModuleList([]) + self.init_alibi(max_positions=max_positions, heads=heads) + + if adaptive_rmsnorm: + rmsnorm_class = partial(AdaptiveRMSNorm, cond_dim=adaptive_rmsnorm_cond_dim_in) + else: + rmsnorm_class = RMSNorm + + if skip_connect_scale is None: + self.skip_connect_scale = 2**-0.5 + else: + self.skip_connect_scale = skip_connect_scale + + for ind in range(depth): + layer = ind + 1 + has_skip = use_unet_skip_connection and layer > (depth // 2) + + self.layers.append( + nn.ModuleList( + [ + nn.Linear(dim * 2, dim) if has_skip else None, + rmsnorm_class(dim=dim), + nn.MultiheadAttention( + embed_dim=dim, + num_heads=heads, + dropout=attn_dropout, + batch_first=True, + ), + rmsnorm_class(dim=dim), + get_feedforward_layer(dim=dim, mult=ff_mult, dropout=ff_dropout), + ] + ) + ) + + self.final_norm = RMSNorm(dim) + + logging.debug('Initialized %s with', self.__class__.__name__) + logging.debug('\tembedding dim: %s', dim) + logging.debug('\tNumber of Layer: %s', depth) + logging.debug('\tfeedforward dim: %s', dim * ff_mult) + logging.debug('\tnumber of heads: %s', heads) + logging.debug('\tDropout rate of MHA: %s', attn_dropout) + logging.debug('\tDropout rate of FF: %s', ff_dropout) + logging.debug('\tnumber of heads: %s', heads) + logging.debug('\tmaximun time length: %s', max_positions) + logging.debug('\tuse AdaptiveRMS: %s', adaptive_rmsnorm) + logging.debug('\tConditional dim: %s', adaptive_rmsnorm_cond_dim_in) + logging.debug('\tUse UNet connection: %s', use_unet_skip_connection) + logging.debug('\tskip connect scale: %s', self.skip_connect_scale) + + def init_alibi( + self, + max_positions: int, + heads: int, + ): + """Initialize the Alibi bias parameters + + References: + - Press et al., Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation, 2021 + """ + + def get_slopes(n): + ratio = 2 ** (-8 / n) + return ratio ** torch.arange(1, n + 1) + + if not math.log2(heads).is_integer(): + logging.warning( + "It is recommend to set number of attention heads to be the power of 2 for the Alibi bias!" + ) + logging.warning(f"Current value of heads: {heads}") + + self.slopes = nn.Parameter(einops.rearrange(get_slopes(heads), "B -> B 1 1")) + + pos_matrix = ( + -1 * torch.abs(torch.arange(max_positions).unsqueeze(0) - torch.arange(max_positions).unsqueeze(1)).float() + ) + pos_matrix = einops.rearrange(pos_matrix, "T1 T2 -> 1 T1 T2") + self.register_buffer('pos_matrix', pos_matrix, persistent=False) + + @property + def input_types(self) -> Dict[str, NeuralType]: + """Returns definitions of module output ports.""" + return { + "x": NeuralType(('B', 'T', 'D'), FloatType()), + "key_padding_mask": NeuralType(('B', 'T'), BoolType(), optional=True), + "adaptive_rmsnorm_cond": NeuralType(('B', 'D'), FloatType(), optional=True), + } + + @property + def output_types(self) -> Dict[str, NeuralType]: + """Returns definitions of module output ports.""" + return { + "output": NeuralType(('B', 'T', 'D'), FloatType()), + } + + @typecheck() + def forward(self, x, key_padding_mask: Optional[torch.Tensor] = None, adaptive_rmsnorm_cond=None): + """Forward pass of the model. + + Args: + input: input tensor, shape (B, C, D, T) + key_padding_mask: mask tensor indicating the padding parts, shape (B, T) + adaptive_rmsnorm_cond: conditional input for the model, shape (B, D) + """ + batch_size, seq_len, *_ = x.shape + skip_connects = [] + alibi_bias = self.get_alibi_bias(batch_size=batch_size, seq_len=seq_len) + + rmsnorm_kwargs = dict() + if adaptive_rmsnorm_cond is not None: + rmsnorm_kwargs = dict(cond=adaptive_rmsnorm_cond) + + for skip_combiner, attn_prenorm, attn, ff_prenorm, ff in self.layers: + + if skip_combiner is None: + skip_connects.append(x) + else: + skip_connect = skip_connects.pop() * self.skip_connect_scale + x = torch.cat((x, skip_connect), dim=-1) + x = skip_combiner(x) + + attn_input = attn_prenorm(x, **rmsnorm_kwargs) + if key_padding_mask is not None: + # Since Alibi_bias is a float-type attn_mask, the padding_mask need to be float-type. + float_key_padding_mask = key_padding_mask.float() + float_key_padding_mask = float_key_padding_mask.masked_fill(key_padding_mask, float('-inf')) + else: + float_key_padding_mask = None + + attn_output, _ = attn( + query=attn_input, + key=attn_input, + value=attn_input, + key_padding_mask=float_key_padding_mask, + need_weights=False, + attn_mask=alibi_bias, + ) + x = x + attn_output + + ff_input = ff_prenorm(x, **rmsnorm_kwargs) + x = ff(ff_input) + x + + return self.final_norm(x) + + def get_alibi_bias(self, batch_size: int, seq_len: int): + """ + Return the alibi_bias given batch size and seqence length + """ + pos_matrix = self.pos_matrix[:, :seq_len, :seq_len] + alibi_bias = pos_matrix * self.slopes + alibi_bias = alibi_bias.repeat(batch_size, 1, 1) + + return alibi_bias + + +class SpectrogramTransformerUNet(NeuralModule): + """This model handles complex-valued inputs by stacking real and imaginary components. + Stacked tensor is processed using TransformerUNet and the output is projected to generate real + and imaginary components of the output channels. + + Convolutional Positional Embedding is applied for the input sequence + """ + + def __init__( + self, + in_channels: int = 1, + out_channels: int = 1, + freq_dim: int = 256, + dim: int = 1024, + depth: int = 24, + heads: int = 16, + ff_mult: int = 4, + ff_dropout: float = 0.0, + attn_dropout: float = 0.0, + max_positions: int = 6000, + time_hidden_dim: Optional[int] = None, + conv_pos_embed_kernel_size: int = 31, + conv_pos_embed_groups: Optional[int] = None, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + dim_in = freq_dim * in_channels * 2 + + if time_hidden_dim is None: + time_hidden_dim = dim * 4 + + self.proj_in = nn.Linear(dim_in, dim) + + self.sinu_pos_emb = nn.Sequential(LearnedSinusoidalPosEmb(dim), nn.Linear(dim, time_hidden_dim), nn.SiLU()) + + self.conv_embed = ConvPositionEmbed( + dim=dim, kernel_size=conv_pos_embed_kernel_size, groups=conv_pos_embed_groups + ) + + self.transformerunet = TransformerUNet( + dim=dim, + depth=depth, + heads=heads, + ff_mult=ff_mult, + ff_dropout=ff_dropout, + attn_dropout=attn_dropout, + max_positions=max_positions, + adaptive_rmsnorm=True, + adaptive_rmsnorm_cond_dim_in=time_hidden_dim, + use_unet_skip_connection=True, + ) + + # 2x the frequency dimension as the model operates in the complex-value domain + dim_out = freq_dim * out_channels * 2 + + self.proj_out = nn.Linear(dim, dim_out) + + logging.debug('Initialized %s with', self.__class__.__name__) + logging.debug('\tin_channels: %s', self.in_channels) + logging.debug('\tout_channels: %s', self.out_channels) + logging.debug('\tInput frequency dimension: %s', freq_dim) + + @property + def input_types(self) -> Dict[str, NeuralType]: + """Returns definitions of module output ports.""" + return { + "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), + "input_length": NeuralType(('B',), LengthsType(), optional=True), + "condition": NeuralType(('B',), FloatType(), optional=True), + } + + @property + def output_types(self) -> Dict[str, NeuralType]: + """Returns definitions of module output ports.""" + return { + "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), + "output_length": NeuralType(('B',), LengthsType(), optional=True), + } + + @staticmethod + def _get_key_padding_mask(input_length: torch.Tensor, max_length: int): + """ + Return the self_attention masking according to the input length. + 0 indicates the frame is in the valid range, while 1 indicates the frame is a padding frame. + Args: + input_length: shape (B) + max_length (int): The maximum length of the input sequence + + return: + key_padding_mask: shape (B, T) + """ + key_padding_mask = torch.arange(max_length).expand(len(input_length), max_length).to(input_length.device) + key_padding_mask = key_padding_mask >= input_length.unsqueeze(1) + return key_padding_mask + + @typecheck() + def forward(self, input, input_length=None, condition=None): + """Forward pass of the model. + + Args: + input: input tensor, shape (B, C, D, T) + input_length: length of the valid time steps for each example in the batch, shape (B,) + condition: scalar condition (time) for the model, will be embedded using `self.time_embedding` + """ + # Stack real and imaginary components + B, C_in, D, T = input.shape + if C_in != self.in_channels: + raise RuntimeError(f'Unexpected input channel size {C_in}, expected {self.in_channels}') + + input_real_imag = torch.stack([input.real, input.imag], dim=2) + input = einops.rearrange(input_real_imag, 'B C RI D T -> B T (C RI D)') + + x = self.proj_in(input) + key_padding_mask = self._get_key_padding_mask(input_length, max_length=T) + x = self.conv_embed(x, mask=key_padding_mask) + x + + if condition is None: + raise NotImplementedError + + time_emb = self.sinu_pos_emb(condition) + + x = self.transformerunet(x=x, key_padding_mask=key_padding_mask, adaptive_rmsnorm_cond=time_emb) + + output = self.proj_out(x) + output = einops.rearrange(output, "B T (C RI D) -> B C D T RI", C=self.out_channels, RI=2, D=D) + output = torch.view_as_complex(output.contiguous()) + + return output, input_length diff --git a/nemo/collections/audio/parts/utils/callbacks.py b/nemo/collections/audio/parts/utils/callbacks.py new file mode 100644 index 000000000000..093d5a11f419 --- /dev/null +++ b/nemo/collections/audio/parts/utils/callbacks.py @@ -0,0 +1,177 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Type + +import einops +import torch +from pytorch_lightning import Callback, LightningModule, Trainer +from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning.loggers.logger import Logger +from pytorch_lightning.loggers.wandb import WandbLogger + +from nemo.utils import logging +from nemo.utils.decorators import experimental + +HAVE_WANDB = True +try: + import wandb +except ModuleNotFoundError: + HAVE_WANDB = False + + +def _get_logger(loggers: List[Logger], logger_type: Type[Logger]): + for logger in loggers: + if isinstance(logger, logger_type): + if hasattr(logger, "experiment"): + return logger.experiment + else: + return logger + raise ValueError(f"Could not find {logger_type} logger in {loggers}.") + + +@experimental +class SpeechEnhancementLoggingCallback(Callback): + """ + Callback which can log artifacts (eg. model predictions, graphs) to local disk, Tensorboard, and/or WandB. + + Args: + data_loader: Data to log artifacts for. + output_dir: Optional local directory. If provided, artifacts will be saved in output_dir. + loggers: Optional list of loggers to use if logging to tensorboard or wandb. + log_tensorboard: Whether to log artifacts to tensorboard. + log_wandb: Whether to log artifacts to WandB. + """ + + def __init__( + self, + data_loader, + data_loader_idx: int, + loggers: Optional[List[Logger]] = None, + log_tensorboard: bool = False, + log_wandb: bool = False, + sample_rate: int = 16000, + max_utts: Optional[int] = None, + ): + self.data_loader = data_loader + self.data_loader_idx = data_loader_idx + self.loggers = loggers if loggers else [] + self.log_tensorboard = log_tensorboard + self.log_wandb = log_wandb + self.sample_rate = sample_rate + self.max_utts = max_utts + + if log_tensorboard: + logging.info('Creating tensorboard logger') + self.tensorboard_logger = _get_logger(self.loggers, TensorBoardLogger) + else: + logging.debug('Not using tensorbord logger') + self.tensorboard_logger = None + + if log_wandb: + if not HAVE_WANDB: + raise ValueError("Wandb not installed.") + logging.info('Creating wandb logger') + self.wandb_logger = _get_logger(self.loggers, WandbLogger) + else: + logging.debug('Not using wandb logger') + self.wandb_logger = None + + logging.debug('Initialized %s with', self.__class__.__name__) + logging.debug('\tlog_tensorboard: %s', self.log_tensorboard) + logging.debug('\tlog_wandb: %s', self.log_wandb) + + def _log_audio(self, audios: torch.Tensor, lengths: torch.Tensor, step: int, label: str = "input"): + + num_utts = audios.size(0) + for audio_idx in range(num_utts): + length = lengths[audio_idx] + if self.tensorboard_logger: + self.tensorboard_logger.add_audio( + tag=f"{label}_{audio_idx}", + snd_tensor=audios[audio_idx, :length], + global_step=step, + sample_rate=self.sample_rate, + ) + + if self.wandb_logger: + wandb_audio = ( + wandb.Audio(audios[audio_idx], sample_rate=self.sample_rate, caption=f"{label}_{audio_idx}"), + ) + self.wandb_logger.log({f"{label}_{audio_idx}": wandb_audio}) + + def on_validation_epoch_end(self, trainer: Trainer, model: LightningModule): + """Log artifacts at the end of an epoch.""" + epoch = 1 + model.current_epoch + output_signal_list = [] + output_length_list = [] + num_examples_uploaded = 0 + + logging.info(f"Logging processed speech for validation dataset {self.data_loader_idx}...") + for batch in self.data_loader: + if isinstance(batch, dict): + # lhotse batches are dictionaries + input_signal = batch['input_signal'] + input_length = batch['input_length'] + target_signal = batch.get('target_signal', input_signal.clone()) + else: + input_signal, input_length, target_signal, _ = batch + + if self.max_utts is None: + num_examples = input_signal.size(0) # batch size + do_upload = True + else: + do_upload = num_examples_uploaded < self.max_utts + num_examples = min(self.max_utts - num_examples_uploaded, input_signal.size(0)) + num_examples_uploaded += num_examples + + if do_upload: + # Only pick the required numbers of speech to the logger + input_signal = input_signal[:num_examples, ...] + target_signal = target_signal[:num_examples, ...] + input_length = input_length[:num_examples] + + # For consistency, the model uses multi-channel format, even if the channel dimension is 1 + if input_signal.ndim == 2: + input_signal = einops.rearrange(input_signal, 'B T -> B 1 T') + if target_signal.ndim == 2: + target_signal = einops.rearrange(target_signal, 'B T -> B 1 T') + + input_signal = input_signal.to(model.device) + input_length = input_length.to(model.device) + + output_signal, output_length = model(input_signal=input_signal, input_length=input_length) + output_signal_list.append(output_signal.to(target_signal.device)) + output_length_list.append(output_length.to(target_signal.device)) + + if len(output_signal_list) == 0: + logging.debug('List are empty, no artifacts to log at epoch %d.', epoch) + return + + output_signals = torch.concat(output_signal_list, dim=0) + output_lengths = torch.concat(output_length_list, dim=0) + if output_signals.size(1) != 1: + logging.error( + f"Currently only supports single-channel audio! Current output shape: {output_signals.shape}" + ) + raise NotImplementedError + + output_signals = einops.rearrange(output_signals, "B 1 T -> B T") + + self._log_audio( + audios=output_signals, + lengths=output_lengths, + step=model.global_step, + label=f"dataloader_{self.data_loader_idx}_processed", + ) From 8d9cfee52bff94459ff8cd6eb88c9c8d0f099f7c Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Mon, 26 Aug 2024 00:10:56 -0700 Subject: [PATCH 047/664] Revert torchrun fix for model import (#10251) Signed-off-by: Alexandros Koumparoulis --- nemo/lightning/io/mixin.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index eff4cd9434ce..e249e2e318b6 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -300,13 +300,8 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa """ connector = self._get_connector(path) ckpt_path: Path = connector.local_path(base_path=base_path) - # If already in multiproc environment (e.g. due to torchrun invocation) run only on RANK = 0 - from nemo.utils.get_rank import is_global_rank_zero - - if is_global_rank_zero(): - ckpt_path = connector(ckpt_path, overwrite=overwrite) - connector.on_import_ckpt(self) - + ckpt_path = connector(ckpt_path, overwrite=overwrite) + connector.on_import_ckpt(self) return ckpt_path @classmethod From 642c97a1595d83d6bab15646a04a25f54ec20057 Mon Sep 17 00:00:00 2001 From: Marc Romeyn Date: Mon, 26 Aug 2024 18:27:00 +0200 Subject: [PATCH 048/664] [NeMo-UX[ Move nemotron imports inline (#10255) * Move nemotron transformers + tokenizer imports inline to reduce number of required deps Signed-off-by: Marc Romeyn * Apply isort and black reformatting Signed-off-by: marcromeyn --------- Signed-off-by: Marc Romeyn Signed-off-by: marcromeyn Co-authored-by: marcromeyn --- nemo/collections/llm/gpt/model/nemotron.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py index dd659f7eedf7..d946e5f48cce 100644 --- a/nemo/collections/llm/gpt/model/nemotron.py +++ b/nemo/collections/llm/gpt/model/nemotron.py @@ -4,16 +4,17 @@ import torch from torch import nn -from transformers import NemotronConfig as HFNemotronConfig -from transformers import NemotronForCausalLM -from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.llm.fn.activation import squared_relu from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel from nemo.collections.llm.utils import Config from nemo.lightning import OptimizerModule, io, teardown if TYPE_CHECKING: + from transformers import NemotronConfig as HFNemotronConfig + from transformers import NemotronForCausalLM + + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec @@ -123,6 +124,8 @@ def init(self) -> NemotronModel: return NemotronModel(self.config, tokenizer=self.tokenizer) def apply(self, output_path: Path) -> Path: + from transformers import NemotronForCausalLM + source = NemotronForCausalLM.from_pretrained(str(self)) target = self.init() trainer = self.nemo_setup(target) @@ -155,10 +158,14 @@ def convert_state(self, source, target): @property def tokenizer(self) -> "AutoTokenizer": + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + return AutoTokenizer(str(self)) @property def config(self) -> NemotronConfig: + from transformers import NemotronConfig as HFNemotronConfig + source = HFNemotronConfig.from_pretrained(str(self)) def make_vocab_size_divisible_by(vocab_size): @@ -224,6 +231,8 @@ def tokenizer(self): @property def config(self) -> "HFNemotronConfig": + from transformers import NemotronConfig as HFNemotronConfig + source: NemotronConfig = io.load_context(str(self)).model.config return HFNemotronConfig( From 8210e9ce735136e22a64834ca8cbab2b160a519d Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Mon, 26 Aug 2024 09:36:33 -0700 Subject: [PATCH 049/664] Wrap CPU model init with megatron_lazy_init_context (#10219) * Wrap CPU model init with megatron_lazy_init_context Signed-off-by: Alexandros Koumparoulis * Cleanup checkpoint-dir if saving fails Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- nemo/lightning/io/connector.py | 3 ++- nemo/lightning/io/pl.py | 23 ++++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index 69368599682e..512f3bc4f12e 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -145,6 +145,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = pl.Trainer: The trainer configured with the model and strategy. """ from nemo.lightning import MegatronStrategy, Trainer + from nemo.lightning._strategy_lib import megatron_lazy_init_context _trainer = trainer or Trainer( devices=1, accelerator="cpu", strategy=MegatronStrategy(store_optimizer_states=False) @@ -155,7 +156,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = if not model.state_dict(): _trainer.strategy.lazy_init = True - with _trainer.init_module(): + with _trainer.init_module(), megatron_lazy_init_context(model.config): model.configure_model() return _trainer diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py index d0749fbeead7..f43d24792c1a 100644 --- a/nemo/lightning/io/pl.py +++ b/nemo/lightning/io/pl.py @@ -126,13 +126,22 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio validate_sharding_integrity = not (self.validated_consistency and self.assume_constant_structure) self.validated_consistency = True - return dist_checkpointing.save( - sharded_state_dict=checkpoint, - checkpoint_dir=checkpoint_dir, - sharded_strategy=self.save_sharded_strategy, - validate_access_integrity=validate_sharding_integrity, - async_sharded_save=self.async_save, - ) + + try: + return dist_checkpointing.save( + sharded_state_dict=checkpoint, + checkpoint_dir=checkpoint_dir, + sharded_strategy=self.save_sharded_strategy, + validate_access_integrity=validate_sharding_integrity, + async_sharded_save=self.async_save, + ) + except: + logging.error(f"Failed to save checkpoint to {checkpoint_dir}") + # Do cleanup. + import shutil + + shutil.rmtree(checkpoint_dir) + raise @override def load_checkpoint( From ea8f49b2f48c84517bf30faf9ea3fabd4c55eba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 27 Aug 2024 00:14:13 +0200 Subject: [PATCH 050/664] Bump `Dockerfile.ci` (2024-08-22) (#10227) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 124bcff ! Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * fix bert flags Signed-off-by: Oliver Koenig --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Oliver Koenig Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> --- .github/workflows/cicd-main.yml | 12 ++++++------ Dockerfile.ci | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index a086a493f683..396ef03bd661 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2061,7 +2061,7 @@ jobs: with: RUNNER: self-hosted-azure SCRIPT: | - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ trainer.devices=2 \ trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ @@ -2091,7 +2091,7 @@ jobs: model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ trainer.devices=2 \ trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ @@ -2128,7 +2128,7 @@ jobs: with: RUNNER: self-hosted-azure SCRIPT: | - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ trainer.devices=2 \ trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ @@ -2159,7 +2159,7 @@ jobs: model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ trainer.devices=2 \ trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ @@ -2199,7 +2199,7 @@ jobs: with: RUNNER: self-hosted-azure SCRIPT: | - NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ trainer.devices=2 \ trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ @@ -2229,7 +2229,7 @@ jobs: model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ trainer.devices=2 \ trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ diff --git a/Dockerfile.ci b/Dockerfile.ci index 38b82a288a2b..161671bf5a5a 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -34,7 +34,7 @@ WORKDIR /workspace # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.15.0 -ARG MCORE_TAG=2fd6e2b74efca73a1f2d27b89bb5419384b4d3bf +ARG MCORE_TAG=124bcff2a8153eccea4d7d0e4df5c5562aab50b9 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ From 69973f925c507df2a879e03def9b9e6139809f34 Mon Sep 17 00:00:00 2001 From: Slyne Deng Date: Mon, 26 Aug 2024 16:35:42 -0700 Subject: [PATCH 051/664] salm export trtllm (#10245) Signed-off-by: slyne deng Co-authored-by: slyne deng --- .../multimodal/speech_llm/export/README.md | 83 +++++ .../speech_llm/export/conf/salm_export.yaml | 16 + .../speech_llm/export/export_salm.py | 39 +++ .../speech_llm/export/extract_salm_weights.py | 204 ++++++++++++ nemo/deploy/multimodal/query_multimodal.py | 12 +- nemo/export/multimodal/build.py | 120 +++++++- nemo/export/multimodal/run.py | 291 +++++++++++++++++- nemo/export/tensorrt_mm_exporter.py | 58 +++- scripts/deploy/multimodal/deploy_triton.py | 15 +- 9 files changed, 810 insertions(+), 28 deletions(-) create mode 100644 examples/multimodal/speech_llm/export/README.md create mode 100644 examples/multimodal/speech_llm/export/conf/salm_export.yaml create mode 100644 examples/multimodal/speech_llm/export/export_salm.py create mode 100644 examples/multimodal/speech_llm/export/extract_salm_weights.py diff --git a/examples/multimodal/speech_llm/export/README.md b/examples/multimodal/speech_llm/export/README.md new file mode 100644 index 000000000000..05e44d112cce --- /dev/null +++ b/examples/multimodal/speech_llm/export/README.md @@ -0,0 +1,83 @@ +## Setup +In this part, we are going to export SALM model into TRTLLM. +First, let's download the [SALM nemo model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/speechllm_fc_llama2_7b/) from NVIDIA ngc. + +```bash +wget --content-disposition 'https://api.ngc.nvidia.com/v2/models/org/nvidia/team/nemo/speechllm_fc_llama2_7b/1.23.1/files?redirect=true&path=speechllm_fc_llama2_7b.nemo' -O speechllm_fc_llama2_7b.nemo +``` + +Then, we need to extract the different parts of SALM. +```bash +output=$PWD/output +python3 extract_salm_weights.py --model_file_path=speechllm_fc_llama2_7b.nemo --output_dir=$output +``` +It takes a while to run the above command. + +Under the `output` dir, you'll see: +``` +output + |___speechllm_fc_llama2_7b_lora.nemo + |___speechllm_fc_llama2_7b_perception + | |____model_config.yaml + | |____model_weights.ckpt + |___speechllm_fc_llama2_7b_llm.nemo + |___ xxx.tokenizer.model +``` + +After we get the lora nemo model and llm nemo model, we can merge the lora part into the llm by: +```bash +python /opt/NeMo/scripts/nlp_language_modeling/merge_lora_weights/merge.py \ + trainer.accelerator=gpu \ + tensor_model_parallel_size=1 \ + pipeline_model_parallel_size=1 \ + gpt_model_file=output/speechllm_fc_llama2_7b_llm.nemo \ + lora_model_path=output/speechllm_fc_llama2_7b_lora.nemo \ + merged_model_path=speechllm_fc_llama2_7b_llm_merged.nemo +``` + +Now we are able to export the engine by: +```bash +python3 export_salm.py \ + model.perception_model_path=output/speechllm_fc_llama2_7b_perception \ + model.llm_model_path=output/speechllm_fc_llama2_7b_llm_merged.nemo +``` + +You should be able to get the generated engines under `./salm` folder. To run the engines, you may run: +```python +from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter + +output_dir = "/ws/salm" # the engine directory +trt_llm_exporter = TensorRTMMExporter(model_dir=output_dir, load_model=True, modality='audio') +input_text = "Q: what's the transcription of the audio? A:" +input_media = '/ws/data/test_audio.wav' +print(trt_llm_exporter.forward(input_text, input_media)) + +``` + +## Deploy +If you want to generate the engines and deploy them with Triton Inference Server, you may also run: + +```bash +python3 NeMo/scripts/deploy/multimodal/deploy_triton.py \ + --modality="audio" \ + --visual_checkpoint=NeMo/examples/multimodal/speech_llm/export/output/speechllm_fc_llama2_7b_perception \ + --llm_checkpoint=NeMo/examples/multimodal/speech_llm/export/output/speechllm_fc_llama2_7b_llm_merged.nemo \ + --llm_model_type="llama" \ + --model_type="salm" \ + --triton_model_name="salm" \ + --max_input_len=4096 \ + --max_output_len=256 \ + --max_multimodal_len=3072 \ + --triton_model_repository=/tmp/trt_model_dir/ +``` + +And on client side, you may run: +```bash +python3 NeMo/scripts/deploy/multimodal/query.py \ + --model_name="salm" \ + --model_type="salm" \ + --input_text="Q: what's the transcription of the audio? A:" \ + --input_media=/ws/data/test_audio.wav +``` + +For more details, please check `NeMo/scripts/deploy/multimodal/deploy_triton.py` and ` NeMo/scripts/deploy/multimodal/query.py`. \ No newline at end of file diff --git a/examples/multimodal/speech_llm/export/conf/salm_export.yaml b/examples/multimodal/speech_llm/export/conf/salm_export.yaml new file mode 100644 index 000000000000..54ab6e9180c5 --- /dev/null +++ b/examples/multimodal/speech_llm/export/conf/salm_export.yaml @@ -0,0 +1,16 @@ +name: speechllm_salm +infer: + output_dir: ./salm + max_batch_size: 1 + tensor_parallelism: 1 + max_input_len: 4096 + max_output_len: 256 + max_multimodal_len: 3072 + perception_max_batch_size: 1 + +model: + type: salm + precision: float16 + perception_model_path: /path/to/speechllm_llama2_7b_perception + llm_model_path: /path/to/speechllm_llama2_7b_llm.nemo + llm_model_type: llama diff --git a/examples/multimodal/speech_llm/export/export_salm.py b/examples/multimodal/speech_llm/export/export_salm.py new file mode 100644 index 000000000000..00500bf46f50 --- /dev/null +++ b/examples/multimodal/speech_llm/export/export_salm.py @@ -0,0 +1,39 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo.core.config import hydra_runner +from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter + + +@hydra_runner(config_path='conf', config_name='salm_export') +def main(cfg): + exporter = TensorRTMMExporter(model_dir=cfg.infer.output_dir, load_model=False, modality='audio') + exporter.export( + visual_checkpoint_path=cfg.model.perception_model_path, + llm_checkpoint_path=cfg.model.llm_model_path, + model_type=cfg.model.type, + llm_model_type=cfg.model.llm_model_type, + tensor_parallel_size=cfg.infer.tensor_parallelism, + max_input_len=cfg.infer.max_input_len, + max_output_len=cfg.infer.max_output_len, + vision_max_batch_size=cfg.infer.perception_max_batch_size, + max_batch_size=cfg.infer.max_batch_size, + max_multimodal_len=cfg.infer.max_multimodal_len, + dtype=cfg.model.precision, + load_model=False, + ) + + +if __name__ == '__main__': + main() diff --git a/examples/multimodal/speech_llm/export/extract_salm_weights.py b/examples/multimodal/speech_llm/export/extract_salm_weights.py new file mode 100644 index 000000000000..0698a411110e --- /dev/null +++ b/examples/multimodal/speech_llm/export/extract_salm_weights.py @@ -0,0 +1,204 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import os +import tempfile + +import torch +from megatron.core import dist_checkpointing +from omegaconf import OmegaConf +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.multimodal.speech_llm.modules.perception_modules import AudioPerceptionModule +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector +from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper +from nemo.utils import logging +from nemo.utils.model_utils import inject_model_parallel_rank + + +def get_config_and_state_dict_from_nemo(filepath, map_location, output_dir, sharded_state_dict=None): + cwd = os.getcwd() + save_restore_connector = NLPSaveRestoreConnector() + + with tempfile.TemporaryDirectory() as tmpdir: + try: + if os.path.isfile(filepath): + save_restore_connector._unpack_nemo_file(path2file=filepath, out_folder=tmpdir) + else: + tmpdir = filepath + + os.chdir(tmpdir) + config_yaml = "model_config.yaml" + model_weights_ckpt = "model_weights.ckpt" + + # find file in tmpdir that endswith "tokenizer.model" + tokenizer = None + for file in os.listdir(tmpdir): + if file.endswith("tokenizer.model"): + tokenizer = file + break + if tokenizer is None: + raise ValueError(f"Tokenizer not found in {tmpdir}") + tokenizer_path = os.path.join(tmpdir, tokenizer) + # copy tokenizer_path to current directory + os.system(f"cp {tokenizer_path} {output_dir}") + tokenizer_path = os.path.join(output_dir, tokenizer) + + # load conf + with open(config_yaml) as f: + conf = OmegaConf.load(f) + + os.chdir(cwd) + model_weights = os.path.join(tmpdir, model_weights_ckpt) + model_weights = inject_model_parallel_rank(model_weights) + state_dict = save_restore_connector._load_state_dict_from_disk(model_weights, map_location=map_location) + + # distributed checkpointing + if state_dict is None and sharded_state_dict is not None: + checkpoint = dict(state_dict=sharded_state_dict) + tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt) + tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0] + assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.' + checkpoint = dist_checkpointing.load( + sharded_state_dict=checkpoint, + checkpoint_dir=tmp_model_weights_dir, + ) + state_dict = checkpoint["state_dict"] + + conf.tokenizer.model = tokenizer_path + return conf, state_dict + finally: + os.chdir(cwd) + + +def get_llm_model_state_dict(state_dict, lora_model_state_dict): + llm_model_state_dict = {} + for key, value in state_dict.items(): + if key.startswith("model."): + if key not in lora_model_state_dict and value != None: + llm_model_state_dict[key] = value + return llm_model_state_dict + + +def get_lora_state_dict(state_dict): + lora_model_state_dict = {} + for key, value in state_dict.items(): + if "adapter_layer.lora" in key and value != None: + lora_model_state_dict[key] = value + return lora_model_state_dict + + +def get_perception_state_dict(state_dict): + perception_state_dict = {} + for key, value in state_dict.items(): + if key.startswith("perception."): + key = key.replace("perception.", "", 1) + perception_state_dict[key] = value + return perception_state_dict + + +def save_llm_model(state_dict, nemo_config, output_path): + if nemo_config.get('megatron_amp_O2', False): + keys = list(state_dict.keys()) + for key in keys: + state_dict[key.replace('model.', 'model.module.', 1)] = state_dict['state_dict'].pop(key) + + trainer = Trainer(accelerator='cpu', strategy=NLPDDPStrategy()) + model = load_state_dict_helper(MegatronGPTModel, nemo_config, trainer, state_dict) + model._save_restore_connector = NLPSaveRestoreConnector() + model.cfg.use_cpu_initialization = False + + model.save_to(output_path) + logging.info(f'llm model saved to: {output_path}') + + +def save_nemo_weights(state_dict, output_dir, config, save_nemo_model=True): + if not os.path.exists(output_dir): + os.mkdir(output_dir) + weight_file = os.path.join(output_dir, "model_weights.ckpt") + torch.save(state_dict, weight_file) + # convert config to yaml + config_file = os.path.join(output_dir, "model_config.yaml") + with open(config_file, "w") as f: + f.write(OmegaConf.to_yaml(config)) + + if save_nemo_model: + # create nemo file + nemo_model_name = f"{output_dir}.nemo" + nemo_path = os.path.join(output_dir, nemo_model_name) + # tar model_config.yaml and model_weights.ckpt + os.system(f"tar -C {output_dir} -cvf {nemo_path} model_config.yaml model_weights.ckpt") + # remove model_config.yaml and model_weights.ckpt + os.system(f"rm {config_file} {weight_file}") + # remove the empty directory + os.system(f"rmdir {output_dir}") + + +def separate_speechllm_model(model_file_path, output_dir, map_location="cuda:0"): + if not os.path.exists(output_dir): + os.mkdir(output_dir) + output_dir = os.path.abspath(output_dir) + + logging.info(f"Separating {model_file_path} into perception, lora, and llm model") + filepath = model_file_path + conf, state_dict = get_config_and_state_dict_from_nemo(filepath, map_location, output_dir) + + base_model_name = os.path.basename(filepath).split(".")[0] + + perception_state_dict = get_perception_state_dict(state_dict) + perception_model_dir = None + if perception_state_dict: + perception_model_dir = f"{base_model_name}_perception" + perception_model_dir = os.path.join(output_dir, perception_model_dir) + save_nemo_weights(perception_state_dict, perception_model_dir, conf.perception, save_nemo_model=False) + + # verify if the exported perception model is correct + perception = AudioPerceptionModule(cfg=conf.perception) + perception.load_state_dict(perception_state_dict) + perception.eval() + print(perception) + print(perception(input_signal=torch.randn(1, 1000), input_signal_length=torch.tensor([1000]))) + # absolute path of perception model + logging.info(f"Perception model saved to: {perception_model_dir}") + + lora_model_weights = get_lora_state_dict(state_dict) + lora_model_dir = None + if lora_model_weights: + lora_model_dir = f"{base_model_name}_lora" + lora_model_dir = os.path.join(output_dir, lora_model_dir) + save_nemo_weights(lora_model_weights, lora_model_dir, conf) + logging.info(f"Lora model saved to: {lora_model_dir}.nemo") + # hard code the target model for now + llm_model_weights = get_llm_model_state_dict(state_dict, lora_model_weights) + if llm_model_weights: + llm_model = f"{base_model_name}_llm.nemo" + llm_model = os.path.join(output_dir, llm_model) + conf.target = "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel" + save_llm_model(llm_model_weights, conf, llm_model) + logging.info(f"LLM model saved to: {llm_model}") + + +# filepath = "/ws/speechllm_fc_llama2_7b.nemo" +# output_dir = "/ws/speechllm_fc_llama2_7b_separated" +# perception_model_dir, lora_model, llm_model = separate_speechllm_model(filepath, output_dir) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Separate speechllm model') + parser.add_argument('--model_file_path', type=str, help='Path to the speechllm model') + parser.add_argument('--output_dir', type=str, help='Output directory to save the separated models') + args = parser.parse_args() + separate_speechllm_model(args.model_file_path, args.output_dir) diff --git a/nemo/deploy/multimodal/query_multimodal.py b/nemo/deploy/multimodal/query_multimodal.py index 1c01c6861048..63e6a3e8c3a6 100644 --- a/nemo/deploy/multimodal/query_multimodal.py +++ b/nemo/deploy/multimodal/query_multimodal.py @@ -13,6 +13,7 @@ # limitations under the License. import numpy as np +import soundfile as sf from PIL import Image from nemo.deploy.utils import str_list2numpy @@ -71,6 +72,11 @@ def setup_media(self, input_media): elif self.model_type == "neva" or self.model_type == "vila": media = Image.open(input_media).convert('RGB') return np.expand_dims(np.array(media), axis=0) + elif self.model_type == "salm": + waveform, sample_rate = sf.read(input_media, dtype=np.float32) + input_signal = np.array([waveform], dtype=np.float32) + input_signal_length = np.array([[len(waveform)]], dtype=np.int32) + return {"input_signal": input_signal, "input_signal_length": input_signal_length} else: raise RuntimeError(f"Invalid model type {self.model_type}") @@ -105,8 +111,10 @@ def query( inputs = {"input_text": prompts} media = self.setup_media(input_media) - - inputs["input_media"] = np.repeat(media[np.newaxis, :, :, :, :], prompts.shape[0], axis=0) + if isinstance(media, dict): + inputs.update(media) + else: + inputs["input_media"] = np.repeat(media[np.newaxis, :, :, :, :], prompts.shape[0], axis=0) if batch_size is not None: inputs["batch_size"] = np.full(prompts.shape, batch_size, dtype=np.int_) diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py index 8ee3fa1c05e7..53c598be47c6 100644 --- a/nemo/export/multimodal/build.py +++ b/nemo/export/multimodal/build.py @@ -23,9 +23,12 @@ import tensorrt as trt import torch import yaml +from omegaconf import OmegaConf from tensorrt_llm.builder import Builder from transformers import AutoModel +from nemo.collections.multimodal.speech_llm.modules.perception_modules import AudioPerceptionModule +from nemo.core.classes.common import typecheck from nemo.export.tensorrt_llm import TensorRTLLM from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_nemo_model @@ -76,6 +79,32 @@ def export_visual_wrapper_onnx( ) +def export_perception_wrapper_onnx( + perception_wrapper, + input, + output_dir, + input_names=['processed_signal', 'processed_signal_length'], + output_names=['encoded', 'encoded_length'], + dynamic_axes={ + 'processed_signal': {0: 'batch', 2: 'time'}, + 'processed_signal_length': {0: 'batch'}, + 'encoded': {0: 'batch', 1: 'time'}, + 'encoded_length': {0: 'batch'}, + }, +): + logger.log(trt.Logger.INFO, "Exporting onnx") + os.makedirs(f'{output_dir}/onnx', exist_ok=True) + torch.onnx.export( + perception_wrapper, + input, + f'{output_dir}/onnx/perception_encoder.onnx', + opset_version=17, + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + ) + + def build_trt_engine( model_type, input_sizes, @@ -85,8 +114,8 @@ def build_trt_engine( image_size=None, num_frames=None, nemo_config=None, + part_name='visual_encoder', ): - part_name = 'visual_encoder' onnx_file = '%s/onnx/%s.onnx' % (output_dir, part_name) engine_file = '%s/%s.engine' % (output_dir, part_name) config_file = '%s/%s' % (output_dir, "config.json") @@ -131,6 +160,10 @@ def build_trt_engine( # input sizes can be a list of ints (e.g., [3, H, W]) when inputs are images, # or a list of three int lists (e.g., [[1, 1, 2700], [1, 500, 2700], [1, 4096, 2700]]). + # or a list of three list of lists + # (e.g., [{input1: min_shape, input2: min_shape, }, \ + # {input1: opt_shape, input2: opt_shape}, \ + # {input1: max_shape, input2: max_shape}] ) assert isinstance(input_sizes, list), "input_sizes must be a list" if isinstance(input_sizes[0], int): logger.log(trt.Logger.INFO, f"Processed input sizes {input_sizes}") @@ -139,10 +172,23 @@ def build_trt_engine( elif len(input_sizes) == 3 and isinstance(input_sizes[0], list): min_size, opt_size, max_size = input_sizes logger.log(trt.Logger.INFO, f"Processed min/opt/max input sizes {min_size}/{opt_size}/{max_size}") + elif len(input_sizes) == 3 and isinstance(input_sizes[0], dict): + logger.log(trt.Logger.INFO, f"Processed min/opt/max input sizes {input_sizes}") else: raise ValueError(f"invalid input sizes: {input_sizes}") - profile.set_shape(inputT.name, [nMinBS, *min_size], [nOptBS, *opt_size], [nMaxBS, *max_size]) + if isinstance(input_sizes[0], dict): + for i in range(network.num_inputs): + inputT = network.get_input(i) + input_name = inputT.name + min_size = input_sizes[0][input_name] + opt_size = input_sizes[1][input_name] + max_size = input_sizes[2][input_name] + logger.log(trt.Logger.INFO, f"{input_name} min/opt/max input sizes {min_size}/{opt_size}/{max_size}") + profile.set_shape(input_name, min_size, opt_size, max_size) + else: + profile.set_shape(inputT.name, [nMinBS, *min_size], [nOptBS, *opt_size], [nMaxBS, *max_size]) + config.add_optimization_profile(profile) t0 = time() @@ -367,6 +413,76 @@ def forward(self, images): ) +def build_perception_engine( + model_dir: str, + perception_checkpoint_path: str, + model_type: str = "salm", + max_batch_size: int = 1, +): + assert model_type == "salm", f"Invalid model type {model_type}" + + def load_perception_model(perception_checkpoint_path): + weights = "model_weights.ckpt" + perception_state_dict = torch.load(os.path.join(perception_checkpoint_path, weights)) + config = "model_config.yaml" + config = OmegaConf.load(os.path.join(perception_checkpoint_path, config)) + perception = AudioPerceptionModule(cfg=config) + perception.load_state_dict(perception_state_dict) + perception.eval() + return perception + + if not os.path.exists(model_dir): + os.makedirs(model_dir) + # load perception model + perception_model = load_perception_model(perception_checkpoint_path) + feature_extractor = perception_model.preprocessor + input_signal = torch.randn(1, 1000, dtype=torch.float32) + input_signal_length = torch.tensor([1000], dtype=torch.int32) + + processed_signal, processed_signal_length = feature_extractor( + input_signal=input_signal, length=input_signal_length + ) + processed_signal_length = processed_signal_length.to(torch.int32) + dump_path = model_dir + "/feature_extractor.ts" # dump the feature extractor as torchscript + feature_extractor.export(dump_path, (input_signal, input_signal_length)) + + class PerceptionWrapper(torch.nn.Module): + def __init__(self, encoder, modality_adapter, proj): + super().__init__() + self.encoder = encoder + self.modality_adapter = modality_adapter + self.proj = proj + + @typecheck.disable_checks() + def forward(self, processed_signal, processed_signal_length): + encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) + encoded, encoded_len = self.modality_adapter(audio_signal=encoded, length=encoded_len) + # b, c, t -> b, t, c + encoded = self.proj(encoded.transpose(1, 2)) + encoded_len = encoded_len.to(torch.int32) + return encoded, encoded_len + + perception = PerceptionWrapper(perception_model.encoder, perception_model.modality_adapter, perception_model.proj) + export_perception_wrapper_onnx(perception, (processed_signal, processed_signal_length), model_dir) + # export the onnx perception model to tensorrt engine + # 512 -> 5.12 sec, 3072 -> 30.72 sec + opt_batch_size = max(1, max_batch_size // 2) + shapes = [ + {"processed_signal": [1, 80, 64], "processed_signal_length": [1]}, + {"processed_signal": [opt_batch_size, 80, 512], "processed_signal_length": [opt_batch_size]}, + {"processed_signal": [max_batch_size, 80, 3072], "processed_signal_length": [max_batch_size]}, + ] + build_trt_engine( + model_type, + shapes, + model_dir, + max_batch_size, + dtype=torch.float16, + nemo_config=None, + part_name='perception_encoder', + ) + + def build_visual_engine( model_dir: str, visual_checkpoint_path: str, diff --git a/nemo/export/multimodal/run.py b/nemo/export/multimodal/run.py index 149df995c77a..2cde46ca41fa 100644 --- a/nemo/export/multimodal/run.py +++ b/nemo/export/multimodal/run.py @@ -25,6 +25,7 @@ import einops import numpy as np +import soundfile as sf import tensorrt as trt import tensorrt_llm import tensorrt_llm.profiler as profiler @@ -32,7 +33,7 @@ import yaml from PIL import Image from tensorrt_llm import logger -from tensorrt_llm._utils import str_dtype_to_trt +from tensorrt_llm._utils import str_dtype_to_trt, torch_dtype_to_trt from tensorrt_llm.runtime import ModelRunner, Session, TensorInfo from torch.nn import functional as F from torchvision import transforms @@ -54,7 +55,8 @@ def trt_dtype_to_torch(dtype): class MultimodalModelRunner: - def __init__(self, visual_engine_dir, llm_engine_dir): + def __init__(self, visual_engine_dir, llm_engine_dir, modality='vision'): + self.modality = modality self.runtime_rank = tensorrt_llm.mpi_rank() device_id = self.runtime_rank % torch.cuda.device_count() torch.cuda.set_device(device_id) @@ -68,13 +70,15 @@ def __init__(self, visual_engine_dir, llm_engine_dir): config = json.load(f) self.model_type = config['builder_config']['model_type'] self.vision_precision = config['builder_config']['precision'] + self.modality_precision = config['builder_config']['precision'] self.num_frames = config['builder_config'].get('num_frames', None) self.image_size = config['builder_config'].get('image_size', None) self.profiling_iterations = 20 - self.init_image_encoder(visual_engine_dir) + if modality == 'vision': + self.init_image_encoder(visual_engine_dir) self.init_tokenizer(llm_engine_dir) self.init_llm(llm_engine_dir) if self.model_type == 'lita' or self.model_type == 'vila' or self.model_type == 'vita': @@ -242,10 +246,10 @@ def insert_tokens_by_index(self, input_ids, num_frames): def preprocess(self, warmup, pre_prompt, post_prompt, image, attention_mask, batch_size): if not warmup: - profiler.start("Vision") + profiler.start(self.modality.capitalize()) if not warmup: - profiler.stop("Vision") + profiler.stop(self.modality.capitalize()) if self.model_type == 'vila': visual_features, visual_atts = self.get_visual_features(image, attention_mask) @@ -848,7 +852,7 @@ def print_result(self, input_text, output_text, batch_size, num_beams, run_profi if run_profiling: msec_per_batch = lambda name: 1000 * profiler.elapsed_time_in_sec(name) / self.profiling_iterations logger.info('Latencies per batch (msec)') - logger.info('TRT vision encoder: %.1f' % (msec_per_batch('Vision'))) + logger.info(f'TRT {self.modality} encoder: %.1f' % (msec_per_batch(self.modality.capitalize()))) logger.info('TRTLLM LLM generate: %.1f' % (msec_per_batch('LLM'))) logger.info('Multimodal generate: %.1f' % (msec_per_batch('Generate'))) @@ -864,3 +868,278 @@ def load_test_media(self, input_media): raise RuntimeError(f"Invalid model type {self.model_type}") return media + + +class SpeechllmModelRunner(MultimodalModelRunner): + def __init__(self, perception_engine_dir, llm_engine_dir, modality): + """ + perception_engine_dir: path to the perception engine directory + it should contain: + config.json nemo_config.yaml + perception_encoder.engine : tensorrt engine + feature_extractor.ts : torchscript model + llm_engine_dir: path to the LLM engine directory + """ + super().__init__(perception_engine_dir, llm_engine_dir, modality) + assert self.model_type == 'salm' + # init preprocessor + feature_extractor_path = os.path.join(perception_engine_dir, 'feature_extractor.ts') + self.feature_extractor = self.init_speech_preprocessor(feature_extractor_path) + self.init_modality_encoder(perception_engine_dir) + + def init_modality_encoder(self, engine_dir): + """ + Initialize the modality encoder session from the prebuilt engine directory + Args: + engine_dir: str, path to the engine directory + """ + # find file with .engine extension + engine_file = None + for file in os.listdir(engine_dir): + if file.endswith('.engine'): + engine_file = file + break + assert engine_file is not None, f"Engine file not found in {engine_dir}" + encoder_path = os.path.join(engine_dir, engine_file) + logger.info(f'Loading engine from {encoder_path}') + with open(encoder_path, 'rb') as f: + engine_buffer = f.read() + logger.info(f'Creating session from engine {encoder_path}') + self.modality_encoder_session = Session.from_serialized_engine(engine_buffer) + + def init_speech_preprocessor(self, feature_extractor_path): + feature_extractor = torch.jit.load(feature_extractor_path) + feature_extractor.eval() + return feature_extractor + + def process_audio(self, input_signal, input_signal_length): + """ + Args: + input_signal: audio signal in numpy array + input_signal_length: length of the audio signal in numpy array + + Returns: + processed_signal: torch.tensor [B, 80, T] + processed_signal_length [B] + """ + input_signal = torch.tensor(input_signal, dtype=torch.float32) + input_signal_length = torch.tensor(input_signal_length, dtype=torch.int32) + processed_signal, processed_signal_length = self.feature_extractor(input_signal, input_signal_length) + return processed_signal, processed_signal_length + + def setup_inputs(self, input_text, input_media, batch_size): + """ + Args: + input_text: str or List[str] or None + input_media: Tuple[np.array, np.array] + input_signal: audio signal in numpy array [b, -1] + input_signal_length: length of the audio signal in numpy array [b] + batch_size: int + + """ + input_signal, input_signal_length = input_media + processed_signal, processed_signal_length = self.process_audio(input_signal, input_signal_length) + processed_signal = processed_signal.to(self.device) + processed_signal_length = processed_signal_length.to(self.device) + if input_text is None: + input_text = "Q: what's the transcription of the audio? A:" + + if isinstance(input_text, str): + input_text = [input_text] * batch_size + + assert len(input_text) == batch_size + pre_prompt = [''] * batch_size + post_prompt = input_text + decoder_input_ids = None + attention_mask = None + return ( + input_text, + pre_prompt, + post_prompt, + processed_signal, + processed_signal_length, + decoder_input_ids, + attention_mask, + ) + + def load_test_media(self, input_media_path): + """ + Args: + input_media_path: str, path to the audio file + Returns: + input_signal: np.array [1, -1] + input_signal_length: np.array [1] + """ + waveform, sample_rate = sf.read(input_media_path, dtype=np.float32) + input_signal = np.array([waveform], dtype=np.float32) + input_signal_length = np.array([len(waveform)], dtype=np.int32) + return input_signal, input_signal_length + + def get_modality_encoder_features(self, modality_features, attention_mask): + """ + Do inference on the modality encoder engine + Args: + modality_features: dict {'input1': torch.tensor, 'input2': torch.tensor, ..} + attention_mask: None + Returns: + """ + + if attention_mask is not None: + modality_features['attention_mask'] = attention_mask + + tensor_info = [] + for key, tensor in modality_features.items(): + tensor_info.append(TensorInfo(key, torch_dtype_to_trt(tensor.dtype), tensor.shape)) + + output_info = self.modality_encoder_session.infer_shapes(tensor_info) + + outputs = { + t.name: torch.empty(tuple(t.shape), dtype=trt_dtype_to_torch(t.dtype), device=self.device) + for t in output_info + } + + ok = self.modality_encoder_session.run(modality_features, outputs, self.stream.cuda_stream) + assert ok, "Runtime execution failed for vision encoder session" + self.stream.synchronize() + + return outputs + + def preprocess(self, warmup, pre_prompt, post_prompt, processed_features, attention_mask, batch_size): + """ + Args: + warmup: bool + pre_prompt: List[str] + post_prompt: List[str] + processed_features: Tuple[torch.tensor, torch.tensor] + processed_signal: torch.tensor [B, 80, T] + processed_signal_length: torch.tensor [B] + attention_mask: None + batch_size: int + Returns: + input_ids: torch.tensor [B, L] + input_lengths: torch.tensor [B] + ptuning_args: List[torch.tensor] + encoded_features: torch.tensor [B, L, D] + """ + if not warmup: + profiler.start(self.modality.capitalize()) + + if not warmup: + profiler.stop(self.modality.capitalize()) + + assert self.model_type == 'salm', f"Invalid model type {self.model_type}" + + processed_features = { + "processed_signal": processed_features[0], + "processed_signal_length": processed_features[1].to(torch.int32), + } + encoded_outputs = self.get_modality_encoder_features(processed_features, attention_mask) + encoded_features, encoded_length = encoded_outputs['encoded'], encoded_outputs['encoded_length'] + pre_input_ids = self.tokenizer(pre_prompt).input_ids + post_input_ids = self.tokenizer(post_prompt).input_ids + input_lengths = [] + input_ids = [] + encoded_length = encoded_length.cpu().numpy() + fake_id_start = self.model.vocab_size + for i in range(batch_size): + feat_len = encoded_length[i] + feat_fake_ids = np.arange(fake_id_start, fake_id_start + feat_len) + cur_input_ids = np.concatenate([pre_input_ids[i], feat_fake_ids, post_input_ids[i]]) + fake_id_start += feat_len + input_lengths.append(len(cur_input_ids)) + input_ids.append(cur_input_ids) + + max_length = max(input_lengths) + # convert input_ids to torch tensor with padding + input_ids = [ + np.pad(ids, (0, max_length - len(ids)), 'constant', constant_values=self.tokenizer.pad_token_id) + for ids in input_ids + ] + input_ids = torch.tensor(input_ids, dtype=torch.int32) + input_lengths = torch.tensor(input_lengths, dtype=torch.int32) + ptuning_args = self.ptuning_setup(encoded_features, input_ids, input_lengths) + + return input_ids, input_lengths, ptuning_args, encoded_features + + def run( + self, + input_text, + input_media=None, + max_new_tokens: int = 30, + batch_size: int = 1, + top_k: int = 1, + top_p: float = 0.0, + temperature: float = 1.0, + repetition_penalty: float = 1.0, + num_beams: int = 1, + run_profiling=False, + check_accuracy=False, + input_signal=None, + input_signal_length=None, + ): + """ + Args: + input_text: str or List[str] or None + input_media: Tuple[np.array, np.array] or None + input_signal: audio signal in numpy array [b, -1] + input_signal_length: length of the audio signal in numpy array [b] + max_new_tokens: int + batch_size: int + top_k: int + top_p: float + temperature: float + repetition_penalty: float + num_beams: int + run_profiling: bool + check_accuracy: bool + """ + if input_media is None: + assert input_signal is not None and input_signal_length is not None + input_media = (input_signal, input_signal_length) + + ( + input_text, + pre_prompt, + post_prompt, + processed_signal, + processed_signal_length, + decoder_input_ids, + attention_mask, + ) = self.setup_inputs(input_text, input_media, batch_size) + processed_media = (processed_signal, processed_signal_length) + + self.generate( + pre_prompt, + post_prompt, + processed_media, + decoder_input_ids, + max_new_tokens, + attention_mask=attention_mask, + warmup=True, + batch_size=batch_size, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + num_beams=num_beams, + ) + num_iters = self.profiling_iterations if run_profiling else 1 + for _ in range(num_iters): + output_text = self.generate( + pre_prompt, + post_prompt, + processed_media, + decoder_input_ids, + max_new_tokens, + attention_mask=attention_mask, + warmup=False, + batch_size=batch_size, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + num_beams=num_beams, + ) + if self.runtime_rank == 0: + self.print_result(input_text, output_text, batch_size, num_beams, run_profiling, check_accuracy) + return output_text diff --git a/nemo/export/tensorrt_mm_exporter.py b/nemo/export/tensorrt_mm_exporter.py index b0536a55f95f..d4da0ac34b1c 100644 --- a/nemo/export/tensorrt_mm_exporter.py +++ b/nemo/export/tensorrt_mm_exporter.py @@ -21,8 +21,8 @@ import wrapt from nemo.deploy import ITritonDeployable -from nemo.export.multimodal.build import build_trtllm_engine, build_visual_engine -from nemo.export.multimodal.run import MultimodalModelRunner +from nemo.export.multimodal.build import build_perception_engine, build_trtllm_engine, build_visual_engine +from nemo.export.multimodal.run import MultimodalModelRunner, SpeechllmModelRunner use_deploy = True try: @@ -74,9 +74,13 @@ def __init__( self, model_dir: str, load_model: bool = True, + modality: str = "vision", ): self.model_dir = model_dir self.runner = None + # vision modality is for image and video + assert modality in ["vision", "audio"] + self.modality = modality if load_model: self._load() @@ -128,8 +132,12 @@ def export( dtype=dtype, ) - visual_dir = os.path.join(self.model_dir, "visual_engine") - build_visual_engine(visual_dir, visual_checkpoint_path, model_type, vision_max_batch_size) + if model_type == "salm": + perception_dir = os.path.join(self.model_dir, "perception_engine") + build_perception_engine(perception_dir, visual_checkpoint_path, model_type, vision_max_batch_size) + else: + visual_dir = os.path.join(self.model_dir, "visual_engine") + build_visual_engine(visual_dir, visual_checkpoint_path, model_type, vision_max_batch_size) if load_model: self._load() @@ -164,19 +172,32 @@ def forward( num_beams, ) + def get_input_media_tensors(self): + if self.modality == "vision": + return [Tensor(name="input_media", shape=(-1, -1, -1, 3), dtype=np.uint8)] + elif self.modality == "audio": + return [ + Tensor(name="input_signal", shape=(-1,), dtype=np.single), + Tensor(name="input_signal_length", shape=(1,), dtype=np.intc), + ] + return [] + @property def get_triton_input(self): inputs = ( - Tensor(name="input_text", shape=(-1,), dtype=bytes), - Tensor(name="input_media", shape=(-1, -1, -1, 3), dtype=np.uint8), - Tensor(name="batch_size", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True), - Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True), - Tensor(name="repetition_penalty", shape=(-1,), dtype=np.single, optional=True), - Tensor(name="num_beams", shape=(-1,), dtype=np.int_, optional=True), + [Tensor(name="input_text", shape=(-1,), dtype=bytes)] + + self.get_input_media_tensors() + + [ + Tensor(name="batch_size", shape=(-1,), dtype=np.int_, optional=True), + Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True), + Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True), + Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True), + Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True), + Tensor(name="repetition_penalty", shape=(-1,), dtype=np.single, optional=True), + Tensor(name="num_beams", shape=(-1,), dtype=np.int_, optional=True), + ] ) + inputs = tuple(inputs) return inputs @property @@ -198,6 +219,9 @@ def triton_infer_fn(self, **inputs: np.ndarray): infer_input["input_image"] = ndarray2img(inputs.pop("input_media")[0])[0] elif self.runner.model_type in video_model_list: infer_input["input_image"] = inputs.pop("input_media")[0] + elif self.runner.model_type == "salm": + infer_input["input_signal"] = inputs.pop("input_signal") + infer_input["input_signal_length"] = inputs.pop("input_signal_length")[:, 0] if "batch_size" in inputs: infer_input["batch_size"] = inputs.pop("batch_size")[0][0] if "max_output_len" in inputs: @@ -223,5 +247,9 @@ def triton_infer_fn(self, **inputs: np.ndarray): def _load(self): llm_dir = os.path.join(self.model_dir, "llm_engine") - visual_dir = os.path.join(self.model_dir, "visual_engine") - self.runner = MultimodalModelRunner(visual_dir, llm_dir) + if self.modality == "vision": + visual_dir = os.path.join(self.model_dir, "visual_engine") + self.runner = MultimodalModelRunner(visual_dir, llm_dir, self.modality) + elif self.modality == "audio": + perception_dir = os.path.join(self.model_dir, "perception_engine") + self.runner = SpeechllmModelRunner(perception_dir, llm_dir, self.modality) diff --git a/scripts/deploy/multimodal/deploy_triton.py b/scripts/deploy/multimodal/deploy_triton.py index d0bf8f10548a..18463a3fc24a 100755 --- a/scripts/deploy/multimodal/deploy_triton.py +++ b/scripts/deploy/multimodal/deploy_triton.py @@ -35,6 +35,16 @@ def get_args(argv): formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Deploy nemo models to Triton", ) + # default modality is vision, can be changed to audio + parser.add_argument( + "-mod", + "--modality", + type=str, + required=False, + default="vision", + choices=["vision", "audio"], + help="Modality of the model", + ) parser.add_argument("-vc", "--visual_checkpoint", type=str, help="Source .nemo file for visual model") parser.add_argument( "-lc", @@ -48,7 +58,7 @@ def get_args(argv): "--model_type", type=str, required=True, - choices=["neva", "video-neva", "lita", "vila", "vita"], + choices=["neva", "video-neva", "lita", "vila", "vita", "salm"], help="Type of the model that is supported.", ) parser.add_argument( @@ -123,8 +133,7 @@ def get_trt_deployable(args): raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.") exporter = TensorRTMMExporter( - model_dir=trt_path, - load_model=(args.visual_checkpoint is None), + model_dir=trt_path, load_model=(args.visual_checkpoint is None), modality=args.modality ) if args.visual_checkpoint is not None: From 59a3e961091a39949c80487c72a5c90c46ad8b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 26 Aug 2024 20:45:55 -0700 Subject: [PATCH 052/664] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let'?= =?UTF-8?q?s=20bump=20`Dockerfile.ci`=20to=20ef85bc9=20!=20(#10250)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> --- Dockerfile.ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ci b/Dockerfile.ci index 161671bf5a5a..3ef2ca64bee7 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -34,7 +34,7 @@ WORKDIR /workspace # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.15.0 -ARG MCORE_TAG=124bcff2a8153eccea4d7d0e4df5c5562aab50b9 +ARG MCORE_TAG=ef85bc94fc744aa5d398d12140f808023afbf78d ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ From 49f13fba620a9118731a2a7f4a68b7838ebfc6cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 26 Aug 2024 22:45:49 -0700 Subject: [PATCH 053/664] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let'?= =?UTF-8?q?s=20bump=20`Dockerfile.ci`=20to=2001ca03f=20!=20(#10266)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: oliver könig Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> --- Dockerfile.ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ci b/Dockerfile.ci index 3ef2ca64bee7..e687c385cce8 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -34,7 +34,7 @@ WORKDIR /workspace # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.15.0 -ARG MCORE_TAG=ef85bc94fc744aa5d398d12140f808023afbf78d +ARG MCORE_TAG=01ca03f11e89f4f85682dcac647c2b913b25fcee ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ From 2f422dd572fb5e8b06049502afdea814d01557cd Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 27 Aug 2024 16:31:18 +0200 Subject: [PATCH 054/664] Load model in the target export precision by default in PTQ (#10267) * Load model in the target export precision by default Signed-off-by: Jan Lasek * Enable megatron_amp_O2=true to actually use half-precision Signed-off-by: Jan Lasek --------- Signed-off-by: Jan Lasek Signed-off-by: Jan Lasek --- examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml index f603ebb58eb7..62f0e452d3b5 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml @@ -17,13 +17,15 @@ trainer: num_nodes: 1 accelerator: gpu logger: false # logger provided by exp_manager - precision: bf16 # 16, 32, or bf16 + precision: ${export.dtype} # 16, bf16, or 32 enable_checkpointing: false model: tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 restore_from_path: llama2-7b-fp16.nemo # Nemo file path + precision: ${export.dtype} # Model weights data type + megatron_amp_O2: true # Enable Megatron O2-style half-precision ## Activation Checkpoint activations_checkpoint_granularity: null # 'selective' or 'full' @@ -42,7 +44,7 @@ export: decoder_type: llama # gptnext, gpt2, llama inference_tensor_parallel: 1 # Default using 1 TP for inference inference_pipeline_parallel: 1 # Default using 1 PP for inference - dtype: ${trainer.precision} # Default precision data type + dtype: 16 # Default precision data type for non-quantized layers: 16 or bf16 save_path: llama2-7b-${quantization.algorithm}.qnemo # Path where the quantized model will be saved compress: false # Whether save_path should be a tarball or a directory sample_output: true # Whether to run a sample prompt before saving From fd751621096647a46c12036acc24c0946797751d Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Tue, 27 Aug 2024 09:53:08 -0700 Subject: [PATCH 055/664] Add WandbPlugin, NsysPlugin and PreemptionPlugin to nemo.lightning.run.plugins (#10223) * Add WandbPlugin, NsysPlugin and PreemptionPlugin to nemo.lightning.run.plugins Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai * Remove duplicate Signed-off-by: Hemil Desai * Add entity to wandb logger Signed-off-by: Hemil Desai * Add documentation Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai * Add warning Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai * PR feedback Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai * Add comments Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai --------- Signed-off-by: Hemil Desai Signed-off-by: hemildesai Co-authored-by: hemildesai --- nemo/collections/llm/recipes/log/default.py | 9 +- nemo/lightning/run/__init__.py | 0 nemo/lightning/run/plugins.py | 165 ++++++++++++++++++++ 3 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 nemo/lightning/run/__init__.py create mode 100644 nemo/lightning/run/plugins.py diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py index dc18565a0e06..4d5e9223b535 100644 --- a/nemo/collections/llm/recipes/log/default.py +++ b/nemo/collections/llm/recipes/log/default.py @@ -10,14 +10,19 @@ def tensorboard_logger(name: str, save_dir: str = "tb_logs") -> Config[TensorBoa return Config(TensorBoardLogger, save_dir=save_dir, name=name) -def wandb_logger(project: str, name: str) -> Config[WandbLogger]: - return Config( +def wandb_logger(project: str, name: str, entity: Optional[str] = None) -> Config[WandbLogger]: + cfg = Config( WandbLogger, project=project, name=name, config={}, ) + if entity: + cfg.entity = entity + + return cfg + def default_log( ckpt_dir: str, diff --git a/nemo/lightning/run/__init__.py b/nemo/lightning/run/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py new file mode 100644 index 000000000000..0f6a76d4799f --- /dev/null +++ b/nemo/lightning/run/plugins.py @@ -0,0 +1,165 @@ +import copy +import logging +import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import Callable, Optional + +import nemo_run as run +import yaml +from nemo_run.core.serialization.yaml import YamlSerializer +from pytorch_lightning import Callback +from pytorch_lightning.loggers import WandbLogger + +from nemo.lightning.pytorch.callbacks import NsysCallback, PreemptionCallback +from nemo.utils import logging + +# This file contains plugins based on NeMo-Run's run.Plugin API. +# Plugins operate both on a configured task and an executor at the same time, and are specific to NeMo-Run. +# If you are adding functionality that goes directly into the Pytorch Lightning trainer, you may consider adding a callback instead of a plugin. + + +def _merge_callbacks(partial: run.Partial, callbacks: list[run.Config[Callback]]): + if hasattr(partial, "trainer"): + if hasattr(partial.trainer, "callbacks"): + for callback in callbacks: + if callback not in partial.trainer.callbacks: + partial.trainer.callbacks.append(callback) + else: + partial.trainer.callbacks = copy.deepcopy(callbacks) + + +@dataclass(kw_only=True) +class PreemptionPlugin(run.Plugin): + """ + A plugin for setting up Preemption callback and preemption signals. + + Args: + preempt_time (int): The time, in seconds, before the task's time limit at which the executor + will send a SIGTERM preemption signal. This allows tasks to be gracefully + stopped before reaching their time limit, reducing waste and + promoting fair resource usage. The default value is 300 seconds (5 minutes). + This is only supported for ``run.SlurmExecutor``. + callbacks (list[run.Config[Callback]]): A list of callback configurations that the plugin + will merge with the task's existing callbacks. + By default, the list includes NeMo's preemption callback. + """ + + preempt_time: int = 300 + callbacks: list[run.Config[Callback]] = field(default_factory=lambda: [run.Config(PreemptionCallback)]) + + def setup(self, task: run.Partial | run.Script, executor: run.Executor): + if isinstance(task, run.Script): + logging.warning( + f"The {self.__class__.__name__} will have no effect on the task as it's an instance of run.Script" + ) + return + + if isinstance(executor, run.SlurmExecutor): + # Sends a SIGTERM self.preempt_time seconds before hitting time limit + logging.info( + f"{self.__class__.__name__} will send a SIGTERM {self.preempt_time} seconds before the job's time limit for your Slurm executor." + ) + executor.signal = f"TERM@{self.preempt_time}" + + _merge_callbacks(task, callbacks=self.callbacks) + + +@dataclass(kw_only=True) +class NsysPlugin(run.Plugin): + """ + A plugin for nsys profiling. + + The NsysPlugin allows you to profile your run using nsys. + You can specify when to start and end the profiling, on which ranks to run the profiling, + and what to trace during profiling. + + Args: + start_step (int): The step at which to start the nsys profiling. + end_step (int): The step at which to end the nsys profiling. + ranks (Optional[list[int]]): The ranks on which to run the nsys profiling. If not specified, + profiling will be run on rank 0. + nsys_trace (Optional[list[str]]): The events to trace during profiling. If not specified, + 'nvtx' and 'cuda' events will be traced. + """ + + start_step: int + end_step: int + ranks: Optional[list[int]] = None + nsys_trace: Optional[list[str]] = None + + def setup(self, task: run.Partial | run.Script, executor: run.Executor): + if isinstance(task, run.Partial): + nsys_callback = run.Config( + NsysCallback, + start_step=self.start_step, + end_step=self.end_step, + ranks=self.ranks or [0], + ) + callbacks: list[run.Config[Callback]] = [nsys_callback] # type: ignore + _merge_callbacks(task, callbacks=callbacks) + + launcher = executor.get_launcher() + launcher.nsys_profile = True + launcher.nsys_trace = self.nsys_trace or ["nvtx", "cuda"] + + +@dataclass(kw_only=True) +class WandbPlugin(run.Plugin): + """ + A plugin for setting up Weights & Biases. + + This plugin sets a ``WandbLogger`` to ``NeMoLogger``'s ``wandb`` arg, + which in turn initializes the Pytorch Lightning `WandbLogger `_. + + This plugin is only activated if the ``WANDB_API_KEY`` environment variable is set. + The ``WANDB_API_KEY`` environment variables will also be set in the executor's environment variables. + Follow https://docs.wandb.ai/quickstart to retrieve your ``WANDB_API_KEY``. + + If `log_task_config` is True, the plugin will log the task configuration as a config dictionary + to the Weights and Biases logger. + + Args: + name (str): The name for the Weights & Biases run. + logger_fn (Callable[..., run.Config[WandbLogger]]): A callable that returns a Config of ``WandbLogger`` + log_task_config (bool, optional): Whether to log the task configuration to the logger. + Defaults to True. + + Raises: + logging.warning: If the task is an instance of `run.Script`, as the plugin has no effect on such tasks. + """ + + name: str + logger_fn: Callable[..., run.Config[WandbLogger]] + log_task_config: bool = True + + def setup(self, task: run.Partial | run.Script, executor: run.Executor): + if isinstance(task, run.Script): + logging.warning( + f"The {self.__class__.__name__} will have no effect on the task as it's an instance of run.Script" + ) + return + + if "WANDB_API_KEY" in os.environ: + executor.env_vars["WANDB_API_KEY"] = os.environ["WANDB_API_KEY"] + + if hasattr(task, "log") and hasattr(task.log, "wandb"): + task.log.wandb = self.logger_fn(name=self.name) + if self.log_task_config: + partial_config = yaml.safe_load(YamlSerializer().serialize(task)) + partial_config["experiment"] = { + "id": self.experiment_id, + "task_name": self.name, + "executor": executor.info(), + "remote_directory": ( + os.path.join(executor.tunnel.job_dir, Path(executor.job_dir).name) + if isinstance(executor, run.SlurmExecutor) + else None + ), + "local_directory": executor.job_dir, + } + task.log.wandb.config = partial_config + else: + logging.warning( + f"The {self.__class__.__name__} will have no effect as WANDB_API_KEY environment variable is not set." + ) From 38800cdad5d46565a156bc62ebcd2847a4f4d043 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Tue, 27 Aug 2024 14:51:20 -0700 Subject: [PATCH 056/664] [NeMo-UX] Handle absolute logger directories in nemo_logger (#10259) * handle absolute and relative logger directories Signed-off-by: Anna Shors * merge lines Signed-off-by: ashors1 --------- Signed-off-by: Anna Shors Signed-off-by: ashors1 --- nemo/lightning/nemo_logger.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py index 6509c384f8cf..bae62f09593b 100644 --- a/nemo/lightning/nemo_logger.py +++ b/nemo/lightning/nemo_logger.py @@ -30,11 +30,10 @@ class NeMoLogger(IOMixin): log_global_rank_0_only (bool): Log only on global rank 0. files_to_copy (Optional[List[str]]): List of files to copy to log directory. update_logger_directory (bool): Whether to update logger directory to write to `exp_dir`. - If True, the `save_dir` passed to the logger will be treated as a relative path and - the logger will be reconfigured to write to `exp_dir / save_dir`. This ensures that - all output from an experiment is written to a common directory. If False, the logger's - save_dir will not be overwritten. This argument applies only to TensorBoardLogger and - WandbLogger instances. + If True, the `save_dir` passed to the logger will be reconfigured to write to `exp_dir / save_dir`. + This ensures that all output from an experiment is written to a common directory. + If False, the logger's save_dir will not be overwritten. + This argument applies only to TensorBoardLogger and WandbLogger instances. ckpt (Optional[ModelCheckpoint]): Model checkpoint callback. tensorboard: (Optional[TensorBoardLogger]): A PyTorch Lightning TensorBoardLogger instance to add to the trainer. @@ -158,7 +157,7 @@ def _setup_trainer_loggers(self, trainer, dir, version): for logger in trainer.loggers: if isinstance(logger, TensorBoardLogger): logger._version = version or "" - logger._root_dir = Path(dir) / logger.save_dir + logger._root_dir = Path(dir) / os.path.relpath(logger.save_dir) trainer.logger._name = self.name logging.warning( f'"update_logger_directory" is True. Overwriting tensorboard logger "save_dir" to {logger._root_dir}' From 57aa305ef60c67e72c10725402fabe267f1470bb Mon Sep 17 00:00:00 2001 From: Ming <111467530+Victor49152@users.noreply.github.com> Date: Tue, 27 Aug 2024 15:20:22 -0700 Subject: [PATCH 057/664] Add sdxl notebook (#10139) * Add sdxl notebook Signed-off-by: mingyuanm * Rename Signed-off-by: mingyuanm * final Update SDXL notebook Signed-off-by: mingyuanm --------- Signed-off-by: mingyuanm --- docs/source/multimodal/text2img/sd.rst | 2 +- tutorials/multimodal/SDXL Tutorial.ipynb | 253 +++++++++++++++++++++++ 2 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 tutorials/multimodal/SDXL Tutorial.ipynb diff --git a/docs/source/multimodal/text2img/sd.rst b/docs/source/multimodal/text2img/sd.rst index 6f5092f93f5f..549f13bbabf6 100644 --- a/docs/source/multimodal/text2img/sd.rst +++ b/docs/source/multimodal/text2img/sd.rst @@ -163,7 +163,7 @@ Optimization related configurations Training with precached latents ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Since the VAE and text encoder remain frozed during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``. +Since the VAE and text encoder remain frozen during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``. Reference ----------- diff --git a/tutorials/multimodal/SDXL Tutorial.ipynb b/tutorials/multimodal/SDXL Tutorial.ipynb new file mode 100644 index 000000000000..92667100b405 --- /dev/null +++ b/tutorials/multimodal/SDXL Tutorial.ipynb @@ -0,0 +1,253 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "d874e23f-9631-48e0-b635-84e7280bf07b", + "metadata": {}, + "source": [ + "# SDXL Training / Inference Tutorial\n", + "\n", + "### Note:\n", + "Currently, this notebook must be run in a NeMo container (> 24.09) and open_clip_torch<=2.24.0. An example command to launch the container:\n", + "\n", + "```\n", + "docker run --gpus all -it --rm -v :/opt/NeMo -v :/datasets --shm-size=8g \\\n", + " -p 8888:8888 --ulimit memlock=-1 --ulimit \\\n", + " stack=67108864 \n", + "```\n", + "\n", + "\n", + "## Introduction\n", + "\n", + "This notebook illustrates how to train and perform inference using Stable Diffusion XL with the NeMo Toolkit. Despite differences in model configs, the training and inference procedure is similar as Stable Diffusion.\n", + "\n", + "The implementation of Stable Diffusion XL is based on [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://arxiv.org/abs/2307.01952).\n", + "\n", + "This tutorial will guide you through the following topics:\n", + "\n", + "1. Training a Stable Diffusion XL model.\n", + "2. Performing inference with the trained model.\n", + "\n", + "## Datasets\n", + "\n", + "Please refer to [Dataset Tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb) for how to prepare a training dataset for Stable diffusion XL.\n", + "\n", + "For a pre-cached Stable Diffusion dataset, each webdataset tar file should, at a minimum, include the pickle files that store the pre-cached image and text features:\n", + "\n", + "```\n", + "t0_r0_0.tar\n", + "|---- 0000.pickle\n", + "|---- 0001.pickle\n", + "...\n", + "```\n", + "\n", + "For non-precached Stable Diffusion dataset, each webdataset tar file should contain the raw texts and corresponding images:\n", + "\n", + "```\n", + "t0_r0_0.tar\n", + "|---- 0000.jpg\n", + "|---- 0000.txt\n", + "|---- 0001.jpg\n", + "|---- 0001.txt\n", + "...\n", + "```\n", + "\n", + "## Encoders Preparation\n", + "\n", + "Depending on whether you precache the dataset, you might also need to first download the image and/or text encoders.\n", + "\n", + "### Option 1: Training on Non-Precached Dataset (Use Encoders During Training)\n", + "\n", + "#### A. Prepare VAE\n", + "To download the default VAE for Stable Diffusion:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "730cd137-0fce-4bab-8ac7-219e5c55faf2", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "! wget https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/vae/diffusion_pytorch_model.safetensors\n", + "! mkdir -p /sdxl_ckpts\n", + "! mv diffusion_pytorch_model.safetensors /sdxl_ckpts/vae.safetensors" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fef8b245-7cee-4048-a9ec-3ada90432a89", + "metadata": {}, + "source": [ + "The above command will download the default VAE weights from HuggingFace and save it to `/sdxl_ckpts/vae.safetensors`.\n", + "\n", + "**Note**: if you want to customize the saved location, make sure it is also reflected in your training config.\n", + "#### B. Prepare Text Encoder\n", + "For the text encoders used in Stable Diffusion XL, it will be automatically downloaded by the training script we provide.\n", + "\n", + "The type of text encoder used in the sdxl model conditioner can be found in `conditioner_config` in the predefined training configs:\n", + "\n", + "```\n", + " conditioner_config:\n", + " _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner\n", + " emb_models:\n", + " - is_trainable: false\n", + " input_key: captions\n", + " ucg_rate: 0.1\n", + " emb_model:\n", + " _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder\n", + " layer: hidden\n", + " layer_idx: 11\n", + " - is_trainable: false\n", + " ucg_rate: 0.1\n", + " input_key: captions\n", + " emb_model:\n", + " _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2\n", + " arch: ViT-bigG-14\n", + " version: laion2b_s39b_b160k\n", + " freeze: true\n", + " layer: penultimate\n", + " always_return_pooled: true\n", + " legacy: false\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8854eb7a-e822-43f6-a1d5-12357049485a", + "metadata": {}, + "source": [ + "\n", + "### Option 2: Training on Precached Dataset (Training UNet Only)\n", + "\n", + "When using precached dataset (please refer to the [Dataset Tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb) for details), every text feature and image feature are stored as key-value pairs in `.pickle` file:\n", + "\n", + "```\n", + "{\n", + " image_key: torch.Tensor(),\n", + " text_key: torch.Tensor(),\n", + "}\n", + "```\n", + "\n", + "Make sure in the training config, `cond_stage_key` is associated with `text_key` and `first_stage_key` is associated with `image_key`.\n", + "\n", + "We offer an expample script to convert a dataset from `parquet` file to webdataset `tar` files at [parquet_conversion](https://github.com/NVIDIA/NeMo/blob/main/scripts/multimodal_dataset_conversion/parquet_conversion.py). Three different modes of prechaed training are provided, they are:\n", + "\n", + "1. No Caching: VAE and Text encoders are loaded during training\n", + "2. Text only: Only text features are loaded from dataset during training\n", + "3. Both: Both image and text features are loaded from dataset during training\n", + "\n", + "In each mode, the cached components should be saved in its raw format in tarfiles while cached components should be saved as torch.Tensor()." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5762427b-f60c-4dfd-8318-e55771b25354", + "metadata": {}, + "source": [ + "## Model Config Setup\n", + "\n", + "Now we will begin setting up the config file needed for Stable Diffusion training. We will use [sd_train.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml) as the template.\n", + "\n", + "1. Modify `model.data.train.dataset_path` so that it has all the webdataset info files you want to train on\n", + "2. Modify `model.data.webdataset.local_root_path` to point to your dataset path\n", + "3. Make sure VAE path `model.first_stage_config.from_pretrained` is adjusted if using non-precached dataset\n", + "4. Make sure the `model.precache mode` is set properly with the dataset you prepared, as detailed above.\n", + "5. Configure `exp_manager.exp_dir` for experiment save directory\n", + "6. Configure `exp_manager.wandb_logger_kwargs` and/or `exp_manager.create_tensorboard_logger` if needed" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "70f858b3-f7d5-4678-b380-80582337bc23", + "metadata": {}, + "source": [ + "**Note**: Please refer to NeMo Toolkit Developer Guide's Stable Diffusion page for more details on in-depth customizations, including all available optimizations.\n", + "\n", + "## Training\n", + "\n", + "Once everything is set up, training stable diffusion is as simple as running:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "589e3a14-c881-4a56-b2bd-370653059dfc", + "metadata": {}, + "outputs": [], + "source": "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py trainer.max_steps=100 model.data.train.dataset_path=/path/to/wdinfo.pkl model.data.webdataset.local_root_path=/path/to/dataset trainer.devices=1 trainer.num_nodes=1 model.micro_batch_size=1 model.global_batch_size=1 model.first_stage_config.from_pretrained=/sdxl_ckpts/vae.safetensors model.fsdp=False" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "892d72dd-c4d7-4ca4-a948-168e187af65c", + "metadata": {}, + "source": [ + "Intermediate checkpoints (during training) and final checkpoint will be saved to `exp_manager.exp_dir` folder. Note that here we use synthetic data for demo purpose." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "087c8b9a-92c3-43d3-86a3-bf7e848dfbd2", + "metadata": {}, + "source": [ + "## Inference\n", + "\n", + "Stable Diffusion XL inference needs a trained NeMo Stable Diffusion checkpoint, along with both the image encoder (VAE) and text encoder (CLIP). The checkpoint can be either a fully trained `.nemo` checkpoint or an intermediate checkpoint from training (typically in `.ckpt` format). \n", + "\n", + "### Inference Config Setup\n", + "\n", + "Now we will begin setting up the config file needed for Stable Diffusion inference. We will use [sd_xl_infer_v2.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer_v2.yaml) as the template.\n", + "\n", + "We generally use [Classifier Free Guidance](https://arxiv.org/abs/2207.12598) for better visual quality, which can be set at `sampling.base.scale`.\n", + "\n", + "NeMo Stable Diffusion supports multiple samplers. Please refer to the developer guide for more details. Samplers can be set at `sampling.base.sampler`.\n", + "\n", + "Inference supports a batch of text prompts, which can be set at `infer.prompt`. One can also generate a configurable number of images per prompt by setting `infer.num_samples`. Generated images will be saved to `out_path`.\n", + "\n", + "You will also need to set the model checkpoint path at `model.restore_from_path` if you are loading from `.nemo` checkpoint, otherwise, mannually set `unet` checkpoints and `vae` checkpoint at `model.unet_config.from_pretrained` and `model.first_stage_config.from_pretrained`, respectively.\n", + "\n", + "### Running the Inference\n", + "\n", + "Once everything is set up, Stable Diffusion inference is as simple as running:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e676c5d-d711-489e-8ab7-3ee20046d88d", + "metadata": {}, + "outputs": [], + "source": "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py model.restore_from_path=/path/to/stable-diffusion-xl-train.nemo out_path=/sdxl_infer_out" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 19668e5320a2e2af0199b6d5e0b841993be3a634 Mon Sep 17 00:00:00 2001 From: Ao Tang Date: Tue, 27 Aug 2024 18:41:35 -0400 Subject: [PATCH 058/664] Add Llama31 Config (#10260) * add llama31 config * Apply isort and black reformatting Signed-off-by: suiyoubi * fix init method * typo * revert llama3-70b init method std --------- Signed-off-by: suiyoubi Co-authored-by: suiyoubi Co-authored-by: Chen Cui --- nemo/collections/llm/__init__.py | 6 ++ nemo/collections/llm/gpt/model/__init__.py | 6 ++ nemo/collections/llm/gpt/model/llama.py | 89 +++++++++++++++++++++- 3 files changed, 99 insertions(+), 2 deletions(-) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 812daddf02b6..86373135adb5 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -39,6 +39,9 @@ Llama2Config70B, Llama3Config8B, Llama3Config70B, + Llama31Config8B, + Llama31Config70B, + Llama31Config405B, LlamaConfig, LlamaModel, MaskedTokenLossReduction, @@ -93,6 +96,9 @@ "Llama2Config70B", "Llama3Config8B", "Llama3Config70B", + "Llama31Config8B", + "Llama31Config70B", + "Llama31Config405B", "CodeLlamaConfig7B", "CodeLlamaConfig13B", "CodeLlamaConfig34B", diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index a0132a34d185..0452c8dc6f89 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -27,6 +27,9 @@ Llama2Config70B, Llama3Config8B, Llama3Config70B, + Llama31Config8B, + Llama31Config70B, + Llama31Config405B, LlamaConfig, LlamaModel, ) @@ -62,6 +65,9 @@ "Llama2Config70B", "Llama3Config8B", "Llama3Config70B", + "Llama31Config8B", + "Llama31Config70B", + "Llama31Config405B", "NemotronConfig", "Nemotron3Config4B", "Nemotron3Config8B", diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index ab2f46378a1e..4f7dd4d37a90 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -1,3 +1,4 @@ +import math from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Annotated, Callable, Optional @@ -9,6 +10,7 @@ from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel from nemo.collections.llm.utils import Config from nemo.lightning import OptimizerModule, io, teardown +from nemo.utils import logging if TYPE_CHECKING: from transformers import LlamaConfig as HFLlamaConfig @@ -66,7 +68,7 @@ class Llama3Config(GPTConfig): num_query_groups: int = 8 hidden_dropout: float = 0.0 attention_dropout: float = 0.0 - normalization = "RMSNorm" + normalization: str = "RMSNorm" init_method_std: float = 0.01 layernorm_epsilon: float = 1.0e-05 add_bias_linear: bool = False @@ -80,10 +82,31 @@ class Llama3Config(GPTConfig): bias_dropout_fusion: bool = True apply_rope_fusion: bool = True share_embeddings_and_output_weights: bool = False - position_embedding_type = "rope" + position_embedding_type: str = "rope" rotary_percent: float = 1.0 +@dataclass +class Llama31Config(Llama3Config): + scale_factor: int = 8 + low_freq_factor: int = 1 + high_freq_factor: int = 4 + old_context_len: int = 8192 + init_method_std: float = 0.02 + + def configure_model(self, tokenizer) -> "MCoreGPTModel": + model = super().configure_model(tokenizer) + # Apply rope scaling for Llama3.1 model + model.rotary_pos_emb.inv_freq = apply_rope_scaling( + model.rotary_pos_emb.inv_freq, + factor=self.scale_factor, + low_freq_factor=self.low_freq_factor, + high_freq_factor=self.high_freq_factor, + old_context_len=self.old_context_len, + ) + return model + + @dataclass class Llama3Config8B(Llama3Config): rotary_base: int = 500_000 @@ -106,6 +129,38 @@ class Llama3Config70B(Llama3Config): make_vocab_size_divisible_by: int = 128 +@dataclass +class Llama31Config8B(Llama31Config): + rotary_base: int = 500_000 + seq_length: int = 131072 + num_layers: int = 32 + hidden_size: int = 4096 + ffn_hidden_size: int = 14336 + num_attention_heads: int = 32 + + +@dataclass +class Llama31Config70B(Llama31Config): + rotary_base: int = 500_000 + seq_length: int = 131072 + num_layers: int = 80 + hidden_size: int = 8192 + ffn_hidden_size: int = 28672 + num_attention_heads: int = 64 + make_vocab_size_divisible_by: int = 128 + + +@dataclass +class Llama31Config405B(Llama31Config): + rotary_base: int = 500_000 + seq_length: int = 131072 + num_layers: int = 126 + hidden_size: int = 16384 + ffn_hidden_size: int = 53248 + num_attention_heads: int = 128 + make_vocab_size_divisible_by: int = 128 + + @dataclass class CodeLlamaConfig7B(Llama2Config7B): rotary_base: int = 1_000_000 @@ -365,6 +420,33 @@ def _export_linear_fc1(linear_fc1): return gate_proj, up_proj +def apply_rope_scaling( + inv_freq, + factor: int = 8, + low_freq_factor: int = 1, + high_freq_factor: int = 4, + old_context_len: int = 8192, +): + logging.info( + f"Apply rope scaling with factor={factor}, low_freq_factor={low_freq_factor}, high_freq_factor={high_freq_factor}, old_context_len={old_context_len}." + ) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + + wavelen = 2 * math.pi / inv_freq + # wavelen < high_freq_wavelen: do nothing + # wavelen > low_freq_wavelen: divide by factor + inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq) + # otherwise: interpolate between the two, using a smooth factor + smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama + is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen) + inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama) + + return inv_freq_llama + + __all__ = [ "LlamaConfig", "Llama2Config7B", @@ -372,6 +454,9 @@ def _export_linear_fc1(linear_fc1): "Llama2Config70B", "Llama3Config8B", "Llama3Config70B", + "Llama31Config8B", + "Llama31Config70B", + "Llama31Config405B", "CodeLlamaConfig7B", "CodeLlamaConfig13B", "CodeLlamaConfig34B", From c7c3eae455be3cda28210e11625f31633e13abe2 Mon Sep 17 00:00:00 2001 From: Selvaraj Anandaraj Date: Tue, 27 Aug 2024 16:30:09 -0700 Subject: [PATCH 059/664] Added offloading support for LoRA adapters (#10237) Signed-off-by: Selvaraj Anandaraj Co-authored-by: Selvaraj Anandaraj Co-authored-by: Chen Cui --- .../modules/common/megatron/adapters/parallel_adapters.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 4f9f04527038..29eea2d54664 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -177,6 +177,7 @@ def __init__( model_parallel_config = ModelParallelConfig() self._sequence_parallel = model_parallel_config.sequence_parallel model_parallel_config.sequence_parallel = False # SP is irrelevant for the lora linear layer + self.config = model_parallel_config if input_is_parallel: self.linear_in = RowParallelLinear( @@ -298,8 +299,14 @@ def forward(self, x): # this function also handles the backward pass correctly x = gather_from_sequence_parallel_region(x) + if self.config.cpu_offloading and self.config.cpu_offloading_activations: + x.activation_offloading = True x, _ = self.linear_in(x) # (@adithyare) ColumnLinear returns output and bias, we are ignoring the bias term. + x = self.activation(x) + + if self.config.cpu_offloading and self.config.cpu_offloading_activations: + x.activation_offloading = True x, _ = self.linear_out(x) if self._sequence_parallel and self.input_is_parallel: From f53600a3e3b85621985f26b9c1f2b9261b2cac96 Mon Sep 17 00:00:00 2001 From: Ao Tang Date: Tue, 27 Aug 2024 19:35:13 -0400 Subject: [PATCH 060/664] Add Qwen2 to Nemo 2 (#10258) * add qwen2 * typo * Apply isort and black reformatting Signed-off-by: suiyoubi * qwen without pip install issue * Apply isort and black reformatting Signed-off-by: suiyoubi * remove calculate vocab size divisible --------- Signed-off-by: suiyoubi Co-authored-by: suiyoubi --- nemo/collections/llm/__init__.py | 12 + nemo/collections/llm/gpt/model/__init__.py | 14 + nemo/collections/llm/gpt/model/base.py | 12 +- nemo/collections/llm/gpt/model/qwen2.py | 392 +++++++++++++++++++++ 4 files changed, 429 insertions(+), 1 deletion(-) create mode 100644 nemo/collections/llm/gpt/model/qwen2.py diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 86373135adb5..168f05d2e56e 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -58,6 +58,12 @@ Nemotron4Config340B, NemotronConfig, NemotronModel, + Qwen2Config, + Qwen2Config1P5B, + Qwen2Config7B, + Qwen2Config72B, + Qwen2Config500M, + Qwen2Model, gpt_data_step, gpt_forward_step, ) @@ -117,6 +123,12 @@ "ChatGLM2Config6B", "ChatGLM3Config6B", "ChatGLMModel", + "Qwen2Model", + "Qwen2Config7B", + "Qwen2Config", + "Qwen2Config500M", + "Qwen2Config1P5B", + "Qwen2Config72B", "PreTrainingDataModule", "FineTuningDataModule", "SquadDataModule", diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 0452c8dc6f89..0bf2fc6f1e7b 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -49,6 +49,14 @@ NemotronConfig, NemotronModel, ) +from nemo.collections.llm.gpt.model.qwen2 import ( + Qwen2Config, + Qwen2Config1P5B, + Qwen2Config7B, + Qwen2Config72B, + Qwen2Config500M, + Qwen2Model, +) __all__ = [ "GPTConfig", @@ -93,6 +101,12 @@ "ChatGLM2Config6B", "ChatGLM3Config6B", "ChatGLMModel", + "Qwen2Config", + "Qwen2Config500M", + "Qwen2Config1P5B", + "Qwen2Config7B", + "Qwen2Config72B", + "Qwen2Model", "MaskedTokenLossReduction", "gpt_data_step", "gpt_forward_step", diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index 2badfa2b1915..c108415a085e 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -13,6 +13,7 @@ from nemo.lightning import get_vocab_size, io from nemo.lightning.megatron_parallel import MaskedTokenLossReduction from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule +from nemo.utils import logging HAVE_TE = True try: @@ -131,10 +132,19 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel": if not isinstance(transformer_layer_spec, ModuleSpec): transformer_layer_spec = transformer_layer_spec(self) + if hasattr(self, 'vocab_size'): + vocab_size = self.vocab_size + logging.info( + f"Use preset vocab_size: {vocab_size}, original vocab_size: {tokenizer.vocab_size}, dummy tokens:" + f" {vocab_size - tokenizer.vocab_size}." + ) + else: + vocab_size = get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by) + return MCoreGPTModel( self, transformer_layer_spec=transformer_layer_spec, - vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by), + vocab_size=vocab_size, max_sequence_length=self.seq_length, fp16_lm_cross_entropy=self.fp16_lm_cross_entropy, parallel_output=self.parallel_output, diff --git a/nemo/collections/llm/gpt/model/qwen2.py b/nemo/collections/llm/gpt/model/qwen2.py new file mode 100644 index 000000000000..eb67dd9d4f0d --- /dev/null +++ b/nemo/collections/llm/gpt/model/qwen2.py @@ -0,0 +1,392 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Annotated, Callable, Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel +from nemo.collections.llm.utils import Config +from nemo.lightning import OptimizerModule, io, teardown + +if TYPE_CHECKING: + from transformers import AutoModelForCausalLM + from transformers import Qwen2Config as HFQwen2Config + + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec + + +@dataclass +class Qwen2Config(GPTConfig): + normalization: str = "RMSNorm" + activation_func: Callable = F.silu + gated_linear_unit: bool = True + add_bias_linear: bool = False + add_qkv_bias: bool = True + seq_length: int = 4096 + init_method_std: int = 0.02 + hidden_dropout: float = 0.0 + attention_dropout: float = 0.0 + vocab_size: int = 151936 + share_embeddings_and_output_weights: Optional[bool] = False + layernorm_epsilon: float = 1e-6 + rotary_base: float = 1000000.0 + position_embedding_type: str = "rope" + apply_query_key_layer_scaling: bool = True + + +@dataclass +class Qwen2Config500M(Qwen2Config): + num_layers: int = 24 + hidden_size: int = 896 + num_attention_heads: int = 14 + num_query_groups: int = 2 + ffn_hidden_size: int = 4864 + + +@dataclass +class Qwen2Config1P5B(Qwen2Config): + num_layers: int = 28 + hidden_size: int = 1536 + num_attention_heads: int = 12 + num_query_groups: int = 2 + ffn_hidden_size: int = 8960 + + +@dataclass +class Qwen2Config7B(Qwen2Config): + num_layers: int = 28 + hidden_size: int = 3584 + num_attention_heads: int = 28 + num_query_groups: int = 4 + ffn_hidden_size: int = 18944 + vocab_size: int = 152064 + + +@dataclass +class Qwen2Config72B(Qwen2Config): + num_layers: int = 80 + hidden_size: int = 8192 + num_attention_heads: int = 64 + num_query_groups: int = 8 + ffn_hidden_size: int = 29568 + vocab_size: int = 152064 + layernorm_epsilon: float = 1e-5 + vocab_size: int = 152064 + + +class Qwen2Model(GPTModel): + def __init__( + self, + config: Annotated[Optional[Qwen2Config], Config[Qwen2Config]] = None, + optim: Optional[OptimizerModule] = None, + tokenizer: Optional["TokenizerSpec"] = None, + model_transform: Optional[Callable[[nn.Module], nn.Module]] = None, + ): + super().__init__(config or Qwen2Config(), optim=optim, tokenizer=tokenizer, model_transform=model_transform) + + +@io.model_importer(Qwen2Model, "hf") +class HFQwen2Importer(io.ModelConnector["AutoModelForCausalLM", Qwen2Model]): + def init(self) -> Qwen2Model: + return Qwen2Model(self.config, tokenizer=self.tokenizer) + + def apply(self, output_path: Path) -> Path: + from transformers import AutoModelForCausalLM + + source = AutoModelForCausalLM.from_pretrained(str(self), trust_remote_code=True) + target = self.init() + trainer = self.nemo_setup(target) + self.convert_state(source, target) + self.nemo_save(output_path, trainer) + + print(f"Converted Qwen model to Nemo, model saved to {output_path}") + + teardown(trainer, target) + del trainer, target + + return output_path + + def convert_state(self, source, target): + mapping = { + "model.embed_tokens.weight": "embedding.word_embeddings.weight", + "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight", + "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight", + "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight", + "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight", + "model.norm.weight": "decoder.final_layernorm.weight", + "lm_head.weight": "output_layer.weight", + } + + return io.apply_transforms( + source, target, mapping=mapping, transforms=[_import_qkv, _import_qkv_bias, _import_linear_fc1] + ) + + @property + def tokenizer(self) -> "AutoTokenizer": + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + + return AutoTokenizer(str(self), trust_remote_code=True) + + @property + def config(self) -> Qwen2Config: + from transformers import AutoConfig as HFAutoConfig + + source = HFAutoConfig.from_pretrained(str(self), trust_remote_code=True) + + output = Qwen2Config( + num_layers=source.num_hidden_layers, + hidden_size=source.hidden_size, + ffn_hidden_size=source.intermediate_size, + num_attention_heads=source.num_attention_heads, + num_query_groups=source.num_key_value_heads, + init_method_std=source.initializer_range, + layernorm_epsilon=source.rms_norm_eps, + gated_linear_unit=True, + make_vocab_size_divisible_by=128, + rotary_base=source.rope_theta, + share_embeddings_and_output_weights=False, + ) + + return output + + +@io.model_exporter(Qwen2Model, "hf") +class HFQwen2Exporter(io.ModelConnector[Qwen2Model, "AutoModelForCausalLM"]): + def init(self) -> "AutoModelForCausalLM": + from transformers import AutoModelForCausalLM + + return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True) + + def apply(self, output_path: Path) -> Path: + target = self.init() + source, _ = self.nemo_load(str(self)) + target = self.convert_state(source, target) + + target = target.cpu() + target.save_pretrained(output_path) + self.tokenizer.save_pretrained(output_path) + + return output_path + + def convert_state(self, source, target): + mapping = { + "embedding.word_embeddings.weight": "model.embed_tokens.weight", + "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight", + "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight", + "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight", + "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight", + "decoder.final_layernorm.weight": "model.norm.weight", + "output_layer.weight": "lm_head.weight", + } + + return io.apply_transforms( + source, target, mapping=mapping, transforms=[_export_qkv, _export_qkv_bias, _export_linear_fc1] + ) + + @property + def tokenizer(self): + return io.load_context(str(self)).model.tokenizer.tokenizer + + @property + def config(self) -> "HFQwen2Config": + from transformers import Qwen2Config as HFQwen2Config + + source: Qwen2Config = io.load_context(str(self)).model.config + + return HFQwen2Config( + num_hidden_layers=source.num_layers, + hidden_size=source.hidden_size, + intermediate_size=source.ffn_hidden_size, + num_attention_heads=source.num_attention_heads, + max_position_embeddings=source.seq_length, + initializer_range=source.init_method_std, + rms_norm_eps=source.layernorm_epsilon, + num_key_value_heads=source.num_query_groups, + rope_theta=source.rotary_base, + vocab_size=getattr(source, 'vocab_size', self.tokenizer.vocab_size), + sliding_window=source.seq_length, + tie_word_embeddings=False, + ) + + +@io.state_transform( + source_key=( + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", + ), + target_key="decoder.layers.*.self_attention.linear_qkv.weight", +) +def _import_qkv(ctx: io.TransformCTX, q, k, v): + megatron_config = ctx.target.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + + old_tensor_shape = q.size() + new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:] + new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:] + + q = q.view(*new_q_tensor_shape) + k = k.view(*new_kv_tensor_shape) + v = v.view(*new_kv_tensor_shape) + + qkv_weights_l = [] + for i in range(num_query_groups): + qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :]) + qkv_weights_l.append(k[i : i + 1, :, :]) + qkv_weights_l.append(v[i : i + 1, :, :]) + qkv_weights = torch.cat(qkv_weights_l) + assert qkv_weights.ndim == 3, qkv_weights.shape + assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape + assert qkv_weights.shape[1] == head_size, qkv_weights.shape + assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape + + qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) + + return qkv_weights + + +@io.state_transform( + source_key=( + "model.layers.*.self_attn.q_proj.bias", + "model.layers.*.self_attn.k_proj.bias", + "model.layers.*.self_attn.v_proj.bias", + ), + target_key="decoder.layers.*.self_attention.linear_qkv.bias", +) +def _import_qkv_bias(ctx: io.TransformCTX, q, k, v): + megatron_config = ctx.target.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + + new_q_tensor_shape = (head_num, head_size) + new_kv_tensor_shape = (num_query_groups, head_size) + + q = q.view(*new_q_tensor_shape) + k = k.view(*new_kv_tensor_shape) + v = v.view(*new_kv_tensor_shape) + + qkv_bias = torch.empty((0, head_size)) + for i in range(num_query_groups): + qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :])) + qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :])) + qkv_bias = torch.cat((qkv_bias, v[i : i + 1, :])) + qkv_bias = qkv_bias.reshape( + [ + head_size * (head_num + 2 * num_query_groups), + ] + ) + return qkv_bias + + +@io.state_transform( + source_key="decoder.layers.*.self_attention.linear_qkv.weight", + target_key=( + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", + ), +) +def _export_qkv(ctx: io.TransformCTX, linear_qkv): + megatron_config = ctx.source.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + qkv_total_dim = head_num + 2 * num_query_groups + + linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size]) + q_slice = torch.cat( + [ + torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group) + for i in range(num_query_groups) + ] + ) + k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2)) + v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) + + q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu() + k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu() + v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu() + + return q_proj, k_proj, v_proj + + +@io.state_transform( + source_key="decoder.layers.*.self_attention.linear_qkv.bias", + target_key=( + "model.layers.*.self_attn.q_proj.bias", + "model.layers.*.self_attn.k_proj.bias", + "model.layers.*.self_attn.v_proj.bias", + ), +) +def _export_qkv_bias(ctx: io.TransformCTX, qkv_bias): + megatron_config = ctx.source.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + qkv_total_dim = head_num + 2 * num_query_groups + + qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size]) + q_slice = torch.cat( + [ + torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group) + for i in range(num_query_groups) + ] + ) + k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2)) + v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) + + q_bias = qkv_bias[q_slice].reshape(-1).cpu() + k_bias = qkv_bias[k_slice].reshape(-1).cpu() + v_bias = qkv_bias[v_slice].reshape(-1).cpu() + + return q_bias, k_bias, v_bias + + +@io.state_transform( + source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"), + target_key="decoder.layers.*.mlp.linear_fc1.weight", +) +def _import_linear_fc1(down, gate): + return torch.cat((down, gate), axis=0).float() + + +@io.state_transform( + source_key="decoder.layers.*.mlp.linear_fc1.weight", + target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"), +) +def _export_linear_fc1(linear_fc1): + gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0) + + return gate_proj, up_proj + + +__all__ = [ + "Qwen2Config", + "Qwen2Config500M", + "Qwen2Config1P5B", + "Qwen2Config7B", + "Qwen2Config72B", + "Qwen2Model", +] From e68f981c393441165548e45c65c49ed5283fc0d5 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 27 Aug 2024 18:24:31 -0700 Subject: [PATCH 061/664] Lazy import tokenizers (#10213) * Move inflect to lazy import Signed-off-by: Alexandros Koumparoulis * Use lazy imports for tokenizer libraries Signed-off-by: Alexandros Koumparoulis * sacremoses lazy import Signed-off-by: Alexandros Koumparoulis * fix Signed-off-by: Alexandros Koumparoulis * fix cyclic import Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * import fix Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * move pangu Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- .../data/en/data_preprocessing.py | 23 ++++-- .../common/parts/preprocessing/cleaners.py | 16 +++- .../common/tokenizers/en_ja_tokenizers.py | 7 +- .../common/tokenizers/indic_tokenizers.py | 4 +- .../common/tokenizers/moses_tokenizers.py | 4 +- .../nlp/modules/common/tokenizer_utils.py | 73 ++++++++++--------- .../niv2/preprocess_niv2.py | 13 +++- .../t0/t0_dataset_preproc.py | 7 +- 8 files changed, 91 insertions(+), 56 deletions(-) diff --git a/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py b/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py index 9523d0974db8..f902e771cde4 100644 --- a/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py +++ b/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py @@ -46,8 +46,8 @@ import os from argparse import ArgumentParser +from functools import cache -import inflect import regex as re from tqdm import tqdm @@ -60,12 +60,21 @@ ) from nemo.utils import logging -engine = inflect.engine() + +@cache +def inflect_engine(): + import inflect + + return inflect.engine() + # these are all words that can appear in a verbalized number, this list will be used later as a filter to detect numbers in verbalizations number_verbalizations = list(range(0, 20)) + list(range(20, 100, 10)) number_verbalizations = ( - [engine.number_to_words(x, zero="zero").replace("-", " ").replace(",", "") for x in number_verbalizations] + [ + inflect_engine().number_to_words(x, zero="zero").replace("-", " ").replace(",", "") + for x in number_verbalizations + ] + ["hundred", "thousand", "million", "billion", "trillion"] + ["point"] ) @@ -85,7 +94,7 @@ def process_url(o): """ def flatten(l): - """ flatten a list of lists """ + """flatten a list of lists""" return [item for sublist in l for item in sublist] if o != '' and '_letter' in o: @@ -129,6 +138,7 @@ def convert2digits(digits: str): Return: res: number verbalization of the integer prefix of the input """ + engine = inflect_engine() res = [] for i, x in enumerate(digits): if x in digit: @@ -145,6 +155,7 @@ def convert2digits(digits: str): def convert(example): + engine = inflect_engine() cls, written, spoken = example written = convert_fraction(written) @@ -288,7 +299,7 @@ def convert(example): def ignore(example): """ This function makes sure specific class types like 'PLAIN', 'ELECTRONIC' etc. are left unchanged. - + Args: example: data example """ @@ -300,7 +311,7 @@ def ignore(example): def process_file(fp): - """ Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory. + """Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory. For more info about the data format, refer to the `text_normalization doc `. diff --git a/nemo/collections/common/parts/preprocessing/cleaners.py b/nemo/collections/common/parts/preprocessing/cleaners.py index 40c80115786a..0697abe8792e 100644 --- a/nemo/collections/common/parts/preprocessing/cleaners.py +++ b/nemo/collections/common/parts/preprocessing/cleaners.py @@ -14,7 +14,6 @@ import re -import inflect from text_unidecode import unidecode from nemo.utils import logging @@ -139,7 +138,14 @@ ] -inflect = inflect.engine() +from functools import cache + + +@cache +def inflect_engine(): + import inflect + + return inflect.engine() def clean_text(string, table, punctuation_to_replace, abbreviation_version=None): @@ -194,11 +200,12 @@ def reset(self): self.currency = None def format_final_number(self, whole_num, decimal): + inflect = inflect_engine() if self.currency: return_string = inflect.number_to_words(whole_num) return_string += " dollar" if whole_num == 1 else " dollars" if decimal: - return_string += " and " + inflect.number_to_words(decimal) + return_string += " and " + inflect_engine().number_to_words(decimal) return_string += " cent" if whole_num == decimal else " cents" self.reset() return return_string @@ -210,11 +217,12 @@ def format_final_number(self, whole_num, decimal): else: # Check if there are non-numbers def convert_to_word(match): - return " " + inflect.number_to_words(match.group(0)) + " " + return " " + inflect_engine().number_to_words(match.group(0)) + " " return re.sub(r'[0-9,]+', convert_to_word, whole_num) def clean(self, match): + inflect = inflect_engine() ws = match.group(2) number = match.group(3) _proceeding_symbol = match.group(7) diff --git a/nemo/collections/common/tokenizers/en_ja_tokenizers.py b/nemo/collections/common/tokenizers/en_ja_tokenizers.py index cf58130834e9..c72ae1853deb 100644 --- a/nemo/collections/common/tokenizers/en_ja_tokenizers.py +++ b/nemo/collections/common/tokenizers/en_ja_tokenizers.py @@ -14,9 +14,6 @@ import re from typing import List -from pangu import spacing -from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer - try: import ipadic import MeCab @@ -36,6 +33,8 @@ class EnJaProcessor: """ def __init__(self, lang_id: str): + from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer + self.lang_id = lang_id self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.moses_detokenizer = MosesDetokenizer(lang=lang_id) @@ -81,6 +80,8 @@ def __init__(self): self.mecab_tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS + " -Owakati") def detokenize(self, text: List[str]) -> str: + from pangu import spacing + RE_WS_IN_FW = re.compile( r'([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])' ) diff --git a/nemo/collections/common/tokenizers/indic_tokenizers.py b/nemo/collections/common/tokenizers/indic_tokenizers.py index 3b9192c8885b..eaf3aa5c7b64 100644 --- a/nemo/collections/common/tokenizers/indic_tokenizers.py +++ b/nemo/collections/common/tokenizers/indic_tokenizers.py @@ -14,8 +14,6 @@ from typing import List -from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer - class IndicProcessor: """ @@ -26,6 +24,8 @@ class IndicProcessor: def __init__(self, lang_id: str): if lang_id != 'hi': raise NotImplementedError + from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer + self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.moses_detokenizer = MosesDetokenizer(lang=lang_id) self.normalizer = MosesPunctNormalizer(lang=lang_id) diff --git a/nemo/collections/common/tokenizers/moses_tokenizers.py b/nemo/collections/common/tokenizers/moses_tokenizers.py index 27e91e6c5262..717427090dd2 100644 --- a/nemo/collections/common/tokenizers/moses_tokenizers.py +++ b/nemo/collections/common/tokenizers/moses_tokenizers.py @@ -14,8 +14,6 @@ from typing import List -from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer - class MosesProcessor: """ @@ -23,6 +21,8 @@ class MosesProcessor: """ def __init__(self, lang_id: str): + from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer + self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.moses_detokenizer = MosesDetokenizer(lang=lang_id) self.normalizer = MosesPunctNormalizer(lang=lang_id) diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py index 4cbadd87fe52..56496d56bc07 100644 --- a/nemo/collections/nlp/modules/common/tokenizer_utils.py +++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py @@ -16,28 +16,8 @@ from dataclasses import MISSING, dataclass from typing import Dict, List, Optional -import nemo -from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer -from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer -from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer -from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer -from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer -from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer -from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer -from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import get_huggingface_pretrained_lm_models_list -from nemo.collections.nlp.modules.common.lm_utils import get_pretrained_lm_models_list -from nemo.collections.nlp.parts.nlp_overrides import HAVE_MEGATRON_CORE from nemo.utils import logging -try: - from nemo.collections.nlp.modules.common.megatron.megatron_utils import get_megatron_tokenizer - - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - HAVE_MEGATRON_CORE = False - - __all__ = ['get_tokenizer', 'get_tokenizer_list'] @@ -96,46 +76,61 @@ def get_tokenizer( model better learn word compositionality and become robust to segmentation errors. It has emperically been shown to improve inference time BLEU scores. """ + if special_tokens is None: special_tokens_dict = {} else: special_tokens_dict = special_tokens if 'megatron' in tokenizer_name: - if not HAVE_MEGATRON_CORE: + try: + from nemo.collections.nlp.modules.common.megatron.megatron_utils import ( + get_megatron_merges_file, + get_megatron_tokenizer, + get_megatron_vocab_file, + ) + except (ImportError, ModuleNotFoundError): raise ImportError( "Megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) if vocab_file is None: - vocab_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_vocab_file( - tokenizer_name - ) - merges_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_merges_file( - tokenizer_name - ) + vocab_file = get_megatron_vocab_file(tokenizer_name) + merges_file = get_megatron_merges_file(tokenizer_name) tokenizer_name = get_megatron_tokenizer(tokenizer_name) if tokenizer_name == 'sentencepiece': + from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer + logging.info("tokenizer_model: " + str(tokenizer_model)) - return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( + return SentencePieceTokenizer( model_path=tokenizer_model, special_tokens=special_tokens, legacy=True, chat_template=chat_template, ) elif tokenizer_name == 'tiktoken': - return nemo.collections.common.tokenizers.tiktoken_tokenizer.TiktokenTokenizer(vocab_file=vocab_file) + from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer + + return TiktokenTokenizer(vocab_file=vocab_file) elif tokenizer_name == 'word': + from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer + return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict) elif tokenizer_name == 'char': + from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer + return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict) elif tokenizer_name == 'regex': + from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer + return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file) logging.info( f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_name}, vocab_file: {vocab_file}, merges_files: {merges_file}, " f"special_tokens_dict: {special_tokens_dict}, and use_fast: {use_fast}" ) + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + return AutoTokenizer( pretrained_model_name=tokenizer_name, vocab_file=vocab_file, @@ -183,6 +178,8 @@ def get_nmt_tokenizer( raise ValueError("No Tokenizer path provided or file does not exist!") if library == 'huggingface': + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + logging.info(f'Getting HuggingFace AutoTokenizer with pretrained_model_name: {model_name}') return AutoTokenizer( pretrained_model_name=model_name, @@ -193,26 +190,32 @@ def get_nmt_tokenizer( trust_remote_code=trust_remote_code, ) elif library == 'sentencepiece': + from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer + logging.info(f'Getting SentencePiece with model: {tokenizer_model}') - return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( + return SentencePieceTokenizer( model_path=tokenizer_model, legacy=legacy, chat_template=chat_template, ) elif library == 'byte-level': + from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer + logging.info(f'Using byte-level tokenization') return ByteLevelTokenizer(special_tokens_dict) elif library == 'regex': + from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer + logging.info(f'Using regex tokenization') return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file) elif library == 'megatron': if model_name == 'GPTSentencePieceTokenizer': + from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer + logging.info("tokenizer_model: ") logging.info(tokenizer_model) - return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( - model_path=tokenizer_model, legacy=legacy - ) + return SentencePieceTokenizer(model_path=tokenizer_model, legacy=legacy) if model_name in megatron_tokenizer_model_map: model_name = megatron_tokenizer_model_map[model_name] @@ -223,8 +226,12 @@ def get_nmt_tokenizer( tokenizer_name=model_name, vocab_file=vocab_file, merges_file=merges_file, chat_template=chat_template ) elif library == 'tabular': + from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer + return TabularTokenizer(vocab_file, delimiter=delimiter) elif library == 'tiktoken': + from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer + return TiktokenTokenizer(vocab_file=vocab_file) else: raise NotImplementedError( diff --git a/scripts/nlp_language_modeling/niv2/preprocess_niv2.py b/scripts/nlp_language_modeling/niv2/preprocess_niv2.py index 073d6da8f32c..6119768e66f2 100644 --- a/scripts/nlp_language_modeling/niv2/preprocess_niv2.py +++ b/scripts/nlp_language_modeling/niv2/preprocess_niv2.py @@ -18,8 +18,6 @@ from argparse import ArgumentParser from multiprocessing import Pool -from sacremoses import MosesDetokenizer - from nemo.collections.common.tokenizers import AutoTokenizer @@ -99,6 +97,8 @@ def write_dataset_to_file(file_name, output_file_name, detokenizer, tokenizer, i def process_folder(data_folder, output_folder, splits_file, remove_newline): + from sacremoses import MosesDetokenizer + detokenizer = MosesDetokenizer('en') tokenizer = AutoTokenizer("gpt2") assert os.path.isdir(data_folder) @@ -162,10 +162,15 @@ def process_folder(data_folder, output_folder, splits_file, remove_newline): help="Path to output folder where JSONL files will be written.", ) parser.add_argument( - "--splits_file_path", type=str, default="default", help="Path to the file that contains splits. ex: ", + "--splits_file_path", + type=str, + default="default", + help="Path to the file that contains splits. ex: ", ) parser.add_argument( - "--remove_newline", action="store_true", help="Whether to remove newlines from the input and output.", + "--remove_newline", + action="store_true", + help="Whether to remove newlines from the input and output.", ) args = parser.parse_args() process_folder(args.niv2_dataset_path, args.jsonl_output_path, args.splits_file_path, args.remove_newline) diff --git a/scripts/nlp_language_modeling/t0/t0_dataset_preproc.py b/scripts/nlp_language_modeling/t0/t0_dataset_preproc.py index 618c02c0cc13..53bed36ff8d0 100644 --- a/scripts/nlp_language_modeling/t0/t0_dataset_preproc.py +++ b/scripts/nlp_language_modeling/t0/t0_dataset_preproc.py @@ -19,7 +19,6 @@ from multiprocessing import Pool import tensorflow as tf -from sacremoses import MosesDetokenizer from tasks_splits_and_features import _TASK_SPLITS_AND_FEATURES_DICT @@ -136,6 +135,8 @@ def process_folder(data_folder, folder_name, output_folder, detokenizer, remove_ def process_all_folders(data_folder, output_folder, remove_newlines): + from sacremoses import MosesDetokenizer + detokenizer = MosesDetokenizer('en') assert os.path.isdir(data_folder) if not os.path.exists(output_folder): @@ -170,7 +171,9 @@ def process_all_folders(data_folder, output_folder, remove_newlines): help="Path to output folder where JSONL files will be written.", ) parser.add_argument( - "--remove_newlines", action="store_true", help="Whether to remove newlines from the input and output.", + "--remove_newlines", + action="store_true", + help="Whether to remove newlines from the input and output.", ) args = parser.parse_args() process_all_folders(args.p3_dataset_path, args.jsonl_output_path, args.remove_newlines) From 5ff7f2278050dc646cfbbb2aa148c276eeababc2 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Wed, 28 Aug 2024 19:53:38 +0300 Subject: [PATCH 062/664] add rampup bs documentation (#9884) (#10289) * create documentation for rampup bs * fix format * fix format * fix config format * move config stage * add example * fix table * fix table * fix grammar * fix grammar --------- Signed-off-by: dimapihtar --- docs/source/nlp/nemo_megatron/intro.rst | 3 +- .../nlp/nemo_megatron/rampup_batch_size.rst | 62 +++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 docs/source/nlp/nemo_megatron/rampup_batch_size.rst diff --git a/docs/source/nlp/nemo_megatron/intro.rst b/docs/source/nlp/nemo_megatron/intro.rst index fab448f3d4f2..65aaee2add6a 100644 --- a/docs/source/nlp/nemo_megatron/intro.rst +++ b/docs/source/nlp/nemo_megatron/intro.rst @@ -20,6 +20,7 @@ To learn more about using NeMo to train Large Language Models at scale, please r peft/landing_page positional_embeddings mcore_customization + rampup_batch_size References @@ -28,4 +29,4 @@ References .. bibliography:: ../nlp_all.bib :style: plain :labelprefix: nlp-megatron - :keyprefix: nlp-megatron- \ No newline at end of file + :keyprefix: nlp-megatron- diff --git a/docs/source/nlp/nemo_megatron/rampup_batch_size.rst b/docs/source/nlp/nemo_megatron/rampup_batch_size.rst new file mode 100644 index 000000000000..1e396cbc7630 --- /dev/null +++ b/docs/source/nlp/nemo_megatron/rampup_batch_size.rst @@ -0,0 +1,62 @@ +.. _rampup_batch_size: + +Ramp Up Batch Size +------------------ + +Ramp up batch size is a feature that allows training to start with a smaller global batch size and linearly increase to a target global batch size over a given number of training samples with specified incremental steps. + +Usage +----- + +To enable global batch size rampup during training, set the rampup_batch_size parameter under the model section of training configuration. This parameter should be a list of three values: + +* ``start_batch_size``: The initial batch size. +* ``batch_size_increment``: The amount by which the batch size will increase at each step. +* ``rampup_samples``: The number of training samples over which the batch size will be ramped up. + +``model.global_batch_size=1024 model.rampup_batch_size=[256, 128, 50000000]`` + +In this example, the training will start with a batch size of 256, increment by 128, and reach the target global batch size of 1024 over 50,000,000 training samples. + +Ramp Up Stages and Training Interruption +---------------------------------------- + +Once the next rampup stage is reached (the point in training when the global batch size increases), NeMo will stop the training. It allows to rerun the training job with a larger number of GPUs or nodes for the next stage of ramp up batch size. + +Automatic Node Scheduling +------------------------- + +In the `NeMo-Framework-Launcher `_, when using rampup batch size, a node scheduler is created automatically. This scheduler allows the use smaller number of nodes for smaller batch size stages and scales up according to the ``training.trainer.num_nodes`` parameter. This parameter corresponds to the maximum number of nodes you want to use for the maximum global batch size. + +Example +------- + +Detailed example of ramp up batch size feature usage with GPT3 5B model and `NeMo-Framework-Launcher `_. In this example, the training started with a global batch size of 256, increased by 256 at each ramp up stage, and reached the target global batch size of 2048 over 10,000,000 training samples. + +Node schedule looks as follows: + ++--------------------+--------------------+ +| global_batch_size | num_nodes | ++====================+====================+ +| 256 | 8 | ++--------------------+--------------------+ +| 512 | 8 | ++--------------------+--------------------+ +| 768 | 8 | ++--------------------+--------------------+ +| 1024 | 8 | ++--------------------+--------------------+ +| 1280 | 10 | ++--------------------+--------------------+ +| 1536 | 12 | ++--------------------+--------------------+ +| 1792 | 14 | ++--------------------+--------------------+ +| 2048 | 16 | ++--------------------+--------------------+ + +Plot of ``global_batch_size`` increase during training: + +.. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-rampup-batch-size-example.png + :alt: + :width: 1080px From 4805fe912b94fdf41363d2b6ad364564fc99c283 Mon Sep 17 00:00:00 2001 From: Ao Tang Date: Wed, 28 Aug 2024 13:04:29 -0400 Subject: [PATCH 063/664] Add Starcoder to Nemo 2 (#10230) * Add sc1/sc2 to nemo-ux * Apply isort and black reformatting Signed-off-by: suiyoubi * typo * Apply isort and black reformatting Signed-off-by: suiyoubi * fix import * remove pip install dependency for sc * Apply isort and black reformatting Signed-off-by: suiyoubi * remove pip install dependency for sc2 * Apply isort and black reformatting Signed-off-by: suiyoubi * typo Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx --------- Signed-off-by: suiyoubi Signed-off-by: Ao Tang Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: suiyoubi Co-authored-by: Chen Cui Co-authored-by: cuichenx --- nemo/collections/llm/__init__.py | 11 + nemo/collections/llm/gpt/model/__init__.py | 16 + nemo/collections/llm/gpt/model/starcoder.py | 206 ++++++++++ nemo/collections/llm/gpt/model/starcoder2.py | 383 +++++++++++++++++++ nemo/lightning/io/state.py | 1 - 5 files changed, 616 insertions(+), 1 deletion(-) create mode 100644 nemo/collections/llm/gpt/model/starcoder.py create mode 100644 nemo/collections/llm/gpt/model/starcoder2.py diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 168f05d2e56e..52c353ba16d7 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -64,6 +64,14 @@ Qwen2Config72B, Qwen2Config500M, Qwen2Model, + Starcoder2Config, + Starcoder2Config3B, + Starcoder2Config7B, + Starcoder2Config15B, + Starcoder2Model, + StarcoderConfig, + StarcoderConfig15B, + StarcoderModel, gpt_data_step, gpt_forward_step, ) @@ -89,6 +97,9 @@ "MixtralConfig8x7B", "MixtralConfig8x22B", "MixtralModel", + "Starcoder2Config15B", + "Starcoder2Config", + "Starcoder2Model", "NemotronModel", "Nemotron3Config4B", "Nemotron3Config8B", diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 0bf2fc6f1e7b..7de5d5b5b5f4 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -57,6 +57,14 @@ Qwen2Config500M, Qwen2Model, ) +from nemo.collections.llm.gpt.model.starcoder import StarcoderConfig, StarcoderConfig15B, StarcoderModel +from nemo.collections.llm.gpt.model.starcoder2 import ( + Starcoder2Config, + Starcoder2Config3B, + Starcoder2Config7B, + Starcoder2Config15B, + Starcoder2Model, +) __all__ = [ "GPTConfig", @@ -67,6 +75,14 @@ "MixtralConfig8x7B", "MixtralConfig8x22B", "MixtralModel", + "Starcoder2Config", + "Starcoder2Model", + "Starcoder2Config15B", + "Starcoder2Config7B", + "Starcoder2Config3B", + "StarcoderConfig", + "StarcoderConfig15B", + "StarcoderModel", "LlamaConfig", "Llama2Config7B", "Llama2Config13B", diff --git a/nemo/collections/llm/gpt/model/starcoder.py b/nemo/collections/llm/gpt/model/starcoder.py new file mode 100644 index 000000000000..e99b707964fe --- /dev/null +++ b/nemo/collections/llm/gpt/model/starcoder.py @@ -0,0 +1,206 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Annotated, Callable, Optional + +import torch.nn.functional as F +from torch import nn + +from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer +from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel +from nemo.collections.llm.utils import Config +from nemo.lightning import OptimizerModule, io, teardown + +if TYPE_CHECKING: + from transformers import GPTBigCodeConfig as HFStarcoderConfig + from transformers import GPTBigCodeForCausalLM + + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec + + +@dataclass +class StarcoderConfig(GPTConfig): + # configs that are common across model sizes + normalization: str = "LayerNorm" + activation_func: Callable = F.gelu + add_bias_linear: bool = True + seq_length: int = 8192 + position_embedding_type: str = "learned_absolute" + hidden_dropout: float = 0.2 + attention_dropout: float = 0.2 + init_method_std: float = 0.01 + layernorm_epsilon: float = 1e-5 + share_embeddings_and_output_weights: bool = False + kv_channels: int = None + num_query_groups: int = 1 + attention_softmax_in_fp32: bool = True + bias_activation_fusion: bool = True + bias_dropout_fusion: bool = True + + +@dataclass +class StarcoderConfig15B(StarcoderConfig): + num_layers: int = 40 + hidden_size: int = 6144 + ffn_hidden_size: int = 24576 + num_attention_heads: int = 48 + init_method_std: float = 0.02 + + +class StarcoderModel(GPTModel): + def __init__( + self, + config: Annotated[Optional[StarcoderConfig], Config[StarcoderConfig]] = None, + optim: Optional[OptimizerModule] = None, + tokenizer: Optional["TokenizerSpec"] = None, + model_transform: Optional[Callable[[nn.Module], nn.Module]] = None, + ): + super().__init__( + config or StarcoderConfig(), optim=optim, tokenizer=tokenizer, model_transform=model_transform + ) + + +@io.model_importer(StarcoderModel, "hf") +class HFStarcoderImporter(io.ModelConnector["GPTBigCodeForCausalLM", StarcoderModel]): + def init(self) -> StarcoderModel: + return StarcoderModel(self.config, tokenizer=self.tokenizer) + + def apply(self, output_path: Path) -> Path: + from transformers import GPTBigCodeForCausalLM + + source = GPTBigCodeForCausalLM.from_pretrained(str(self)) + target = self.init() + trainer = self.nemo_setup(target) + self.convert_state(source, target) + self.nemo_save(output_path, trainer) + + print(f"Converted Starcoder model to Nemo, model saved to {output_path}") + + teardown(trainer, target) + del trainer, target + + return output_path + + def convert_state(self, source, target): + mapping = { + "transformer.wte.weight": "embedding.word_embeddings.weight", + "transformer.wpe.weight": "embedding.position_embeddings.weight", + "transformer.h.*.attn.c_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight", + "transformer.h.*.attn.c_proj.bias": "decoder.layers.*.self_attention.linear_proj.bias", + "transformer.h.*.attn.c_attn.weight": "decoder.layers.*.self_attention.linear_qkv.weight", + "transformer.h.*.attn.c_attn.bias": "decoder.layers.*.self_attention.linear_qkv.bias", + "transformer.h.*.mlp.c_fc.weight": "decoder.layers.*.mlp.linear_fc1.weight", + "transformer.h.*.mlp.c_fc.bias": "decoder.layers.*.mlp.linear_fc1.bias", + "transformer.h.*.mlp.c_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight", + "transformer.h.*.mlp.c_proj.bias": "decoder.layers.*.mlp.linear_fc2.bias", + "transformer.h.*.ln_1.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight", + "transformer.h.*.ln_1.bias": "decoder.layers.*.self_attention.linear_qkv.layer_norm_bias", + "transformer.h.*.ln_2.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight", + "transformer.h.*.ln_2.bias": "decoder.layers.*.mlp.linear_fc1.layer_norm_bias", + "transformer.ln_f.weight": "decoder.final_layernorm.weight", + "transformer.ln_f.bias": "decoder.final_layernorm.bias", + "lm_head.weight": "output_layer.weight", + } + + return io.apply_transforms(source, target, mapping=mapping) + + @property + def tokenizer(self) -> "AutoTokenizer": + return AutoTokenizer(str(self)) + + @property + def config(self) -> StarcoderConfig: + from transformers import GPTBigCodeConfig as HFStarcoderConfig + + source = HFStarcoderConfig.from_pretrained(str(self)) + + def make_vocab_size_divisible_by(vocab_size): + base = 128 + while vocab_size % base != 0: + base //= 2 + return base + + output = StarcoderConfig( + num_layers=source.n_layer, + hidden_size=source.n_embd, + ffn_hidden_size=source.n_inner, + num_attention_heads=source.n_head, + init_method_std=source.initializer_range, + seq_length=source.n_positions, + layernorm_epsilon=source.layer_norm_epsilon, + num_query_groups=1, + make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size), + share_embeddings_and_output_weights=False, + ) + + return output + + +@io.model_exporter(StarcoderModel, "hf") +class HFStarcoderExporter(io.ModelConnector[StarcoderModel, "GPTBigCodeForCausalLM"]): + def init(self) -> "GPTBigCodeForCausalLM": + from transformers import GPTBigCodeForCausalLM + + return GPTBigCodeForCausalLM._from_config(self.config) + + def apply(self, output_path: Path) -> Path: + target = self.init() + source, _ = self.nemo_load(str(self)) + target = self.convert_state(source, target) + + target = target.cpu() + target.save_pretrained(output_path) + self.tokenizer.save_pretrained(output_path) + + return output_path + + def convert_state(self, source, target): + mapping = { + "embedding.word_embeddings.weight": "transformer.wte.weight", + "embedding.position_embeddings.weight": "transformer.wpe.weight", + "decoder.layers.*.self_attention.linear_proj.weight": "transformer.h.*.attn.c_proj.weight", + "decoder.layers.*.self_attention.linear_proj.bias": "transformer.h.*.attn.c_proj.bias", + "decoder.layers.*.self_attention.linear_qkv.weight": "transformer.h.*.attn.c_attn.weight", + "decoder.layers.*.self_attention.linear_qkv.bias": "transformer.h.*.attn.c_attn.bias", + "decoder.layers.*.mlp.linear_fc1.weight": "transformer.h.*.mlp.c_fc.weight", + "decoder.layers.*.mlp.linear_fc1.bias": "transformer.h.*.mlp.c_fc.bias", + "decoder.layers.*.mlp.linear_fc2.weight": "transformer.h.*.mlp.c_proj.weight", + "decoder.layers.*.mlp.linear_fc2.bias": "transformer.h.*.mlp.c_proj.bias", + "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "transformer.h.*.ln_1.weight", + "decoder.layers.*.self_attention.linear_qkv.layer_norm_bias": "transformer.h.*.ln_1.bias", + "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "transformer.h.*.ln_2.weight", + "decoder.layers.*.mlp.linear_fc1.layer_norm_bias": "transformer.h.*.ln_2.bias", + "decoder.final_layernorm.weight": "transformer.ln_f.weight", + "decoder.final_layernorm.bias": "transformer.ln_f.bias", + "output_layer.weight": "lm_head.weight", + } + + return io.apply_transforms(source, target, mapping=mapping) + + @property + def tokenizer(self): + return io.load_context(str(self)).model.tokenizer.tokenizer + + @property + def config(self) -> "HFStarcoderConfig": + from transformers import sGPTBigCodeConfig as HFStarcoderConfig + + source: StarcoderConfig = io.load_context(str(self)).model.config + + return HFStarcoderConfig( + num_hidden_layers=source.num_layers, + hidden_size=source.hidden_size, + intermediate_size=source.ffn_hidden_size, + num_attention_heads=source.num_attention_heads, + head_dim=( + source.kv_channels + if source.kv_channels is not None + else source.hidden_size // source.num_attention_heads + ), + tie_word_embeddings=source.share_embeddings_and_output_weights, + max_position_embeddings=source.seq_length, + initializer_range=source.init_method_std, + norm_eps=source.layernorm_epsilon, + num_key_value_heads=source.num_query_groups, + vocab_size=self.tokenizer.vocab_size, + ) diff --git a/nemo/collections/llm/gpt/model/starcoder2.py b/nemo/collections/llm/gpt/model/starcoder2.py new file mode 100644 index 000000000000..e53f1bde7012 --- /dev/null +++ b/nemo/collections/llm/gpt/model/starcoder2.py @@ -0,0 +1,383 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Annotated, Callable, List, Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer +from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel +from nemo.collections.llm.utils import Config +from nemo.lightning import OptimizerModule, io, teardown + +if TYPE_CHECKING: + from transformers import Starcoder2Config as HFStarcoder2Config + from transformers import Starcoder2ForCausalLM + + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec + + +@dataclass +class Starcoder2Config(GPTConfig): + # configs that are common across model sizes + normalization: str = "LayerNorm" + activation_func: Callable = F.gelu + add_bias_linear: bool = True + seq_length: int = 16384 + position_embedding_type: str = "rope" + rotary_percent: float = 1.0 + hidden_dropout: float = 0.0 + attention_dropout: float = 0.0 + init_method_std: float = 0.01 + share_embeddings_and_output_weights: bool = False + kv_channels: int = None + num_query_groups: int = None + window_size: Optional[List[int]] = None + apply_query_key_layer_scaling: bool = True + attention_softmax_in_fp32: bool = True + bias_activation_fusion: bool = True + bias_dropout_fusion: bool = True + layernorm_epsilon: float = 1e-5 + + +@dataclass +class Starcoder2Config3B(Starcoder2Config): + num_layers: int = 30 + hidden_size: int = 3072 + ffn_hidden_size: int = 12288 + num_query_groups: int = 2 + num_attention_heads: int = 24 + init_method_std: float = 0.018042 + rotary_base: float = 999999.4420358813 + + +@dataclass +class Starcoder2Config7B(Starcoder2Config): + num_layers: int = 32 + hidden_size: int = 4608 + ffn_hidden_size: int = 18432 + num_query_groups: int = 4 + num_attention_heads: int = 36 + init_method_std: float = 0.018042 + rotary_base: float = 1_000_000 + + +@dataclass +class Starcoder2Config15B(Starcoder2Config): + num_layers: int = 40 + hidden_size: int = 6144 + ffn_hidden_size: int = 24576 + num_query_groups: int = 4 + num_attention_heads: int = 48 + init_method_std: float = 0.01275 + rotary_base: float = 100_000 + + +class Starcoder2Model(GPTModel): + def __init__( + self, + config: Annotated[Optional[Starcoder2Config], Config[Starcoder2Config]] = None, + optim: Optional[OptimizerModule] = None, + tokenizer: Optional["TokenizerSpec"] = None, + model_transform: Optional[Callable[[nn.Module], nn.Module]] = None, + ): + super().__init__( + config or Starcoder2Config(), optim=optim, tokenizer=tokenizer, model_transform=model_transform + ) + + +@io.model_importer(Starcoder2Model, "hf") +class HFStarcoder2Importer(io.ModelConnector["Starcoder2ForCausalLM", Starcoder2Model]): + def init(self) -> Starcoder2Model: + return Starcoder2Model(self.config, tokenizer=self.tokenizer) + + def apply(self, output_path: Path) -> Path: + from transformers import Starcoder2ForCausalLM + + source = Starcoder2ForCausalLM.from_pretrained(str(self)) + target = self.init() + trainer = self.nemo_setup(target) + self.convert_state(source, target) + self.nemo_save(output_path, trainer) + + print(f"Converted Starcoder2 model to Nemo, model saved to {output_path}") + + teardown(trainer, target) + del trainer, target + + return output_path + + def convert_state(self, source, target): + mapping = { + "model.embed_tokens.weight": "embedding.word_embeddings.weight", + "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight", + "model.layers.*.self_attn.o_proj.bias": "decoder.layers.*.self_attention.linear_proj.bias", + "model.layers.*.mlp.c_fc.weight": "decoder.layers.*.mlp.linear_fc1.weight", + "model.layers.*.mlp.c_fc.bias": "decoder.layers.*.mlp.linear_fc1.bias", + "model.layers.*.mlp.c_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight", + "model.layers.*.mlp.c_proj.bias": "decoder.layers.*.mlp.linear_fc2.bias", + "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight", + "model.layers.*.input_layernorm.bias": "decoder.layers.*.self_attention.linear_qkv.layer_norm_bias", + "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight", + "model.layers.*.post_attention_layernorm.bias": "decoder.layers.*.mlp.linear_fc1.layer_norm_bias", + "model.norm.weight": "decoder.final_layernorm.weight", + "model.norm.bias": "decoder.final_layernorm.bias", + "lm_head.weight": "output_layer.weight", + } + + return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv_bias, _import_qkv_weight]) + + @property + def tokenizer(self) -> "AutoTokenizer": + return AutoTokenizer(str(self)) + + @property + def config(self) -> Starcoder2Config: + from transformers import Starcoder2Config as HFStarcoder2Config + + source = HFStarcoder2Config.from_pretrained(str(self)) + + def make_vocab_size_divisible_by(vocab_size): + base = 128 + while vocab_size % base != 0: + base //= 2 + return base + + output = Starcoder2Config( + num_layers=source.num_hidden_layers, + hidden_size=source.hidden_size, + ffn_hidden_size=source.intermediate_size, + num_attention_heads=source.num_attention_heads, + init_method_std=source.initializer_range, + seq_length=source.max_position_embeddings, + layernorm_epsilon=source.norm_epsilon, + num_query_groups=source.num_key_value_heads, + rotary_base=source.rope_theta, + make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size), + share_embeddings_and_output_weights=False, + ) + + return output + + +@io.model_exporter(Starcoder2Model, "hf") +class HFStarcoder2Exporter(io.ModelConnector[Starcoder2Model, "Starcoder2ForCausalLM"]): + def init(self) -> "Starcoder2ForCausalLM": + from transformers import Starcoder2ForCausalLM + + return Starcoder2ForCausalLM._from_config(self.config) + + def apply(self, output_path: Path) -> Path: + target = self.init() + source, _ = self.nemo_load(str(self)) + target = self.convert_state(source, target) + + target = target.cpu() + target.save_pretrained(output_path) + self.tokenizer.save_pretrained(output_path) + + return output_path + + def convert_state(self, source, target): + mapping = { + "embedding.word_embeddings.weight": "model.embed_tokens.weight", + "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight", + "decoder.layers.*.self_attention.linear_proj.bias": "model.layers.*.self_attn.o_proj.bias", + "decoder.layers.*.mlp.linear_fc1.weight": "model.layers.*.mlp.c_fc.weight", + "decoder.layers.*.mlp.linear_fc1.bias": "model.layers.*.mlp.c_fc.bias", + "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.c_proj.weight", + "decoder.layers.*.mlp.linear_fc2.bias": "model.layers.*.mlp.c_proj.bias", + "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight", + "decoder.layers.*.self_attention.linear_qkv.layer_norm_bias": "model.layers.*.input_layernorm.bias", + "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight", + "decoder.layers.*.mlp.linear_fc1.layer_norm_bias": "model.layers.*.post_attention_layernorm.bias", + "decoder.final_layernorm.weight": "model.norm.weight", + "decoder.final_layernorm.bias": "model.norm.bias", + "output_layer.weight": "lm_head.weight", + } + + return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv_weight, _export_qkv_bias]) + + @property + def tokenizer(self): + return io.load_context(str(self)).model.tokenizer.tokenizer + + @property + def config(self) -> "HFStarcoder2Config": + from transformers import Starcoder2Config as HFStarcoder2Config + + source: Starcoder2Config = io.load_context(str(self)).model.config + + return HFStarcoder2Config( + num_hidden_layers=source.num_layers, + hidden_size=source.hidden_size, + intermediate_size=source.ffn_hidden_size, + num_attention_heads=source.num_attention_heads, + head_dim=( + source.kv_channels + if source.kv_channels is not None + else source.hidden_size // source.num_attention_heads + ), + tie_word_embeddings=source.share_embeddings_and_output_weights, + max_position_embeddings=source.seq_length, + initializer_range=source.init_method_std, + norm_eps=source.layernorm_epsilon, + num_key_value_heads=source.num_query_groups, + rope_theta=source.rotary_base, + partial_rotary_factor=source.rotary_percent, + vocab_size=self.tokenizer.vocab_size, + ) + + +@io.state_transform( + source_key=( + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", + ), + target_key="decoder.layers.*.self_attention.linear_qkv.weight", +) +def _import_qkv_weight(ctx: io.TransformCTX, q, k, v): + megatron_config = ctx.target.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + + old_tensor_shape = q.size() + new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:] + new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:] + + q = q.view(*new_q_tensor_shape) + k = k.view(*new_kv_tensor_shape) + v = v.view(*new_kv_tensor_shape) + + qkv_weights_l = [] + for i in range(num_query_groups): + qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :]) + qkv_weights_l.append(k[i : i + 1, :, :]) + qkv_weights_l.append(v[i : i + 1, :, :]) + + qkv_weights = torch.cat(qkv_weights_l) + assert qkv_weights.ndim == 3, qkv_weights.shape + assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape + assert qkv_weights.shape[1] == head_size, qkv_weights.shape + assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape + + qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) + + return qkv_weights + + +@io.state_transform( + source_key=( + "model.layers.*.self_attn.q_proj.bias", + "model.layers.*.self_attn.k_proj.bias", + "model.layers.*.self_attn.v_proj.bias", + ), + target_key="decoder.layers.*.self_attention.linear_qkv.bias", +) +def _import_qkv_bias(ctx: io.TransformCTX, qb, kb, vb): + megatron_config = ctx.target.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + + new_q_bias_tensor_shape = (head_num, head_size) + new_kv_bias_tensor_shape = (num_query_groups, head_size) + + qb = qb.view(*new_q_bias_tensor_shape) + kb = kb.view(*new_kv_bias_tensor_shape) + vb = vb.view(*new_kv_bias_tensor_shape) + + qkv_bias_l = [] + for i in range(num_query_groups): + qkv_bias_l.append(qb[i * heads_per_group : (i + 1) * heads_per_group, :]) + qkv_bias_l.append(kb[i : i + 1, :]) + qkv_bias_l.append(vb[i : i + 1, :]) + + qkv_bias = torch.cat(qkv_bias_l) + qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups)]) + + return qkv_bias + + +@io.state_transform( + source_key="decoder.layers.*.self_attention.linear_qkv.weight", + target_key=( + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", + ), +) +def _export_qkv_weight(ctx: io.TransformCTX, linear_qkv): + megatron_config = ctx.source.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + qkv_total_dim = head_num + 2 * num_query_groups + + linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size]) + q_slice = torch.cat( + [ + torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group) + for i in range(num_query_groups) + ] + ) + k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2)) + v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) + + q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu() + k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu() + v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu() + + return q_proj, k_proj, v_proj + + +@io.state_transform( + source_key="decoder.layers.*.self_attention.linear_qkv.bias", + target_key=( + "model.layers.*.self_attn.q_proj.bias", + "model.layers.*.self_attn.k_proj.bias", + "model.layers.*.self_attn.v_proj.bias", + ), +) +def _export_qkv_bias(ctx: io.TransformCTX, qkv_bias): + megatron_config = ctx.source.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + qkv_total_dim = head_num + 2 * num_query_groups + + qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size]) + q_slice = torch.cat( + [ + torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group) + for i in range(num_query_groups) + ] + ) + k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2)) + v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) + + q_bias = qkv_bias[q_slice].reshape(-1).cpu() + k_bias = qkv_bias[k_slice].reshape(-1).cpu() + v_bias = qkv_bias[v_slice].reshape(-1).cpu() + + return q_bias, k_bias, v_bias diff --git a/nemo/lightning/io/state.py b/nemo/lightning/io/state.py index 9fd81a960358..18e0865171c7 100644 --- a/nemo/lightning/io/state.py +++ b/nemo/lightning/io/state.py @@ -255,7 +255,6 @@ def __call__(self, ctx: TransformCTX) -> TransformCTX: if multiple_sources: for target_index, target_match in np.ndenumerate(target_matches): source_match = source_matches[target_index] - if accepts_var_args: source_values = [source_dict[k] for k in source_match] target_dict[target_match] = self.call_transform(ctx, *source_values) From 2438fa9af19179267a04f385334766e9f60e046a Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Wed, 28 Aug 2024 10:32:55 -0700 Subject: [PATCH 064/664] comment out ASR_dev_run_Speech_To_Text_HF_Finetuning until fixed (#10293) Signed-off-by: Pablo Garay --- .github/workflows/cicd-main.yml | 70 ++++++++++++++++----------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 396ef03bd661..8100d95ae2a3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -499,41 +499,41 @@ jobs: AFTER_SCRIPT: | rm -rf examples/asr/speech_finetuning_results - OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: |- - python examples/asr/speech_to_text_finetune.py \ - --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ - ~model.train_ds.hf_data_cfg \ - model.train_ds.num_workers=1 \ - model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ - model.train_ds.streaming=true \ - +model.train_ds.hf_data_cfg.path="librispeech_asr" \ - +model.train_ds.hf_data_cfg.name=null \ - +model.train_ds.hf_data_cfg.split="test.clean" \ - +model.train_ds.hf_data_cfg.streaming=true \ - ~model.validation_ds.hf_data_cfg \ - model.validation_ds.streaming=true \ - +model.validation_ds.hf_data_cfg.path="librispeech_asr" \ - +model.validation_ds.hf_data_cfg.name=null \ - +model.validation_ds.hf_data_cfg.split="test.clean" \ - +model.validation_ds.hf_data_cfg.streaming=true \ - ~model.test_ds \ - init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - model.tokenizer.update_tokenizer=False \ - model.optim.sched.warmup_steps=0 \ - +model.optim.sched.max_steps=3 \ - trainer.max_epochs=null \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_finetuning_results - AFTER_SCRIPT: | - rm -rf examples/asr/speech_finetuning_results - IS_OPTIONAL: true + # OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning: + # needs: [cicd-test-container-setup] + # uses: ./.github/workflows/_test_template.yml + # with: + # RUNNER: self-hosted-azure-gpus-1 + # SCRIPT: |- + # python examples/asr/speech_to_text_finetune.py \ + # --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ + # ~model.train_ds.hf_data_cfg \ + # model.train_ds.num_workers=1 \ + # model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ + # model.train_ds.streaming=true \ + # +model.train_ds.hf_data_cfg.path="librispeech_asr" \ + # +model.train_ds.hf_data_cfg.name=null \ + # +model.train_ds.hf_data_cfg.split="test.clean" \ + # +model.train_ds.hf_data_cfg.streaming=true \ + # ~model.validation_ds.hf_data_cfg \ + # model.validation_ds.streaming=true \ + # +model.validation_ds.hf_data_cfg.path="librispeech_asr" \ + # +model.validation_ds.hf_data_cfg.name=null \ + # +model.validation_ds.hf_data_cfg.split="test.clean" \ + # +model.validation_ds.hf_data_cfg.streaming=true \ + # ~model.test_ds \ + # init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ + # model.tokenizer.update_tokenizer=False \ + # model.optim.sched.warmup_steps=0 \ + # +model.optim.sched.max_steps=3 \ + # trainer.max_epochs=null \ + # trainer.devices=1 \ + # trainer.accelerator="gpu" \ + # +trainer.fast_dev_run=True \ + # exp_manager.exp_dir=examples/asr/speech_finetuning_results + # AFTER_SCRIPT: | + # rm -rf examples/asr/speech_finetuning_results + # IS_OPTIONAL: true ASR_dev_run_Speech_to_Text_WPE_-_Conformer: needs: [cicd-test-container-setup] From 5040546dffe1061cba942ea9976325e47a44223d Mon Sep 17 00:00:00 2001 From: gautham-kollu Date: Wed, 28 Aug 2024 11:34:28 -0700 Subject: [PATCH 065/664] Adding a Garbage-collection callback for a synchronized garbage-collection across ALL processes (#10261) * Adding Garbage-collection callback for a synchrozned gc across processed Signed-off-by: Gautham Kollu * 1 Signed-off-by: Gautham Kollu * Apply isort and black reformatting Signed-off-by: gautham-kollu Signed-off-by: Gautham Kollu * PR feedback Signed-off-by: Gautham Kollu --------- Signed-off-by: Gautham Kollu Signed-off-by: gautham-kollu Co-authored-by: Gautham Kollu Co-authored-by: gautham-kollu --- nemo/lightning/pytorch/callbacks/__init__.py | 2 + .../pytorch/callbacks/garbage_collection.py | 68 +++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 nemo/lightning/pytorch/callbacks/garbage_collection.py diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py index ef31e1078298..dd2908e6f5e6 100644 --- a/nemo/lightning/pytorch/callbacks/__init__.py +++ b/nemo/lightning/pytorch/callbacks/__init__.py @@ -1,4 +1,5 @@ from nemo.lightning.pytorch.callbacks.ddp_parity_checker import DdpParityChecker +from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback from nemo.lightning.pytorch.callbacks.memory_profiler import MemoryProfileCallback from nemo.lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform @@ -18,4 +19,5 @@ "ProgressPrinter", "PreemptionCallback", "DdpParityChecker", + "GarbageCollectionCallback", ] diff --git a/nemo/lightning/pytorch/callbacks/garbage_collection.py b/nemo/lightning/pytorch/callbacks/garbage_collection.py new file mode 100644 index 000000000000..a2b2bb6498a3 --- /dev/null +++ b/nemo/lightning/pytorch/callbacks/garbage_collection.py @@ -0,0 +1,68 @@ +import gc +from typing import Any + +import pytorch_lightning as pl +from nemo.utils import logging + + +class GarbageCollectionCallback(pl.Callback): + """Callback for synchronized manual Garbage Collection. This is required for distributed training + as all processes on different rank need to synchronize to garbage collect at the same time, without which + one process might hog or straggle all the rest of the processes. + + Migration from NeMo 1.0: + When mitrating from NeMo1, + - gc_interval = 0 implied no GC, simply do not add this callback to the trainer + - gc_interval > 0, this config is maps => gc_interval_train + + - env-var:NEMO_MANUAL_GC_IN_VALIDATION=0 or doesn't exist => Set gc_interval_val to a very high value that it does not practically run. + - env-var:NEMO_MANUAL_GC_IN_VALIDATION=1 => Set gc_interval_val to the same value as gc_interval + + Moving from boolean flag (NEMO_MANUAL_GC_IN_VALIDATION) to integer is to allow user to set a specific value based on the size of the + validation datasets. + + Note: This callback does not run gc at the start or the end of training or validation. + """ + + def __init__(self, gc_interval_train, gc_interval_val) -> None: + """_summary_ + + Args: + gc_interval (int, mandatory): Number of global train steps at which garbage collection is done. + gc_interval_val (int, mandatory): Number of global validation steps at which garbage collection is done. + """ + assert gc_interval_train > 0, "gc_interval_train should be an integer value larger than 0." + assert gc_interval_val > 0, "gc_interval_val should be an integer value larger than 0." + + super().__init__() + self.gc_interval_train = gc_interval_train + self.gc_interval_val = gc_interval_val + # As garbage collection is manually controlled, disable automatic garbage collector. + gc.disable() + # This counter is required as pl does not have a native way to track the validation step counter. + self.validation_global_step = 0 + + def on_train_batch_end( + self, + trainer: pl.Trainer, + pl_module: pl.LightningModule, + outputs: pl.utilities.types.STEP_OUTPUT, + batch: Any, + batch_idx: int, + ) -> None: + if trainer.global_step % self.gc_interval_train == 0: + logging.info(f"Running garbage collection at train global_step: {trainer.global_step}") + gc.collect() + + def on_validation_batch_end( + self, + trainer: pl.Trainer, + pl_module: pl.LightningModule, + outputs: pl.utilities.types.STEP_OUTPUT, + batch: Any, + batch_idx: int, + ) -> None: + self.validation_global_step += 1 + if self.validation_global_step % self.gc_interval_val == 0: + logging.info(f"Running garbage collection at validation step: {self.validation_global_step}") + gc.collect() From 1d2d5072514e27e26d102e8610d50ca1a2a693c8 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Wed, 28 Aug 2024 12:55:12 -0700 Subject: [PATCH 066/664] Do not overwrite wandb name in NeMo Logger (#10265) * Do not overwrite wandb name in NeMo Logger Signed-off-by: Hemil Desai * Do not overwrite tensorboard name Signed-off-by: Hemil Desai * Fix tests Signed-off-by: Hemil Desai --------- Signed-off-by: Hemil Desai --- nemo/lightning/nemo_logger.py | 20 ++++++++------------ tests/lightning/test_nemo_logger.py | 4 ++-- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py index bae62f09593b..e5cd45181cc7 100644 --- a/nemo/lightning/nemo_logger.py +++ b/nemo/lightning/nemo_logger.py @@ -59,7 +59,7 @@ class NeMoLogger(IOMixin): def __post_init__(self): if self.log_local_rank_0_only is True and self.log_global_rank_0_only is True: raise ValueError( - f"Cannot set both log_local_rank_0_only and log_global_rank_0_only to True. Please set either one or neither." + "Cannot set both log_local_rank_0_only and log_global_rank_0_only to True. Please set either one or neither." ) def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool = False, task_config=None): @@ -73,7 +73,6 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool = AppState: The application state with updated log directory and other settings. """ from nemo.constants import NEMO_ENV_VARNAME_VERSION - from nemo.utils.exp_manager import check_explicit_log_dir from nemo.utils.get_rank import is_global_rank_zero self.local_rank = int(os.environ.get("LOCAL_RANK", 0)) @@ -100,7 +99,7 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool = # Default dir to ./nemo_experiments if None was passed _dir = self.dir if self.dir is None: - _dir = str(Path.cwd() / 'nemo_experiments') + _dir = str(Path.cwd() / "nemo_experiments") if not self.name: self.name = "default" @@ -114,7 +113,7 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool = version = None elif is_global_rank_zero(): if self.use_datetime_version: - version = time.strftime('%Y-%m-%d_%H-%M-%S') + version = time.strftime("%Y-%m-%d_%H-%M-%S") if version: if is_global_rank_zero(): os.environ[NEMO_ENV_VARNAME_VERSION] = version @@ -130,7 +129,7 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool = app_state.cmd_args = sys.argv os.makedirs(log_dir, exist_ok=True) # Cannot limit creation to global zero as all ranks write to own log file - logging.info(f'Experiments will be logged at {log_dir}') + logging.info(f"Experiments will be logged at {log_dir}") if task_config and is_global_rank_zero(): self._handle_task_config(task_config, log_dir) @@ -158,7 +157,6 @@ def _setup_trainer_loggers(self, trainer, dir, version): if isinstance(logger, TensorBoardLogger): logger._version = version or "" logger._root_dir = Path(dir) / os.path.relpath(logger.save_dir) - trainer.logger._name = self.name logging.warning( f'"update_logger_directory" is True. Overwriting tensorboard logger "save_dir" to {logger._root_dir}' ) @@ -166,8 +164,6 @@ def _setup_trainer_loggers(self, trainer, dir, version): logger._id = version or "" logger._save_dir = Path(dir) / logger.save_dir logger._wandb_init["dir"] = Path(dir) / logger.save_dir - logger._wandb_init["name"] = self.name - logger._name = self.name logging.warning( f'"update_logger_directory" is True. Overwriting wandb logger "save_dir" to {logger._save_dir}' ) @@ -211,8 +207,8 @@ def _setup_trainer_model_checkpoint(self, trainer, log_dir, ckpt=None): if callback.dirpath is None: callback.dirpath = Path(log_dir / "checkpoints") if callback.filename is None: - callback.filename = f'{self.name}--{{{callback.monitor}:.4f}}-{{epoch}}' - ModelCheckpoint.CHECKPOINT_NAME_LAST = callback.filename + '-last' + callback.filename = f"{self.name}--{{{callback.monitor}:.4f}}-{{epoch}}" + ModelCheckpoint.CHECKPOINT_NAME_LAST = callback.filename + "-last" def _handle_task_config(self, task_config, log_dir): try: @@ -223,7 +219,7 @@ def _handle_task_config(self, task_config, log_dir): with open(log_dir / "task.json", "w") as f: f.write(task_json) except Exception as e: - logging.warning(f'Saving task config failed: {e}. Skipping saving') + logging.warning(f"Saving task config failed: {e}. Skipping saving") def _setup_file_logging(self, log_dir): """Set up file logging based on rank settings.""" @@ -233,7 +229,7 @@ def _setup_file_logging(self, log_dir): # This is set if the env var NEMO_TESTING is set to True. nemo_testing = get_envbool(NEMO_ENV_VARNAME_TESTING, False) - log_file = log_dir / f'nemo_log_globalrank-{self.global_rank}_localrank-{self.local_rank}.txt' + log_file = log_dir / f"nemo_log_globalrank-{self.global_rank}_localrank-{self.local_rank}.txt" if self.log_local_rank_0_only and not nemo_testing and self.local_rank == 0: logging.add_file_handler(log_file) diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index a0a16150c65f..955367cb7581 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -21,7 +21,7 @@ def test_loggers(self): trainer = nl.Trainer(accelerator="cpu") logger = nl.NeMoLogger( update_logger_directory=True, - wandb=WandbLogger(save_dir="wandb_logs", offline=True), + wandb=WandbLogger(name="custom", save_dir="wandb_logs", offline=True), ) logger.setup(trainer) @@ -30,7 +30,7 @@ def test_loggers(self): assert len(trainer.loggers) == 2 assert isinstance(trainer.loggers[1], WandbLogger) assert str(trainer.loggers[1].save_dir).endswith("nemo_experiments/wandb_logs") - assert trainer.loggers[1]._name == "default" + assert trainer.loggers[1]._name == "custom" def test_explicit_log_dir(self, trainer): explicit_dir = "explicit_test_dir" From 5bbfa53e35da58feb3829a61c0421655d175f8ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 28 Aug 2024 14:11:44 -0700 Subject: [PATCH 067/664] Bump `Dockerfile.ci` (2024-08-28) (#10278) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 34e607e ! Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * refactor: Deprecate `async_grad_allreduce` Signed-off-by: Oliver Koenig --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Oliver Koenig Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> --- Dockerfile.ci | 2 +- nemo/collections/nlp/modules/common/megatron/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile.ci b/Dockerfile.ci index e687c385cce8..275aaecb95f0 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -34,7 +34,7 @@ WORKDIR /workspace # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.15.0 -ARG MCORE_TAG=01ca03f11e89f4f85682dcac647c2b913b25fcee +ARG MCORE_TAG=34e607ef41cf1c0ed481a678df9c76952d0ec00c ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py index 5aaac6755601..601cb7a4d7e8 100644 --- a/nemo/collections/nlp/modules/common/megatron/utils.py +++ b/nemo/collections/nlp/modules/common/megatron/utils.py @@ -93,7 +93,7 @@ def parallel_lm_logits( tensor_model_parallel = parallel_state.get_tensor_model_parallel_world_size() > 1 # async grad allreduce can only be used when not using sequence parallelism - async_grad_allreduce = async_tensor_model_parallel_allreduce and tensor_model_parallel and not sequence_parallel + allreduce_dgrad = async_tensor_model_parallel_allreduce and tensor_model_parallel and not sequence_parallel # copy input_ to model parallel region if needed if async_tensor_model_parallel_allreduce or sequence_parallel: @@ -108,7 +108,7 @@ def parallel_lm_logits( weight=word_embeddings_weight, bias=bias, gradient_accumulation_fusion=gradient_accumulation_fusion, - async_grad_allreduce=async_grad_allreduce, + allreduce_dgrad=allreduce_dgrad, sequence_parallel=sequence_parallel, ) From 60ac8aa5feda20a8d0cdb107b5a46e2fed6145be Mon Sep 17 00:00:00 2001 From: Slyne Deng Date: Wed, 28 Aug 2024 14:15:06 -0700 Subject: [PATCH 068/664] Multimodal trtllm export and infer script (#10287) * salm export trtllm Signed-off-by: slyne deng * add export script Signed-off-by: slyne deng * fix style Signed-off-by: slyne deng * Apply isort and black reformatting Signed-off-by: Slyne --------- Signed-off-by: slyne deng Signed-off-by: Slyne Co-authored-by: slyne deng Co-authored-by: Slyne --- scripts/export/export_mm_to_trtllm.py | 139 ++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 scripts/export/export_mm_to_trtllm.py diff --git a/scripts/export/export_mm_to_trtllm.py b/scripts/export/export_mm_to_trtllm.py new file mode 100644 index 000000000000..e7389f6e07af --- /dev/null +++ b/scripts/export/export_mm_to_trtllm.py @@ -0,0 +1,139 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script exports multimodal model to TensorRT and do a local inference test. +For multimodal model, it supports the following models: +- NEVA +- Video-NEVA +- LITA +- VILA +- VITA +- SALM +""" + +import argparse +import os + +from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter + + +def parse_args(): + parser = argparse.ArgumentParser(description='Export multimodal model to TensorRT') + parser.add_argument('--output_dir', required=True, help='Directory to save the exported model') + parser.add_argument( + '--visual_checkpoint_path', + required=True, + help='Path to the visual model checkpoint or perception model checkpoint', + ) + parser.add_argument('--llm_checkpoint_path', required=True, help='Source .nemo file for llm') + parser.add_argument( + '--modality', + default="vision", + choices=["vision", "audio"], + help="Modality of the model", + ) + parser.add_argument( + '--model_type', + type=str, + required=True, + choices=["neva", "video-neva", "lita", "vila", "vita", "salm"], + help="Type of the model that is supported.", + ) + + parser.add_argument( + '--llm_model_type', + type=str, + required=True, + choices=["gptnext", "gpt", "llama", "falcon", "starcoder", "mixtral", "gemma"], + help="Type of LLM. gptnext, gpt, llama, falcon, and starcoder are only supported." + " gptnext and gpt are the same and keeping it for backward compatibility", + ) + + parser.add_argument('--tensor_parallel_size', type=int, default=1, help='tensor parallelism size') + parser.add_argument('--max_input_len', type=int, default=4096, help='Maximum input length') + parser.add_argument('--max_output_len', type=int, default=256, help='Maximum output length') + parser.add_argument('--max_batch_size', type=int, default=1, help='Maximum batch size') + parser.add_argument( + '--vision_max_batch_size', + type=int, + default=1, + help='Max batch size of the visual inputs, for lita/vita model with video inference, this should be set to 256', + ) + parser.add_argument('--max_multimodal_len', type=int, default=3072, help='Maximum multimodal length') + parser.add_argument( + "--dtype", + choices=["bfloat16", "float16"], + default="bfloat16", + type=str, + help="dtype of the model on TensorRT", + ) + parser.add_argument( + '--delete_existing_files', action='store_true', help='Delete existing files in the output directory' + ) + parser.add_argument( + '--test_export_only', action='store_true', help='Only test the export without saving the model' + ) + parser.add_argument('--input_text', help='Input text for inference') + parser.add_argument('--input_media', default=None, help='Input media file for inference') + parser.add_argument('--batch_size', type=int, default=1, help='Batch size for inference') + parser.add_argument('--max_output', type=int, default=128, help='Maximum output length for inference') + parser.add_argument('--top_k', type=int, default=1, help='Top k for sampling') + parser.add_argument('--top_p', type=float, default=0.0, help='Top p for sampling') + parser.add_argument("--temperature", default=1.0, type=float, help="temperature") + parser.add_argument("--repetition_penalty", default=1.0, type=float, help="repetition_penalty") + parser.add_argument("--num_beams", default=1, type=int, help="num_beams") + + args = parser.parse_args() + return args + + +def main(args): + exporter = TensorRTMMExporter(model_dir=args.output_dir, load_model=False, modality=args.modality) + exporter.export( + visual_checkpoint_path=args.visual_checkpoint_path, + llm_checkpoint_path=args.llm_checkpoint_path, + model_type=args.model_type, + llm_model_type=args.llm_model_type, + tensor_parallel_size=args.tensor_parallel_size, + max_input_len=args.max_input_len, + max_output_len=args.max_output_len, + max_batch_size=args.max_batch_size, + vision_max_batch_size=args.vision_max_batch_size, + max_multimodal_len=args.max_multimodal_len, + dtype=args.dtype, + delete_existing_files=args.delete_existing_files, + load_model=not args.test_export_only, + ) + test_inference = not args.test_export_only + if test_inference: + assert args.input_media is not None, "Input media file is required for inference" + assert os.path.exists(args.input_media), f"Input media file {args.input_media} does not exist" + output = exporter.forward( + input_text=args.input_text, + input_media=args.input_media, + batch_size=args.batch_size, + max_output_len=args.max_output, + top_k=args.top_k, + top_p=args.top_p, + temperature=args.temperature, + repetition_penalty=args.repetition_penalty, + num_beams=args.num_beams, + ) + print(output) + + +if __name__ == '__main__': + args = parse_args() + main(args) From a860e6bf4592e278403a975406abc1e246de2052 Mon Sep 17 00:00:00 2001 From: Ryan Langman Date: Wed, 28 Aug 2024 14:20:13 -0700 Subject: [PATCH 069/664] [TTS] Add config and modules for 22khz and 44khz audio codec (#10107) * [TTS] Add config and modules for 22khz and 44khz audio codec Signed-off-by: Ryan * Apply isort and black reformatting Signed-off-by: rlangman * [TTS] Add argument docstring to new modules Signed-off-by: Ryan --------- Signed-off-by: Ryan Signed-off-by: rlangman Co-authored-by: rlangman --- .../conf/audio_codec/audio_codec_22050.yaml | 193 +++++++++++ .../conf/audio_codec/audio_codec_44100.yaml | 193 +++++++++++ nemo/collections/common/parts/utils.py | 54 +++ .../tts/modules/audio_codec_modules.py | 313 +++++++++++++++++- 4 files changed, 748 insertions(+), 5 deletions(-) create mode 100644 examples/tts/conf/audio_codec/audio_codec_22050.yaml create mode 100644 examples/tts/conf/audio_codec/audio_codec_44100.yaml diff --git a/examples/tts/conf/audio_codec/audio_codec_22050.yaml b/examples/tts/conf/audio_codec/audio_codec_22050.yaml new file mode 100644 index 000000000000..c45f2c2a129c --- /dev/null +++ b/examples/tts/conf/audio_codec/audio_codec_22050.yaml @@ -0,0 +1,193 @@ +# This config contains the default values for training 22.05kHz NeMo Audio Codec model. +# If you want to train model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. + +name: AudioCodec + +max_epochs: ??? +# Adjust batch size based on GPU memory +batch_size: 16 +# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch. +# If null, then weighted sampling is disabled. +weighted_sampling_steps_per_epoch: null + +# Dataset metadata for each manifest +# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41 +train_ds_meta: ??? +val_ds_meta: ??? + +log_ds_meta: ??? +log_dir: ??? + +# Modify these values based on your sample rate +sample_rate: 22050 +win_length: 1024 +hop_length: 256 +train_n_samples: 8192 # ~0.37 seconds +# The product of the down_sample_rates and up_sample_rates should match the hop_length. +# For example 2 * 2 * 8 * 8 = 256. +down_sample_rates: [2, 2, 8, 8] +up_sample_rates: [8, 8, 2, 2] + +num_codebooks: 8 +encoder_out_dim: 32 + +model: + + max_epochs: ${max_epochs} + steps_per_epoch: ${weighted_sampling_steps_per_epoch} + + sample_rate: ${sample_rate} + samples_per_frame: ${hop_length} + + mel_loss_l1_scale: 10.0 + mel_loss_l2_scale: 0.0 + stft_loss_scale: 10.0 + time_domain_loss_scale: 0.0 + commit_loss_scale: 0.0 + + # Probability of updating the discriminator during each training step + # For example, update the discriminator 1/2 times (1 update for every 2 batches) + disc_updates_per_period: 1 + disc_update_period: 2 + + # All resolutions for mel reconstruction loss, ordered [num_fft, hop_length, window_length] + loss_resolutions: [ + [32, 8, 32], [64, 16, 64], [128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024] + ] + mel_loss_dims: [5, 10, 20, 40, 80, 160] + mel_loss_log_guard: 1.0 + stft_loss_log_guard: 1.0 + feature_loss_type: absolute + + train_ds: + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + dataset_meta: ${train_ds_meta} + weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} + sample_rate: ${sample_rate} + n_samples: ${train_n_samples} + min_duration: 0.4 # seconds + max_duration: null + + dataloader_params: + batch_size: ${batch_size} + drop_last: true + num_workers: 4 + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + sample_rate: ${sample_rate} + n_samples: null + min_duration: null + max_duration: null + trunc_duration: 10.0 # Only use the first 10 seconds of audio for computing validation loss + dataset_meta: ${val_ds_meta} + + dataloader_params: + batch_size: 4 + num_workers: 2 + + # Configures how audio samples are generated and saved during training. + # Remove this section to disable logging. + log_config: + log_dir: ${log_dir} + log_epochs: [10, 50] + epoch_frequency: 100 + log_tensorboard: false + log_wandb: false + + generators: + - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator + log_audio: true + log_encoding: false + log_dequantized: false + + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + sample_rate: ${sample_rate} + n_samples: null + min_duration: null + max_duration: null + trunc_duration: 10.0 # Only log the first 10 seconds of generated audio. + dataset_meta: ${log_ds_meta} + + dataloader_params: + batch_size: 4 + num_workers: 2 + + audio_encoder: + _target_: nemo.collections.tts.modules.audio_codec_modules.HiFiGANEncoder + down_sample_rates: ${down_sample_rates} + encoded_dim: ${encoder_out_dim} + base_channels: 48 + activation: "lrelu" + + audio_decoder: + _target_: nemo.collections.tts.modules.audio_codec_modules.HiFiGANDecoder + up_sample_rates: ${up_sample_rates} + input_dim: ${encoder_out_dim} + base_channels: 768 + activation: "half_snake" + output_activation: "clamp" + + vector_quantizer: + _target_: nemo.collections.tts.modules.audio_codec_modules.GroupFiniteScalarQuantizer + num_groups: ${num_codebooks} + num_levels_per_group: [8, 5, 5, 5] + + discriminator: + _target_: nemo.collections.tts.modules.audio_codec_modules.Discriminator + discriminators: + - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiPeriodDiscriminator + - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiResolutionDiscriminatorSTFT + resolutions: [[512, 128, 512], [1024, 256, 1024]] + stft_bands: [[0.0, 0.1], [0.1, 0.25], [0.25, 0.5], [0.5, 0.75], [0.75, 1.0]] + + generator_loss: + _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss + + discriminator_loss: + _target_: nemo.collections.tts.losses.audio_codec_loss.DiscriminatorSquaredLoss + + optim: + _target_: torch.optim.Adam + lr: 2e-4 + betas: [0.8, 0.99] + + sched: + name: ExponentialLR + gamma: 0.998 + +trainer: + num_nodes: 1 + devices: -1 + accelerator: gpu + strategy: ddp_find_unused_parameters_true + precision: 16 + max_epochs: ${max_epochs} + accumulate_grad_batches: 1 + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 10 + benchmark: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: false + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + mode: min + save_top_k: 5 + save_best_model: true + always_save_nemo: true + resume_if_exists: false + resume_ignore_no_checkpoint: false diff --git a/examples/tts/conf/audio_codec/audio_codec_44100.yaml b/examples/tts/conf/audio_codec/audio_codec_44100.yaml new file mode 100644 index 000000000000..eab13a0e440b --- /dev/null +++ b/examples/tts/conf/audio_codec/audio_codec_44100.yaml @@ -0,0 +1,193 @@ +# This config contains the default values for training 44.1kHz NeMo Audio Codec model. +# If you want to train model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. + +name: AudioCodec + +max_epochs: ??? +# Adjust batch size based on GPU memory +batch_size: 16 +# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch. +# If null, then weighted sampling is disabled. +weighted_sampling_steps_per_epoch: null + +# Dataset metadata for each manifest +# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41 +train_ds_meta: ??? +val_ds_meta: ??? + +log_ds_meta: ??? +log_dir: ??? + +# Modify these values based on your sample rate +sample_rate: 44100 +win_length: 2048 +hop_length: 512 +train_n_samples: 16384 # ~0.37 seconds +# The product of the down_sample_rates and up_sample_rates should match the hop_length. +# For example 2 * 4 * 8 * 8 = 512. +down_sample_rates: [2, 4, 8, 8] +up_sample_rates: [8, 8, 4, 2] + +num_codebooks: 8 +encoder_out_dim: 32 + +model: + + max_epochs: ${max_epochs} + steps_per_epoch: ${weighted_sampling_steps_per_epoch} + + sample_rate: ${sample_rate} + samples_per_frame: ${hop_length} + + mel_loss_l1_scale: 10.0 + mel_loss_l2_scale: 0.0 + stft_loss_scale: 10.0 + time_domain_loss_scale: 0.0 + commit_loss_scale: 0.0 + + # Probability of updating the discriminator during each training step + # For example, update the discriminator 1/2 times (1 update for every 2 batches) + disc_updates_per_period: 1 + disc_update_period: 2 + + # All resolutions for mel reconstruction loss, ordered [num_fft, hop_length, window_length] + loss_resolutions: [ + [32, 8, 32], [64, 16, 64], [128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048] + ] + mel_loss_dims: [5, 10, 20, 40, 80, 160, 320] + mel_loss_log_guard: 1.0 + stft_loss_log_guard: 1.0 + feature_loss_type: absolute + + train_ds: + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + dataset_meta: ${train_ds_meta} + weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} + sample_rate: ${sample_rate} + n_samples: ${train_n_samples} + min_duration: 0.4 # seconds + max_duration: null + + dataloader_params: + batch_size: ${batch_size} + drop_last: true + num_workers: 4 + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + sample_rate: ${sample_rate} + n_samples: null + min_duration: null + max_duration: null + trunc_duration: 10.0 # Only use the first 10 seconds of audio for computing validation loss + dataset_meta: ${val_ds_meta} + + dataloader_params: + batch_size: 4 + num_workers: 2 + + # Configures how audio samples are generated and saved during training. + # Remove this section to disable logging. + log_config: + log_dir: ${log_dir} + log_epochs: [10, 50] + epoch_frequency: 100 + log_tensorboard: false + log_wandb: false + + generators: + - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator + log_audio: true + log_encoding: false + log_dequantized: false + + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + sample_rate: ${sample_rate} + n_samples: null + min_duration: null + max_duration: null + trunc_duration: 10.0 # Only log the first 10 seconds of generated audio. + dataset_meta: ${log_ds_meta} + + dataloader_params: + batch_size: 4 + num_workers: 2 + + audio_encoder: + _target_: nemo.collections.tts.modules.audio_codec_modules.HiFiGANEncoder + down_sample_rates: ${down_sample_rates} + encoded_dim: ${encoder_out_dim} + base_channels: 48 + activation: "lrelu" + + audio_decoder: + _target_: nemo.collections.tts.modules.audio_codec_modules.HiFiGANDecoder + up_sample_rates: ${up_sample_rates} + input_dim: ${encoder_out_dim} + base_channels: 768 + activation: "half_snake" + output_activation: "clamp" + + vector_quantizer: + _target_: nemo.collections.tts.modules.audio_codec_modules.GroupFiniteScalarQuantizer + num_groups: ${num_codebooks} + num_levels_per_group: [8, 5, 5, 5] + + discriminator: + _target_: nemo.collections.tts.modules.audio_codec_modules.Discriminator + discriminators: + - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiPeriodDiscriminator + - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiResolutionDiscriminatorSTFT + resolutions: [[512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]] + stft_bands: [[0.0, 0.1], [0.1, 0.25], [0.25, 0.5], [0.5, 0.75], [0.75, 1.0]] + + generator_loss: + _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss + + discriminator_loss: + _target_: nemo.collections.tts.losses.audio_codec_loss.DiscriminatorSquaredLoss + + optim: + _target_: torch.optim.Adam + lr: 2e-4 + betas: [0.8, 0.99] + + sched: + name: ExponentialLR + gamma: 0.998 + +trainer: + num_nodes: 1 + devices: -1 + accelerator: gpu + strategy: ddp_find_unused_parameters_true + precision: 16 + max_epochs: ${max_epochs} + accumulate_grad_batches: 1 + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 10 + benchmark: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: false + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + mode: min + save_top_k: 5 + save_best_model: true + always_save_nemo: true + resume_if_exists: false + resume_ignore_no_checkpoint: false diff --git a/nemo/collections/common/parts/utils.py b/nemo/collections/common/parts/utils.py index 75783815548a..e08f7d710183 100644 --- a/nemo/collections/common/parts/utils.py +++ b/nemo/collections/common/parts/utils.py @@ -159,3 +159,57 @@ def mask_sequence_tensor(tensor: torch.Tensor, lengths: torch.Tensor): raise ValueError('Can only mask tensors of shape B x L, B x D x L and B x D1 x D2 x L') return tensor * mask + + +class ClampActivation(nn.Module): + + def __init__(self, min_value: float = -1.0, max_value: float = 1.0): + super().__init__() + self.min_value = min_value + self.max_value = max_value + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return torch.clamp(input, min=self.min_value, max=self.max_value) + + +@torch.jit.script +def snake(x: torch.Tensor, alpha: torch.Tensor, eps: float = 1e-9) -> torch.Tensor: + """ + equation for snake activation function: x + (alpha + eps)^-1 * sin(alpha * x)^2 + """ + shape = x.shape + x = x.reshape(shape[0], shape[1], -1) + x = x + (alpha + eps).reciprocal() * torch.sin(alpha * x).pow(2) + x = x.reshape(shape) + return x + + +class Snake(nn.Module): + """ + Snake activation function introduced in 'https://arxiv.org/abs/2006.08195' + """ + + def __init__(self, channels: int): + super().__init__() + self.alpha = nn.Parameter(torch.ones(1, channels, 1)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return snake(x, self.alpha) + + +class HalfSnake(nn.Module): + """ + Activation which applies snake to the first half of input elements and leaky relu to the second half. + """ + + def __init__(self, channels: int): + super().__init__() + self.snake_channels = channels // 2 + self.snake_act = Snake(self.snake_channels) + self.lrelu = torch.nn.LeakyReLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + snake_out = self.snake_act(x[:, : self.snake_channels, :]) + lrelu_out = self.lrelu(x[:, self.snake_channels :, :]) + out = torch.cat([snake_out, lrelu_out], dim=1) + return out diff --git a/nemo/collections/tts/modules/audio_codec_modules.py b/nemo/collections/tts/modules/audio_codec_modules.py index e9ed34732c36..c8070225d25a 100644 --- a/nemo/collections/tts/modules/audio_codec_modules.py +++ b/nemo/collections/tts/modules/audio_codec_modules.py @@ -22,8 +22,7 @@ from einops import rearrange from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor -from nemo.collections.asr.parts.utils.activations import Snake -from nemo.collections.common.parts.utils import mask_sequence_tensor +from nemo.collections.common.parts.utils import ClampActivation, HalfSnake, Snake, mask_sequence_tensor from nemo.core.classes.common import typecheck from nemo.core.classes.module import NeuralModule from nemo.core.neural_types.elements import ( @@ -75,6 +74,8 @@ def __init__(self, activation: str = "elu", channels: int = 1): self.activation = torch.nn.LeakyReLU() elif activation == "snake": self.activation = Snake(channels) + elif activation == "half_snake": + self.activation = HalfSnake(channels) else: raise ValueError(f"Unknown activation {activation}") @@ -322,6 +323,185 @@ def forward(self, audio_real, audio_gen): return scores_real, scores_gen, fmaps_real, fmaps_gen +class DiscriminatorSTFT(NeuralModule): + """ + Discriminator network from EnCodec for Complex STFT input, but without dilations. + + Args: + filters: number of filters to use in Conv2d layers + lrelu_slope: Slope to use for activations. Leaky relu with slope of 0.1 or 0.2 is recommended for the + stability of the feature matching loss + """ + + def __init__(self, filters: int = 32, lrelu_slope: float = 0.1): + super().__init__() + + self.activation = nn.LeakyReLU(lrelu_slope) + self.conv_layers = nn.ModuleList( + [ + Conv2dNorm(2, filters, kernel_size=(3, 9)), + Conv2dNorm(filters, filters, kernel_size=(3, 9), stride=(1, 2)), + Conv2dNorm(filters, filters, kernel_size=(3, 9), stride=(1, 2)), + Conv2dNorm(filters, filters, kernel_size=(3, 9), stride=(1, 2)), + Conv2dNorm(filters, filters, kernel_size=(3, 3)), + ] + ) + self.conv_post = Conv2dNorm(filters, 1, kernel_size=(3, 3)) + + @property + def input_types(self): + return { + "spec": NeuralType(('B', 'C', 'T_spec', 'D'), VoidType()), + } + + @property + def output_types(self): + return { + "scores": NeuralType(('B', 'C', 'T_spec'), VoidType()), + "fmap": [NeuralType(('B', 'D', 'T_spec', 'C'), VoidType())], + } + + @typecheck() + def forward(self, spec): + fmap = [] + + # [batch, 2, T_spec, fft] + out = spec + for conv in self.conv_layers: + # [batch, filters, T_spec, fft // strides] + out = conv(inputs=out) + out = self.activation(out) + fmap.append(out) + # [batch, 1, T_spec, fft // 8] + scores = self.conv_post(inputs=out) + fmap.append(scores) + scores = rearrange(scores, "B 1 T C -> B C T") + + return scores, fmap + + +class MultiBandDiscriminatorSTFT(NeuralModule): + """ + Multi-band STFT discriminator proposed in DAC (https://arxiv.org/abs/2306.06546). + + Computes the complex STFT for a given resolution and splits it into sub-bands, + which are given to separate discriminator networks. + + Args: + resolution: STFT resolution, provided as a tuple of 3 integers ordered (num_fft, hop_length, window_length) + stft_bands: List of tuples, with each tuple having 2 float values (band_start, band_end). + The floats are in the range [0, 1] representing the fraction of all stft bands. + For example for n_fft=1024, the stft output has 513 dimensions. + For band input [(0, 0.25), (0.25, 1.0)] it would use stft dimensions [0 through 127] and [128 through 512]. + """ + + def __init__(self, resolution: Tuple[int], stft_bands: Iterable[Tuple[int]]): + super().__init__() + + self.n_fft, self.hop_length, self.win_length = resolution + self.register_buffer("window", torch.hann_window(self.win_length, periodic=False)) + self.discriminators = nn.ModuleList([DiscriminatorSTFT() for _ in stft_bands]) + n_stft = self.n_fft // 2 + 1 + self.stft_bands = [(int(band[0] * n_stft), int(band[1] * n_stft)) for band in stft_bands] + + def compute_stft(self, audio): + # [B, fft, T_spec] + fft = torch.stft( + audio, + n_fft=self.n_fft, + hop_length=self.hop_length, + win_length=self.win_length, + window=self.window, + normalized=True, + center=True, + return_complex=True, + ) + fft = rearrange(fft, "B fft T -> B T fft") + # [batch, 2, T_spec, fft] + out = torch.stack([fft.real, fft.imag], dim=1) + return out + + @property + def input_types(self): + return { + "audio": NeuralType(('B', 'T_audio'), AudioSignal()), + } + + @property + def output_types(self): + return { + "scores_list": [NeuralType(('B', 'C', 'T_spec'), VoidType())], + "fmaps_list": [[NeuralType(('B', 'D', 'T_spec', 'C'), VoidType())]], + } + + @typecheck() + def forward(self, audio): + scores_list = [] + fmap_list = [] + spec = self.compute_stft(audio) + for band, disc in zip(self.stft_bands, self.discriminators): + spec_band = spec[:, :, :, band[0] : band[1]] + score, fmap = disc(spec=spec_band) + scores_list.append(score) + fmap_list.append(fmap) + + return scores_list, fmap_list + + +class MultiResolutionDiscriminatorSTFT(NeuralModule): + """ + Multi-resolution discriminator which creates a multi-band discriminator for each input resolution. + + Args: + resolutions: List of STFT resolutions, each resolution provided as a tuple of 3 integers ordered + (num_fft, hop_length, window_length) + stft_bands: List of tuples, with each tuple having 2 float values (band_start, band_end). + The floats are in the range [0, 1] representing the fraction of all stft bands. + For example for n_fft=1024, the stft output has 513 dimensions. + For band input [(0, 0.25), (0.25, 1.0)] it would use stft dimensions [0 through 127] and [128 through 512]. + """ + + def __init__(self, resolutions: Iterable[Tuple[int]], stft_bands: Iterable[Tuple[int]]): + super().__init__() + self.discriminators = nn.ModuleList( + [MultiBandDiscriminatorSTFT(resolution=resolution, stft_bands=stft_bands) for resolution in resolutions] + ) + + @property + def input_types(self): + return { + "audio_real": NeuralType(('B', 'T_audio'), AudioSignal()), + "audio_gen": NeuralType(('B', 'T_audio'), AudioSignal()), + } + + @property + def output_types(self): + return { + "scores_real": [NeuralType(('B', 'C', 'T_spec'), VoidType())], + "scores_gen": [NeuralType(('B', 'C', 'T_spec'), VoidType())], + "fmaps_real": [[NeuralType(('B', 'D', 'T_spec', 'C'), VoidType())]], + "fmaps_gen": [[NeuralType(('B', 'D', 'T_spec', 'C'), VoidType())]], + } + + @typecheck() + def forward(self, audio_real, audio_gen): + scores_real = [] + scores_gen = [] + fmaps_real = [] + fmaps_gen = [] + + for disc in self.discriminators: + score_real_i, fmap_real_i = disc(audio=audio_real) + scores_real = scores_real + score_real_i + fmaps_real = fmaps_real + fmap_real_i + + score_gen_i, fmap_gen_i = disc(audio=audio_gen) + scores_gen = scores_gen + score_gen_i + fmaps_gen = fmaps_gen + fmap_gen_i + + return scores_real, scores_gen, fmaps_real, fmaps_gen + + class Discriminator(NeuralModule): """ Wrapper class which takes a list of discriminators and aggregates the results across them. @@ -868,6 +1048,120 @@ def forward(self, inputs, input_len): return out +class HiFiGANEncoder(NeuralModule): + """ + Audio encoder created by inverting the HiFi-GAN decoder. + + Args: + encoded_dim: Dimension of encoder output. + down_sample_rates: Rate to upsample for each decoder block. The product of the downsample rates will + determine the output token rate. For example 2 * 2 * 8 * 8 = 256 samples per token. + base_channels: Number of filters in the first convolution. The number of channels will be doubled after each + downsample layer. + in_kernel_size: Kernel size of the input convolution. + out_kernel_size: Kernel size of the output convolution. + resblock_kernel_sizes: List of kernel sizes to use in each residual block. + resblock_dilation_sizes: List of dilations to use in each residual block. + activation: Activation to use in residual and downsample layers, defaults to leaky relu. + """ + + def __init__( + self, + encoded_dim: int, + down_sample_rates: Iterable[int] = (2, 2, 8, 8), + base_channels: int = 32, + in_kernel_size: int = 7, + out_kernel_size: int = 7, + resblock_kernel_sizes: Iterable[int] = (3, 7, 11), + resblock_dilation_sizes: Iterable[int] = (1, 3, 5), + activation: str = "lrelu", + ): + assert in_kernel_size > 0 + assert out_kernel_size > 0 + + super().__init__() + + self.down_sample_rates = down_sample_rates + self.pre_conv = Conv1dNorm(in_channels=1, out_channels=base_channels, kernel_size=in_kernel_size) + + in_channels = base_channels + self.activations = nn.ModuleList([]) + self.down_sample_conv_layers = nn.ModuleList([]) + self.res_layers = nn.ModuleList([]) + for i, down_sample_rate in enumerate(self.down_sample_rates): + res_layer = HiFiGANResLayer( + channels=in_channels, + kernel_sizes=resblock_kernel_sizes, + dilations=resblock_dilation_sizes, + activation=activation, + ) + self.res_layers.append(res_layer) + + act = CodecActivation(activation, channels=in_channels) + self.activations.append(act) + + out_channels = 2 * in_channels + kernel_size = 2 * down_sample_rate + + padding = get_down_sample_padding(kernel_size=kernel_size, stride=down_sample_rate) + down_sample_conv = Conv1dNorm( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=down_sample_rate, + padding=padding, + ) + in_channels = out_channels + self.down_sample_conv_layers.append(down_sample_conv) + + self.post_activation = CodecActivation(activation, channels=in_channels) + self.post_conv = Conv1dNorm(in_channels=in_channels, out_channels=encoded_dim, kernel_size=out_kernel_size) + + @property + def input_types(self): + return { + "audio": NeuralType(('B', 'T_audio'), AudioSignal()), + "audio_len": NeuralType(tuple('B'), LengthsType()), + } + + @property + def output_types(self): + return { + "encoded": NeuralType(('B', 'D', 'T_encoded'), EncodedRepresentation()), + "encoded_len": NeuralType(tuple('B'), LengthsType()), + } + + def remove_weight_norm(self): + self.pre_conv.remove_weight_norm() + self.post_conv.remove_weight_norm() + for res_layer in self.res_layers: + res_layer.remove_weight_norm() + for down_sample_conv in self.down_sample_conv_layers: + down_sample_conv.remove_weight_norm() + + @typecheck() + def forward(self, audio, audio_len): + encoded_len = audio_len + audio = rearrange(audio, "B T -> B 1 T") + # [B, C, T_audio] + out = self.pre_conv(inputs=audio, input_len=encoded_len) + for act, res_layer, down_sample_conv, down_sample_rate in zip( + self.activations, self.res_layers, self.down_sample_conv_layers, self.down_sample_rates + ): + # [B, C, T] + out = res_layer(inputs=out, input_len=encoded_len) + out = act(out) + + encoded_len = encoded_len // down_sample_rate + # [B, 2 * C, T / down_sample_rate] + out = down_sample_conv(inputs=out, input_len=encoded_len) + + out = self.post_activation(out) + # [B, encoded_dim, T_encoded] + encoded = self.post_conv(inputs=out, input_len=encoded_len) + return encoded, encoded_len + + class HiFiGANDecoder(NeuralModule): """ Codec decoder using the HiFi-GAN generator architecture. @@ -876,8 +1170,9 @@ class HiFiGANDecoder(NeuralModule): Args: input_dim: Input dimension. - up_sample_rates: Rate to upsample for each decoder block. The product of the upsample rates will - determine the output frame rate. For example 8 * 8 * 2 * 2 = 256 samples per token. + up_sample_rates: Rate to upsample for each decoder block. The product of the upsample rates should be the same + as the overall downsample rate for your encoder. For example, a symmetric encoder/decoder can be created + with encoder downsample rates [2, 2, 8, 8] and decoder upsample rates [8, 8, 2, 2]. base_channels: Number of filters in the first convolution. The number of channels will be cut in half after each upsample layer. in_kernel_size: Kernel size of the input convolution. @@ -885,6 +1180,8 @@ class HiFiGANDecoder(NeuralModule): resblock_kernel_sizes: List of kernel sizes to use in each residual block. resblock_dilation_sizes: List of dilations to use in each residual block. activation: Activation to use in residual and upsample layers, defaults to leaky relu. + output_activation: Activation to apply to output. To produce a valid audio signal, it should output values in + the range [-1.0, 1.0]. Supports "tanh" and "clamp". """ def __init__( @@ -897,6 +1194,7 @@ def __init__( resblock_kernel_sizes: Iterable[int] = (3, 7, 11), resblock_dilation_sizes: Iterable[int] = (1, 3, 5), activation: str = "lrelu", + output_activation: str = "tanh", ): assert in_kernel_size > 0 assert out_kernel_size > 0 @@ -933,7 +1231,12 @@ def __init__( self.post_activation = CodecActivation(activation, channels=in_channels) self.post_conv = Conv1dNorm(in_channels=in_channels, out_channels=1, kernel_size=out_kernel_size) - self.out_activation = nn.Tanh() + if output_activation == "tanh": + self.out_activation = nn.Tanh() + elif output_activation == "clamp": + self.out_activation = ClampActivation() + else: + raise ValueError(f"Invalid audio output activation {output_activation}") @property def input_types(self): From f45422a5bb06592a7f81712c614bc7d774c0117a Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Wed, 28 Aug 2024 16:19:57 -0700 Subject: [PATCH 070/664] Add example script to run NeMo 2.0 llama pretraining with NeMo-Run (#10226) * Add example script to run NeMo 2.0 llama pretraining with NeMo-Run Signed-off-by: Hemil Desai * Fixes Signed-off-by: Hemil Desai * Address PR comments Signed-off-by: Hemil Desai --------- Signed-off-by: Hemil Desai --- examples/llm/run/llama3_pretraining.py | 190 +++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 examples/llm/run/llama3_pretraining.py diff --git a/examples/llm/run/llama3_pretraining.py b/examples/llm/run/llama3_pretraining.py new file mode 100644 index 000000000000..612b58e2169f --- /dev/null +++ b/examples/llm/run/llama3_pretraining.py @@ -0,0 +1,190 @@ +# This script is used for pretraining a Llama3 model, specifically for the 8b or 70b model variants, on local and slurm executors. +# It uses NeMo 2.0 recipes (https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/recipes/llama3_8b.py#L74) and NeMo-Run (https://github.com/NVIDIA/NeMo-Run) to configure and execute the runs. + +import argparse +from functools import partial +from typing import Any, Optional + +import nemo_run as run + +from nemo.collections import llm + + +def get_parser(): + parser = argparse.ArgumentParser(description="Llama3 Pretraining") + parser.add_argument( + "--size", + type=str, + default="8b", + help="Choose llama3 model size 70b/8b", + ) + parser.add_argument( + "--tag", + type=str, + help="Optional tag for your experiment title which will be appended after the model/exp name.", + required=False, + default="", + ) + parser.add_argument( + "--dryrun", + action="store_true", + help="Do a dryrun and exit", + default=False, + ) + parser.add_argument( + "--slurm", + action="store_true", + help="Run on slurm using run.SlurmExecutor", + default=False, + ) + return parser + + +def slurm_executor( + user: str, + host: str, + remote_job_dir: str, + account: str, + partition: str, + nodes: int, + devices: int, + time: str = "01:00:00", + custom_mounts: Optional[list[str]] = None, + custom_env_vars: Optional[dict[str, str]] = None, + container_image: str = "nvcr.io/nvidia/nemo:dev", + retries: int = 0, +) -> run.SlurmExecutor: + if not (user and host and remote_job_dir and account and partition and nodes and devices): + raise RuntimeError( + "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function." + ) + + mounts = [] + if custom_mounts: + mounts.extend(custom_mounts) + + env_vars = { + "TRANSFORMERS_OFFLINE": "1", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", + "NCCL_NVLS_ENABLE": "0", + "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", + "NVTE_ASYNC_AMAX_REDUCTION": "1", + "NVTE_FUSED_ATTN": "0", + } + if custom_env_vars: + env_vars |= custom_env_vars + + executor = run.SlurmExecutor( + account=account, + partition=partition, + tunnel=run.SSHTunnel( + user=user, + host=host, + job_dir=remote_job_dir, + ), + nodes=nodes, + ntasks_per_node=devices, + gpus_per_node=devices, + mem="0", + exclusive=True, + gres="gpu:8", + packager=run.GitArchivePackager(subpath="examples/llm/run"), + ) + + executor.container_image = container_image + executor.container_mounts = mounts + executor.env_vars = env_vars + executor.retries = retries + executor.time = time + + return executor + + +def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor: + env_vars = { + "TRANSFORMERS_OFFLINE": "1", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", + "NCCL_NVLS_ENABLE": "0", + "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", + "NVTE_ASYNC_AMAX_REDUCTION": "1", + "NVTE_FUSED_ATTN": "0", + } + + executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) + + return executor + + +def main(): + args = get_parser().parse_args() + if args.tag and not args.tag.startswith("-"): + args.tag = "-" + args.tag + + MODEL_SIZE_MAPPING: dict[str, dict[str, Any]] = { + "8b": { + "exp_name": "llama3-8b", + "nemo": { + "pretrain": partial(llm.llama3_8b.pretrain_recipe, num_nodes=1, num_gpus_per_node=8), + }, + }, + "70b": { + "exp_name": "llama3-70b", + "nemo": { + "pretrain": partial(llm.llama3_70b.pretrain_recipe, num_nodes=128, num_gpus_per_node=8), + }, + }, + } + + exp_name = MODEL_SIZE_MAPPING[args.size]["exp_name"] + + # Uses configs from NeMo directly + pretrain = MODEL_SIZE_MAPPING[args.size]["nemo"]["pretrain"]( + name=exp_name, + ckpt_dir=f"/{exp_name}/checkpoints", + ) + + # Overwrite the dataloader in the recipe to use your custom dataloader. + # dataloader = set_your_custom_dataloader + # pretrain.data = dataloader + + pretrain.trainer.val_check_interval = 400 + pretrain.log.ckpt.save_top_k = -1 + pretrain.log.ckpt.every_n_train_steps = 400 + + pretrain.trainer.max_steps = 1000 + + executor: run.Executor + + if args.slurm: + # TODO: Set your custom parameters for the Slurm Executor. + executor = slurm_executor( + user="", + host="", + remote_job_dir="", + account="", + partition="", + nodes=pretrain.trainer.num_nodes, + devices=pretrain.trainer.devices, + ) + else: + executor = local_executor_torchrun(nodes=pretrain.trainer.num_nodes, devices=pretrain.trainer.devices) + + with run.Experiment(f"{exp_name}{args.tag}") as exp: + pretrain.log.dir = f"/{exp_name}/checkpoints" + + for i in range(1): + exp.add( + pretrain, + executor=executor, + name=exp_name, + tail_logs=True if isinstance(executor, run.LocalExecutor) else False, + ) + + if args.dryrun: + exp.dryrun() + else: + exp.run(sequential=True, detach=True) + + +if __name__ == "__main__": + main() From 22f0bb0179543f84c3be00959ea532bc4da57825 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 28 Aug 2024 20:17:56 -0700 Subject: [PATCH 071/664] Add FSDP for NeMo 2.0 (#9748) * modify code structure and add strategy Signed-off-by: jasonwan * correct doc url Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * oextract common elements and add callback Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * add iomixin Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * update strategies Signed-off-by: jasonwan * update callback Signed-off-by: jasonwan * add training step to strategy Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * remove unused import Signed-off-by: jasonwan * add iomixin to strategy & precision Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: artbataev * add val/test steps to strategy Signed-off-by: jasonwan * add documentations Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * add default sharding for fsdp. add setup callback detection Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * extract checkpoint io logic Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * clean up unused imports Signed-off-by: jasonwan * sync new megatron strategy changes Signed-off-by: jasonwan * break down setup callback back into strategy Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * reorder stuff Signed-off-by: jasonwan * fix data logic Signed-off-by: jasonwan * minor fix Signed-off-by: jasonwan * add dtensor ckpt conversion support Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * support hsdp Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * remove iomixin Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * fix import Signed-off-by: jasonwan * refactor loss reduction Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * clean up Signed-off-by: jasonwan * add unittest Signed-off-by: jasonwan * Apply isort and black reformatting Signed-off-by: blahBlahhhJ * clean up Signed-off-by: jasonwan --------- Signed-off-by: jasonwan Signed-off-by: blahBlahhhJ Signed-off-by: artbataev Co-authored-by: blahBlahhhJ Co-authored-by: artbataev --- nemo/lightning/__init__.py | 2 +- nemo/lightning/pytorch/strategies/__init__.py | 8 + .../pytorch/strategies/fsdp_strategy.py | 245 +++++++ .../megatron_strategy.py} | 107 +-- nemo/lightning/pytorch/strategies/utils.py | 308 +++++++++ .../llm/test_mnist_model_nemo2_fsdp.py | 611 ++++++++++++++++++ 6 files changed, 1197 insertions(+), 84 deletions(-) create mode 100644 nemo/lightning/pytorch/strategies/__init__.py create mode 100644 nemo/lightning/pytorch/strategies/fsdp_strategy.py rename nemo/lightning/pytorch/{strategies.py => strategies/megatron_strategy.py} (89%) create mode 100644 nemo/lightning/pytorch/strategies/utils.py create mode 100644 tests/collections/llm/test_mnist_model_nemo2_fsdp.py diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py index e9674ed1e212..9d9b0df4da39 100644 --- a/nemo/lightning/__init__.py +++ b/nemo/lightning/__init__.py @@ -18,7 +18,7 @@ from nemo.lightning.pytorch.optim import LRSchedulerModule, MegatronOptimizerModule, OptimizerModule, lr_scheduler from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler -from nemo.lightning.pytorch.strategies import MegatronStrategy +from nemo.lightning.pytorch.strategies import FSDPStrategy, MegatronStrategy from nemo.lightning.pytorch.trainer import Trainer from nemo.lightning.resume import AutoResume diff --git a/nemo/lightning/pytorch/strategies/__init__.py b/nemo/lightning/pytorch/strategies/__init__.py new file mode 100644 index 000000000000..d946d8a9c149 --- /dev/null +++ b/nemo/lightning/pytorch/strategies/__init__.py @@ -0,0 +1,8 @@ +from nemo.lightning.pytorch.strategies.fsdp_strategy import FSDPStrategy +from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy + + +__all__ = [ + "FSDPStrategy", + "MegatronStrategy", +] diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py new file mode 100644 index 000000000000..9bb08b3cbd7a --- /dev/null +++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py @@ -0,0 +1,245 @@ +import shutil +from collections import OrderedDict +from pathlib import Path +from typing import Any, Dict, Optional, Union + +import pytorch_lightning as pl +import torch +from lightning_fabric.plugins import CheckpointIO +from lightning_fabric.strategies.fsdp import _get_sharded_state_dict_context +from megatron.core.transformer.transformer_layer import TransformerLayer +from pytorch_lightning.strategies.fsdp import FSDPStrategy as PLFSDPStrategy +from pytorch_lightning.trainer.states import TrainerFn +from pytorch_lightning.utilities.types import STEP_OUTPUT +from torch.distributed.checkpoint.state_dict import ( # get_state_dict, + StateDictOptions, + get_optimizer_state_dict, + set_state_dict, +) +from torch.utils.data import DataLoader +from typing_extensions import override + +from nemo.lightning import io +from nemo.lightning.pytorch.strategies.utils import ( + ckpt_to_dir, + fix_progress_bar, + get_checkpoint_io, + init_model_parallel, + mcore_to_pyt_sharded_state_dict, + pyt_to_mcore_state_dict, + setup_data_sampler, + setup_parallel_ranks, +) + + +class FSDPStrategy(PLFSDPStrategy, io.IOMixin): + """Megatron plugin for Pytorch Lightning. + + This strategy implements Fully-Sharded-Data-Parallel using PyTorch's native FSDP methods. + Comparing with MegatronStrategy, FSDPStrategy is designed to be more lightweight, with + minimal modifications over Lightning's FSDPStrategy but preserves necessary features to be + compatible with nemo and mcore. + By default, this strategy wraps FSDP per TransformerLayer. + + Note: + This strategy is designed to work with NVIDIA's Megatron-LM framework and requires + specific model implementations that are compatible with Megatron's parallelism techniques. + Note: + Due to the different optimizer structure (FSDP only uses torch native optimizers), + MegatronStrategy cannot resume training from checkpoints saved by FSDPStrategy, and vice + versa. However, the model weights structure is made compatible, so switching strategy is + possible if users only need the weights not the optimizer states. (E.g. run pretrain with + megatron 4D parallelism and run SFT with FSDP.) + """ + + def __init__( + self, + auto_wrap_policy={TransformerLayer}, + state_dict_type="sharded", + ckpt_include_optimizer=False, + data_sampler=None, + **kwargs, + ): + super().__init__(auto_wrap_policy=auto_wrap_policy, state_dict_type=state_dict_type, **kwargs) + + self.data_sampler = data_sampler + self.ckpt_include_optimizer = ckpt_include_optimizer + + @override + def setup_environment(self) -> None: + setup_parallel_ranks(self) + super().setup_environment() + init_model_parallel(self.model) + + @override + def setup(self, trainer: pl.Trainer) -> None: + self.trainer = trainer + setup_data_sampler(self.trainer) + fix_progress_bar(trainer) + super().setup(trainer) + + def _get_loss_reduction(self, step_type: str): + for fn_name in [f"{step_type}_loss_reduction", "loss_reduction"]: + if hasattr(self.lightning_module, fn_name): + return getattr(self.lightning_module, fn_name) + return None + + def _step_proxy(self, step_type, batch, batch_idx=None): + method_name = f"{step_type}_step" + if self.model != self.lightning_module: + loss = self._forward_redirection(self.model, self.lightning_module, method_name, batch, batch_idx) + else: + loss = getattr(self.lightning_module, method_name)(batch, batch_idx) + + _loss_reduction = self._get_loss_reduction(step_type) + if _loss_reduction: + return _loss_reduction.forward(batch, loss) + return loss, {'avg': loss} + + @override + def training_step(self, batch, batch_idx=None) -> STEP_OUTPUT: + assert self.lightning_module is not None + assert self.model is not None + with self.precision_plugin.train_step_context(): + loss, reduced = self._step_proxy("training", batch, batch_idx) + + self.lightning_module.log( + 'global_step', + self.trainer.global_step, + prog_bar=True, + rank_zero_only=True, + batch_size=1, + ) + + self.lightning_module.log( + 'step', + self.trainer.global_step, + ) + self.lightning_module.log( + 'reduced_train_loss', reduced['avg'], prog_bar=True, rank_zero_only=True, batch_size=1 + ) + + # returns unreduced loss for backward + return loss + + @override + def validation_step(self, batch, batch_idx=None) -> Any: + assert self.lightning_module is not None + assert self.model is not None + with self.precision_plugin.val_step_context(): + loss, reduced = self._step_proxy("validation", batch, batch_idx) + self.lightning_module.log('val_loss', reduced['avg'], rank_zero_only=True, batch_size=1) + return loss + + @override + def test_step(self, batch, batch_idx=None) -> STEP_OUTPUT: + assert self.lightning_module is not None + assert self.model is not None + with self.precision_plugin.test_step_context(): + loss, reduced = self._step_proxy("test", batch, batch_idx) + self.lightning_module.log('test_loss', reduced['avg'], rank_zero_only=True, batch_size=1) + + return loss + + @override + def predict_step(self, batch, batch_idx=None) -> STEP_OUTPUT: + assert self.lightning_module is not None + assert self.model is not None + with self.precision_plugin.predict_step_context(): + loss, reduced = self._step_proxy("predict", batch, batch_idx) + return reduced + + @override + def process_dataloader(self, dataloader: DataLoader) -> DataLoader: + if self.data_sampler: + return self.data_sampler.transform_dataloader(dataloader) + + return dataloader + + @property + @override + def checkpoint_io(self) -> CheckpointIO: + return get_checkpoint_io(self._checkpoint_io) + + @checkpoint_io.setter + def checkpoint_io(self, io: CheckpointIO) -> None: + self._checkpoint_io = io + + @property + def current_epoch_step(self) -> int: + """ + Get the value of step within an epoch. + """ + return max( + self.trainer.fit_loop.epoch_loop.automatic_optimization.optim_progress.optimizer.step.current.completed, + self.trainer.fit_loop.epoch_loop.manual_optimization.optim_step_progress.current.completed, + ) + + @override + def remove_checkpoint(self, filepath: Union[str, Path]) -> None: + # Taken from MegatronStrategy + if self.is_global_zero: + shutil.rmtree(ckpt_to_dir(filepath)) + + @override + def save_checkpoint( + self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None + ) -> None: + """Converts PyT checkpoints to MCore format and save using MCore dist ckpt library.""" + checkpoint["sharded_state_dict"] = pyt_to_mcore_state_dict(checkpoint.pop("state_dict")) + checkpoint["state_dict"] = OrderedDict([]) + + # TODO: do we still need to keep this? + for optim_state in checkpoint['optimizer_states']: + optim_state.pop("state") + + if self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_include_optimizer: + checkpoint['optimizer'] = get_optimizer_state_dict(self.model, self.optimizers) + pyt_to_mcore_state_dict(checkpoint['optimizer']['state'], prefix="optimizer.state.") + + self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options) + + @override + def load_checkpoint(self, checkpoint_path: str | Path) -> Dict[str, Any]: + """PTL method which we override to integrate distributed checkpoints for FSDP models. + Different from MegatronStrategy, both model and optimizer states are restore within + this method. + + The logic here is slightly more complicated: + 1. Obtain PyT state dicts (sharded & unflattened) for model and optim -> torch::ShardedTensor + 2. Convert to MCore state dicts -> mcore::ShardedTensor + 3. Load from checkpoint using MCore dist ckpt API -> torch::Tensor + 4. Convert to PyT state dicts (sharded & unflattened) -> torch::ShardedTensor + 5. Load into model and optim using PyT dist ckpt API + 6. Return the loaded checkpoint for lightning to load other metadata + """ + path = Path(self.broadcast(checkpoint_path)) + torch.cuda.empty_cache() + + # TODO: the elegant way to load both state dicts. Need pytorch 2.3.1 + # msd, osd = get_state_dict(self.model, self.optimizers, options=StateDictOptions(cpu_offload=True)) + sharded_state_dict = {} + with _get_sharded_state_dict_context(self.model): + msd = self.model.state_dict() + pyt_to_mcore_state_dict(msd) + sharded_state_dict["sharded_state_dict"] = msd + + if self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING: + osd = get_optimizer_state_dict(self.model, self.optimizers, options=StateDictOptions(cpu_offload=True)) + pyt_to_mcore_state_dict(osd['state'], prefix="optimizer.state.") + sharded_state_dict["optimizer"] = osd + + checkpoint = self.checkpoint_io.load_checkpoint(path, sharded_state_dict=sharded_state_dict) + mcore_to_pyt_sharded_state_dict(checkpoint['sharded_state_dict'], msd) + + if self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING: + mcore_to_pyt_sharded_state_dict(checkpoint['optimizer']['state'], osd['state']) + + set_state_dict( + self.model, + self.optimizers if self.ckpt_include_optimizer else [], + model_state_dict=checkpoint['sharded_state_dict'], + optim_state_dict=checkpoint['optimizer'] if self.ckpt_include_optimizer else None, + ) + + return checkpoint diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py similarity index 89% rename from nemo/lightning/pytorch/strategies.py rename to nemo/lightning/pytorch/strategies/megatron_strategy.py index d6ef18770fa4..e13b603b127d 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -17,11 +17,9 @@ from megatron.core.distributed import DistributedDataParallelConfig from megatron.core.optimizer import OptimizerConfig from pytorch_lightning.accelerators import CPUAccelerator -from pytorch_lightning.callbacks.progress import TQDMProgressBar from pytorch_lightning.loops import _AutomaticOptimization, evaluation_loop, fit_loop, prediction_loop from pytorch_lightning.loops.fetchers import _DataLoaderIterDataFetcher from pytorch_lightning.overrides.distributed import _sync_module_states -from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities.types import STEP_OUTPUT @@ -32,9 +30,16 @@ from typing_extensions import override from nemo.lightning import _strategy_lib, io -from nemo.lightning.io.pl import MegatronCheckpointIO from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction -from nemo.lightning.pytorch.callbacks import MegatronProgressBar, ModelTransform, ProgressPrinter +from nemo.lightning.pytorch.callbacks import ModelTransform +from nemo.lightning.pytorch.strategies.utils import ( + ckpt_to_dir, + fix_progress_bar, + get_checkpoint_io, + init_model_parallel, + setup_data_sampler, + setup_parallel_ranks, +) from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO, AsyncFinalizerCallback if TYPE_CHECKING: @@ -260,17 +265,9 @@ def setup(self, trainer: pl.Trainer) -> None: assert self.model is not None self.model = self._layer_sync.apply(self.model) - datamodule = getattr(trainer, "datamodule", None) - if not self.data_sampler and hasattr(datamodule, "data_sampler"): - self.data_sampler = datamodule.data_sampler - self.data_sampler.setup(self.cluster_environment.global_rank()) - if hasattr(datamodule, "reconfigure_limit_batches"): - datamodule.reconfigure_limit_batches() - - if self.data_sampler: - self.data_sampler.connect(trainer) + setup_data_sampler(self.trainer) + fix_progress_bar(trainer, self.replace_progress_bar, self.progress_interval) - self._fix_progress_bar(trainer) self.setup_megatron_parallel(trainer) self.setup_precision_plugin() @@ -323,19 +320,9 @@ def setup(self, trainer: pl.Trainer) -> None: @override def setup_distributed(self) -> None: - self._setup_parallel_ranks() + setup_parallel_ranks(self) super().setup_distributed() - - from megatron.core import parallel_state - - from nemo.utils import AppState - - # init model parallel if needed - if not parallel_state.model_parallel_is_initialized(): - app_state = AppState() - - if app_state.model_parallel_size is not None: - _strategy_lib.init_model_parallel(self.model) + init_model_parallel(self.model) if self.data_sampler: assert isinstance(self.cluster_environment, ClusterEnvironment), "Cluster environment not initialized" @@ -434,12 +421,6 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None: _optimizers_to_device(self.optimizers, self.root_device) - def _setup_parallel_ranks(self) -> None: - self.set_world_ranks() - env = cast(ClusterEnvironment, self.cluster_environment) - - _strategy_lib.init_parallel_ranks(env.world_size(), env.global_rank(), env.local_rank(), self.parallelism) - @override def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTPUT: assert self.lightning_module is not None @@ -561,27 +542,6 @@ def _update_step_kwargs(self, dataloader_iter, kwargs, step_name: str): return kwargs - def _fix_progress_bar(self, trainer: pl.Trainer) -> None: - callbacks: List[pl.Callback] = cast(List[pl.Callback], getattr(trainer, "callbacks")) - contains_megatron_progress, contains_progress = False, False - for callback in callbacks: - if isinstance(callback, MegatronProgressBar): - contains_megatron_progress = True - if callback.__class__ == TQDMProgressBar: - contains_progress = True - if not contains_megatron_progress and contains_progress: - for i, callback in enumerate(callbacks): - if isinstance(callback, TQDMProgressBar): - if self.replace_progress_bar: - printer = ProgressPrinter(log_interval=self.progress_interval) - printer._trainer = trainer - if not trainer.is_global_zero: - printer.disable() - callbacks[i] = printer - else: - callback.__class__ = MegatronProgressBar - break - def optimizer_sharded_state_dict(self, is_loading=False): """ Sharded state dictionary for an MainParamsOptimizerWrapper. @@ -660,23 +620,17 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr @property @override def checkpoint_io(self) -> CheckpointIO: - if self._checkpoint_io is None: - self._checkpoint_io = MegatronCheckpointIO( - save_ckpt_format=self.save_ckpt_format, - async_save=self.async_save, - torch_dist_multiproc=self.torch_dist_multiproc, - assume_constant_structure=self.assume_constant_structure, - parallel_save=self.parallel_save, - parallel_save_within_dp=self.parallel_save_within_dp, - parallel_load=self.parallel_load, - load_directly_on_device=self.load_directly_on_device, - ) - if self.async_save: - self._checkpoint_io = AsyncFinalizableCheckpointIO(self._checkpoint_io) - elif isinstance(self._checkpoint_io, _WrappingCheckpointIO): - self._checkpoint_io.checkpoint_io = MegatronCheckpointIO() - - return self._checkpoint_io + return get_checkpoint_io( + self._checkpoint_io, + save_ckpt_format=self.save_ckpt_format, + async_save=self.async_save, + torch_dist_multiproc=self.torch_dist_multiproc, + assume_constant_structure=self.assume_constant_structure, + parallel_save=self.parallel_save, + parallel_save_within_dp=self.parallel_save_within_dp, + parallel_load=self.parallel_load, + load_directly_on_device=self.load_directly_on_device, + ) @checkpoint_io.setter def checkpoint_io(self, io: CheckpointIO) -> None: @@ -775,19 +729,6 @@ def tensor_init_context(self, empty_init: Optional[bool] = None): yield -def ckpt_to_dir(filepath: Union[str, Path]) -> Path: - """PTL considers checkpoints as .ckpt files. - This method removes the extension and returns a path - to be used as a directory for distributed checkpoints. - """ - filepath = Path(filepath) - - if filepath.suffix == ".ckpt": - return filepath.with_name(filepath.stem) - - return filepath - - def _data_fetcher_wrapper(fn): @functools.wraps(fn) def wrapped(trainer: pl.Trainer, stage: RunningStage): diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py new file mode 100644 index 000000000000..86b7d58ae36b --- /dev/null +++ b/nemo/lightning/pytorch/strategies/utils.py @@ -0,0 +1,308 @@ +import io +from pathlib import Path +from typing import Any, Dict, Iterable, List, Tuple, Union, cast + +import pytorch_lightning as pl +import torch +from lightning_fabric.plugins import ClusterEnvironment +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.mapping import ShardedBase, ShardedObject, ShardedTensor +from megatron.core.dist_checkpointing.strategies.torch import sharded_tensor_to_torch_sharded_tensor +from megatron.core.transformer.utils import _get_extra_state_offsets +from pytorch_lightning.callbacks import TQDMProgressBar +from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO +from torch.distributed._sharded_tensor import ShardedTensor as TorchShardedTensor +from torch.distributed._tensor import DTensor, Replicate, Shard +from torch.distributed.device_mesh import DeviceMesh + +from nemo.lightning import _strategy_lib +from nemo.lightning.io.pl import MegatronCheckpointIO +from nemo.lightning.pytorch.callbacks import MegatronProgressBar, ProgressPrinter +from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO + + +def setup_parallel_ranks(strategy: pl.strategies.Strategy): + from megatron.core.model_parallel_config import ModelParallelConfig + + env = cast(ClusterEnvironment, strategy.cluster_environment) + parallelism = getattr(strategy, "parallelism", ModelParallelConfig()) + _strategy_lib.init_parallel_ranks(env.world_size(), env.global_rank(), env.local_rank(), parallelism) + + +def init_model_parallel(pl_module: pl.LightningModule): + from megatron.core import parallel_state + + from nemo.utils import AppState + + if not parallel_state.model_parallel_is_initialized(): + app_state = AppState() + + if app_state.model_parallel_size is not None: + _strategy_lib.init_model_parallel(pl_module) + + +def setup_data_sampler(trainer: pl.Trainer): + datamodule = getattr(trainer, "datamodule", None) + if datamodule is not None: + if hasattr(trainer.strategy, "data_sampler") and trainer.strategy.data_sampler is not None: + datamodule.data_sampler = trainer.strategy.data_sampler + elif hasattr(datamodule, "data_sampler"): + trainer.strategy.data_sampler = datamodule.data_sampler + + if trainer.strategy.data_sampler is not None: + trainer.strategy.data_sampler.setup(trainer.strategy.cluster_environment.global_rank()) + trainer.strategy.data_sampler.connect(trainer) + + if hasattr(datamodule, "reconfigure_limit_batches"): + datamodule.reconfigure_limit_batches() + + +def fix_progress_bar(trainer: pl.Trainer, replace_progress_bar: bool = True, progress_interval: int = 1) -> None: + callbacks: List[pl.Callback] = cast(List[pl.Callback], getattr(trainer, "callbacks")) + contains_megatron_progress, contains_progress = False, False + for callback in callbacks: + if isinstance(callback, MegatronProgressBar): + contains_megatron_progress = True + if callback.__class__ == TQDMProgressBar: + contains_progress = True + if not contains_megatron_progress and contains_progress: + for i, callback in enumerate(callbacks): + if isinstance(callback, TQDMProgressBar): + if replace_progress_bar: + printer = ProgressPrinter(log_interval=progress_interval) + printer._trainer = trainer + if not trainer.is_global_zero: + printer.disable() + callbacks[i] = printer + else: + callback.__class__ = MegatronProgressBar + break + + +def ckpt_to_dir(filepath: Union[str, Path]) -> Path: + """PTL considers checkpoints as .ckpt files. + This method removes the extension and returns a path + to be used as a directory for distributed checkpoints. + """ + filepath = Path(filepath) + + if filepath.suffix == ".ckpt": + return filepath.with_name(filepath.stem) + + return filepath + + +def get_checkpoint_io(checkpoint_io, **kwargs): + if checkpoint_io is None: + checkpoint_io = MegatronCheckpointIO(**kwargs) + if kwargs.get("async_save", False): + checkpoint_io = AsyncFinalizableCheckpointIO(checkpoint_io) + elif isinstance(checkpoint_io, _WrappingCheckpointIO): + checkpoint_io.checkpoint_io = MegatronCheckpointIO() + + return checkpoint_io + + +def mcore_to_pyt_sharded_state_dict( + checkpoint: Dict[str, List[torch.Tensor]], + sharded_state_dict: Dict[str, Union[List[ShardedTensor], ShardedObject]], + dtensor: bool = False, + device_mesh: DeviceMesh = None, +) -> Dict[str, Union[TorchShardedTensor, io.BytesIO]]: + def _mcore_to_pyt_dtensor( + tens: List[torch.Tensor], + sh_tens: List[ShardedTensor], + device_mesh: DeviceMesh, + ) -> DTensor: + assert len(tens) == 1 and len(sh_tens) == 1 + + dten = DTensor.from_local( + tens[0], + device_mesh, + ( + Replicate(), + Shard(dim=0), + ), # hardcoded for HSDP + ) + return dten + + def _mcore_to_pyt_sharded_tensor(tens: List[torch.Tensor], sh_tens: List[ShardedTensor]) -> TorchShardedTensor: + for ten, sh_ten in zip(tens, sh_tens): + # remove prepend axes and put in loaded tensor + sh_ten.global_shape = sh_ten.global_shape[sh_ten.prepend_axis_num :] + sh_ten.global_offset = sh_ten.global_offset[sh_ten.prepend_axis_num :] + sh_ten.axis_fragmentations = sh_ten.axis_fragmentations[sh_ten.prepend_axis_num :] + sh_ten.prepend_axis_num = 0 + sh_ten.data = ten + sh_ten.validate_metadata_integrity() + + return sharded_tensor_to_torch_sharded_tensor(sh_tens) + + def _convert(checkpoint, sharded_state_dict, k, device_mesh=None): + assert k in sharded_state_dict, f"{k} not in sharded_state_dict" + + if isinstance(sharded_state_dict[k], Dict): + for kk in sharded_state_dict[k]: + _convert(checkpoint[k], sharded_state_dict[k], kk, device_mesh=device_mesh) + elif isinstance(sharded_state_dict[k], ShardedObject): + """Do nothing. checkpoint[k] contains loaded io.BytesIO already.""" + elif isinstance(sharded_state_dict[k], List): # list of ShardedTensor + if dtensor: + checkpoint[k] = _mcore_to_pyt_dtensor(checkpoint[k], sharded_state_dict[k], device_mesh=device_mesh) + else: + checkpoint[k] = _mcore_to_pyt_sharded_tensor(checkpoint[k], sharded_state_dict[k]) + + for k in checkpoint: + _convert(checkpoint, sharded_state_dict, k, device_mesh=device_mesh) + + return checkpoint + + +def pyt_to_mcore_state_dict( + state_dict: Dict[str, Any], prefix: str = "", device_mesh: DeviceMesh = None +) -> Dict[str, List[ShardedBase]]: + + def _dtensor_to_mcore_sharded_tensor( + key: str, + dten: DTensor, + prepend_offsets: Iterable[Tuple[int, int, int]] = (), + prefix: str = "", + allow_shape_mismatch: bool = False, + device_mesh: DeviceMesh = None, + ) -> List[ShardedTensor]: + prepend_axis_num = len(prepend_offsets) + + assert device_mesh is not None + assert isinstance(dten, DTensor), dten + + ten = dten.to_local() + global_shape = dten.shape + + rank_offsets = [] + replica_id = 0 + axis = list(range(len(global_shape))) + axis_fragm = [global_shape[i] // ten.shape[i] for i in axis] + + for i, placement in enumerate(dten.placements): + if isinstance(placement, Shard): + ax = placement.dim + rank_offsets.append((ax + prepend_axis_num, dten.device_mesh.get_local_rank(i), axis_fragm[ax])) + elif placement.is_replicate(): + replica_id = device_mesh.get_local_rank(i) + + local_shard = ShardedTensor.from_rank_offsets( + f"{prefix}{key}", + ten, + *prepend_offsets, # prepend layer shards + *rank_offsets, + replica_id=replica_id, + prepend_axis_num=prepend_axis_num, + allow_shape_mismatch=allow_shape_mismatch, + ) + return [local_shard] + + def _torch_to_mcore_sharded_tensor( + key: str, + sh_ten: TorchShardedTensor, + prepend_offsets: Iterable[Tuple[int, int, int]] = (), + prefix: str = "", + allow_shape_mismatch: bool = False, + ) -> List[ShardedTensor]: + prepend_axis_num = len(prepend_offsets) + + assert isinstance(sh_ten, TorchShardedTensor), sh_ten + sharded_meta = sh_ten.metadata() + local_shards = sh_ten.local_shards() + + # DEBUG + assert all([ls.metadata.placement == local_shards[0].metadata.placement for ls in local_shards]), [ + ls.meta.placement for ls in local_shards + ] + + global_shape = sharded_meta.size + + axis = list(range(len(global_shape))) + axis_fragm = [global_shape[i] // local_shards[0].metadata.shard_sizes[i] for i in axis] + rank_offsets = [] + + for i in range(len(local_shards)): + local_shard = local_shards[i] + ten, meta = local_shard.tensor, local_shard.metadata + + for j in range(len(axis)): + axis_rank_offset = meta.shard_offsets[j] // meta.shard_sizes[j] + rank_offsets.append((axis[j] + prepend_axis_num, axis_rank_offset, axis_fragm[j])) + + local_shards[i] = ShardedTensor.from_rank_offsets( + f"{prefix}{key}", + ten, + *prepend_offsets, # prepend layer shards + *rank_offsets, + replica_id=0, + prepend_axis_num=prepend_axis_num, + allow_shape_mismatch=allow_shape_mismatch, + ) + + return local_shards + + def _torch_to_mcore_sharded_object( + key: str, + obj: io.BytesIO, + sharded_offsets: Iterable[Tuple[int, int, int]] = (), + prefix: str = "", + ) -> ShardedObject: + replica_id = ( + 0, + 0, + parallel_state.get_data_parallel_rank(with_context_parallel=True), + ) + + return ShardedObject(f"{prefix}{key}", obj, *_get_extra_state_offsets(sharded_offsets), replica_id) + + def _convert(state_dict, k, sh_key, v, prepend_offsets, prefix="", allow_shape_mismatch=False, device_mesh=None): + if isinstance(v, Dict): + for kk, vv in v.items(): + _convert( + v, + kk, + sh_key, + vv, + prepend_offsets, + prefix=f"{prefix}{kk}.", + allow_shape_mismatch=allow_shape_mismatch, + device_mesh=device_mesh, + ) + elif isinstance(v, DTensor): + state_dict[k] = _dtensor_to_mcore_sharded_tensor( + sh_key, + v, + prepend_offsets, + prefix=prefix, + allow_shape_mismatch=allow_shape_mismatch, + device_mesh=device_mesh, + ) + elif isinstance(v, TorchShardedTensor): + state_dict[k] = _torch_to_mcore_sharded_tensor( + sh_key, v, prepend_offsets, prefix=prefix, allow_shape_mismatch=allow_shape_mismatch + ) + elif isinstance(v, io.BytesIO): + state_dict[k] = _torch_to_mcore_sharded_object(sh_key, v, prepend_offsets, prefix) + + num_layers = 0 + for k in state_dict: + if k.startswith("module.decoder.layers."): + num_layers = max(num_layers, int(k.split('.')[3]) + 1) + + for k, v in state_dict.items(): + prepend_offsets = [] + sh_key = k + allow_shape_mismatch = k.endswith(".word_embeddings.weight") # vocab size can be different + if k.startswith("module.decoder.layers."): + sh_key = k.split('.') + global_layer_offset = int(sh_key.pop(3)) + sh_key = '.'.join(sh_key) + prepend_offsets.append((0, global_layer_offset, num_layers)) + + _convert(state_dict, k, sh_key, v, prepend_offsets, prefix, allow_shape_mismatch, device_mesh) + + return state_dict diff --git a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py new file mode 100644 index 000000000000..32fde23bceb9 --- /dev/null +++ b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py @@ -0,0 +1,611 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import subprocess +import sys +import tempfile +from contextlib import contextmanager +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, TypedDict, TypeVar, Union + +import megatron.core.num_microbatches_calculator +import pytest +import pytorch_lightning as pl +import torch +import torch.distributed +from megatron.core import ModelParallelConfig, parallel_state +from megatron.core.optimizer import OptimizerConfig +from megatron.core.transformer.enums import ModelType +from megatron.core.transformer.module import MegatronModule +from pytorch_lightning.loggers import TensorBoardLogger +from torch import Tensor, nn +from torch.optim import Adam +from torch.utils.data import DataLoader +from torchvision import transforms +from torchvision.datasets import MNIST + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.lightning import NeMoLogger, io, resume +from nemo.lightning.megatron_parallel import DataT, MegatronLossReduction, ReductionT +from nemo.lightning.pytorch import callbacks as nl_callbacks +from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule +from nemo.lightning.pytorch.plugins import MegatronDataSampler + +TokenizerType = Any + +"""This is intended to be a minimal self-container NeMo2 example.""" + + +T = TypeVar("T") + + +@dataclass +class ExampleConfig(ModelParallelConfig): + """ExampleConfig is a dataclass that is used to configure the model. + + Timers from ModelParallelConfig are required for megatron forward compatibility. + """ + + calculate_per_token_loss: bool = False + + def configure_model(self) -> nn.Module: + """This function is called by the strategy to construct the model. + + Note: Must pass self into Model since model requires having a config object. + + Returns: + The model object. + """ + return ExampleModel(self) + + +class MSELossReduction(MegatronLossReduction): + """A class used for calculating the loss, and for logging the reduced loss across micro batches.""" + + def forward(self, batch: DataT, forward_out: Tensor) -> Tuple[Tensor, ReductionT]: + """Calculates the loss within a micro-batch. A micro-batch is a batch of data on a single GPU. + + Args: + batch: A batch of data that gets passed to the original forward inside LitAutoEncoder. + forward_out: the output of the forward method inside LitAutoEncoder. + + Returns: + A tuple containing [, ReductionT] where the loss tensor will be used for + backpropagation and the ReductionT will be passed to the reduce method + (which currently only works for logging.). + """ + x = batch["data"] + outputs = forward_out + x_hat = outputs["x_hat"] + # you could also put a latent loss on z here. + xview = x.view(x.size(0), -1) + loss = nn.functional.mse_loss(x_hat, xview) + + return loss, {"avg": loss} + + def reduce(self, losses_reduced_per_micro_batch: Sequence[ReductionT]) -> Tensor: + """Works across micro-batches. (data on single gpu). + + Note: This currently only works for logging and this loss will not be used for backpropagation. + + Args: + losses_reduced_per_micro_batch: a list of the outputs of forward + + Returns: + A tensor that is the mean of the losses. (used for logging). + """ + mse_losses = torch.stack([loss["avg"] for loss in losses_reduced_per_micro_batch]) + return mse_losses.mean() + + +def some_first(seq: Iterable[Optional[T]]) -> T: + """Returns the first non-None value from the sequence or fails""" # noqa: D415 + for s in seq: + if s is not None: + return s + raise ValueError("non-None value not found") + + +def get_dtype_device(torch_object) -> Tuple[torch.dtype, torch.device]: # noqa: D103 + match torch_object: + case []: + raise ValueError("Looking up dtype on an empty list") + case {**data} if not data: + raise ValueError("Looking up dtype on an empty dict") + case torch.Tensor(dtype=dtype, device=device): + return dtype, device + case torch.nn.Module() as m: + try: + p = next(m.parameters()) + except StopIteration as e: + raise ValueError("Cannot get dtype on a torch module with no parameters.") from e + return p.dtype, p.device + case dict(keys=_, values=values): + val = some_first(values()) + return get_dtype_device(val) + case list() as l: + val = some_first(l) + return get_dtype_device(val) + case _: + raise TypeError("Got something we didnt expect") + + +# NOTE(SKH): These types are all wrong, but are close. The inner type must always be a torch.Tensor, but the outer container should be generic. +def batch_collator(batches: Optional[Union[Tuple[ReductionT], List[ReductionT]]]) -> Optional[ReductionT]: + """Takes a sequence of batches and collates them into a single batch. + This is distinct from the standard pytorch default_collator since it does + not add the batch dimension, it's assumed the batch + dimension is already present in the input, as would be the case when + parallelizing across minibatches. + + IMPORTANT: The underlying data primitive _must_ be a torch Tensor. The input to this function is a recurisve type, + there can be any amount of nesting between dictionaries, tuples, and lists, as long as the inner type is a n-d torch.Tensor. + + Examples: + Outer container = Dict: + [{'a': torch.tensor([1]), 'b': torch.tensor([2])}, {'a': torch.tensor([2]), 'b': torch.tensor([3])}] -> {'a': torch.tensor([1, 2]), 'b': torch.tensor([2, 3])} + Outer container = List: + [[torch.tensor([1]), torch.tensor([2])], [torch.tensor([2]), torch.tensor([3])]] -> [torch.tensor([1, 2]), torch.tensor([2, 3])] + Outer container = Tuple: + ([torch.tensor([1]), torch.tensor([2])], [torch.tensor([2]), torch.tensor([3])]) -> (torch.tensor([1, 2]), torch.tensor([2, 3])) + + Args: + batches (Optional[Sequence[ReductionT]]): sequence of batches to collate into a single batch. + + Returns: + A single batch of the same type as the elements of your input sequence. + """ # noqa: D205 + match batches: + case [torch.Tensor(), *_]: + return torch.cat(batches, dim=0) + case [dict(), *_]: + return {key: batch_collator([batch[key] for batch in batches]) for key in batches[0]} + case [tuple(), *_]: + return tuple(batch_collator([batch[i] for batch in batches]) for i in range(len(batches[0]))) + case [list(), *_]: + return [batch_collator([batch[i] for batch in batches]) for i in range(len(batches[0]))] + case None: + return None + case []: + raise ValueError("Cannot process an empty sequence") + case _: + raise ValueError("Unsupported input structure in batch_collator") + + +class PassthroughLossReduction(MegatronLossReduction): + """Internally in NeMo2.0 the forward step is always expected to return a loss reduction class, and forward is expected to return a loss. + This class hijacks that mechanism to instead pass through the forward output unperturbed as the loss (to enable inference in the predict step), and then the + reduce method is used to collate the batch of forward outputs into a single batch. This supports the model forward output being a tensor, dict, tuple, + or list of tensors. The inner type _must always be a torch.Tensor_. + """ # noqa: D205 + + def forward(self, batch: DataT, forward_out: DataT) -> Tuple[torch.Tensor, DataT]: + """_summary_ + + Args: + batch (DataT): The batch of data that was passed through the model to generate output. + forward_out (torch.Tensor): The output from your model's forward pass. + + Returns: + Tuple[torch.Tensor, ReductionT]: A tuple containing the loss tensor (dummy in this case) and the forward output (unmodified). + """ # noqa: D415 + dtype, device = get_dtype_device(forward_out) + return torch.zeros(1, device=device, dtype=dtype), forward_out + + def reduce(self, forward_out: List[DataT]) -> DataT: + """This overrides the standard reduce with a simplified version that just takes a list of your model's forward outputs + and collates them togehter into a single output. + + Args: + forward_out (List[ReductionT]): _description_ + + Returns: + ReductionT: _description_ + """ # noqa: D205 + return batch_collator(forward_out) + + +class TorchAdam(OptimizerModule): + def __init__(self, config, lr_scheduler=None): + self.conf = config + + super().__init__(lr_scheduler) + + def optimizers(self, model): + return [ + Adam( + model.parameters(), + lr=self.conf.lr, + betas=(self.conf.adam_beta1, self.conf.adam_beta2), + eps=self.conf.adam_eps, + weight_decay=self.conf.weight_decay, + ) + ] + + +class LitAutoEncoder(pl.LightningModule, io.IOMixin, io.ConnectorMixin): + """A very basic lightning module for testing the megatron strategy and the megatron-nemo2-bionemo contract.""" + + def __init__(self, config): + """Initializes the model. + + Args: + config: a Config object necessary to construct the actual nn.Module (the thing that has the parameters). + """ + super().__init__() + self.config = config + self.optim = TorchAdam( + config=OptimizerConfig(lr=1e-4, optimizer="adam"), + ) + # Bind the configure_optimizers method to the model + self.optim.connect(self) + + def forward(self, batch: Dict, batch_idx: Optional[int] = None) -> Any: + """This forward will be called by the megatron scheduler and it will be wrapped. + + !!! note + + The `training_step` defines the training loop and is independent of the `forward` method here. + + Args: + batch: A dictionary of data. + batch_idx: The index of the batch. + + Returns: + The output of the model. + """ + x = batch["data"] + return self.module(x) + + def training_step(self, batch, batch_idx: Optional[int] = None): + """The training step is where the loss is calculated and the backpropagation is done. + + Background: + - NeMo's Strategy overrides this method. + - The strategies' training step will call the forward method of the model. + - That forward method then calls the wrapped forward step of MegatronParallel which wraps the forward method of the model. + - That wrapped forward step is then executed inside the Mcore scheduler, which calls the `_forward_step` method from the + MegatronParallel class. + - Which then calls the training_step function here. + + In this particular use case, we simply call the forward method of this class, the lightning module. + + Args: + batch: A dictionary of data. requires `batch_idx` as default None. + batch_idx: The index of the batch. + """ + return self(batch, batch_idx) + + @property + def training_loss_reduction(self) -> MegatronLossReduction: # noqa: D102 + # This is the function that takes batch['loss_mask'] and the logits output by the model and reduces the loss + return MSELossReduction() + + @property + def validation_loss_reduction(self) -> MegatronLossReduction: # noqa: D102 + return MSELossReduction() + + @property + def test_loss_reduction(self) -> MegatronLossReduction: # noqa: D102 + return MSELossReduction() + + @property + def predict_loss_reduction(self) -> MegatronLossReduction: # noqa: D102 + # This allows us to do inference (not output the loss) + return PassthroughLossReduction() + + def configure_model(self) -> None: # noqa: D102 + self.module = self.config.configure_model() + + +class ExampleModel(MegatronModule): # noqa: D101 + def __init__(self, config: ModelParallelConfig) -> None: + """Constructor of the model. + + Args: + config: The config object is responsible for telling the strategy what model to create. + """ + super().__init__(config) + self.model_type = ModelType.encoder_or_decoder + self.linear1 = nn.Linear(28 * 28, 64) + self.relu = nn.ReLU() + self.linear2 = nn.Linear(64, 3) + self.linear3 = nn.Linear(3, 64) + self.relu2 = nn.ReLU() + self.linear4 = nn.Linear(64, 28 * 28) + + def forward(self, x: Tensor) -> Dict[str, Tensor]: + """Forward pass of the model. + + Args: + x: The input data. + + Returns: + x_hat: The result of the last linear layer of the network. + """ + x = x.view(x.size(0), -1) + z = self.linear1(x) + z = self.relu(z) + z = self.linear2(z) + x_hat = self.linear3(z) + x_hat = self.relu2(x_hat) + x_hat = self.linear4(x_hat) + return {"x_hat": x_hat, "z": z} + + def set_input_tensor(self, input_tensor: Optional[Tensor]) -> None: + """This is needed because it is a megatron convention. Even if it is a no-op for single GPU testing. + + See megatron.model.transformer.set_input_tensor() + + Note: Currently this is a no-op just to get by an mcore function. + + Args: + input_tensor: Input tensor. + """ + pass + + +class MnistItem(TypedDict): + data: Tensor + label: Tensor + idx: int + + +class MNISTCustom(MNIST): # noqa: D101 + def __getitem__(self, index: int) -> MnistItem: + """Wraps the getitem method of the MNIST dataset such that we return a Dict + instead of a Tuple or tensor. + + Args: + index: The index we want to grab, an int. + + Returns: + A dict containing the data ("x"), label ("y"), and index ("idx"). + """ # noqa: D205 + x, y = super().__getitem__(index) + + return { + "data": x, + "label": y, + "idx": index, + } + + +# TODO: remove this callback after `val` loss is logged by default in training in NeMo2 +class LossLoggingCallback(pl.Callback): # noqa: D101 + def __init__(self): + """Log the loss at the end of each batch. For training do not reduce across the epoch but do so for validation/test.""" + self.val_losses = [] + self.test_losses = [] + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): # noqa: D102 + # Assuming the loss is computed internally and stored in pl_module + if torch.distributed.get_rank() == 0 and parallel_state.is_pipeline_last_stage(): + if isinstance(outputs, dict): + outputs = outputs["loss"] + loss = outputs + pl_module.log("train_loss", loss, on_step=True, prog_bar=True, logger=True, rank_zero_only=True) + + def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx=0): # noqa: D102 + if torch.distributed.get_rank() == 0 and parallel_state.is_pipeline_last_stage(): + if isinstance(outputs, dict): + outputs = outputs["loss"] + loss = outputs + self.test_losses.append(loss) + + def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx=0): # noqa: D102 + # Assuming the loss is computed internally and stored in pl_module + if torch.distributed.get_rank() == 0 and parallel_state.is_pipeline_last_stage(): + if isinstance(outputs, dict): + outputs = outputs["loss"] + loss = outputs + self.val_losses.append(loss) + + def on_validation_epoch_end(self, trainer, pl_module): # noqa: D102 + if torch.distributed.get_rank() == 0 and parallel_state.is_pipeline_last_stage(): + if len(self.val_losses) > 0: + avg_val_loss = torch.stack(self.val_losses).mean() + pl_module.log("val_loss", avg_val_loss, prog_bar=True, logger=True, rank_zero_only=True) + self.val_losses.clear() + + def on_test_epoch_end(self, trainer, pl_module): # noqa: D102 + if torch.distributed.get_rank() == 0 and parallel_state.is_pipeline_last_stage(): + if len(self.test_losses) > 0: + avg_test_loss = torch.stack(self.test_losses).mean() + pl_module.log("test_loss", avg_test_loss, prog_bar=True, logger=True, rank_zero_only=True) + self.test_losses.clear() + + +class MNISTDataModule(pl.LightningDataModule): # noqa: D101 + def __init__(self, data_dir: str = "./", batch_size: int = 32) -> None: # noqa: D107 + super().__init__() + self.data_dir = data_dir + self.batch_size = batch_size + self.micro_batch_size = 8 + self.global_batch_size = 8 + self.max_len = 100 + self.rampup_batch_size = None + + # Note that this sampler is sequential, meaning it does not do any shuffling. Let's wrap our data in a shuffler. + # Wraps the datasampler with the MegatronDataSampler. The MegatronDataSampler is a wrapper that allows the sampler + # to be used with megatron. It sets up the capability to utilize micro-batching and gradient accumulation. It is also + # the place where the global batch size is constructed. + self.data_sampler = MegatronDataSampler( + seq_len=self.max_len, + micro_batch_size=self.micro_batch_size, + global_batch_size=self.global_batch_size, + rampup_batch_size=self.rampup_batch_size, + ) + + def setup(self, stage: str) -> None: + """Sets up the datasets + + Args: + stage: can be one of train / test / predict. + """ # noqa: D415 + self.mnist_test = MNISTCustom(self.data_dir, download=True, transform=transforms.ToTensor(), train=False) + self.mnist_predict = MNISTCustom(self.data_dir, download=True, transform=transforms.ToTensor(), train=False) + mnist_full = MNISTCustom(self.data_dir, download=True, transform=transforms.ToTensor(), train=True) + self.mnist_train, self.mnist_val = torch.utils.data.random_split( + mnist_full, [55000, 5000], generator=torch.Generator().manual_seed(42) + ) + + def train_dataloader(self) -> DataLoader: # noqa: D102 + return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=0) + + def val_dataloader(self) -> DataLoader: # noqa: D102 + return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=0) + + def test_dataloader(self) -> DataLoader: # noqa: D102 + return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=0) + + +### Begin model environment related utilities +def _reset_megatron_parallel_state(): + """Resets _GLOBAL_NUM_MICROBATCHES_CALCULATOR in megatron which is used in NeMo to initialized model parallel in + nemo.collections.nlp.modules.common.megatron.megatron_init.initialize_model_parallel_for_nemo + """ # noqa: D205, D415 + megatron.core.num_microbatches_calculator._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None + # Clean up any process groups created in testing + torch.cuda.empty_cache() + if parallel_state.is_initialized(): + parallel_state.destroy_model_parallel() + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + +@contextmanager +def reset_megatron_parallel_state() -> Iterator[None]: + """Puts you into a clean parallel state, and again tears it down at the end.""" + try: + _reset_megatron_parallel_state() + yield + finally: + _reset_megatron_parallel_state() + + +@pytest.mark.run_only_on("GPU") +@pytest.mark.integration +def test_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu(): + path = os.path.abspath(__file__) + call = f"python {path}" + # Raises a CalledProcessError if there is a failure in the subprocess + subprocess.check_call(call, shell=True, stdout=sys.stdout, stderr=sys.stdout) + + +def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu(): + """This is the actual test that will get run in a subprocess so it does not contaminate the state of other tests.""" + with tempfile.TemporaryDirectory() as tmpdir_str: + tmpdir = Path(tmpdir_str) + assert tmpdir.exists() + assert tmpdir.is_dir() + with reset_megatron_parallel_state(): + # Configure our custom Checkpointer + name = "test_experiment" + checkpoint_callback = nl_callbacks.ModelCheckpoint( + save_best_model=True, + save_last=True, + monitor="val_loss", + save_top_k=1, + every_n_train_steps=5, + # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe + enable_nemo_ckpt_io=True, + ) + root_dir = tmpdir + save_dir = root_dir / name + tb_logger = TensorBoardLogger(save_dir=str(save_dir), name=name) + # Setup the logger and train the model + nemo_logger = NeMoLogger( + dir=str(root_dir), # WARNING: passing a path in here results in mutating the Path class. + name=name, + tensorboard=tb_logger, + ckpt=checkpoint_callback, + ) + # Needed so that the trainer can find an output directory for the profiler + # nemo_logger.save_dir = tmpdir + + model = LitAutoEncoder(config=ExampleConfig()) + strategy = nl.FSDPStrategy() + trainer = nl.Trainer( + accelerator="gpu", + devices=1, + strategy=strategy, + limit_val_batches=5, + val_check_interval=5, + max_steps=20, + num_nodes=1, + log_every_n_steps=5, + callbacks=[io.track_io(LossLoggingCallback)()], + ) + data_module = MNISTDataModule(data_dir=tmpdir) + llm.train( + model=model, + data=data_module, + trainer=trainer, + log=nemo_logger, + resume=resume.AutoResume( + path=None, # Overrides the path found by resume_if_exists when set. + resume_if_exists=True, # Looks for the -last checkpoint to continue training. + resume_ignore_no_checkpoint=True, # When false this will throw an error with no existing checkpoint. + ), + ) + trainer._teardown() + with reset_megatron_parallel_state(): + pred_strategy = nl.FSDPStrategy( + data_sampler=MegatronDataSampler( + seq_len=28 * 28, + micro_batch_size=2, + global_batch_size=2, + output_log=False, # Disable logs to support predict_step + ), + ) + predict_trainer = nl.Trainer( + accelerator="gpu", + devices=1, + strategy=pred_strategy, + default_root_dir=str(root_dir), # WARNING: passing a path in here results in mutating the Path class. + ) + ckpt_path = checkpoint_callback.last_model_path.replace( + ".ckpt", "" + ) # strip .ckpt off the end of the last path + + assert Path( + ckpt_path + ).exists(), f"checkpoint {ckpt_path} not found in {os.listdir(Path(ckpt_path).parent)}" + # FIXME: the below checkpoint loading strategy and manual module unwrapping probably only works in single GPU + # and maybe DDP. + unwrapped_trained_model = trainer.model.module # TODO clean this up. Would be good not to have to unwrap. + forward_output = batch_collator( + predict_trainer.predict( + unwrapped_trained_model, dataloaders=data_module.test_dataloader(), ckpt_path=ckpt_path + ) + ) + + assert set(forward_output.keys()) == { + "z", + "x_hat", + }, f"We expect forward output from predit_step, not the loss, got: {forward_output}" + assert forward_output["x_hat"].shape == (len(data_module.mnist_test), 28 * 28) + assert forward_output["z"].shape == (len(data_module.mnist_test), 3) # latent bottleneck in model of dim 3 + predict_trainer._teardown() + + +if __name__ == "__main__": + # Have the test run this one item as a subprocess call + run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu() From 9796b6956555d43b48819a983f582728624e4996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Kami=C5=84ski?= <67481570+Laplasjan107@users.noreply.github.com> Date: Thu, 29 Aug 2024 09:56:26 +0200 Subject: [PATCH 072/664] Export fp8 te nemo to trt-llm (#10096) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * initial commit Signed-off-by: Piotr Kaminski * PR draft Signed-off-by: Piotr Kaminski * fixed scaling weights Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Piotr Kaminski * fixed zarr loading, added flags, refactor Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * fix expert key mapping Signed-off-by: Piotr Kaminski * refactor Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * fix: failed test was finishing with exit code 0 Signed-off-by: Piotr Kaminski * test commit -- rerun github checks Signed-off-by: Piotr Kaminski * bugfix: naming Signed-off-by: Piotr Kaminski * bugfix v2: naming Signed-off-by: Piotr Kaminski * apply code review changes Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * fix TensorRTLLM build (fp8 still not supported) Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * undo refactor Signed-off-by: Piotr Kaminski * bugfix: arguments to dist_convert Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 --------- Signed-off-by: Piotr Kaminski Signed-off-by: Laplasjan107 Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com> Co-authored-by: Piotr Kaminski Co-authored-by: Laplasjan107 --- nemo/export/tensorrt_llm.py | 6 + .../trt_llm/converter/model_converter.py | 35 +- .../converter/model_to_trt_llm_ckpt.py | 41 ++- nemo/export/trt_llm/converter/utils.py | 328 +++++++++++------- .../trt_llm/nemo_ckpt_loader/nemo_file.py | 135 ++++++- scripts/export/export_to_trt_llm.py | 36 ++ tests/export/nemo_export.py | 39 ++- 7 files changed, 470 insertions(+), 150 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 2a89b76cc099..0e851ffec2af 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -169,6 +169,8 @@ def export( multiple_profiles: bool = False, gpt_attention_plugin: str = "auto", gemm_plugin: str = "auto", + fp8_quantized: Optional[bool] = None, + fp8_kvcache: Optional[bool] = None, ): """ Exports nemo checkpoints to TensorRT-LLM. @@ -203,6 +205,8 @@ def export( multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto" gemm_plugin (str): enable the gpt plugin. Default = "auto" + fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type. + fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type. """ if n_gpus is not None: @@ -324,6 +328,8 @@ def export( gpus_per_node=gpus_per_node, use_parallel_embedding=use_parallel_embedding, use_embedding_sharing=use_embedding_sharing, + fp8_quantized=fp8_quantized, + fp8_kvcache=fp8_kvcache, ) for weight_dict, model_config in zip(weights_dicts, model_configs): diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py index 337a0a4e4e77..627096168d7b 100755 --- a/nemo/export/trt_llm/converter/model_converter.py +++ b/nemo/export/trt_llm/converter/model_converter.py @@ -15,10 +15,11 @@ import csv import logging -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple import numpy as np import tensorrt_llm +import torch from tensorrt_llm._utils import pad_vocab_size from tensorrt_llm.functional import non_gated_version from tensorrt_llm.layers import MoeConfig @@ -80,6 +81,18 @@ def prompt_convert(prompt_config, prompt_weights): return vtokens_embeddings +def determine_quantization_settings( + nemo_model_config, fp8_quantized: Optional[bool] = None, fp8_kvcache: Optional[bool] = None +) -> Tuple[bool, bool]: + is_nemo_quantized = nemo_model_config.get('fp8', False) + if fp8_quantized is None: + fp8_quantized = is_nemo_quantized + if fp8_kvcache is None: + fp8_kvcache = is_nemo_quantized + + return fp8_quantized, fp8_kvcache + + def model_to_trtllm_ckpt( model, nemo_model_config, @@ -93,15 +106,17 @@ def model_to_trtllm_ckpt( use_embedding_sharing: bool = False, use_distributed_convert: bool = False, model_parallel_rank: int = None, - vocab_size: int = None, + vocab_size: Optional[int] = None, + fp8_quantized: Optional[bool] = None, + fp8_kvcache: Optional[bool] = None, ) -> Tuple[List[Dict], List[PretrainedConfig]]: - if nemo_model_config.get("share_embeddings_and_output_weights", False) and not use_embedding_sharing: LOGGER.info( "Found share_embeddings_and_output_weights is True in NeMo config, set use_embedding_sharing = True" ) use_embedding_sharing = True + fp8_quantized, fp8_kvcache = determine_quantization_settings(nemo_model_config, fp8_quantized, fp8_kvcache) # If the model has been sharded with model parallelism, convert the model in a gpu-distributed manner if use_distributed_convert: weights_dict = dist_model_to_trt_llm_ckpt( @@ -110,6 +125,8 @@ def model_to_trtllm_ckpt( inference_tp_size=tensor_parallel_size, inference_pp_size=pipeline_parallel_size, tokenizer_vocab_size=vocab_size, + fp8_quantized=fp8_quantized, + fp8_kvcache=fp8_kvcache, ) vocab_size_padded = vocab_size else: @@ -122,6 +139,8 @@ def model_to_trtllm_ckpt( storage_type=dtype, use_parallel_embedding=use_parallel_embedding, decoder_type=decoder_type, + fp8_quantized=fp8_quantized, + fp8_kvcache=fp8_kvcache, ) has_lm_head = "lm_head.weight" in weights_dict @@ -161,8 +180,8 @@ def model_to_trtllm_ckpt( 'embedding_sharding_dim': 0, 'share_embedding_table': use_embedding_sharing, 'quantization': { - 'quant_algo': None, - 'kv_cache_quant_algo': None, + 'quant_algo': "FP8" if fp8_quantized else None, + 'kv_cache_quant_algo': "FP8" if fp8_kvcache else None, }, 'bias': nemo_model_config.get('bias'), 'apply_query_key_layer_scaling': False, @@ -263,9 +282,9 @@ def model_to_trtllm_ckpt( if mapping.is_last_pp_rank(): if has_lm_head: - weights_dict_local["lm_head.weight"] = np.ascontiguousarray( - split(lm_head_weight, mapping.tp_size, mapping.tp_rank) - ) + weights_dict_local["lm_head.weight"] = split( + lm_head_weight, mapping.tp_size, mapping.tp_rank + ).contiguous() weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"] ln_f_bias = weights_dict.get("transformer.ln_f.bias") diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py index db8a66308047..07ac7b334f8e 100644 --- a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py +++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py @@ -25,7 +25,7 @@ from tqdm import tqdm from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision -from nemo.export.trt_llm.converter.utils import save_val, split_and_save_weight, weights_dict +from nemo.export.trt_llm.converter.utils import save_scaling_factor, save_val, split_and_save_weight, weights_dict LOGGER = logging.getLogger("NeMo") @@ -94,6 +94,24 @@ def rename_key_dist_ckpt(old_key: str, layer: int): return rename_key(new_key) +def is_scaling_factor(key: str) -> bool: + return "extra_state" in key + + +def load_scaling_factors(model: dict, num_layers: int, export_config: dict) -> dict: + if not export_config.get('fp8_quantized', False): + return {} + + scaling_factors = {} + for key, val in model.items(): + if is_scaling_factor(key): + for layer in range(num_layers): + renamed_key = rename_key_dist_ckpt(key, layer) + scaling_factors = save_scaling_factor(scaling_factors, renamed_key, val[layer], export_config) + + return scaling_factors + + @torch.no_grad() def convert_model_to_trt_llm_ckpt( nemo_model_config, @@ -104,6 +122,8 @@ def convert_model_to_trt_llm_ckpt( decoder_type, use_parallel_embedding, processes, + fp8_quantized=False, + fp8_kvcache=False, ): # if checkpoints files could be found - start preparing output dir @@ -148,6 +168,8 @@ def convert_model_to_trt_llm_ckpt( "use_attention_nemo_shape": True, "transpose_weights": True, "use_parallel_embedding": use_parallel_embedding, + "fp8_quantized": fp8_quantized, + "fp8_kvcache": fp8_kvcache, } # split_factor: in how many parts a TP training node is split @@ -158,7 +180,7 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): if tp_idx == 0 and pp_idx == 0: if has_position_embedding: val = model[get_layer_name("position_embedding", prefix)] - val = torch_to_numpy(val.to(storage_type).cpu()) + val = val.to(storage_type).cpu() model_level_weights["transformer.position_embedding.weight"].append(val) if pp_idx == 0: val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)] @@ -171,19 +193,19 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): pad_width = vocab_size_padded - vocab_size val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0) - val = torch_to_numpy(val.to(storage_type).cpu()) + val = val.to(storage_type).cpu() model_level_weights["transformer.vocab_embedding.weight"].append(val) if has_lm_head and pp_idx == training_pp_size - 1: val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)] - val = torch_to_numpy(val.to(storage_type).cpu()) + val = val.to(storage_type).cpu() model_level_weights["lm_head.weight"].append(val) weights_dict = {} - tp_rank = 0 handle_model_level_weights(model, 0, 0) model = extract_layers_with_prefix(model, transformer_layer_prefix) + scaling_factors = load_scaling_factors(model, num_layers, export_config) starmap_args = [] for key, val in model.items(): @@ -202,6 +224,7 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): storage_type, None, export_config, + scaling_factors, ) ) else: @@ -219,6 +242,7 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): storage_type, None, export_config, + scaling_factors, ) ) @@ -236,9 +260,10 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): weights_dict.update(weights_dict_local) for key, values in model_level_weights.items(): - model_level_weights[key] = np.concatenate(values, axis=0) + model_level_weights[key] = torch.concatenate(values, axis=0) weights_dict[key] = model_level_weights[key] + weights_dict.update(scaling_factors) return weights_dict @@ -269,6 +294,8 @@ def dist_model_to_trt_llm_ckpt( inference_tp_size, inference_pp_size, tokenizer_vocab_size, + fp8_quantized=False, + fp8_kvcache=False, ): from megatron.core import parallel_state from megatron.core.tensor_parallel.utils import VocabUtility @@ -314,6 +341,8 @@ def dist_model_to_trt_llm_ckpt( "convert_on_device": True, "use_attention_nemo_shape": True, "transpose_weights": True, + "fp8_quantized": fp8_quantized, + "fp8_kvcache": fp8_kvcache, } starmap_config = { diff --git a/nemo/export/trt_llm/converter/utils.py b/nemo/export/trt_llm/converter/utils.py index eab17167cbd5..3f9f2a31a307 100755 --- a/nemo/export/trt_llm/converter/utils.py +++ b/nemo/export/trt_llm/converter/utils.py @@ -13,6 +13,7 @@ # limitations under the License. +from typing import List, Optional, Tuple, Union import numpy as np import tensorrt_llm import torch @@ -31,6 +32,35 @@ "falcon": 'FalconForCausalLM', } +post_layernorm_keys = [ + "post_attention_layernorm.weight", + "post_attention_layernorm.bias", + "post_self_attn_layernorm.weight", +] +mlp_proj_bias_keys = ["mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias"] +attention_dense_bias_keys = ["attention.linear_proj.bias", "attention.dense.bias"] +input_layernorm_keys = ["input_layernorm.weight", "input_layernorm.bias"] +pre_layernorm_keys = ["pre_mlp_layernorm.weight", "pre_mlp_layernorm.bias"] +attention_dense_weight_keys = ["attention.linear_proj.weight", "attention.dense.weight"] +mlp_proj_weight_keys = ["mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight"] +mlp_fc_keys = ["mlp.dense_h_to_4h.weight", "mlp.dense_h_to_4h.bias", "mlp.linear_fc1.weight", "mlp.linear_fc1.bias"] +attention_qkv_bias_keys = ["attention.query_key_value.bias", "attention.linear_qkv.bias"] +attention_qkv_weight_keys = ["attention.query_key_value.weight", "attention.linear_qkv.weight"] +mlp_router_keys = ["mlp.router.weight"] +mlp_fc_expert_keys = ["experts.linear_fc1.weight"] +mlp_proj_experts_keys = ["experts.linear_fc2.weight"] +final_layernorm_keys = ["final_layernorm.weight", "final_layernorm.bias"] +mlp_dense_2_keys = ["mlp.dense_h_to_4h_2.weight", "mlp.dense_h_to_4h_2.bias"] +attention_not_mapped_keys = [ + "attention.query.weight", + "attention.query.bias", + "attention.key_value.weight", + "attention.key_value.bias", +] + +weight_scaling_suffix = '.weights_scaling_factor' +activation_scaling_suffix = '.activation_scaling_factor' + def save_val(val, dir, key, tp_num=None): suffix = "" if tp_num is None else f".{tp_num}.bin" @@ -174,10 +204,130 @@ def write_int8(vals, dir, base_key, split_dim, tp_rank, split_factor, kv_cache_o save_val(vals[save_key], dir, f"{base_key}.{save_key}") +def get_suffix(key: str) -> str: + return '.' + key.split('.')[-1] + + +def get_trt_llm_prefix(key: str) -> str: + layer_num = key.split(".")[1] + return f'transformer.layers.{layer_num}' + + +def any_word_in_key(key: str, words: List[str]) -> bool: + return any([word in key for word in words]) + + +def sequential_key_map(key: str, mapping: List[Tuple[List[str], str]]) -> Optional[str]: + for keywords, mapped in mapping: + if any_word_in_key(key, keywords): + return mapped + + return None + + +def get_trt_llm_infix(key: str) -> Optional[str]: + mapping = [ + (post_layernorm_keys, '.post_layernorm'), + (mlp_proj_bias_keys, '.mlp.proj'), + (attention_dense_bias_keys, '.attention.dense'), + (input_layernorm_keys, '.input_layernorm'), + (pre_layernorm_keys, '.post_layernorm'), + (attention_dense_weight_keys, '.attention.dense'), + (mlp_proj_weight_keys, '.mlp.proj'), + (mlp_fc_keys, '.mlp.fc'), + (attention_qkv_bias_keys + attention_qkv_weight_keys, '.attention.qkv'), + (mlp_router_keys, '.mlp.router'), + (mlp_fc_expert_keys, '.mlp.fc'), + (mlp_proj_experts_keys, '.mlp.proj'), + ] + return sequential_key_map(key, mapping) + + +def get_trt_llm_keyname(key: str) -> str: + if any_word_in_key(key, final_layernorm_keys): + return key.replace("final_layernorm", "transformer.ln_f") + + if infix := get_trt_llm_infix(key): + return get_trt_llm_prefix(key) + infix + get_suffix(key) + + return key + + +def is_scaling_factor(key: str) -> bool: + return "scale_fwd" in key + + +def get_scaling_factor_keys(key: str) -> Tuple[Tuple[str, str], Tuple[str, str]]: + # Reuses existing mapping of NeMo -> TRT LLM weights key via swapping suffixes + corresponding_weight_key = '.'.join(key.split('.')[:-2]) + '.weight' + corresponding_trt_llm_weight_key = get_trt_llm_keyname(corresponding_weight_key) + base_key = '.'.join(corresponding_trt_llm_weight_key.split('.')[:-1]) + + weight_scale = base_key + weight_scaling_suffix + activation_scale = base_key + activation_scaling_suffix + keys = (weight_scale, activation_scale) + + layer_prefix = get_trt_llm_prefix(key) + mapped_key = layer_prefix + '.mlp.gate' + gate_activation = mapped_key + activation_scaling_suffix + gate_weight = mapped_key + weight_scaling_suffix + gate_keys = (gate_activation, gate_weight) + + return keys, gate_keys + + +def save_scaling_factor(scaling_factors: dict, key: str, val: torch.Tensor, config: dict): + if not is_scaling_factor(key): + return scaling_factors + + activation_factor = torch_to_numpy(1 / val[0].view(1)) + weights_factor = torch_to_numpy(1 / val[1].view(1)) + + (weights_key, activation_key), gate_keys = get_scaling_factor_keys(key) + scaling_factors[activation_key] = activation_factor + scaling_factors[weights_key] = weights_factor + + split_gated_activation = config.get("split_gated_activation", False) + if split_gated_activation and any_word_in_key(key, ["mlp.dense_h_to_4h", "mlp.linear_fc1"]): + (gate_activation_key, gate_weight_key) = gate_keys + scaling_factors[gate_activation_key] = activation_factor + scaling_factors[gate_weight_key] = weights_factor + + return scaling_factors + + +def cast_val_datatype(vals, trt_llm_key, storage_type, is_fp8_model, scaling_factors): + if not is_fp8_model: + return [val.to(storage_type) for val in vals] + + fp8_storage_type = torch.float8_e4m3fn + quantized_keys = [ + k.split(weight_scaling_suffix)[0] for k in scaling_factors.keys() if k.endswith(weight_scaling_suffix) + ] + for k in quantized_keys: + if k in trt_llm_key: + storage_type = fp8_storage_type + scale = scaling_factors[k + weight_scaling_suffix] + vals = [val.to(torch.float32) / scale for val in vals] + break + + return [val.to(storage_type) for val in vals] + + +def split_val_gate(vals: List[np.ndarray], convert_on_device: bool): + if convert_on_device: + return [[n] for n in torch.chunk(vals[0], 2, axis=-1)] + + splits = [np.split(val, 2, axis=-1) for val in vals] + return list(zip(*splits)) + + # Note: in multi_query_mode, only query heads are split between multiple GPUs, while key/value head # are not split as there is only one head per key/value. @torch.no_grad() -def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_type, act_range, config): +def split_and_save_weight( + tp_rank, saved_dir, split_factor, key, vals, storage_type, act_range, config, scaling_factors={} +): use_attention_nemo_shape = config.get("use_attention_nemo_shape", False) split_gated_activation = config.get("split_gated_activation", False) num_attention_heads = config.get("num_attention_heads", 0) @@ -187,12 +337,11 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t num_kv_heads = config.get("num_kv_heads", num_attention_heads) size_per_head = config.get("kv_channels", None) convert_on_device = config.get("convert_on_device", False) - + is_fp8_model = config.get("fp8_quantized", False) + use_fp8_kv_cache = config.get("fp8_kvcache", False) save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only" - layer_num = key.split(".")[1] - layer_prefix = f'transformer.layers.{layer_num}' - + trt_llm_key = get_trt_llm_keyname(key) if not isinstance(vals, list): vals = [vals] @@ -201,138 +350,82 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t if "layernorm.weight" in key and config.get("apply_layernorm_1p", False): vals = [val.float() + 1.0 for val in vals] - vals = [val.to(storage_type) for val in vals] + vals = cast_val_datatype(vals, trt_llm_key, storage_type, is_fp8_model, scaling_factors) if convert_on_device: assert len(vals) == 1 # Should only convert a single device param per call assert torch.is_tensor(vals[0]) elif torch.is_tensor(vals[0]): vals = [torch_to_numpy(val.cpu()) for val in vals] - if ( - "input_layernorm.weight" in key - or "input_layernorm.bias" in key - or "pre_mlp_layernorm.weight" in key - or "pre_mlp_layernorm.bias" in key - or "attention.dense.bias" in key - or "attention.linear_proj.bias" in key - or "post_attention_layernorm.weight" in key - or "post_attention_layernorm.bias" in key - or "post_self_attn_layernorm.weight" in key - or "mlp.dense_4h_to_h.bias" in key - or "mlp.linear_fc2.bias" in key - or "final_layernorm.weight" in key - or "final_layernorm.bias" in key - ): + if any_word_in_key( + key, + input_layernorm_keys + + pre_layernorm_keys + + attention_dense_bias_keys + + post_layernorm_keys + + mlp_proj_bias_keys + + final_layernorm_keys, + ) and (tp_rank == 0 or convert_on_device): # shared weights, only need to convert the weights of rank 0 - if "post_self_attn_layernorm" in key or "post_attention_layernorm" in key: - if key.endswith('weight'): - key = f'{layer_prefix}.post_layernorm.weight' - else: - key = f'{layer_prefix}.post_layernorm.bias' - elif "mlp.linear_fc2.bias" in key or "mlp.dense_4h_to_h.bias" in key: - key = f'{layer_prefix}.mlp.proj.bias' - elif "attention.linear_proj.bias" in key or "attention.dense.bias" in key: - key = f'{layer_prefix}.attention.dense.bias' - elif "final_layernorm" in key: - key = key.replace("final_layernorm", "transformer.ln_f") - elif "input_layernorm" in key: - if key.endswith('weight'): - key = f'{layer_prefix}.input_layernorm.weight' - else: - key = f'{layer_prefix}.input_layernorm.bias' - elif "pre_mlp_layernorm" in key: - if key.endswith('weight'): - key = f'{layer_prefix}.post_layernorm.weight' - else: - key = f'{layer_prefix}.post_layernorm.bias' - if tp_rank == 0 or convert_on_device: - save_val(vals[0], saved_dir, key) - - elif ( - "attention.dense.weight" in key - or "mlp.dense_4h_to_h.weight" in key - or "attention.linear_proj.weight" in key - or "mlp.linear_fc2.weight" in key - ): - if "attention.linear_proj.weight" in key or "attention.dense.weight" in key: - key = f'{layer_prefix}.attention.dense.weight' - elif "mlp.linear_fc2.weight" in key or "mlp.dense_4h_to_h.weight" in key: - key = f'{layer_prefix}.mlp.proj.weight' + save_val(vals[0], saved_dir, trt_llm_key) + elif any_word_in_key(key, attention_dense_weight_keys + mlp_proj_weight_keys): if convert_on_device: - save_val(vals[0], saved_dir, key) + save_val(vals[0], saved_dir, trt_llm_key) else: cat_dim = 0 val = np.concatenate(vals, axis=cat_dim) split_vals = np.split(val, split_factor, axis=cat_dim) - save_split(split_vals, saved_dir, key, tp_rank, split_factor) + save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor) if act_range is not None and int8_outputs == "all": - base_key = key.replace(".weight", "") + base_key = trt_llm_key.replace(".weight", "") vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode) write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor) - elif ( - "mlp.dense_h_to_4h.weight" in key - or "mlp.dense_h_to_4h.bias" in key - or "mlp.linear_fc1.weight" in key - or "mlp.linear_fc1.bias" in key - ): - if key.endswith("weight"): - key = f'{layer_prefix}.mlp.fc.weight' - else: - key = f'{layer_prefix}.mlp.fc.bias' - + elif any_word_in_key(key, mlp_fc_keys): if split_gated_activation: - if convert_on_device: - vals, gates = [[n] for n in torch.chunk(vals[0], 2, axis=-1)] - else: - splits = [np.split(val, 2, axis=-1) for val in vals] - vals, gates = list(zip(*splits)) + vals, gates = split_val_gate(vals, convert_on_device) if convert_on_device: - save_val(vals[0], saved_dir, key) + save_val(vals[0], saved_dir, trt_llm_key) else: cat_dim = -1 val = np.concatenate(vals, axis=cat_dim) split_vals = np.split(val, split_factor, axis=cat_dim) - save_split(split_vals, saved_dir, key, tp_rank, split_factor) + save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor) if act_range is not None and int8_outputs == "all": - base_key = key.replace(".weight", "") + base_key = trt_llm_key.replace(".weight", "") vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode) write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor) if split_gated_activation: assert not save_int8 - if key.endswith("weight"): - key = f'{layer_prefix}.mlp.gate.weight' - else: - key = f'{layer_prefix}.mlp.gate.bias' - + layer_prefix = get_trt_llm_prefix(key) + gate_key = layer_prefix + '.mlp.gate' + get_suffix(trt_llm_key) if convert_on_device: - save_val(gates[0], saved_dir, key) + save_val(gates[0], saved_dir, gate_key) else: gate = np.concatenate(gates, axis=cat_dim) split_vals = np.split(gate, split_factor, axis=cat_dim) - save_split(split_vals, saved_dir, key, tp_rank, split_factor) + save_split(split_vals, saved_dir, gate_key, tp_rank, split_factor) - elif "mlp.dense_h_to_4h_2.weight" in key or "mlp.dense_h_to_4h_2.bias" in key: + elif any_word_in_key(key, mlp_dense_2_keys): if convert_on_device: - save_val(vals[0], saved_dir, key) + save_val(vals[0], saved_dir, trt_llm_key) else: cat_dim = -1 val = np.concatenate(vals, axis=cat_dim) split_vals = np.split(val, split_factor, axis=cat_dim) - save_split(split_vals, saved_dir, key, tp_rank, split_factor) + save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor) if act_range is not None and int8_outputs == "all": - base_key = key.replace(".weight", "") + base_key = trt_llm_key.replace(".weight", "") vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode) write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor) - elif "attention.query_key_value.bias" in key or "attention.linear_qkv.bias" in key: - key = f'{layer_prefix}.attention.qkv.bias' + elif any_word_in_key(key, attention_qkv_bias_keys): qkv_hidden_dim = vals[0].shape[0] size_per_head = qkv_hidden_dim // (num_attention_heads + 2 * num_kv_heads) q_num = num_attention_heads // num_kv_heads @@ -349,7 +442,7 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t if convert_on_device: qkv = torch.split(val, [q_num, 1, 1], dim=1) split_vals = torch.concatenate([qkv[0].reshape(-1), qkv[1].reshape(-1), qkv[2].reshape(-1)], dim=1) - save_val(split_vals, saved_dir, key) + save_val(split_vals, saved_dir, trt_llm_key) else: qkv = np.split(val, [q_num, q_num + 1], axis=1) q_split = np.split(qkv[0], split_factor, axis=0) @@ -361,10 +454,9 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t np.concatenate([q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], axis=0) for i in range(split_factor) ] - save_split(split_vals, saved_dir, key, tp_rank, split_factor) + save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor) - elif "attention.query_key_value.weight" in key or "attention.linear_qkv.weight" in key: - key = f'{layer_prefix}.attention.qkv.weight' + elif any_word_in_key(key, attention_qkv_weight_keys): assert use_attention_nemo_shape, "Only support NEMO shape for QKV weights" hidden_dim = vals[0].shape[0] if size_per_head is None: @@ -380,7 +472,7 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t split_vals = torch.concatenate( [qkv[0].reshape(hidden_dim, -1), qkv[1].reshape(hidden_dim, -1), qkv[2].reshape(hidden_dim, -1)], dim=1 ) - save_val(split_vals, saved_dir, key) + save_val(split_vals, saved_dir, trt_llm_key) else: len_vals = len(vals) val = np.concatenate(vals, axis=1) @@ -414,10 +506,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t ) for i in range(split_factor) ] - save_split(split_vals, saved_dir, key, tp_rank, split_factor) + save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor) if save_int8: - base_key = key.replace(".weight", "") + base_key = trt_llm_key.replace(".weight", "") vals_i8 = generate_int8(val, act_range, is_qkv=True, multi_query_mode=multi_query_mode) write_int8( vals_i8, @@ -428,18 +520,20 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t split_factor, kv_cache_only=int8_outputs == "kv_cache_only", ) - elif ( - "attention.query.weight" in key - or "attention.query.bias" in key - or "attention.key_value.weight" in key - or "attention.key_value.bias" in key - ): + + if use_fp8_kv_cache: + base_key = trt_llm_key.replace('.qkv.weight', '') + scaling_factor = np.array([1.0], dtype=np.float32) + save_val(scaling_factor, dir, base_key + '.kv_cache_scaling_factor') + + elif any_word_in_key(key, attention_not_mapped_keys): pass - elif "mlp.router.weight" in key: + + elif any_word_in_key(key, mlp_router_keys): val = np.concatenate(vals, axis=1) - key = f'{layer_prefix}.mlp.router.weight' - save_val(val, saved_dir, key) - elif "experts.linear_fc1.weight" in key: + save_val(val, saved_dir, trt_llm_key) + + elif any_word_in_key(key, mlp_fc_expert_keys): cat_dim = -1 val = np.concatenate(vals, axis=cat_dim) w1, w3 = np.split(val, 2, axis=1) @@ -449,15 +543,13 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t split_w3s = np.split(w3, split_factor, axis=1) split_vals = [np.concatenate(item, axis=1) for item in zip(split_w3s, split_w1s)] - key = f'{layer_prefix}.mlp.fc.weight' - save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor) + save_expert_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor) - elif "experts.linear_fc2.weight" in key: + elif any_word_in_key(key, mlp_proj_experts_keys): cat_dim = -1 val = np.concatenate(vals, axis=cat_dim) split_vals = np.split(val, split_factor, axis=cat_dim) - key = f'{layer_prefix}.mlp.proj.weight' - save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor) + save_expert_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor) else: print(f"[WARNING] {key} not handled by converter") @@ -465,14 +557,16 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t return weights_dict -def split(v, tp_size, idx, dim=0): +def split(v: Union[np.ndarray, torch.Tensor], tp_size: int, idx: int, dim: int = 0): """Splits the np tensor v on dim and return the idx's slice.""" if tp_size == 1: return v - if len(v.shape) == 1: - return np.ascontiguousarray(np.split(v, tp_size)[idx]) - else: - return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx]) + + dim = dim if len(v.shape) != 1 else 0 + if torch.is_tensor(v): + return torch.split(v, v.size(dim) // tp_size, dim=dim)[idx].contiguous() + + return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx]) def init_model_parallel_from_nemo(reshard_model): diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py index 1b711b5edbf3..14f02b06b71b 100644 --- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py +++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py @@ -17,16 +17,18 @@ import json import logging import os +from io import BytesIO from pathlib import Path -from typing import Dict, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np import tensorstore # This is important even though not used. Otherwise zarr raises error. import torch import yaml import zarr -from tensorrt_llm._utils import np_bfloat16 -from torch.distributed.checkpoint import FileSystemReader, TensorStorageMetadata +from tensorrt_llm._utils import np_bfloat16, str_dtype_to_torch +from torch.distributed.checkpoint import FileSystemReader +from torch.distributed.checkpoint.metadata import BytesStorageMetadata, TensorStorageMetadata from torch.distributed.checkpoint.state_dict_loader import load_state_dict from transformers import AutoTokenizer, PreTrainedTokenizer @@ -65,7 +67,65 @@ def __init__(self, path: Union[Path, TarPath]) -> None: self.path = path # overwrites path set in super().__init__ call -def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch_tensor=True): +def get_extra_state_key(state_dict: dict) -> Optional[str]: + for key in state_dict.keys(): + if '_extra_state/' in key: + return key + return None + + +def unpack_extra_state_key(key: str) -> Tuple[str, int]: + basename = key.split('/')[0] + size = int(key.split('/')[1].split('_')[-1]) + return basename, size + + +def clear_loaded_extra_states(state_dict: dict, basename: str) -> dict: + """The scaling factors are originally saved to state_dict under the keynames 'basename/*' + The standardized representation is saved to 'basename.*'. This function clears the former from the state. + """ + to_remove = [k for k in state_dict.keys() if basename + '/' in k] + for key in to_remove: + state_dict.pop(key) + return state_dict + + +def retrieve_scale(bytes: BytesIO) -> Optional[torch.Tensor]: + bytes.seek(0) + extra_state = torch.load(bytes) + if not extra_state or 'scale_fwd' not in extra_state: + return None + return extra_state['scale_fwd'].cpu() + + +def load_scales_from_bytes(bytes_list: List[BytesIO]) -> Optional[torch.Tensor]: + scales = [] + for bytes in bytes_list: + scale = retrieve_scale(bytes) + if scale is None: + return None + scales.append(scale) + return torch.stack(scales) + + +def load_scaling_factors(state_dict: dict, basename: str, size: int) -> Optional[torch.Tensor]: + keynames = [f'{basename}/shard_{layer}_{size}' for layer in range(size)] + bytes_list = [state_dict[keyname][0] for keyname in keynames] + return load_scales_from_bytes(bytes_list) + + +def standarize_distributed_scaling_factors(state_dict: dict) -> dict: + while key := get_extra_state_key(state_dict): + basename, size = unpack_extra_state_key(key) + scaling_factors = load_scaling_factors(state_dict, basename, size) + if scaling_factors is not None: + state_dict[basename + '.scale_fwd'] = scaling_factors + state_dict = clear_loaded_extra_states(state_dict, basename) + + return state_dict + + +def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch_tensor: bool = True): fs_reader = TarFileSystemReader(checkpoint_dir) metadata = fs_reader.read_metadata() @@ -74,11 +134,17 @@ def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch for k, tp in metadata.state_dict_metadata.items() if isinstance(tp, TensorStorageMetadata) } + + state_dict.update( + {k: {} for k, tp in metadata.state_dict_metadata.items() if isinstance(tp, BytesStorageMetadata)} + ) + load_state_dict( state_dict, storage_reader=fs_reader, no_dist=True, ) + state_dict = standarize_distributed_scaling_factors(state_dict) if not torch_tensor: for k, v in state_dict.items(): @@ -89,24 +155,61 @@ def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch return state_dict +def get_sharded_file(dir: dict, layer_number: int) -> Optional[os.PathLike]: + pt_file_list = list(dir.glob(f'shard_{layer_number}_*.pt')) + if pt_file_list == []: + return None + return pt_file_list[0] + + +def load_sharded_pickle_extra_state_scale(dir: Union[Path, TarPath]): + def _get_layer_number(file): + basename = os.path.basename(str(file)) + return int(basename.split('_')[1]) + + pt_files = list(dir.glob('shard_*_*.pt')) + bytes_list = [] + for file in sorted(pt_files, key=_get_layer_number): + with file.open('rb') as opened_file: + bytes_list.append(torch.load(opened_file)) + + return load_scales_from_bytes(bytes_list) + + +def contains_extra_states(subdir: Union[Path, TarPath]): + return list(subdir.glob('shard_0_*.pt')) != [] + + +def load_extra_state_from_pickle(sharded_state_dict: dict, subdir: Union[Path, TarPath]): + scales = load_sharded_pickle_extra_state_scale(subdir) + if scales is not None: + key = subdir.name + '.scale_fwd' + sharded_state_dict[key] = scales + + return sharded_state_dict + + def load_sharded_metadata_zarr(checkpoint_dir: Union[Path, TarPath], torch_tensor=True): sharded_state_dict = {} for subdir in checkpoint_dir.iterdir(): - if not subdir.is_dir() or not (subdir / '.zarray').exists(): + if not subdir.is_dir(): continue - key = subdir.name - - zstore = ZarrPathStore(subdir) - arr = zarr.open(zstore, 'r') - if torch_tensor: - # sharded_state_dict[key] = torch.from_numpy(arr[:].astype("float32")).to(dtype=torch.bfloat16) - if arr.dtype.name == "bfloat16": - sharded_state_dict[key] = torch.from_numpy(arr[:].view(np.int16)).view(torch.bfloat16) + if contains_extra_states(subdir): + sharded_state_dict = load_extra_state_from_pickle(sharded_state_dict, subdir) + elif (subdir / '.zarray').exists(): + key = subdir.name + zstore = ZarrPathStore(subdir) + arr = zarr.open(zstore, 'r') + + if torch_tensor: + # sharded_state_dict[key] = torch.from_numpy(arr[:].astype("float32")).to(dtype=torch.bfloat16) + if arr.dtype.name == "bfloat16": + sharded_state_dict[key] = torch.from_numpy(arr[:].view(np.int16)).view(torch.bfloat16) + else: + sharded_state_dict[key] = torch.from_numpy(arr[:]).view(str_dtype_to_torch(arr.dtype.name)) else: - sharded_state_dict[key] = torch.from_numpy(arr[:]) - else: - sharded_state_dict[key] = arr[:] + sharded_state_dict[key] = arr[:] return sharded_state_dict diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py index a9b9d92c172b..3f5924fde80c 100644 --- a/scripts/export/export_to_trt_llm.py +++ b/scripts/export/export_to_trt_llm.py @@ -15,12 +15,17 @@ import argparse import logging import sys +from typing import Optional from nemo.export.tensorrt_llm import TensorRTLLM LOGGER = logging.getLogger("NeMo") +class UsageError(Exception): + pass + + def get_args(argv): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, @@ -107,8 +112,37 @@ def get_args(argv): 'It is used to compute the workspace size of lora plugin.', ) parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode") + parser.add_argument( + "-fp8", + "--export_fp8_quantized", + default="auto", + type=str, + help="Enables exporting to a FP8-quantized TRT LLM checkpoint", + ) + parser.add_argument( + "-kv_fp8", + "--use_fp8_kv_cache", + default="auto", + type=str, + help="Enables exporting with FP8-quantizatized KV-cache", + ) args = parser.parse_args(argv) + + def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]: + s = s.lower() + true_strings = ["true", "1"] + false_strings = ["false", "0"] + if s in true_strings: + return True + if s in false_strings: + return False + if optional and s == 'auto': + return None + raise UsageError(f"Invalid boolean value for argument --{name}: '{s}'") + + args.export_fp8_quantized = str_to_bool("export_fp8_quantized", args.export_fp8_quantized, optional=True) + args.use_fp8_kv_cache = str_to_bool("use_fp8_kv_cache", args.use_fp8_kv_cache, optional=True) return args @@ -153,6 +187,8 @@ def nemo_export_trt_llm(argv): use_lora_plugin=args.use_lora_plugin, lora_target_modules=args.lora_target_modules, max_lora_rank=args.max_lora_rank, + fp8_quantized=args.export_fp8_quantized, + fp8_kvcache=args.use_fp8_kv_cache, ) LOGGER.info("Export is successful.") diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py index 557d6c07613d..ecaf198a0c07 100644 --- a/tests/export/nemo_export.py +++ b/tests/export/nemo_export.py @@ -242,6 +242,8 @@ def run_inference( test_deployment=False, test_data_path=None, save_trt_engine=False, + fp8_quantized=False, + fp8_kvcache=False, ) -> Tuple[Optional[FunctionalResult], Optional[AccuracyResult]]: if Path(checkpoint_path).exists(): if tp_size > torch.cuda.device_count(): @@ -325,6 +327,8 @@ def run_inference( lora_target_modules=lora_target_modules, max_num_tokens=max_num_tokens, use_embedding_sharing=use_embedding_sharing, + fp8_quantized=fp8_quantized, + fp8_kvcache=fp8_kvcache, ) if ptuning: @@ -452,6 +456,8 @@ def run_existing_checkpoints( test_data_path=None, save_trt_engine=False, in_framework=False, + fp8_quantized=False, + fp8_kvcache=False, ) -> Tuple[Optional[FunctionalResult], Optional[AccuracyResult]]: if tp_size > torch.cuda.device_count(): print("Skipping the test due to not enough number of GPUs") @@ -530,6 +536,8 @@ def run_existing_checkpoints( test_deployment=test_deployment, test_data_path=test_data_path, save_trt_engine=save_trt_engine, + fp8_quantized=fp8_quantized, + fp8_kvcache=fp8_kvcache, ) @@ -748,16 +756,33 @@ def get_args(): type=float, help="GPU memory utilization percentage for vLLM.", ) + parser.add_argument( + "-fp8", + "--export_fp8_quantized", + default="auto", + type=str, + help="Enables exporting to a FP8-quantized TRT LLM checkpoint", + ) + parser.add_argument( + "-kv_fp8", + "--use_fp8_kv_cache", + default="auto", + type=str, + help="Enables exporting with FP8-quantizatized KV-cache", + ) args = parser.parse_args() - def str_to_bool(name: str, s: str) -> bool: + def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]: + s = s.lower() true_strings = ["true", "1"] false_strings = ["false", "0"] - if s.lower() in true_strings: + if s in true_strings: return True - if s.lower() in false_strings: + if s in false_strings: return False + if optional and s == 'auto': + return None raise UsageError(f"Invalid boolean value for argument --{name}: '{s}'") args.test_cpp_runtime = str_to_bool("test_cpp_runtime", args.test_cpp_runtime) @@ -768,6 +793,8 @@ def str_to_bool(name: str, s: str) -> bool: args.use_vllm = str_to_bool("use_vllm", args.use_vllm) args.use_parallel_embedding = str_to_bool("use_parallel_embedding", args.use_parallel_embedding) args.in_framework = str_to_bool("in_framework", args.in_framework) + args.export_fp8_quantized = str_to_bool("export_fp8_quantized", args.export_fp8_quantized, optional=True) + args.use_fp8_kv_cache = str_to_bool("use_fp8_kv_cache", args.use_fp8_kv_cache, optional=True) return args @@ -821,6 +848,8 @@ def run_inference_tests(args): test_data_path=args.test_data_path, save_trt_engine=args.save_trt_engine, in_framework=args.in_framework, + fp8_quantized=args.export_fp8_quantized, + fp8_kvcache=args.use_fp8_kv_cache, ) tps = tps * 2 @@ -877,6 +906,8 @@ def run_inference_tests(args): test_cpp_runtime=args.test_cpp_runtime, test_data_path=args.test_data_path, save_trt_engine=args.save_trt_engine, + fp8_quantized=args.export_fp8_quantized, + fp8_kvcache=args.use_fp8_kv_cache, ) tps = tps * 2 @@ -940,5 +971,7 @@ def optional_bool_to_pass_fail(b: Optional[bool]): run_inference_tests(args) except UsageError as e: LOGGER.error(f"{e}") + raise e except argparse.ArgumentError as e: LOGGER.error(f"{e}") + raise e From 3ed93c1c696841a5f22a000174bdd71876a66a74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Kami=C5=84ski?= <67481570+Laplasjan107@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:48:19 +0200 Subject: [PATCH 073/664] Bugfix: loading scaling factors for pyt 24.07 (#10297) * bugfix: loading scaling factors Signed-off-by: Piotr Kaminski * list instead of set Signed-off-by: Piotr Kaminski --------- Signed-off-by: Piotr Kaminski Co-authored-by: Piotr Kaminski --- nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py index 14f02b06b71b..74e0aac758da 100644 --- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py +++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py @@ -136,7 +136,7 @@ def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch } state_dict.update( - {k: {} for k, tp in metadata.state_dict_metadata.items() if isinstance(tp, BytesStorageMetadata)} + {k: [] for k, tp in metadata.state_dict_metadata.items() if isinstance(tp, BytesStorageMetadata)} ) load_state_dict( From 006d65fb83ece4766e3d1e9d25c536e86172de6f Mon Sep 17 00:00:00 2001 From: jbieniusiewi <152396322+jbieniusiewi@users.noreply.github.com> Date: Thu, 29 Aug 2024 19:58:38 +0200 Subject: [PATCH 074/664] Sanity checks for unfinished checkpoints removal (#10228) * Added sanity checks Signed-off-by: Jacek Bieniusiewicz * Apply isort and black reformatting Signed-off-by: jbieniusiewi * Updated error msg Signed-off-by: Jacek Bieniusiewicz * Added checks for NeMo 2.0 Signed-off-by: Jacek Bieniusiewicz * Updated NeMo 2.0 test Signed-off-by: Jacek Bieniusiewicz * Apply isort and black reformatting Signed-off-by: jbieniusiewi * fixed test_nemo_logger.py Signed-off-by: Jacek Bieniusiewicz --------- Signed-off-by: Jacek Bieniusiewicz Signed-off-by: jbieniusiewi Co-authored-by: jbieniusiewi --- nemo/lightning/resume.py | 18 +++++++++++ nemo/utils/exp_manager.py | 18 +++++++++++ tests/core/test_exp_manager.py | 47 +++++++++++++++++++++++++++++ tests/lightning/test_nemo_logger.py | 32 +++++++++++++++++++- 4 files changed, 114 insertions(+), 1 deletion(-) diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py index ca87628d699e..a2de3ce6f690 100644 --- a/nemo/lightning/resume.py +++ b/nemo/lightning/resume.py @@ -109,8 +109,26 @@ def nemo_path(self, model=None) -> Optional[Path]: end_dist_checkpoints = [d for d in dist_checkpoints if d.match("*end")] last_dist_checkpoints = [d for d in dist_checkpoints if d.match("*last")] + end_chkpt_cnt = len(end_dist_checkpoints) end_checkpoints = _filter_out_unfinished_checkpoints(end_dist_checkpoints) + finished_end_chkpt_cnt = len(end_checkpoints) + if end_chkpt_cnt > 0 and finished_end_chkpt_cnt == 0: + raise ValueError( + "End checkpoint is unfinished and cannot be used to resume the training." + " Please remove the checkpoint manually to avoid unexpected cosequences, such as" + " restarting from scratch." + ) + + last_chkpt_cnt = len(last_dist_checkpoints) last_checkpoints = _filter_out_unfinished_checkpoints(last_dist_checkpoints) + finished_last_chkpt_cnt = len(last_checkpoints) + if last_chkpt_cnt > 0 and finished_last_chkpt_cnt == 0: + raise ValueError( + "Last checkpoint is unfinished and cannot be used to resume the training." + " Please remove the checkpoint manually to avoid unexpected cosequences, such as" + " restarting from scratch. Hint: Iteration number can be added to the checkpoint name pattern" + " to maximize chance that there is at least one finished last checkpoint to resume from." + ) if not checkpoint_dir.exists() or (not len(end_checkpoints) > 0 and not len(last_checkpoints) > 0): if self.resume_ignore_no_checkpoint: diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index ca18b22c00bc..201ae0a37bd7 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -748,11 +748,29 @@ def check_resume( end_checkpoints = ( end_dist_checkpoints if end_dist_checkpoints else list(checkpoint_dir.rglob("*end.ckpt")) ) + end_chkpt_cnt = len(end_checkpoints) end_checkpoints = _filter_out_unfinished_checkpoints(end_checkpoints) + finished_end_chkpt_cnt = len(end_checkpoints) + if end_chkpt_cnt > 0 and finished_end_chkpt_cnt == 0: + raise ValueError( + "End checkpoint is unfinished and cannot be used to resume the training." + " Please remove the checkpoint manually to avoid unexpected cosequences, such as" + " restarting from scratch." + ) + last_checkpoints = ( last_dist_checkpoints if last_dist_checkpoints else list(checkpoint_dir.rglob("*last.ckpt")) ) + last_chkpt_cnt = len(last_checkpoints) last_checkpoints = _filter_out_unfinished_checkpoints(last_checkpoints) + finished_last_chkpt_cnt = len(last_checkpoints) + if last_chkpt_cnt > 0 and finished_last_chkpt_cnt == 0: + raise ValueError( + "Last checkpoint is unfinished and cannot be used to resume the training." + " Please remove the checkpoint manually to avoid unexpected cosequences, such as" + " restarting from scratch. Hint: Iteration number can be added to the checkpoint name pattern" + " to maximize chance that there is at least one finished last checkpoint to resume from." + ) if not checkpoint_dir_exists or (not len(end_checkpoints) > 0 and not len(last_checkpoints) > 0): if resume_ignore_no_checkpoint: diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py index 2d9bd03f0203..a0b69927ecc0 100644 --- a/tests/core/test_exp_manager.py +++ b/tests/core/test_exp_manager.py @@ -1050,3 +1050,50 @@ def test_invalid_checkpoints_removed_from_topk(self, tmp_path): assert 'epoch=8.ckpt' in ckpt_filenames assert 'epoch=7.ckpt' in ckpt_filenames assert 'epoch=4.ckpt' in ckpt_filenames + + @pytest.mark.unit + def test_doesnt_silently_start_from_scratch(self, tmp_path): + """ + Ensure that if the last checkpoint is unfinished it wont silently start from scratch. + This is to avoid a training that is not actually making any progress. + """ + test_dir = tmp_path / "test" + checkpoints_dir = test_dir / "checkpoints" + + self._write_fake_checkpoint( + checkpoints_dir / "megatron_gpt--val_loss=5.01-step=900-consumed_samples=1000.0-last.ckpt", + isdir=False, + add_unfinished_marker=True, + ) # incomplete last + + restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) + + with pytest.raises(Exception): + exp_manager( + restored_trainer, + {"resume_if_exists": True, "resume_ignore_no_checkpoint": True, "explicit_log_dir": str(test_dir)}, + ) + + @pytest.mark.unit + def test_doesnt_silently_start_from_scratch_dist(self, tmp_path): + """ + Ensure that if the last distributed checkpoint is unfinished it wont silently start from scratch. + This is to avoid a training that is not actually making any progress. + """ + + test_dir = tmp_path / "test" + checkpoints_dir = test_dir / "checkpoints" + + self._write_fake_checkpoint( + checkpoints_dir / "megatron_gpt--val_loss=5.01-step=1100-consumed_samples=17600.0-last", + isdir=True, + add_unfinished_marker=True, + ) # incomplete last + + restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) + + with pytest.raises(Exception): + exp_manager( + restored_trainer, + {"resume_if_exists": True, "resume_ignore_no_checkpoint": True, "explicit_log_dir": str(test_dir)}, + ) diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 955367cb7581..3476f1361809 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -116,12 +116,42 @@ def test_resume(self, trainer, tmp_path): dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), resume_if_exists=True, ).setup(trainer) + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").rmdir() - ## if there are multiple "-last" checkpoints, choose the most recent one + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").mkdir() + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end-unfinished").touch() + # Error because *end.ckpt is unfinished, should raise an error despite resume_ignore_no_checkpoint=True + with pytest.raises(ValueError): + nl.AutoResume( + dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), + resume_if_exists=True, + resume_past_end=True, + resume_ignore_no_checkpoint=True, + ).setup(trainer) Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").rmdir() + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end-unfinished").unlink() + + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last").mkdir() + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last-unfinished").touch() + # Error because *last.ckpt is unfinished, should raise an error despite resume_ignore_no_checkpoint=True + with pytest.raises(ValueError): + nl.AutoResume( + dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), + resume_if_exists=True, + resume_ignore_no_checkpoint=True, + ).setup(trainer) + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last").rmdir() + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last-unfinished").unlink() + + ## if there are multiple "-last" checkpoints, choose the most recent one Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last").mkdir() time.sleep(1) ## sleep for a second so the checkpoints are created at different times Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last").mkdir() + time.sleep(1) + # unfinished last, that should be ignored + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel3--last").mkdir() + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel3--last-unfinished").touch() + nl.AutoResume( dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), resume_if_exists=True, From cdf61f902267d03954f846313bbb19b5db628967 Mon Sep 17 00:00:00 2001 From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> Date: Thu, 29 Aug 2024 12:32:51 -0700 Subject: [PATCH 075/664] allow disabling validation (#10273) Signed-off-by: Maanu Grover --- nemo/collections/llm/gpt/data/pre_training.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py index b2d9b5ba8cca..6b0f6c63195e 100644 --- a/nemo/collections/llm/gpt/data/pre_training.py +++ b/nemo/collections/llm/gpt/data/pre_training.py @@ -139,7 +139,11 @@ def setup(self, stage: str = "") -> None: num_val_samples = int(eval_iters * self.data_sampler.global_batch_size) num_test_samples = int(test_iters * self.data_sampler.global_batch_size) - if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): + if ( + self.trainer.limit_val_batches > 0.0 + and self.trainer.limit_val_batches <= 1.0 + and isinstance(self.trainer.limit_val_batches, float) + ): assert "blend" not in self.build_kwargs, ( "When using a single data distribution, limit_val_batches <= 1.0 is not supported. If you'd " "like to run with a fractional value of limit_val_batches, please pass in separate datasets for " From 736a6fc381abf85f09bf8a8941fd9935dca45bf5 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Thu, 29 Aug 2024 22:36:40 +0300 Subject: [PATCH 076/664] make torch_dist ckpt strategy as default (#9852) (#10291) copy of #9852 Signed-off-by: dimapihtar Signed-off-by: dimapihtar Co-authored-by: dimapihtar --- .../nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +- nemo/lightning/io/pl.py | 7 +++++++ nemo/utils/callbacks/dist_ckpt_io.py | 9 ++++++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 95cb1dcf48ec..388c95d7d3d7 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -175,7 +175,7 @@ model: fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint. # Distributed checkpoint setup - dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. + dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU dist_ckpt_parallel_save: True # if true, each worker will write its own part of the dist checkpoint dist_ckpt_parallel_save_within_dp: False # if true, save will be parallelized only within a DP group (whole world otherwise), which might slightly reduce the save overhead diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py index f43d24792c1a..f87f15223720 100644 --- a/nemo/lightning/io/pl.py +++ b/nemo/lightning/io/pl.py @@ -217,6 +217,13 @@ def _determine_dist_ckpt_save_strategy(self): are passed in config or in case of a fully parallel save in which case a parallelization wrapper is applied. """ + if self.save_ckpt_format == 'zarr': + logging.warning( + f'`zarr` distributed checkpoint backend is deprecated.' + f' Distributed optimizer checkpoint saving might be extremely slow.' + f' Please switch to PyTorch Distributed format (model.dist_ckpt_format=torch_dist).' + ) + if self.async_save and self.save_ckpt_format != 'torch_dist': raise ValueError('Async dist-ckpt save supported only for torch_dist format') diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py index 437c8b0c5887..091075488878 100644 --- a/nemo/utils/callbacks/dist_ckpt_io.py +++ b/nemo/utils/callbacks/dist_ckpt_io.py @@ -242,7 +242,7 @@ def from_config(cls, model_cfg: dict, async_save: bool = False): it should be provided separately. Defaults to False. """ return cls( - save_ckpt_format=model_cfg.get('dist_ckpt_format', 'zarr'), + save_ckpt_format=model_cfg.get('dist_ckpt_format', 'torch_dist'), load_directly_on_device=model_cfg.get('dist_ckpt_load_on_device', True), load_strictness=model_cfg.get('dist_ckpt_load_strictness', None), async_save=async_save, @@ -390,6 +390,13 @@ def _determine_dist_ckpt_save_strategy(self): are passed in config or in case of a fully parallel save in which case a parallelization wrapper is applied. """ + if self.save_ckpt_format == 'zarr': + logging.warning( + f'`zarr` distributed checkpoint backend is deprecated.' + f' Distributed optimizer checkpoint saving might be extremely slow.' + f' Please switch to PyTorch Distributed format (model.dist_ckpt_format=torch_dist).' + ) + if self.async_save and self.save_ckpt_format != 'torch_dist': raise ValueError('Async dist-ckpt save supported only for torch_dist format') From ea0f69f7dce15540311060e7e9f0b99881542c98 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Thu, 29 Aug 2024 21:47:07 +0200 Subject: [PATCH 077/664] TRT-LLM 0.12 + ModelOpt 0.17.0 updates (#10301) * Update trtllm-build options Signed-off-by: Jan Lasek * Pull QUANT_CFG_CHOICES into try/catch for HAVE_MODELOPT consistency Signed-off-by: Jan Lasek * Remove deprecated parallel group setup Signed-off-by: Jan Lasek * Remove deprecated size settings Signed-off-by: Jan Lasek * Use max_seq_len instead of max_output_len [part I] Signed-off-by: Jan Lasek --------- Signed-off-by: Jan Lasek --- nemo/export/quantize/quantizer.py | 21 ++++++++----------- nemo/export/tensorrt_llm.py | 19 ++++++++++++----- .../trt_llm/qnemo/qnemo_to_tensorrt_llm.py | 10 ++++----- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 590cf50c804c..de5b07787a1f 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -31,7 +31,15 @@ try: import modelopt.torch.quantization as mtq from modelopt.torch.export import export_tensorrt_llm_checkpoint - from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group + + QUANT_CFG_CHOICES = { + "int8": mtq.INT8_DEFAULT_CFG, + "int8_sq": mtq.INT8_SMOOTHQUANT_CFG, + "fp8": mtq.FP8_DEFAULT_CFG, + "int4_awq": mtq.INT4_AWQ_CFG, + "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG, + "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, + } HAVE_MODELOPT = True @@ -41,14 +49,6 @@ SUPPORTED_DTYPE = [16, "16", "bf16"] # Default precision for non-quantized layers -QUANT_CFG_CHOICES = { - "int8": mtq.INT8_DEFAULT_CFG, - "int8_sq": mtq.INT8_SMOOTHQUANT_CFG, - "fp8": mtq.FP8_DEFAULT_CFG, - "int4_awq": mtq.INT4_AWQ_CFG, - "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG, - "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, -} class Quantizer: @@ -157,9 +157,6 @@ def dummy(): model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer) model.trainer.strategy.setup_environment() - set_data_parallel_group(mpu.get_data_parallel_group()) - set_tensor_parallel_group(mpu.get_tensor_model_parallel_group()) - @staticmethod def modify_model_config(model_cfg: DictConfig) -> DictConfig: """Modify model config for quantization.""" diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 0e851ffec2af..1e06d0fdb8b9 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -148,7 +148,7 @@ def export( pipeline_parallelism_size: int = 1, gpus_per_node: Optional[int] = None, max_input_len: int = 256, - max_output_len: int = 256, + max_output_len: Optional[int] = 256, max_input_token: Optional[int] = None, max_output_token: Optional[int] = None, max_batch_size: int = 8, @@ -169,6 +169,7 @@ def export( multiple_profiles: bool = False, gpt_attention_plugin: str = "auto", gemm_plugin: str = "auto", + reduce_fusion: bool = True, fp8_quantized: Optional[bool] = None, fp8_kvcache: Optional[bool] = None, ): @@ -201,10 +202,11 @@ def export( max_lora_rank (int): maximum lora rank. max_num_tokens (int): opt_num_tokens (int): - max_seq_len (int): + max_seq_len (int): the maximum sequence length of a single request. multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto" gemm_plugin (str): enable the gpt plugin. Default = "auto" + reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type. fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type. """ @@ -257,8 +259,14 @@ def export( ) max_output_len = max_output_token - if max_seq_len is None: - max_seq_len = max_input_len + max_output_len + if max_output_len is not None: + warnings.warn( + "Parameter max_output_len is deprecated and will be removed. Please use max_seq_len instead.", + DeprecationWarning, + stacklevel=2, + ) + if max_seq_len is None: + max_seq_len = max_input_len + max_output_len if max_batch_size < 4: warnings.warn( @@ -284,7 +292,6 @@ def export( nemo_checkpoint_path=nemo_checkpoint_path, engine_dir=self.model_dir, max_input_len=max_input_len, - max_output_len=max_output_len, max_seq_len=max_seq_len, max_batch_size=max_batch_size, max_prompt_embedding_table_size=max_prompt_embedding_table_size, @@ -292,6 +299,7 @@ def export( pipeline_parallel_size=pipeline_parallelism_size, use_parallel_embedding=use_parallel_embedding, paged_kv_cache=paged_kv_cache, + paged_context_fmha=paged_context_fmha, remove_input_padding=remove_input_padding, use_lora_plugin=use_lora_plugin, lora_target_modules=lora_target_modules, @@ -299,6 +307,7 @@ def export( max_num_tokens=max_num_tokens, opt_num_tokens=opt_num_tokens, multiple_profiles=multiple_profiles, + reduce_fusion=reduce_fusion, ) else: if model_type is None: diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py index 48127a507a58..47d6b635c14d 100644 --- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py +++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py @@ -27,7 +27,6 @@ def qnemo_to_tensorrt_llm( nemo_checkpoint_path: str, engine_dir: str, max_input_len: int, - max_output_len: int, max_seq_len: Optional[int], max_batch_size: int, max_prompt_embedding_table_size: int, @@ -35,6 +34,7 @@ def qnemo_to_tensorrt_llm( pipeline_parallel_size: Optional[int] = None, use_parallel_embedding: bool = False, paged_kv_cache: bool = True, + paged_context_fmha: bool = False, remove_input_padding: bool = True, use_lora_plugin: Optional[str] = None, lora_target_modules: Optional[List[str]] = None, @@ -43,6 +43,7 @@ def qnemo_to_tensorrt_llm( opt_num_tokens: Optional[int] = None, max_beam_width: int = 1, multiple_profiles: bool = False, + reduce_fusion: bool = True, ): """Build TensorRT-LLM engine with trtllm-build command in a subprocess.""" assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}" @@ -82,17 +83,16 @@ def qnemo_to_tensorrt_llm( build_cmd += f"--workers {num_build_workers} " build_cmd += f"--max_batch_size {max_batch_size} " build_cmd += f"--max_input_len {max_input_len} " - build_cmd += f"--max_output_len {max_output_len} " build_cmd += f"--max_beam_width {max_beam_width} " - build_cmd += f"--tp_size {config.mapping.tp_size} " - build_cmd += f"--pp_size {config.mapping.pp_size} " build_cmd += f"--max_prompt_embedding_table_size {max_prompt_embedding_table_size} " build_cmd += f"--builder_opt {builder_opt} " build_cmd += f"--gpt_attention_plugin {config.dtype} " build_cmd += f"--nccl_plugin {config.dtype} " build_cmd += f"--paged_kv_cache {'enable' if paged_kv_cache else 'disable'} " + build_cmd += f"--use_paged_context_fmha {'enable' if paged_context_fmha else 'disable'} " build_cmd += f"--remove_input_padding {'enable' if remove_input_padding else 'disable'} " build_cmd += f"--multiple_profiles {'enable' if multiple_profiles else 'disable'} " + build_cmd += f"--reduce_fusion {'enable' if reduce_fusion else 'disable'} " if use_fused_mlp: build_cmd += "--use_fused_mlp " if "RecurrentGemma" not in config.architecture else "" @@ -100,7 +100,7 @@ def qnemo_to_tensorrt_llm( if not use_qdq: build_cmd += f"--gemm_plugin {config.dtype} " - if max_seq_len: + if max_seq_len is not None: build_cmd += f"--max_seq_len {max_seq_len} " if max_num_tokens is not None: From eff7ddd03c78c60fbc42dfcb53fd761507098965 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Thu, 29 Aug 2024 23:09:45 +0300 Subject: [PATCH 078/664] add documentation for reset_lr feature (#9639) (#10290) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Change default parallel_save to False (#9633) * Unwrap ckpt_io for model opt (async save) (#9622) (#9634) * add reset_lr documentation * fix style * fix style * fix style * add image * fix typo * fix plot * fix plot * change plot size * fix style * move image * add reset_lr to intro page --------- Signed-off-by: Mikołaj Błaż Signed-off-by: dimapihtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: mikolajblaz --- docs/source/nlp/nemo_megatron/intro.rst | 1 + .../nlp/nemo_megatron/reset_learning_rate.rst | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 docs/source/nlp/nemo_megatron/reset_learning_rate.rst diff --git a/docs/source/nlp/nemo_megatron/intro.rst b/docs/source/nlp/nemo_megatron/intro.rst index 65aaee2add6a..831edc4bbd42 100644 --- a/docs/source/nlp/nemo_megatron/intro.rst +++ b/docs/source/nlp/nemo_megatron/intro.rst @@ -20,6 +20,7 @@ To learn more about using NeMo to train Large Language Models at scale, please r peft/landing_page positional_embeddings mcore_customization + reset_learning_rate rampup_batch_size diff --git a/docs/source/nlp/nemo_megatron/reset_learning_rate.rst b/docs/source/nlp/nemo_megatron/reset_learning_rate.rst new file mode 100644 index 000000000000..f89daeeb3907 --- /dev/null +++ b/docs/source/nlp/nemo_megatron/reset_learning_rate.rst @@ -0,0 +1,30 @@ +.. _reset_learning_rate: + +Reset Learning Rate +------------------- + +The reset learning rate feature provides the ability to reset the learning rate for an existing checkpoint to its initial value (either 0 or ``optim.min_lr`` depending on the warmup steps) when performing continual pretraining. + +Parameters +---------- + +* ``reset_lr`` (boolean): Enables resetting the learning rate to the initial value. This feature is only supported with the distributed optimizer and megatron_amp_O2. +* ``reset_lr_steps`` (boolean): Enables adjusting the learning rate's max_steps and decay_steps by subtracting the number of steps already completed at the checkpoint. + +Use Cases +--------- + +1. ``reset_lr=True, reset_lr_steps=False`` +When pretraining an existing checkpoint "from scratch" on a different dataset. The learning rate will be reset to its initial value. This allows the model to start training on a new dataset with the same learning rate dynamics as if it were starting from scratch. + +2. ``reset_lr=True, reset_lr_steps=True`` +When continuing training from an existing checkpoint with the same configuration. The learning rate will be reset to its initial value, and the ``max_steps`` and ``decay_steps`` for learning rate schedule will be recalculated by subtracting the number of steps already completed at the checkpoint. Specifically: + * ``max_steps`` will be recalculated as ``max_steps -= completed_steps``. + * ``decay_steps`` will be recalculated as ``decay_steps -= completed_steps``. +This ensures that the learning rate reaches the ``min_lr`` value by the end of training without changing the ``trainer.max_steps``: + +.. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-reset-learning-rate-example.png + :alt: + :width: 1080px + + From 3ebe56787c628bc777eb390af0a548586b5468e6 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Thu, 29 Aug 2024 13:13:39 -0700 Subject: [PATCH 079/664] [NeMo UX] expose `num_dataset_builder_threads` argument (#10281) * expose num_dataset_builder_threads arg Signed-off-by: ashors1 * upate docstring Signed-off-by: ashors1 --------- Signed-off-by: ashors1 --- nemo/collections/llm/gpt/data/pre_training.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py index 6b0f6c63195e..919a9b52b4bb 100644 --- a/nemo/collections/llm/gpt/data/pre_training.py +++ b/nemo/collections/llm/gpt/data/pre_training.py @@ -54,6 +54,7 @@ class PreTrainingDataModule(pl.LightningDataModule, IOMixin): split (str): A string of 3 comma-separated integers denoting how much of the distribution to allocate to train, validation, and test sets, respectively. Unused if ``paths`` is a dict. index_mapping_dir (Optional[str]): Path to a directory to write index mapping files. + num_dataset_builder_threads (int): The number of threads to use for dataset building. """ def __init__( @@ -73,6 +74,7 @@ def __init__( seed: int = 1234, split: str = "900,50,50", index_mapping_dir: Optional[str] = None, + num_dataset_builder_threads: int = 1, ) -> None: super().__init__() if not isinstance(paths, (list, tuple, dict)): @@ -110,6 +112,7 @@ def __init__( self.seed = seed self.split = split self.index_mapping_dir = index_mapping_dir + self.num_dataset_builder_threads = num_dataset_builder_threads self.init_global_step = 0 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer @@ -218,6 +221,7 @@ def gpt_dataset_config(self) -> "GPTDatasetConfig": reset_position_ids=self.reset_position_ids, reset_attention_mask=self.reset_attention_mask, eod_mask_loss=self.eod_mask_loss, + num_dataset_builder_threads=self.num_dataset_builder_threads, **self.build_kwargs, ) From d0128dab39e7f43b81ae103d7616beb84da83dd4 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Thu, 29 Aug 2024 14:22:14 -0700 Subject: [PATCH 080/664] Disable SP (#10282) Signed-off-by: Alexandros Koumparoulis --- scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py index c50267ef6b42..ba9012de01a8 100644 --- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py @@ -79,6 +79,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None: model_config = MegatronGPTModel.restore_from(in_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.sequence_parallel = False if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True From 81f18f6f386a8bf3f0091fcaacd50f0f99d34f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 29 Aug 2024 16:10:23 -0700 Subject: [PATCH 081/664] ci: Selective triggering (#10195) * ci: Selective triggering Signed-off-by: Oliver Koenig * simplify pass-through Signed-off-by: Oliver Koenig * fix Signed-off-by: Oliver Koenig * fix Signed-off-by: Oliver Koenig * refactor tests that dont use template Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .github/workflows/cicd-main.yml | 430 +++++++++++++++++++------------- 1 file changed, 261 insertions(+), 169 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 8100d95ae2a3..1f49f891b85e 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -35,39 +35,47 @@ jobs: pre-flight: runs-on: ubuntu-latest outputs: - test_to_run: ${{ steps.main.outputs.test_to_run }} + test_to_run: ${{ steps.test_to_run.outputs.main }} + all: ${{ steps.all.outputs.main }} steps: - name: Parse test_to_run - id: main + id: test_to_run run: | - parsed_string=$(echo ${{ inputs.test_to_run }} | jq -c --raw-input 'split(",")') - echo "test_to_run=${parsed_string}" >> "$GITHUB_ENV" + parsed_string=$(echo ${{ inputs.test_to_run || 'all' }} | jq -c --raw-input 'split(",")') + echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT" + - name: Parse all + id: all + run: | + echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT" gpu-test: needs: [pre-flight] runs-on: self-hosted-azure - if: ${{ github.event.label.name == 'Run CICD' }} + if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }} steps: - name: Run nvidia-smi test run: | whoami nvidia-smi + cicd-cluster-clean: runs-on: self-hosted-azure-builder needs: [pre-flight] - if: ${{ github.event.label.name == 'Run CICD' }} + if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }} steps: - name: Clean server from old files run: | docker container prune --filter "until=24h" --force docker image prune -a --filter "until=24h" --force - cicd-test-container-setup: - needs: [cicd-cluster-clean] + needs: [cicd-cluster-clean, pre-flight] runs-on: self-hosted-azure-builder - if: ${{ github.event.label.name == 'Run CICD' }} + if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }} + outputs: + test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} + all: ${{ needs.pre-flight.outputs.all }} steps: - name: Checkout repository uses: actions/checkout@v4 @@ -117,6 +125,7 @@ jobs: L0_Unit_Tests_GPU: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure TIMEOUT: 60 @@ -138,17 +147,19 @@ jobs: L0_Setup_Test_Data_And_Models: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Setup_Test_Data_And_Models') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | python -m tests.setup --save_dir /home/TestData/nlp - ## - name: L2: Multimodal Imagen Train + # - name: L2: Multimodal Imagen Train # L2: Community LLM Checkpoints tests L2_Community_LLM_Checkpoints_tests_Bert: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Bert') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -162,6 +173,7 @@ jobs: L2_Community_LLM_Checkpoints_tests_Mamba2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Mamba2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -177,6 +189,7 @@ jobs: L2_Community_LLM_Checkpoints_tests_Llama: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Llama') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -190,6 +203,7 @@ jobs: L2_Community_LLM_Checkpoints_tests_Llama3: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Llama3') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -204,6 +218,7 @@ jobs: L2_Community_LLM_Checkpoints_tests_StarCoder: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_StarCoder') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -219,6 +234,7 @@ jobs: L2_Community_LLM_Checkpoints_tests_Falcon: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Falcon') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -233,6 +249,7 @@ jobs: L2_Community_vita_Checkpoints_tests_Llama3: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_vita_Checkpoints_tests_Llama3') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -280,6 +297,7 @@ jobs: L2_PTQ_Llama2_Export_Only: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_Export_Only') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -293,6 +311,7 @@ jobs: L2_PTQ_Llama2_FP8: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_FP8') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -313,6 +332,7 @@ jobs: OPTIONAL_L2_PTQ_Llama2_INT8_SQ: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_PTQ_Llama2_INT8_SQ') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure TIMEOUT: 15 @@ -402,6 +422,7 @@ jobs: L2_Distill_Llama2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Distill_Llama2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -431,6 +452,7 @@ jobs: ASR_dev_run_Speech_to_Text: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -447,6 +469,7 @@ jobs: ASR_dev_run_Speech_to_Text_WPE_-_CitriNet: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text_WPE_-_CitriNet') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -466,6 +489,7 @@ jobs: ASR_dev_run_Speech_Pre-training_-_CitriNet: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_Pre-training_-_CitriNet') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -483,6 +507,7 @@ jobs: ASR_dev_run_Speech_To_Text_Finetuning: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_To_Text_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -538,6 +563,7 @@ jobs: ASR_dev_run_Speech_to_Text_WPE_-_Conformer: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text_WPE_-_Conformer') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -560,6 +586,7 @@ jobs: ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -582,6 +609,7 @@ jobs: L2_Speech_to_Text_EMA: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_to_Text_EMA') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -599,6 +627,7 @@ jobs: L2_Speech_to_Text_AED: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_to_Text_AED') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -640,6 +669,7 @@ jobs: L2_Speaker_dev_run_Speaker_Recognition: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Recognition') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -660,6 +690,7 @@ jobs: L2_Speaker_dev_run_Speaker_Diarization: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Diarization') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -681,6 +712,7 @@ jobs: L2_Speaker_dev_run_Speech_to_Label: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speech_to_Label') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -705,6 +737,7 @@ jobs: L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -724,6 +757,7 @@ jobs: L2_Speaker_dev_run_Clustering_Diarizer_Inference: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Clustering_Diarizer_Inference') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -742,6 +776,7 @@ jobs: L2_Speaker_dev_run_Neural_Diarizer_Inference: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Neural_Diarizer_Inference') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -757,6 +792,7 @@ jobs: L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -774,6 +810,7 @@ jobs: L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -792,6 +829,7 @@ jobs: L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -818,6 +856,7 @@ jobs: L2_ASR_Adapters_Linear_Adapters: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Adapters_Linear_Adapters') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -838,6 +877,7 @@ jobs: L2_ASR_Adapters_RelPos_MHA_Adapters: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Adapters_RelPos_MHA_Adapters') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -860,6 +900,7 @@ jobs: L2_Speech_Estimate_Duration_Bins: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Estimate_Duration_Bins') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -891,6 +932,7 @@ jobs: L2_Speech_Batch_Size_OOMptimizer: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Batch_Size_OOMptimizer') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -909,6 +951,7 @@ jobs: L2_Speech_Batch_Size_OOMptimizer_Canary: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Batch_Size_OOMptimizer_Canary') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -921,6 +964,7 @@ jobs: L2_Speech_Transcription_Speech_to_Text_Transcribe: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Speech_to_Text_Transcribe') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -936,6 +980,7 @@ jobs: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_Full_Manifest') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -954,6 +999,7 @@ jobs: L2_Speech_Transcription_Canary_Transcribe_With_Prompt: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_With_Prompt') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -976,6 +1022,7 @@ jobs: L2_Speech_Transcription_Canary_Transcribe_Audio_Dir: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_Audio_Dir') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -996,6 +1043,7 @@ jobs: L2_Transducer_alignment_Running_pytest: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Transducer_alignment_Running_pytest') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1005,6 +1053,7 @@ jobs: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1025,6 +1074,7 @@ jobs: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1046,6 +1096,7 @@ jobs: L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1099,6 +1150,7 @@ jobs: L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1125,6 +1177,7 @@ jobs: L2_Duplex_Text_Normalization_with_Tarred_dataset: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Duplex_Text_Normalization_with_Tarred_dataset') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1154,6 +1207,7 @@ jobs: L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1172,6 +1226,7 @@ jobs: L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1296,6 +1351,7 @@ jobs: L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1314,6 +1370,7 @@ jobs: L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1338,6 +1395,7 @@ jobs: L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1354,6 +1412,7 @@ jobs: L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1365,6 +1424,7 @@ jobs: L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1386,6 +1446,7 @@ jobs: L2_Pretraining_BERT_pretraining_from_Text: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Pretraining_BERT_pretraining_from_Text') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1415,6 +1476,7 @@ jobs: L2_Pretraining_BERT_from_Preprocessed: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Pretraining_BERT_from_Preprocessed') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1447,6 +1509,7 @@ jobs: L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1511,6 +1574,7 @@ jobs: L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1538,6 +1602,7 @@ jobs: L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1564,6 +1629,7 @@ jobs: L2_NMT_Attention_is_All_You_Need_Inference: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Inference') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1579,6 +1645,7 @@ jobs: L2_NMT_Attention_is_All_You_Need_Finetuning: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1609,6 +1676,7 @@ jobs: L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -1637,6 +1705,7 @@ jobs: L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1655,6 +1724,7 @@ jobs: L2_Megatron_NMT_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_NMT_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -1752,6 +1822,7 @@ jobs: L2_Megatron_BART_Perceiver_MIM_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Perceiver_MIM_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2058,6 +2129,7 @@ jobs: L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2125,6 +2197,7 @@ jobs: L2_Megatron_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2196,6 +2269,7 @@ jobs: L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2265,6 +2339,7 @@ jobs: L2_Megatron_RETRO_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2321,6 +2396,7 @@ jobs: L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2485,6 +2561,7 @@ jobs: L2_RAG_Pipeline_Indexing: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_RAG_Pipeline_Indexing') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2502,6 +2579,7 @@ jobs: L2_RAG_Pipeline_Generating: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_RAG_Pipeline_Generating') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2519,6 +2597,7 @@ jobs: L2_BioMegatron_Bert_NER_Task: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_BioMegatron_Bert_NER_Task') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2533,132 +2612,120 @@ jobs: L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-2-h100 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - env: - # This is to improve p2p overlap on H100 - NVTE_FWD_LAYERNORM_SM_MARGIN: 8 - NVTE_BWD_LAYERNORM_SM_MARGIN: 8 - TORCH_NCCL_AVOID_RECORD_STREAMS: 1 - NCCL_MIN_NCHANNELS: 4 - # TP overlap is not supported in docker environment - #NVTE_UB_SPLIT_RS: 0 - #NVTE_UB_ATOMIC_GEMM_RS: 1 - #NVTE_RS_STRIDED_ATOMIC: 1 - #NVTE_UB_FP8_RS: 1 - # Increase p2p chunksize to 2MB - NCCL_P2P_NET_CHUNKSIZE: 2097152 - # Disable gc when switching to/from validation steps - NEMO_MANUAL_GC_IN_VALIDATION: 0 - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - ++model.transformer_engine=True \ - ++model.fp8=True \ - ++model.fp8_hybrid=True \ - ++model.fp8_amax_history_len=1024 \ - ++model.fp8_amax_compute_algo=max \ - ++model.reduce_amax=True \ - ++model.use_te_rng_tracker=True \ - ++model.name=megatron_gpt_full_te_layer_autocast \ - model.ub_tp_comm_overlap=False \ - model.tensor_model_parallel_size=2 \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.validation_drop_last=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - ++model.transformer_engine=True \ - ++model.fp8=True \ - ++model.fp8_hybrid=True \ - ++model.fp8_amax_history_len=1024 \ - ++model.fp8_amax_compute_algo=max \ - ++model.reduce_amax=True \ - ++model.use_te_rng_tracker=True \ - ++model.name=megatron_gpt_full_te_layer_autocast \ - model.ub_tp_comm_overlap=False \ - model.tensor_model_parallel_size=2 \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.validation_drop_last=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure-gpus-2-h100 + SCRIPT: | + # This is to improve p2p overlap on H100 + export NVTE_FWD_LAYERNORM_SM_MARGIN=8 + export NVTE_BWD_LAYERNORM_SM_MARGIN=8 + export TORCH_NCCL_AVOID_RECORD_STREAMS=1 + export NCCL_MIN_NCHANNELS=4 + # TP overlap is not supported in docker environment + #NVTE_UB_SPLIT_RS: 0 + #NVTE_UB_ATOMIC_GEMM_RS: 1 + #NVTE_RS_STRIDED_ATOMIC: 1 + #NVTE_UB_FP8_RS: 1 + # Increase p2p chunksize to 2MB + export NCCL_P2P_NET_CHUNKSIZE=2097152 + # Disable gc when switching to/from validation steps + export NEMO_MANUAL_GC_IN_VALIDATION=0 - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + ++model.transformer_engine=True \ + ++model.fp8=True \ + ++model.fp8_hybrid=True \ + ++model.fp8_amax_history_len=1024 \ + ++model.fp8_amax_compute_algo=max \ + ++model.reduce_amax=True \ + ++model.use_te_rng_tracker=True \ + ++model.name=megatron_gpt_full_te_layer_autocast \ + model.ub_tp_comm_overlap=False \ + model.tensor_model_parallel_size=2 \ + model.optim.name=distributed_fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + ++model.transformer_engine=True \ + ++model.fp8=True \ + ++model.fp8_hybrid=True \ + ++model.fp8_amax_history_len=1024 \ + ++model.fp8_amax_compute_algo=max \ + ++model.reduce_amax=True \ + ++model.use_te_rng_tracker=True \ + ++model.name=megatron_gpt_full_te_layer_autocast \ + model.ub_tp_comm_overlap=False \ + model.tensor_model_parallel_size=2 \ + model.optim.name=distributed_fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2833,6 +2900,7 @@ jobs: L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2902,6 +2970,7 @@ jobs: L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2982,6 +3051,7 @@ jobs: L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3063,6 +3133,7 @@ jobs: L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-2-h100 SCRIPT: | @@ -3173,6 +3244,7 @@ jobs: L2_Megatron_GPT_Finetuning_PP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Finetuning_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3241,6 +3313,7 @@ jobs: L2_Megatron_GPT_Finetuning_StarCoder_PP1: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Finetuning_StarCoder_PP1') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -3272,48 +3345,35 @@ jobs: L2_Megatron_GPT_Reranker: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - rm -rf /home/TestData/nlp/megatron_ir/working_dir - - python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \ - exp_manager.exp_dir="/home/TestData/nlp/megatron_ir/working_dir" \ - model.global_batch_size=4 \ - model.micro_batch_size=4 \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.max_epochs=null \ - trainer.max_steps=20 \ - trainer.val_check_interval=10 \ - model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \ - model.peft.lora_tuning.adapter_dim=8 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \ - model.data.validation_ds.write_embeddings_to_file=True \ - model.data.validation_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/val_embs" \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] - - - rm -rf /home/TestData/nlp/megatron_ir/working_dir - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Reranker') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + rm -rf /home/TestData/nlp/megatron_ir/working_dir + + python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \ + exp_manager.exp_dir="/home/TestData/nlp/megatron_ir/working_dir" \ + model.global_batch_size=4 \ + model.micro_batch_size=4 \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.max_epochs=null \ + trainer.max_steps=20 \ + trainer.val_check_interval=10 \ + model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \ + model.peft.lora_tuning.adapter_dim=8 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \ + model.data.validation_ds.write_embeddings_to_file=True \ + model.data.validation_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/val_embs" \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_ir/working_dir L2_Megatron_GPT_Embedding: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Embedding') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3355,6 +3415,7 @@ jobs: L2_Megatron_GPT_PEFT_Lora_PP2_O2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_PP2_O2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3407,6 +3468,7 @@ jobs: L2_Megatron_GPT_PEFT_Lora_TP2_O1: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_TP2_O1') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3456,6 +3518,7 @@ jobs: L2_Megatron_GPT_PEFT_Lora_TP2SP1: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_TP2SP1') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-2-h100 SCRIPT: | @@ -3512,6 +3575,7 @@ jobs: L2_Megatron_GPT_Eval: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Eval') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3525,6 +3589,7 @@ jobs: L2_Megatron_GPT_Eval_PP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Eval_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3540,6 +3605,7 @@ jobs: L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3591,6 +3657,7 @@ jobs: L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3607,6 +3674,7 @@ jobs: L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3623,6 +3691,7 @@ jobs: L2_Megatron_T5_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3719,6 +3788,7 @@ jobs: L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3793,6 +3863,7 @@ jobs: L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3887,6 +3958,7 @@ jobs: L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3981,6 +4053,7 @@ jobs: L2_Megatron_T5_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4049,6 +4122,7 @@ jobs: L2_Megatron_T5_w_Mixture_of_Expert_Pretraining: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_w_Mixture_of_Expert_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4090,6 +4164,7 @@ jobs: L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4168,6 +4243,7 @@ jobs: L2_Megatron_T5_Eval: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Eval') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4179,6 +4255,7 @@ jobs: L2_Megatron_Core_T5_Eval: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_T5_Eval') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4190,6 +4267,7 @@ jobs: L2_Megatron_BART_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4257,6 +4335,7 @@ jobs: L2_Megatron_BART_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4329,6 +4408,7 @@ jobs: L2_Megatron_T5_PEFT_Lora_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_PEFT_Lora_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4380,6 +4460,7 @@ jobs: L2_Megatron_Core_T5_PEFT_Lora_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_T5_PEFT_Lora_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4432,6 +4513,7 @@ jobs: L2_Megatron_Mock_Data_Generation_MockGPTDataset: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Mock_Data_Generation_MockGPTDataset') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4447,6 +4529,7 @@ jobs: L2_Megatron_Mock_Data_Generation_MockT5Dataset: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Mock_Data_Generation_MockT5Dataset') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4464,6 +4547,7 @@ jobs: L2_TTS_Fast_dev_runs_1_Tacotron_2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Tacotron_2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -4489,6 +4573,7 @@ jobs: L2_TTS_Fast_dev_runs_1_WaveGlow: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_WaveGlow') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4510,6 +4595,7 @@ jobs: L2_TTS_Fast_dev_runs_1_FastPitch: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_FastPitch') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4583,6 +4669,7 @@ jobs: L2_TTS_Fast_dev_runs_1_Mixer-TTS: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Mixer-TTS') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4608,6 +4695,7 @@ jobs: L2_TTS_Fast_dev_runs_1_Hifigan: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Hifigan') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4659,6 +4747,7 @@ jobs: Speech_Checkpoints_tests: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Speech_Checkpoints_tests') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure TIMEOUT: 20 @@ -4674,6 +4763,7 @@ jobs: OPTIONAL_L2_Stable_Diffusion_Training: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Stable_Diffusion_Training') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: | @@ -4724,6 +4814,7 @@ jobs: L2_NeMo_2_GPT_Pretraining_no_transformer_engine: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_Pretraining_no_transformer_engine') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -4756,6 +4847,7 @@ jobs: L2_NeMo_2_GPT_DDP_Param_Parity_check: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_DDP_Param_Parity_check') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | From 4d5f1aa9ca1ddfc196b50a1fefaba398c50637a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 29 Aug 2024 22:33:23 -0700 Subject: [PATCH 082/664] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let'?= =?UTF-8?q?s=20bump=20`Dockerfile.ci`=20to=209ab31cb=20!=20(#10311)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> --- Dockerfile.ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ci b/Dockerfile.ci index 275aaecb95f0..43f137bf0c89 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -34,7 +34,7 @@ WORKDIR /workspace # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.15.0 -ARG MCORE_TAG=34e607ef41cf1c0ed481a678df9c76952d0ec00c +ARG MCORE_TAG=9ab31cbd6265f83640008801e1c3efbf80892cea ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ From b5d1d5f662b09d719cd2d17d366339409abb84eb Mon Sep 17 00:00:00 2001 From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> Date: Fri, 30 Aug 2024 00:07:16 -0700 Subject: [PATCH 083/664] Log Gradient Norms (#10244) * override optimizer step to log Signed-off-by: Maanu Grover * import fix Signed-off-by: Maanu Grover * remove norm from output Signed-off-by: Maanu Grover * also log num zeros in grad Signed-off-by: Maanu Grover * bugfix Signed-off-by: Maanu Grover --------- Signed-off-by: Maanu Grover --- nemo/core/optim/mcore_optim.py | 4 +-- .../pytorch/strategies/megatron_strategy.py | 34 ++++++++++++++++++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py index c058da52a97a..09b6a290d558 100644 --- a/nemo/core/optim/mcore_optim.py +++ b/nemo/core/optim/mcore_optim.py @@ -70,9 +70,9 @@ def step(self, closure): loss = closure() # return unused update_successful, grad_norm, num_zeros_in_grad - self.mcore_optimizer.step() + _, grad_norm, num_zeros_in_grad = self.mcore_optimizer.step() - return loss + return loss, grad_norm, num_zeros_in_grad # Promote state so it can be retrieved or set via # "optimizer_instance.state" diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index e13b603b127d..e719c50d8a50 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -7,7 +7,20 @@ from contextlib import ExitStack, contextmanager from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Dict, List, Literal, Mapping, Optional, TypeVar, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ContextManager, + Dict, + List, + Literal, + Mapping, + Optional, + TypeVar, + Union, + cast, +) import pytorch_lightning as pl import torch @@ -29,6 +42,7 @@ from torch.utils.data import DataLoader from typing_extensions import override +from nemo.core.optim.mcore_optim import McoreDistributedOptimizer from nemo.lightning import _strategy_lib, io from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction from nemo.lightning.pytorch.callbacks import ModelTransform @@ -472,6 +486,24 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP return out + @override + def optimizer_step( + self, + optimizer: torch.optim.Optimizer, + closure: Callable[[], Any], + model: Optional[Union["pl.LightningModule", nn.Module]] = None, + **kwargs: Any, + ) -> Any: + optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs) + + if isinstance(optimizer, McoreDistributedOptimizer): + optimizer_output, grad_norm, num_zeros_in_grad = optimizer_output + self.lightning_module.log('grad_norm', grad_norm, batch_size=1) + if num_zeros_in_grad is not None: + self.lightning_module.log('num_zeros_in_grad', num_zeros_in_grad, batch_size=1) + + return optimizer_output + @override def validation_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTPUT: assert self.lightning_module is not None From d886151a1cf805758ba248db921c05437998e4ca Mon Sep 17 00:00:00 2001 From: Alexey Panteleev Date: Fri, 30 Aug 2024 03:02:35 -0700 Subject: [PATCH 084/664] Add support for LoRA on vLLM (#10009) * Added basic support for adding LoRA checkpoints in HF format when running deploy_vllm_triton.py Signed-off-by: Alexey Panteleev * Moved the conversion logic from the convert_nemo_to_canonical.py script to a reusable module, removed the tar unpacking, removed the dependencies on OmegaConf and NLPSaveRestoreConnector. Signed-off-by: Alexey Panteleev * Implemented on-load conversion of Nemo format LoRA checkpoints into HF format for vLLM. Signed-off-by: Alexey Panteleev * Added logger initialization, improved some messages. Signed-off-by: Alexey Panteleev * Moved the LoRA converter script to nemo.export.utils. Signed-off-by: Alexey Panteleev * Fixed the description of the query.py script. Signed-off-by: Alexey Panteleev * Apply isort and black reformatting Signed-off-by: apanteleev * Fixed the missing file close. Signed-off-by: Alexey Panteleev --------- Signed-off-by: Alexey Panteleev Signed-off-by: apanteleev Co-authored-by: apanteleev Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Co-authored-by: Eric Harper --- nemo/export/utils/__init__.py | 13 + nemo/export/utils/lora_converter.py | 223 ++++++++++++++++++ nemo/export/vllm_exporter.py | 81 ++++++- .../convert_nemo_to_canonical.py | 204 +--------------- scripts/deploy/nlp/deploy_vllm_triton.py | 63 ++--- scripts/deploy/nlp/query.py | 2 +- 6 files changed, 349 insertions(+), 237 deletions(-) create mode 100644 nemo/export/utils/__init__.py create mode 100644 nemo/export/utils/lora_converter.py diff --git a/nemo/export/utils/__init__.py b/nemo/export/utils/__init__.py new file mode 100644 index 000000000000..d9155f923f18 --- /dev/null +++ b/nemo/export/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/export/utils/lora_converter.py b/nemo/export/utils/lora_converter.py new file mode 100644 index 000000000000..530dea55370b --- /dev/null +++ b/nemo/export/utils/lora_converter.py @@ -0,0 +1,223 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import re +import tarfile +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import torch +import yaml + +from nemo.export.tarutils import TarPath + + +def replace_number_add_offset(key, offset_value): + # This function finds the layer number in the state dict key and adds a numeric offset to that number + + if offset_value == 0: + return key + + pattern = r'layers.(\d+)' + + def add_offset(match): + return "layers." + str(int(match.group(1)) + offset_value) + + return re.sub(pattern, add_offset, key) + + +def rename_qkv_keys(key): + new_keys = [] + new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.q_adapter.")) + new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.k_adapter.")) + new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.v_adapter.")) + return new_keys + + +def reformat_module_names_to_hf(tensors: Dict[str, torch.Tensor]) -> Tuple[Dict[str, torch.Tensor], List[str]]: + new_tensors = dict() + module_names = set() + known_module_names = ["q_proj", "k_proj", "v_proj", "o_proj", "down_proj", "gate_proj", "up_proj"] + for module_name, module_weight in tensors.items(): + # map linear_in and linear_out to lora_a/lora_b counterparts + new_module_name = "base_model." + module_name.replace("linear_in", "lora_A").replace("linear_out", "lora_B") + + # map target modules to their vLLM/HF counterparts + new_module_name = new_module_name.replace("q_adapter", "q_proj") + new_module_name = new_module_name.replace("k_adapter", "k_proj") + new_module_name = new_module_name.replace("v_adapter", "v_proj") + new_module_name = new_module_name.replace("lora_dense_attention_adapter", "o_proj") + new_module_name = new_module_name.replace("lora_4htoh_adapter", "down_proj") + new_module_name = new_module_name.replace("gate_adapter", "gate_proj") + new_module_name = new_module_name.replace("up_adapter", "up_proj") + + # map other parts of the module names to fit vLLM/huggingface + new_module_name = new_module_name.replace(".adapter_layer", "") + new_module_name = new_module_name.replace(".lora_unfused_kqv_proj", "") + new_module_name = new_module_name.replace(".lora_unfused_hto4h_adapter", "") + new_module_name = new_module_name.replace("self_attention", "self_attn") + new_module_name = new_module_name.replace("decoder", "model") + + new_tensors[new_module_name] = module_weight + + # keep track of the modules that we've added to store them in the config file + for kmn in known_module_names: + if f'.{kmn}' in new_module_name: + module_names.add(kmn) + + return (new_tensors, list(module_names)) + + +def convert_lora_weights_to_canonical( + config: Dict[str, Any], lora_weights: Dict[str, torch.Tensor] +) -> Dict[str, torch.Tensor]: + """This function converts nemo style (fused) lora weights to canonical (unfused) + LoRA weights. Namely, it unfuses the QKV adapter layers and the H-to-4H adapter layers. + + Returns: + Dict[str, torch.Tensor]: The new LoRA weights with unfused layers. + """ + + hidden_size = int(config["hidden_size"]) + num_heads = int(config["num_attention_heads"]) + head_size = hidden_size // num_heads + num_query_groups = int(config.get("num_query_groups", num_heads)) # num_kv_heads + + heads_per_group = num_heads // num_query_groups + qkv_total_dim = num_heads + 2 * num_query_groups + + adapter_size = config['peft']['lora_tuning']['adapter_dim'] + + q_slice = torch.cat( + [ + torch.arange((heads_per_group + 2) * group_idx, (heads_per_group + 2) * group_idx + heads_per_group) + for group_idx in range(num_query_groups) + ] + ) + k_slice = torch.arange(heads_per_group, qkv_total_dim, heads_per_group + 2) + v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, heads_per_group + 2) + + qkv_keys_to_update = [] + hto4h_keys_to_update = [] + for key in lora_weights.keys(): + if "lora_kqv_adapter" in key: + qkv_keys_to_update.append(key) + if "lora_hto4h_adapter" in key: + hto4h_keys_to_update.append(key) + + # unfuse QKV layer + for key in qkv_keys_to_update: + if "linear_in" in key: + assert lora_weights[key].size(0) == adapter_size + for new_key in rename_qkv_keys(key): + lora_weights[new_key] = lora_weights[key] + assert len(lora_weights[new_key].size()) == 2 + elif "linear_out" in key: + assert lora_weights[key].size(1) == adapter_size + for new_key, size in zip(rename_qkv_keys(key), [q_slice, k_slice, v_slice]): + lora_weights[new_key] = ( + lora_weights[key] + .reshape((qkv_total_dim, head_size, adapter_size))[size] + .reshape((-1, adapter_size)) + ) + assert len(lora_weights[new_key].size()) == 2 + lora_weights.pop(key) + + # This maps to gate_up_proj in HF, but we need to split it up into gate_proj and up_proj + for key in hto4h_keys_to_update: + gate_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.gate_adapter.") + up_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.up_adapter.") + + module_weight = lora_weights[key] + if "linear_in" in key: + # lora_a gets duplicated + lora_weights[gate_proj_key] = module_weight + lora_weights[up_proj_key] = module_weight + elif "linear_out" in key: + # lora_b gets split + split_size = module_weight.shape[0] + gate_up_split = module_weight.split(split_size // 2) + lora_weights[gate_proj_key] = gate_up_split[0] + lora_weights[up_proj_key] = gate_up_split[1] + lora_weights.pop(key) + return lora_weights + + +def convert_lora_nemo_to_canonical(lora_nemo, save_path, hf_format=False, donor_hf_config=None): + with TarPath(lora_nemo) as archive: + with (archive / "model_config.yaml").open("r") as config_file: + lora_config = yaml.load(config_file, Loader=yaml.SafeLoader) + + tp_size = lora_config.get('tensor_model_parallel_size', 1) + pp_size = lora_config.get('pipeline_model_parallel_size', 1) + + lora_state_dict = [{}] * tp_size + + for pp in range(pp_size): + for tp in range(tp_size): + if tp_size == 1: + ckpt_file = archive / "model_weights.ckpt" + elif pp_size == 1: + ckpt_file = archive / f"mp_rank_{tp:02d}/model_weights.ckpt" + else: + ckpt_file = archive / f"tp_rank_{tp:02d}_pp_rank_{pp:03d}/model_weights.ckpt" + + with ckpt_file.open("rb") as f: + l = torch.load(f, map_location=torch.device('cpu')) + + if pp == 0: + lora_state_dict[tp] = l + else: + # calculate layer offset + layer_offset = lora_config['num_layers'] // pp_size * pp + for key, value in l.items(): + new_key = replace_number_add_offset(key, layer_offset) + lora_state_dict[tp][new_key] = value + + # TODO: currently suport tp=1 + lora_state_dict = lora_state_dict[0] + if lora_config['peft']['lora_tuning'].get('variant', 'nemo') == "nemo": + lora_config['peft']['lora_tuning']['variant'] = "canonical" + lora_state_dict = convert_lora_weights_to_canonical(lora_config, lora_state_dict) + + if hf_format: + lora_state_dict, target_modules = reformat_module_names_to_hf(lora_state_dict) + Path(save_path).mkdir(parents=True, exist_ok=True) + torch.save(lora_state_dict, f"{save_path}/adapter_model.bin") + if donor_hf_config is not None: + with open(donor_hf_config) as hf_config_file: + adapter_config = json.load(hf_config_file) + else: + adapter_config = {} + adapter_config['peft_type'] = "LORA" + adapter_config['r'] = lora_config['peft']['lora_tuning']['adapter_dim'] + adapter_config['lora_alpha'] = lora_config['peft']['lora_tuning']['alpha'] + adapter_config['target_modules'] = target_modules + with open(f"{save_path}/adapter_config.json", "w") as f: + json.dump(adapter_config, f, indent=4) + else: + with tempfile.TemporaryDirectory() as tmpdir: + with open(f"{tmpdir}/model_config.yaml", "w") as f: + yaml.dump(lora_config, f) + torch.save(lora_state_dict, f"{tmpdir}/model_weights.ckpt") + + dirname = os.path.dirname(save_path) + os.makedirs(dirname, exist_ok=True) + with tarfile.open(save_path, "w:") as tar: + tar.add(tmpdir, arcname=".") + + return lora_state_dict, lora_config diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py index de06ea830e07..b37fccb3385d 100644 --- a/nemo/export/vllm_exporter.py +++ b/nemo/export/vllm_exporter.py @@ -19,11 +19,13 @@ import numpy import wrapt from vllm import RequestOutput, SamplingParams -from vllm.config import CacheConfig, DeviceConfig, LoadConfig, LoadFormat, ParallelConfig, SchedulerConfig +from vllm.config import CacheConfig, DeviceConfig, LoadConfig, LoadFormat, LoRAConfig, ParallelConfig, SchedulerConfig from vllm.executor.ray_utils import initialize_ray_cluster +from vllm.lora.request import LoRARequest from nemo.deploy import ITritonDeployable from nemo.deploy.utils import cast_output +from nemo.export.utils.lora_converter import convert_lora_nemo_to_canonical from nemo.export.vllm.engine import NemoLLMEngine from nemo.export.vllm.model_config import NemoModelConfig from nemo.export.vllm.model_loader import NemoModelLoader @@ -83,6 +85,7 @@ def export( tensor_parallel_size: int = 1, pipeline_parallel_size: int = 1, max_model_len: int = None, + lora_checkpoints: List[str] = [], dtype: str = 'auto', seed: int = 0, log_stats: bool = True, @@ -207,6 +210,11 @@ def export( model_loader_extra_config=None, ) + # Convert the LoRA checkpoints to vLLM compatible format and derive the configuration structure + lora_config = self._prepare_lora_checkpoints( + model_dir=model_dir, lora_checkpoints=lora_checkpoints, dtype=model_config.dtype + ) + # Initialize the cluster and specify the executor class. if device_config.device_type == "neuron": from vllm.executor.neuron_executor import NeuronExecutor @@ -239,7 +247,7 @@ def export( scheduler_config=scheduler_config, device_config=device_config, load_config=load_config, - lora_config=None, + lora_config=lora_config, multimodal_config=None, speculative_config=None, decoding_config=None, @@ -249,18 +257,57 @@ def export( log_stats=log_stats, ) + def _prepare_lora_checkpoints(self, model_dir: str, lora_checkpoints: List[str], dtype) -> LoRAConfig: + self.lora_checkpoints = [] + + if lora_checkpoints is None or len(lora_checkpoints) == 0: + return None + + index = 0 + max_lora_rank = 0 + for nemo_file in lora_checkpoints: + if not os.path.isfile(nemo_file): + raise FileNotFoundError(f"LoRA checkpoint file '{nemo_file} does not exist'") + + hf_lora_dir = os.path.join(model_dir, f'lora_{index}') + + LOGGER.info(f"Converting LoRA checkpoint '{nemo_file}' into '{hf_lora_dir}'...") + + _, lora_config = convert_lora_nemo_to_canonical(nemo_file, hf_lora_dir, hf_format=True) + self.lora_checkpoints.append(hf_lora_dir) + + rank = lora_config['peft']['lora_tuning']['adapter_dim'] + max_lora_rank = max(max_lora_rank, rank) + + index += 1 + + return LoRAConfig(max_lora_rank=max_lora_rank, max_loras=len(self.lora_checkpoints), lora_dtype=dtype) + def _add_request_to_engine( - self, prompt: str, max_output_len: int, temperature: float = 1.0, top_k: int = 1, top_p: float = 0.0 + self, + prompt: str, + max_output_len: int, + temperature: float = 1.0, + top_k: int = 1, + top_p: float = 0.0, + lora_uid: Optional[int] = None, ) -> str: if top_p <= 0.0: top_p = 1.0 sampling_params = SamplingParams(max_tokens=max_output_len, temperature=temperature, top_k=top_k, top_p=top_p) + if lora_uid is not None and lora_uid >= 0 and lora_uid < len(self.lora_checkpoints): + lora_request = LoRARequest( + lora_name=f'LoRA_{lora_uid}', lora_int_id=lora_uid + 1, lora_local_path=self.lora_checkpoints[lora_uid] + ) + else: + lora_request = None + request_id = str(self.request_id) self.request_id += 1 - self.engine.add_request(request_id, prompt, sampling_params) + self.engine.add_request(request_id, prompt, sampling_params, lora_request=lora_request) return request_id @@ -306,12 +353,18 @@ def _forward_streaming(self, request_ids: List[str]): yield [[response] for response in responses] def _add_triton_request_to_engine(self, inputs: numpy.ndarray, index: int) -> str: + if 'lora_uids' in inputs: + lora_uid = int(numpy.char.decode(inputs['lora_uids'][index][0], encoding="utf-8")) + else: + lora_uid = None + return self._add_request_to_engine( prompt=inputs['prompts'][index][0].decode('UTF-8'), max_output_len=inputs['max_output_len'][index][0], temperature=inputs['temperature'][index][0], top_k=inputs['top_k'][index][0], top_p=inputs['top_p'][index][0], + lora_uid=lora_uid, ) @property @@ -322,6 +375,7 @@ def get_triton_input(self): Tensor(name="top_k", shape=(-1,), dtype=numpy.int_, optional=True), Tensor(name="top_p", shape=(-1,), dtype=numpy.single, optional=True), Tensor(name="temperature", shape=(-1,), dtype=numpy.single, optional=True), + Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True), ) return inputs @@ -394,9 +448,6 @@ def forward( if task_ids is not None and task_ids != []: raise NotImplementedError("task_ids is not supported") - if lora_uids is not None and lora_uids != []: - raise NotImplementedError("lora_uids is not supported") - if prompt_embeddings_table is not None: raise NotImplementedError("prompt_embeddings_table is not supported") @@ -407,9 +458,21 @@ def forward( raise NotImplementedError("output_log_probs is not supported") request_ids = [] - for prompt in input_texts: + for index in range(len(input_texts)): + prompt = input_texts[index] + + if lora_uids is not None and index < len(lora_uids): + lora_uid = lora_uids[index] + else: + lora_uid = None + request_id = self._add_request_to_engine( - prompt=prompt, max_output_len=max_output_len, temperature=temperature, top_k=top_k, top_p=top_p + prompt=prompt, + max_output_len=max_output_len, + temperature=temperature, + top_k=top_k, + top_p=top_p, + lora_uid=lora_uid, ) request_ids.append(request_id) diff --git a/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py b/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py index 65a00fd56d22..36c16c21d0f7 100644 --- a/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py +++ b/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py @@ -28,206 +28,10 @@ --output_path ./canonical_style_lora_model.nemo \ --hf_format --hf_config checkpoints/bin/adapter_config.json """ -import json -import tempfile -from argparse import ArgumentParser -from pathlib import Path -from typing import Any, Dict - -import torch -from omegaconf import OmegaConf, open_dict -from scripts.nlp_language_modeling.merge_lora_weights.merge import replace_number_add_offset - -from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector - -target_map = { - "all": ["gate_proj", "o_proj", "up_proj", "down_proj", "k_proj", "q_proj", "v_proj"], - "attention_qkv": ["k_proj", "q_proj", "v_proj"], - "attention_dense": ["gate_proj", "o_proj", "up_proj"], -} - - -def rename_keys(key): - new_keys = [] - if "lora_kqv_adapter" in key: - new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.q_adapter.")) - new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.k_adapter.")) - new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.v_adapter.")) - elif "lora_hto4h_adapter" in key: - new_keys.append(key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.gate_adapter.")) - new_keys.append(key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.up_adapter.")) - return new_keys - - -def rename_qkv_keys(key): - new_keys = [] - new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.q_adapter.")) - new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.k_adapter.")) - new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.v_adapter.")) - return new_keys - - -def reformat_module_names_to_hf(tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: - new_tensors = dict() - for module_name, module_weight in tensors.items(): - # map linear_in and linear_out to lora_a/lora_b counterparts - new_module_name = "base_model." + module_name.replace("linear_in", "lora_A").replace("linear_out", "lora_B") - - # map target modules to their vLLM/HF counterparts - new_module_name = new_module_name.replace("q_adapter", "q_proj") - new_module_name = new_module_name.replace("k_adapter", "k_proj") - new_module_name = new_module_name.replace("v_adapter", "v_proj") - new_module_name = new_module_name.replace("lora_dense_attention_adapter", "o_proj") - new_module_name = new_module_name.replace("lora_4htoh_adapter", "down_proj") - new_module_name = new_module_name.replace("gate_adapter", "gate_proj") - new_module_name = new_module_name.replace("up_adapter", "up_proj") - - # map other parts of the module names to fit vLLM/huggingface - new_module_name = new_module_name.replace(".adapter_layer", "") - new_module_name = new_module_name.replace(".lora_unfused_kqv_proj", "") - new_module_name = new_module_name.replace(".lora_unfused_hto4h_adapter", "") - new_module_name = new_module_name.replace("self_attention", "self_attn") - new_module_name = new_module_name.replace("decoder", "model") - - new_tensors[new_module_name] = module_weight - return new_tensors - - -def convert_lora_weights_to_canonical( - config: Dict[str, Any], lora_weights: Dict[str, torch.Tensor] -) -> Dict[str, torch.Tensor]: - """This function converts nemo style (fused) lora weights to canonical (unfused) - LoRA weights. Namely, it unfuses the QKV adapter layers and the H-to-4H adapter layers. - - Returns: - Dict[str, torch.Tensor]: The new LoRA weights with unfused layers. - """ - - hidden_size = int(config["hidden_size"]) - num_heads = int(config["num_attention_heads"]) - head_size = hidden_size // num_heads - num_query_groups = int(config.get("num_query_groups", num_heads)) # num_kv_heads - - heads_per_group = num_heads // num_query_groups - qkv_total_dim = num_heads + 2 * num_query_groups - - adapter_size = config['peft']['lora_tuning']['adapter_dim'] - - q_slice = torch.cat( - [ - torch.arange((heads_per_group + 2) * group_idx, (heads_per_group + 2) * group_idx + heads_per_group) - for group_idx in range(num_query_groups) - ] - ) - k_slice = torch.arange(heads_per_group, qkv_total_dim, heads_per_group + 2) - v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, heads_per_group + 2) - - qkv_keys_to_update = [] - hto4h_keys_to_update = [] - for key in lora_weights.keys(): - if "lora_kqv_adapter" in key: - qkv_keys_to_update.append(key) - if "lora_hto4h_adapter" in key: - hto4h_keys_to_update.append(key) - - # unfuse QKV layer - for key in qkv_keys_to_update: - if "linear_in" in key: - assert lora_weights[key].size(0) == adapter_size - for new_key in rename_qkv_keys(key): - lora_weights[new_key] = lora_weights[key] - assert len(lora_weights[new_key].size()) == 2 - elif "linear_out" in key: - assert lora_weights[key].size(1) == adapter_size - for new_key, size in zip(rename_qkv_keys(key), [q_slice, k_slice, v_slice]): - lora_weights[new_key] = ( - lora_weights[key] - .reshape((qkv_total_dim, head_size, adapter_size))[size] - .reshape((-1, adapter_size)) - ) - assert len(lora_weights[new_key].size()) == 2 - lora_weights.pop(key) - - # This maps to gate_up_proj in HF, but we need to split it up into gate_proj and up_proj - for key in hto4h_keys_to_update: - gate_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.gate_adapter.") - up_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.up_adapter.") - - module_weight = lora_weights[key] - if "linear_in" in key: - # lora_a gets duplicated - lora_weights[gate_proj_key] = module_weight - lora_weights[up_proj_key] = module_weight - elif "linear_out" in key: - # lora_b gets split - split_size = module_weight.shape[0] - gate_up_split = module_weight.split(split_size // 2) - lora_weights[gate_proj_key] = gate_up_split[0] - lora_weights[up_proj_key] = gate_up_split[1] - lora_weights.pop(key) - return lora_weights - - -def convert_lora(lora_nemo, save_path, hf_format=False): - with tempfile.TemporaryDirectory() as tmpdir: - NLPSaveRestoreConnector._unpack_nemo_file(lora_nemo, tmpdir) - config_file = f"{tmpdir}/model_config.yaml" - lora_config = OmegaConf.load(config_file) - tp_size = lora_config.tensor_model_parallel_size - pp_size = lora_config.pipeline_model_parallel_size - - lora_state_dict = [{}] * tp_size - - for pp in range(pp_size): - for tp in range(tp_size): - if tp_size == 1: - ckpt_file = f"{tmpdir}/model_weights.ckpt" - elif pp_size == 1: - ckpt_file = f"{tmpdir}/mp_rank_{tp:02d}/model_weights.ckpt" - else: - ckpt_file = f"{tmpdir}/tp_rank_{tp:02d}_pp_rank_{pp:03d}/model_weights.ckpt" - - l = torch.load(ckpt_file, map_location=torch.device('cpu')) - if pp == 0: - lora_state_dict[tp] = l - else: - # calculate layer offset - layer_offset = lora_config.num_layers // pp_size * pp - for key, value in l.items(): - new_key = replace_number_add_offset(key, layer_offset) - lora_state_dict[tp][new_key] = value - - # TODO: currently suport tp=1 - lora_state_dict = lora_state_dict[0] - if lora_config.peft.lora_tuning.variant == "nemo": - with open_dict(lora_config): - lora_config.peft.lora_tuning.variant = "canonical" - with open(f"{tmpdir}/model_config.yaml", "w") as f: - OmegaConf.save(lora_config, f) - lora_state_dict = convert_lora_weights_to_canonical(lora_config, lora_state_dict) - if hf_format: - lora_state_dict = reformat_module_names_to_hf(lora_state_dict) - Path(save_path).mkdir(parents=True, exist_ok=True) - torch.save(lora_state_dict, f"{save_path}/adapter_model.bin") - adapter_config = json.load(open(args.hf_config)) - adapter_config['peft_type'] = "LORA" - adapter_config['r'] = lora_config.peft.lora_tuning.adapter_dim - adapter_config['lora_alpha'] = lora_config.peft.lora_tuning.alpha - with open(f"{save_path}/adapter_config.json", "w") as f: - json.dump(adapter_config, f, indent=4) - else: - torch.save(lora_state_dict, f"{tmpdir}/model_weights.ckpt") - NLPSaveRestoreConnector._make_nemo_file_from_folder(save_path, tmpdir) - - return lora_state_dict, lora_config +from argparse import ArgumentParser -def fix_for_O2(state_dict): - new_state_dict = {} - for k, v in state_dict.items(): - if "model.module." not in k: - new_state_dict[k.replace('model.', 'model.module.')] = v - return new_state_dict +from nemo.export.utils.lora_converter import convert_lora_nemo_to_canonical def get_args(): @@ -255,4 +59,6 @@ def get_args(): if __name__ == '__main__': args = get_args() - convert_lora(args.nemo_lora_path, args.output_path, args.hf_format) + convert_lora_nemo_to_canonical( + args.nemo_lora_path, args.output_path, args.hf_format, donor_hf_config=args.hf_config + ) diff --git a/scripts/deploy/nlp/deploy_vllm_triton.py b/scripts/deploy/nlp/deploy_vllm_triton.py index a6a861575f69..d125b85ac772 100755 --- a/scripts/deploy/nlp/deploy_vllm_triton.py +++ b/scripts/deploy/nlp/deploy_vllm_triton.py @@ -20,6 +20,8 @@ from nemo.deploy import DeployPyTriton +# Configure the NeMo logger to look the same as vLLM +logging.basicConfig(format="%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s", datefmt="%m-%d %H:%M:%S") LOGGER = logging.getLogger("NeMo") try: @@ -61,12 +63,15 @@ def get_args(argv): choices=["bfloat16", "float16", "fp8", "int8"], default="bfloat16", type=str, - help="dtype of the model on TensorRT-LLM or vLLM", + help="dtype of the model on vLLM", ) parser.add_argument( "-mml", "--max_model_len", default=512, type=int, help="Max input + ouptut length of the model" ) parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model") + parser.add_argument( + "-lc", "--lora_ckpt", default=[], type=str, nargs="+", help="List of LoRA checkpoints in HF format" + ) parser.add_argument( "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences." ) @@ -92,19 +97,7 @@ def get_args(argv): return args -def get_vllm_deployable(args): - tempdir = None - model_dir = args.triton_model_repository - if model_dir is None: - tempdir = tempfile.TemporaryDirectory() - model_dir = tempdir.name - LOGGER.info( - f"{model_dir} path will be used as the vLLM intermediate folder. " - + "Please set the --triton_model_repository parameter if you'd like to use a path that already " - + "includes the vLLM model files." - ) - elif not os.path.exists(model_dir): - os.makedirs(model_dir) +def get_vllm_deployable(args, model_dir): try: exporter = vLLMExporter() @@ -114,6 +107,7 @@ def get_vllm_deployable(args): model_type=args.model_type, tensor_parallel_size=args.tensor_parallelism_size, max_model_len=args.max_model_len, + lora_checkpoints=args.lora_ckpt, dtype=args.dtype, weight_storage=args.weight_storage, gpu_memory_utilization=args.gpu_memory_utilization, @@ -121,9 +115,6 @@ def get_vllm_deployable(args): return exporter except Exception as error: raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) - finally: - if tempdir is not None: - tempdir.cleanup() def nemo_deploy(argv): @@ -138,9 +129,25 @@ def nemo_deploy(argv): LOGGER.info("Logging level set to {}".format(loglevel)) LOGGER.info(args) - triton_deployable = get_vllm_deployable(args) + # If no model_dir was supplied, create a temporary directory. + # This directory should persist while the model is being served, becaue it may contain + # converted LoRA checkpoints, and those are accessed by vLLM at request time. + tempdir = None + model_dir = args.triton_model_repository + if model_dir is None: + tempdir = tempfile.TemporaryDirectory() + model_dir = tempdir.name + LOGGER.info( + f"{model_dir} will be used for the vLLM intermediate folder. " + + "Please set the --triton_model_repository parameter if you'd like to use a path that already " + + "includes the vLLM model files." + ) + elif not os.path.exists(model_dir): + os.makedirs(model_dir) try: + triton_deployable = get_vllm_deployable(args, model_dir=model_dir) + nm = DeployPyTriton( model=triton_deployable, triton_model_name=args.triton_model_name, @@ -151,21 +158,21 @@ def nemo_deploy(argv): streaming=args.enable_streaming, ) - LOGGER.info("Triton deploy function will be called.") + LOGGER.info("Starting the Triton server...") nm.deploy() - except Exception as error: - LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error)) - return - - try: - LOGGER.info("Model serving on Triton is will be started.") nm.serve() + + LOGGER.info("Stopping the Triton server...") + nm.stop() + except Exception as error: - LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error)) + LOGGER.error("An error has occurred while setting up or serving the model. Error message: " + str(error)) return - LOGGER.info("Model serving will be stopped.") - nm.stop() + # Clean up the temporary directory + finally: + if tempdir is not None: + tempdir.cleanup() if __name__ == '__main__': diff --git a/scripts/deploy/nlp/query.py b/scripts/deploy/nlp/query.py index 5b36c2616326..5d70102c8295 100644 --- a/scripts/deploy/nlp/query.py +++ b/scripts/deploy/nlp/query.py @@ -23,7 +23,7 @@ def get_args(argv): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description=f"Exports nemo models stored in nemo checkpoints to TensorRT-LLM", + description=f"Sends a single query to an LLM hosted on a Triton server.", ) parser.add_argument("-u", "--url", default="0.0.0.0", type=str, help="url for the triton server") parser.add_argument("-mn", "--model_name", required=True, type=str, help="Name of the triton model") From 1ce9089143b0136523cb08bb37941a35c9b08307 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Fri, 30 Aug 2024 14:05:47 +0200 Subject: [PATCH 085/664] Flexible passing args to TensorRTLLM in nemo_export.py (#10315) Signed-off-by: Jan Lasek --- tests/export/nemo_export.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py index ecaf198a0c07..c9af5e9d1af1 100644 --- a/tests/export/nemo_export.py +++ b/tests/export/nemo_export.py @@ -244,7 +244,10 @@ def run_inference( save_trt_engine=False, fp8_quantized=False, fp8_kvcache=False, + trt_llm_export_kwargs=None, ) -> Tuple[Optional[FunctionalResult], Optional[AccuracyResult]]: + if trt_llm_export_kwargs is None: + trt_llm_export_kwargs = {} if Path(checkpoint_path).exists(): if tp_size > torch.cuda.device_count(): print( @@ -329,6 +332,7 @@ def run_inference( use_embedding_sharing=use_embedding_sharing, fp8_quantized=fp8_quantized, fp8_kvcache=fp8_kvcache, + **trt_llm_export_kwargs, ) if ptuning: @@ -458,6 +462,7 @@ def run_existing_checkpoints( in_framework=False, fp8_quantized=False, fp8_kvcache=False, + trt_llm_export_kwargs=None, ) -> Tuple[Optional[FunctionalResult], Optional[AccuracyResult]]: if tp_size > torch.cuda.device_count(): print("Skipping the test due to not enough number of GPUs") @@ -493,6 +498,9 @@ def run_existing_checkpoints( else: use_embedding_sharing = False + if trt_llm_export_kwargs is None: + trt_llm_export_kwargs = {} + if in_framework: return run_in_framework_inference( model_name=model_name, @@ -538,6 +546,7 @@ def run_existing_checkpoints( save_trt_engine=save_trt_engine, fp8_quantized=fp8_quantized, fp8_kvcache=fp8_kvcache, + **trt_llm_export_kwargs, ) @@ -770,6 +779,12 @@ def get_args(): type=str, help="Enables exporting with FP8-quantizatized KV-cache", ) + parser.add_argument( + "--trt_llm_export_kwargs", + default={}, + type=json.loads, + help="Extra keyword arguments passed to TensorRTLLM.export", + ) args = parser.parse_args() @@ -850,6 +865,7 @@ def run_inference_tests(args): in_framework=args.in_framework, fp8_quantized=args.export_fp8_quantized, fp8_kvcache=args.use_fp8_kv_cache, + trt_llm_export_kwargs=args.trt_llm_export_kwargs, ) tps = tps * 2 @@ -908,6 +924,7 @@ def run_inference_tests(args): save_trt_engine=args.save_trt_engine, fp8_quantized=args.export_fp8_quantized, fp8_kvcache=args.use_fp8_kv_cache, + trt_llm_export_kwargs=args.trt_llm_export_kwargs, ) tps = tps * 2 From e5f22a8feac53f001065b256cf5b0a389c82c50e Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Fri, 30 Aug 2024 12:46:46 -0400 Subject: [PATCH 086/664] add back HF Finetune script to CI (#10308) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri --- .github/workflows/cicd-main.yml | 72 ++++++++++--------- .../speech_to_text_hf_finetune.yaml | 6 ++ 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 1f49f891b85e..7375f81c4b09 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -524,41 +524,43 @@ jobs: AFTER_SCRIPT: | rm -rf examples/asr/speech_finetuning_results - # OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning: - # needs: [cicd-test-container-setup] - # uses: ./.github/workflows/_test_template.yml - # with: - # RUNNER: self-hosted-azure-gpus-1 - # SCRIPT: |- - # python examples/asr/speech_to_text_finetune.py \ - # --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ - # ~model.train_ds.hf_data_cfg \ - # model.train_ds.num_workers=1 \ - # model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ - # model.train_ds.streaming=true \ - # +model.train_ds.hf_data_cfg.path="librispeech_asr" \ - # +model.train_ds.hf_data_cfg.name=null \ - # +model.train_ds.hf_data_cfg.split="test.clean" \ - # +model.train_ds.hf_data_cfg.streaming=true \ - # ~model.validation_ds.hf_data_cfg \ - # model.validation_ds.streaming=true \ - # +model.validation_ds.hf_data_cfg.path="librispeech_asr" \ - # +model.validation_ds.hf_data_cfg.name=null \ - # +model.validation_ds.hf_data_cfg.split="test.clean" \ - # +model.validation_ds.hf_data_cfg.streaming=true \ - # ~model.test_ds \ - # init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - # model.tokenizer.update_tokenizer=False \ - # model.optim.sched.warmup_steps=0 \ - # +model.optim.sched.max_steps=3 \ - # trainer.max_epochs=null \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=True \ - # exp_manager.exp_dir=examples/asr/speech_finetuning_results - # AFTER_SCRIPT: | - # rm -rf examples/asr/speech_finetuning_results - # IS_OPTIONAL: true + OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: |- + python examples/asr/speech_to_text_finetune.py \ + --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ + ~model.train_ds.hf_data_cfg \ + model.train_ds.num_workers=1 \ + model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ + model.train_ds.streaming=true \ + +model.train_ds.hf_data_cfg.path="librispeech_asr" \ + +model.train_ds.hf_data_cfg.name=null \ + +model.train_ds.hf_data_cfg.split="test.clean" \ + +model.train_ds.hf_data_cfg.streaming=true \ + +model.train_ds.hf_data_cfg.trust_remote_code=True \ + ~model.validation_ds.hf_data_cfg \ + model.validation_ds.streaming=true \ + +model.validation_ds.hf_data_cfg.path="librispeech_asr" \ + +model.validation_ds.hf_data_cfg.name=null \ + +model.validation_ds.hf_data_cfg.split="test.clean" \ + +model.validation_ds.hf_data_cfg.streaming=true \ + +model.validation_ds.hf_data_cfg.trust_remote_code=True \ + ~model.test_ds \ + init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ + model.tokenizer.update_tokenizer=False \ + model.optim.sched.warmup_steps=0 \ + +model.optim.sched.max_steps=3 \ + trainer.max_epochs=null \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_finetuning_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_finetuning_results + IS_OPTIONAL: true ASR_dev_run_Speech_to_Text_WPE_-_Conformer: needs: [cicd-test-container-setup] diff --git a/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml b/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml index e6d9b0b49c65..0c0e40562506 100644 --- a/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml +++ b/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml @@ -35,14 +35,17 @@ model: name: ${model.data_name} split: 'train.clean.360' streaming: ${model.streaming} + trust_remote_code: true - path: ${model.data_path} name: ${model.data_name} split: 'train.clean.100' streaming: ${model.streaming} + trust_remote_code: true - path: ${model.data_path} name: ${model.data_name} split: 'train.other.500' streaming: ${model.streaming} + trust_remote_code: true sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows @@ -65,6 +68,7 @@ model: name: ${model.data_name} split: 'validation.other' streaming: ${model.streaming} + trust_remote_code: true sample_rate: ${model.sample_rate} batch_size: 8 @@ -87,10 +91,12 @@ model: name: ${model.data_name} split: 'test.other' streaming: ${model.streaming} + trust_remote_code: true - path: ${model.data_path} name: ${model.data_name} split: 'test.clean' streaming: ${model.streaming} + trust_remote_code: true sample_rate: ${model.sample_rate} batch_size: 8 From a777a442c43cd2092684c8b8f06701ed62134a9f Mon Sep 17 00:00:00 2001 From: Huiying Date: Fri, 30 Aug 2024 09:57:35 -0700 Subject: [PATCH 087/664] Add Yi 1.5 34b Neva support (#10083) * Yi1.5 34b neva prompt template Signed-off-by: HuiyingLi * apply black and isort Signed-off-by: HuiyingLi * add tokenizer and inference configs in docstring Signed-off-by: HuiyingLi --------- Signed-off-by: HuiyingLi --- .../multimodal/data/neva/conversation.py | 36 ++++++ .../multimodal/data/neva/neva_dataset.py | 104 ++++++++++++++++++ .../common/text_generation_strategy.py | 23 ++++ .../modules/common/text_generation_utils.py | 3 + 4 files changed, 166 insertions(+) diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py index 2e110eebe9e6..89f1ab24f0a9 100644 --- a/nemo/collections/multimodal/data/neva/conversation.py +++ b/nemo/collections/multimodal/data/neva/conversation.py @@ -49,6 +49,7 @@ class SeparatorStyle(Enum): LLAMA_3 = auto() MISTRAL = auto() NVGPT = auto() + YI34b = auto() @dataclasses.dataclass @@ -155,7 +156,31 @@ def get_prompt(self): ret += wrap_user(message) + self.sep else: ret += wrap_assistant(message) + (self.sep if message else "") + elif self.sep_style == SeparatorStyle.YI34b: + """ + {{ if .System }}<|im_start|>system + {{ .System }}<|im_end|> + {{ end }}{{ if .Prompt }}<|im_start|>user + {{ .Prompt }}<|im_end|> + {{ end }}<|im_start|>assistant + {{ .Response }}<|im_end|> + """ + wrap_sys = lambda msg: f"<|im_start|>system\n{msg}<|im_end|>" + wrap_user = lambda msg: f"<|im_start|>user\n{msg.strip()}<|im_end|>" + wrap_assistant = lambda msg: f"<|im_start|>assistant\n{msg}<|im_end|>" + ret = wrap_sys(self.system) if len(self.system) > 0 else "" + for i, (role, message) in enumerate(messages): + if i == 0: + assert message, "first message should not be none" + assert role == self.roles[0], "first message should come from user" + if type(message) is tuple: + message, _, _ = message + elif i % 2 == 0: + ret += wrap_user(message) + self.sep + else: + ret += wrap_assistant(message) + (self.sep if message else "") + ret = ret.strip() elif self.sep_style == SeparatorStyle.PLAIN: seps = [self.sep, self.sep2] ret = self.system @@ -322,6 +347,16 @@ def dict(self): sep2=f"{DEFAULT_SYSTEM_TOKEN}System\n", ) +conv_yi_34b = Conversation( + system="", + roles=('user', 'assistant'), + version="1.5", + messages=(), + offset=0, + sep_style=SeparatorStyle.YI34b, + sep="\n", +) + conv_vicuna_v0 = Conversation( system="A chat between a curious human and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the human's questions.", @@ -490,6 +525,7 @@ def dict(self): "nv_steerlm": conv_nvgpt, "nv_dpo": conv_nv_dpo, "mistral": conv_mistral, + "yi_34b": conv_yi_34b, } if __name__ == "__main__": diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 8102d179757e..37f57ff21bba 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -598,6 +598,104 @@ def preprocess_llama_2( ) +def preprocess_yi_34b( + sources: dict, + tokenizer, + cfg, +) -> Dict: + """ + Preprocess sources for Yi-1.5 34b model configuration. + + The function applies prompt templates and tokenizes the conversations according to the Yi-1.5 34b model specifications. + It involves special handling of tokens, masking of labels, and adjustments based on configuration settings. + + This template works with the following tokenizer configs: + - model.tokenizer.library='huggingface' + - model.tokenizer.type='01-ai/Yi-1.5-34B' + - model.tokenizer.additional_special_tokens='{additional_special_tokens: ["", "", "", "", "", ""]}' + At inference time, add end string to stop sampling: + - inference.end_strings='["<|im_end|>"]' + + Parameters: + - sources (dict): A dictionary of sources containing conversations to be processed. + - tokenizer: The tokenizer to be used for processing the text. + - cfg: Configuration settings for preprocessing, including context length and additional tokens. + + Returns: + - Dict: A dictionary containing tokenized and labeled data suitable for the LLaMA 2 model. + This includes tokens, labels, and any special processing as defined in the configuration. + """ + + """<|im_start|>user\n{prompt.strip()}<|im_end|>\n<|im_start|>assistant\n""" + + conv = conversation_lib.conv_yi_34b.copy() + + # apply prompt templates + conversations = [] + for i, source in enumerate(sources): + source = source["conversations"] + strip_end_for_inference = False + + for i, turn in enumerate(source): + + if i % 2 == 1: + turn["from"] = conv.roles[1] + value = turn["value"] + + conv.append_message(turn['from'], value) + if not turn["value"]: + strip_end_for_inference = True + else: + turn["from"] = conv.roles[0] + conv.append_message(turn["from"], turn["value"]) + context = conv.get_prompt() + if strip_end_for_inference and context.endswith("\n<|im_end|>"): + context = context[: -len("\n<|im_end|>")] + "\n" + conversations.append(context) + + add_extra_token = cfg.get("add_extra_token") + + tokens = tokenize( + texts=conversations, + tokenizer=tokenizer, + context_length=cfg.get("context_length"), + add_extra_token=add_extra_token, + ) + labels = tokens.clone().detach() + + round_sep = "<|im_start|>user\n" + sep = "<|im_start|>assistant\n" + for conversation, target in zip(conversations, labels): + rounds = conversation.split(round_sep) + rounds = [round_sep.join(rounds[:2])] + [(round_sep + x) for x in rounds[2:]] + assert len(conversation) == sum(map(len, rounds)) + cur_len = 0 + for i, rou in enumerate(rounds): + if rou == "": + break + parts = rou.split(sep) + if len(parts) != 2: + break + instruction_len = len(tokenizer.text_to_ids(parts[0] + sep)) + round_len = len(tokenizer.text_to_ids(rou)) + target[cur_len : cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if add_extra_token: + tokens = tokens[:, :-1].contiguous() + labels = labels[:, 1:].contiguous() + else: + labels = torch.roll(labels, shifts=-1, dims=-1) + labels[:, -1] = IGNORE_INDEX + + return dict( + tokens=tokens, + labels=labels, + ) + + def preprocess_v1( sources: dict, tokenizer, @@ -1160,6 +1258,12 @@ def expand2square(pil_img, background_color): self.tokenizer, self.multimodal_cfg, ) + elif self.conv_template == "yi_34b": + data_dict = preprocess_yi_34b( + sources, + self.tokenizer, + self.multimodal_cfg, + ) else: raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.") diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index 8b9d7cf712c4..09f265ed2521 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -426,6 +426,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c preprocess_nv_dpo, preprocess_nvgpt, preprocess_v1, + preprocess_yi_34b, ) list_data_dict = [] @@ -486,6 +487,28 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents ) # HARDCODED FOR NOW data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg) + elif multimodal_cfg["conv_template"] == "yi_34b": + record = { + 'conversations': [ + { + 'from': 'human', + 'value': prompt, + }, + { + 'from': 'gpt', + 'value': '', + }, + ], + } + for turn in record['conversations']: + if turn.get('value') is not None: + turn['value'] = re.sub('', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value']) + list_data_dict.append(record) + sources = preprocess_multimodal( + copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents + ) # HARDCODED FOR NOW + data_dict = preprocess_yi_34b(sources, tokenizer, multimodal_cfg) + elif multimodal_cfg["conv_template"] == "llama_3": record = { 'conversations': [ diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index a5215b12bfae..6db533d0719b 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -288,6 +288,9 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para elif conv_template == "llama_3": clean_response = clean_response.rsplit("assistant<|end_header_id|>\n\n", 1)[-1] clean_response = re.sub(r"(<\|eot_id\|>)+$", "", clean_response) + elif conv_template == "yi_34b": + clean_response = clean_response.split("<|im_start|>assistant\n")[-1] + clean_response = clean_response.strip("<|im_end|>") elif conv_template == "v1": clean_response = clean_response.rsplit("ASSISTANT: ", 1)[-1] From 0ba99795932123a580aba49ade7058b1f0d812ba Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Fri, 30 Aug 2024 14:03:50 -0400 Subject: [PATCH 088/664] move to cpu only for log probs (#10316) * move to cpu only for log probs Signed-off-by: Nithin Rao Koluguri * Apply isort and black reformatting Signed-off-by: nithinraok --------- Signed-off-by: Nithin Rao Koluguri Signed-off-by: nithinraok Co-authored-by: Nithin Rao Koluguri Co-authored-by: nithinraok --- nemo/collections/asr/parts/utils/transcribe_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index c26fa6f4984d..7e1b69652fb1 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -535,9 +535,8 @@ def transcribe_partial_audio( if isinstance(asr_model, EncDecHybridRNNTCTCModel) and decoder_type == "ctc": logits = asr_model.ctc_decoder(encoder_output=logits) - logits = logits.cpu() - if logprobs: + logits = logits.cpu() logits = logits.numpy() # dump log probs per file for idx in range(logits.shape[0]): From b87e1e3dfd7665f161750d194c13287a30974838 Mon Sep 17 00:00:00 2001 From: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> Date: Fri, 30 Aug 2024 19:04:39 -0400 Subject: [PATCH 089/664] [NeMo-UX] Don't create attention mask for GPTs (#10242) * remove attention mask as default Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * fix non TE case Signed-off-by: Jimmy Zhang * fix Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 --------- Signed-off-by: Jimmy Zhang Signed-off-by: JimmyZhang12 Co-authored-by: Jimmy Zhang Co-authored-by: JimmyZhang12 --- nemo/collections/llm/gpt/data/mock.py | 36 +++++++++++++++---- nemo/collections/llm/gpt/data/pre_training.py | 9 +++++ nemo/collections/llm/gpt/model/base.py | 16 +++++---- 3 files changed, 47 insertions(+), 14 deletions(-) diff --git a/nemo/collections/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py index 37e255bf5aec..7d195bad9ec2 100644 --- a/nemo/collections/llm/gpt/data/mock.py +++ b/nemo/collections/llm/gpt/data/mock.py @@ -9,6 +9,12 @@ from nemo.lightning.pytorch.plugins import MegatronDataSampler +HAVE_TE = True +try: + import transformer_engine +except (ImportError, ModuleNotFoundError): + HAVE_TE = False + if TYPE_CHECKING: from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec @@ -27,6 +33,7 @@ def __init__( num_workers: int = 8, pin_memory: bool = True, persistent_workers: bool = False, + create_attention_mask: bool = False, ): super().__init__() self.seq_length = seq_length @@ -36,6 +43,7 @@ def __init__( self.num_workers = num_workers self.pin_memory = pin_memory self.persistent_workers = persistent_workers + self.create_attention_mask = create_attention_mask or not HAVE_TE from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer @@ -48,9 +56,15 @@ def __init__( ) def setup(self, stage: str = "") -> None: - self._train_ds = _MockGPTDataset(self.tokenizer, "train", self.num_train_samples, self.seq_length) - self._validation_ds = _MockGPTDataset(self.tokenizer, "valid", self.num_val_samples, self.seq_length) - self._test_ds = _MockGPTDataset(self.tokenizer, "test", self.num_test_samples, self.seq_length) + self._train_ds = _MockGPTDataset( + self.tokenizer, "train", self.num_train_samples, self.seq_length, self.create_attention_mask + ) + self._validation_ds = _MockGPTDataset( + self.tokenizer, "valid", self.num_val_samples, self.seq_length, self.create_attention_mask + ) + self._test_ds = _MockGPTDataset( + self.tokenizer, "test", self.num_test_samples, self.seq_length, self.create_attention_mask + ) def train_dataloader(self) -> TRAIN_DATALOADERS: if not hasattr(self, "_train_ds"): @@ -86,6 +100,7 @@ def __init__( num_samples: int, seq_length: int, seed: int = 42, + create_attention_mask: bool = False, ) -> None: super().__init__() self.name = name @@ -93,9 +108,12 @@ def __init__( self.vocab_size = tokenizer.vocab_size self.length = num_samples self.seed = seed + self.create_attention_mask = create_attention_mask + + if create_attention_mask: + self.attention_mask = torch.tril(torch.ones((self.seq_length, self.seq_length), device='cpu')).unsqueeze(0) + self.attention_mask = self.attention_mask < 0.5 - self.attention_mask = torch.tril(torch.ones((self.seq_length, self.seq_length))).unsqueeze(0) - self.attention_mask = self.attention_mask < 0.5 self.loss_mask = torch.ones(self.seq_length, dtype=torch.float) self.position_ids = torch.arange(self.seq_length, dtype=torch.int64) @@ -112,14 +130,18 @@ def __getitem__(self, idx) -> Dict[str, torch.Tensor]: tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64)) labels = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64)) - return { + batch = { "tokens": tokens, "labels": labels, - "attention_mask": self.attention_mask, "loss_mask": self.loss_mask, "position_ids": self.position_ids, } + if self.create_attention_mask: + batch["attention_mask"] = self.attention_mask + + return batch + def _collate_fn(self, batch): """ A default implementation of a collation function. diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py index 919a9b52b4bb..b5266b24ac0b 100644 --- a/nemo/collections/llm/gpt/data/pre_training.py +++ b/nemo/collections/llm/gpt/data/pre_training.py @@ -11,6 +11,12 @@ from nemo.lightning.io.mixin import IOMixin from nemo.lightning.pytorch.plugins import MegatronDataSampler +HAVE_TE = True +try: + import transformer_engine +except (ImportError, ModuleNotFoundError): + HAVE_TE = False + if TYPE_CHECKING: from megatron.core.datasets.gpt_dataset import GPTDatasetConfig @@ -69,6 +75,7 @@ def __init__( pin_memory: bool = True, persistent_workers: bool = False, reset_position_ids: bool = False, + create_attention_mask: bool = False, reset_attention_mask: bool = False, eod_mask_loss: bool = False, seed: int = 1234, @@ -107,6 +114,7 @@ def __init__( self.pin_memory = pin_memory self.persistent_workers = persistent_workers self.reset_position_ids = reset_position_ids + self.create_attention_mask = create_attention_mask or not HAVE_TE self.reset_attention_mask = reset_attention_mask self.eod_mask_loss = eod_mask_loss self.seed = seed @@ -219,6 +227,7 @@ def gpt_dataset_config(self) -> "GPTDatasetConfig": tokenizer=self.tokenizer, path_to_cache=self.index_mapping_dir, reset_position_ids=self.reset_position_ids, + create_attention_mask=self.create_attention_mask, reset_attention_mask=self.reset_attention_mask, eod_mask_loss=self.eod_mask_loss, num_dataset_builder_threads=self.num_dataset_builder_threads, diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index c108415a085e..eaac9394e887 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -47,8 +47,6 @@ def gpt_data_step(dataloader_iter) -> Dict[str, torch.Tensor]: required_keys.update(("tokens", "position_ids")) if parallel_state.is_pipeline_last_stage(): required_keys.update(("labels", "loss_mask")) - # if self.get_attention_mask_from_fusion: - # required_keys.remove('attention_mask') _batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in _batch.items()} # slice batch along sequence dimension for context parallelism @@ -61,10 +59,17 @@ def gpt_forward_step(model, batch) -> torch.Tensor: forward_args = { "input_ids": batch["tokens"], "position_ids": batch["position_ids"], - "attention_mask": batch["attention_mask"], "labels": batch["labels"], } + if 'attention_mask' not in batch: + assert ( + HAVE_TE + ), "The dataloader did not provide an attention mask, however Transformer Engine was not detected. \ + This requires Transformer Engine's implementation of fused or flash attention." + else: + forward_args["attention_mask"] = batch['attention_mask'] + if 'cu_seqlens' in batch: forward_args['packed_seq_params'] = get_packed_seq_params(batch) @@ -110,9 +115,6 @@ class GPTConfig(TransformerConfig, io.IOMixin): masked_softmax_fusion: bool = True deallocate_pipeline_outputs = True - # TODO: Move this to better places? - get_attention_mask_from_fusion: bool = False - transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = default_layer_spec forward_step_fn: Callable = gpt_forward_step data_step_fn: Callable = gpt_data_step @@ -184,7 +186,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - attention_mask: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, decoder_input: Optional[torch.Tensor] = None, inference_params=None, From 9a2200546ff3c091d6d7d7e489b4d0e5931e083d Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:26:17 -0700 Subject: [PATCH 090/664] Make get_optim_config iterable (#10318) * Make get_optim_config iterable Signed-off-by: Alexandros Koumparoulis * Update error message Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis --- nemo/lightning/pytorch/plugins/mixed_precision.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py index 79394cc4bbb1..37a895ea1875 100644 --- a/nemo/lightning/pytorch/plugins/mixed_precision.py +++ b/nemo/lightning/pytorch/plugins/mixed_precision.py @@ -28,8 +28,15 @@ def get_optim_config(optimizer: Optimizer): + extract_config = lambda x: x.config try: - return optimizer.mcore_optimizer.config + from megatron.core.optimizer import ChainedOptimizer + + if isinstance(optimizer.mcore_optimizer, ChainedOptimizer): + opts = optimizer.mcore_optimizer.chained_optimizers + else: + opts = [optimizer.mcore_optimizer] + yield from map(extract_config, opts) except: raise ValueError("Failed to extract optimizer config from module.") @@ -149,9 +156,9 @@ def convert_optimizer(self, optimizer: Optimizer) -> Optimizer: This is optional and depends on the precision limitations during optimization. """ - optim_config = get_optim_config(optimizer) - assert optim_config.bf16 == self.dtype_config.bf16, "BF16 enabled on model but not on optimizer" - assert optim_config.fp16 == self.dtype_config.fp16, "BF16 enabled on model but not on optimizer" + for optim_config in get_optim_config(optimizer): + assert optim_config.bf16 == self.dtype_config.bf16, "BF16 model/optim config mismatch" + assert optim_config.fp16 == self.dtype_config.fp16, "FP16 model/optim config mismatch" return optimizer def convert_input(self, data: AnyT) -> AnyT: From b698ae511a1d88c668cfcd73a1b3689a81243bfc Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Fri, 30 Aug 2024 17:52:51 -0700 Subject: [PATCH 091/664] Fix llama3 pretraining NeMo 2.0 script (#10307) Signed-off-by: Hemil Desai --- examples/llm/run/llama3_pretraining.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/llm/run/llama3_pretraining.py b/examples/llm/run/llama3_pretraining.py index 612b58e2169f..6cc001029a3b 100644 --- a/examples/llm/run/llama3_pretraining.py +++ b/examples/llm/run/llama3_pretraining.py @@ -140,7 +140,7 @@ def main(): # Uses configs from NeMo directly pretrain = MODEL_SIZE_MAPPING[args.size]["nemo"]["pretrain"]( name=exp_name, - ckpt_dir=f"/{exp_name}/checkpoints", + ckpt_dir="/nemo_run/checkpoints", ) # Overwrite the dataloader in the recipe to use your custom dataloader. @@ -170,8 +170,6 @@ def main(): executor = local_executor_torchrun(nodes=pretrain.trainer.num_nodes, devices=pretrain.trainer.devices) with run.Experiment(f"{exp_name}{args.tag}") as exp: - pretrain.log.dir = f"/{exp_name}/checkpoints" - for i in range(1): exp.add( pretrain, From 78357ae99ff2cf9f179f53fbcb02c88a5a67defb Mon Sep 17 00:00:00 2001 From: Wil Kong Date: Mon, 2 Sep 2024 09:21:11 +0800 Subject: [PATCH 092/664] Support TE-DPA For Stable Diffusion (#10288) * [SD] Add te-dpa support Signed-off-by: Wil Kong * [SD] Add te-dpa support, resolve compatibility with TE-master Signed-off-by: Wil Kong * [SD] Add te-dpa support, add check for attention configs. Signed-off-by: Wil Kong * Fix bugs of flash-attn and dpa in SD. Signed-off-by: Wil Kong * Fix the issue of DPA API change. Signed-off-by: Wil Kong * Apply isort and black reformatting Signed-off-by: alpha0422 Signed-off-by: Wil Kong --------- Signed-off-by: Wil Kong Signed-off-by: alpha0422 Co-authored-by: Mengdi Wang --- .../modules/stable_diffusion/attention.py | 46 ++++++++++++++++--- .../diffusionmodules/openaimodel.py | 9 +++- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py index 492f68af032e..9d4d5de2d203 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py @@ -41,6 +41,7 @@ from nemo.utils import logging try: + from transformer_engine.pytorch.attention import DotProductAttention from transformer_engine.pytorch.module import LayerNormLinear, LayerNormMLP HAVE_TE = True @@ -255,11 +256,21 @@ def __init__( dim_head=64, dropout=0.0, use_flash_attention=False, + use_te_dpa=False, lora_network_alpha=None, use_te=False, ): super().__init__() + assert not ( + use_te_dpa and use_flash_attention + ), 'use_te_dpa and use_flash_attention cannot be True together. Please specify the attention you want to use.' + + if use_flash_attention: + assert flash_attn_installed, 'Flash-attention must be installed.' + if use_te_dpa: + assert HAVE_TE, 'TransformerEngine is required to run with TE DPA.' + self.inner_dim = dim_head * heads if context_dim is None: self.is_self_attn = True @@ -277,6 +288,7 @@ def __init__( self.to_k = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha) self.to_v = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha) + self.use_te_dpa = use_te_dpa self.use_te = use_te if use_te: return_layernorm_output = True if self.is_self_attn else False @@ -292,11 +304,21 @@ def __init__( ) self.use_flash_attention = use_flash_attention - if dim_head <= 160 and (dim_head % 8) == 0 and flash_attn_installed: - if context_dim == query_dim: - self.flash_attn = FlashSelfAttention(softmax_scale=self.scale) - else: - self.flash_attn = FlashCrossAttention(softmax_scale=self.scale) + if dim_head <= 160 and (dim_head % 8) == 0: + if self.use_flash_attention: + if context_dim == query_dim: + self.flash_attn = FlashSelfAttention(softmax_scale=self.scale) + else: + self.flash_attn = FlashCrossAttention(softmax_scale=self.scale) + elif self.use_te_dpa: + self.te_dpa = DotProductAttention( + kv_channels=dim_head, + num_attention_heads=self.inner_dim // dim_head, + attn_mask_type='no_mask', + attention_type='self' if context_dim == query_dim else 'cross', + qkv_format='bshd', # `sbhd`, `bshd`, `thd` + softmax_scale=self.scale, + ) def forward(self, x, context=None, mask=None, additional_tokens=None, n_times_crossframe_attn_in_self=0): h = self.heads @@ -338,7 +360,7 @@ def _attention(self, q, k, v, mask=None, additional_tokens=None): if ( not flash_attn_installed - or not self.use_flash_attention + or (not self.use_flash_attention and not self.use_te_dpa) or q.dtype == torch.float32 or (self.dim_head > 160 or (self.dim_head % 8) != 0) or mask is not None @@ -365,6 +387,13 @@ def _attention(self, q, k, v, mask=None, additional_tokens=None): # (b h) n d -> b n (h d) out = rearrange_heads_inner(out, h) + + elif self.use_te_dpa: + b, s_kv, hd = k.shape + s_q = q.shape[1] + d = hd // h + out = self.te_dpa(q.view(b, s_q, h, d), k.view(b, s_kv, h, d), v.view(b, s_kv, h, d)) + elif self.context_dim == self.query_dim: # self-attention qkv = torch.stack([q, k, v], dim=2) @@ -404,6 +433,7 @@ def __init__( gated_ff=True, use_checkpoint=False, use_flash_attention=False, + use_te_dpa=False, disable_self_attn=False, lora_network_alpha=None, use_te=False, @@ -416,6 +446,7 @@ def __init__( dim_head=d_head, dropout=dropout, use_flash_attention=use_flash_attention, + use_te_dpa=use_te_dpa, context_dim=context_dim if self.disable_self_attn else None, lora_network_alpha=lora_network_alpha, use_te=use_te, @@ -428,6 +459,7 @@ def __init__( dim_head=d_head, dropout=dropout, use_flash_attention=use_flash_attention, + use_te_dpa=use_te_dpa, lora_network_alpha=lora_network_alpha, use_te=use_te, ) # is self-attn if context is none @@ -485,6 +517,7 @@ def __init__( use_linear=False, use_checkpoint=False, use_flash_attention=False, + use_te_dpa=False, lora_network_alpha=None, use_te=False, ): @@ -527,6 +560,7 @@ def __init__( context_dim=context_dim[d], use_checkpoint=use_checkpoint, use_flash_attention=use_flash_attention, + use_te_dpa=use_te_dpa, disable_self_attn=disable_self_attn, lora_network_alpha=lora_network_alpha, use_te=use_te, diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py index b94624b33ba2..66df3c378bfb 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py @@ -609,6 +609,7 @@ def __init__( from_NeMo=False, # It must be specified when from pretrained is not None. It indicates loading unet from NeMo trained ckpt or HF use_flash_attention: bool = False, + use_te_dpa: bool = False, unet_precision: str = "fp32", lora_network_alpha=None, timesteps=1000, @@ -782,6 +783,7 @@ def __init__( use_linear=use_linear_in_transformer, use_checkpoint=use_checkpoint, use_flash_attention=use_flash_attention, + use_te_dpa=use_te_dpa, lora_network_alpha=lora_network_alpha, use_te=self.use_te_fp8, ) @@ -851,6 +853,7 @@ def __init__( use_linear=use_linear_in_transformer, use_checkpoint=use_checkpoint, use_flash_attention=use_flash_attention, + use_te_dpa=use_te_dpa, use_te=self.use_te_fp8, lora_network_alpha=lora_network_alpha, ) @@ -918,6 +921,7 @@ def __init__( use_linear=use_linear_in_transformer, use_checkpoint=use_checkpoint, use_flash_attention=use_flash_attention, + use_te_dpa=use_te_dpa, lora_network_alpha=lora_network_alpha, use_te=self.use_te_fp8, ) @@ -978,8 +982,8 @@ def __init__( self.convert_to_fp16() elif unet_precision == 'fp16': self.convert_to_fp16(enable_norm_layers=True) - elif self.use_te_fp8: - assert unet_precision != 'fp16', "fp8 training can't work with fp16 O2 amp recipe" + if self.use_te_fp8: + assert unet_precision == 'fp16', "fp8 training can't work with fp16 O2 amp recipe" convert_module_to_fp8(self) fp8_margin = int(os.getenv("FP8_MARGIN", '0')) @@ -1002,6 +1006,7 @@ def __init__( amax_history_len=fp8_amax_history_len, amax_compute_algo=fp8_amax_compute_algo, override_linear_precision=(False, False, not fp8_wgrad), + # fp8_dpa=use_te_dpa, # TODO; fp8 DPA kernel is not supported now. ) old_state_dict = self.state_dict() new_state_dict = self.te_fp8_key_mapping(old_state_dict) From 8cd751bc96b521d2d3d4b0ea0493ad305ac7fb21 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:23:55 -0700 Subject: [PATCH 093/664] fix tokenizer restoration (#10336) Signed-off-by: Alexandros Koumparoulis --- nemo/lightning/io/api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py index 4315b3211bf7..4f035b9a0248 100644 --- a/nemo/lightning/io/api.py +++ b/nemo/lightning/io/api.py @@ -125,7 +125,9 @@ def import_ckpt( raise ValueError("Model must be an instance of ConnectorMixin") importer: ModelConnector = model.importer(source) - return importer(overwrite=overwrite, output_path=output_path) + ckpt_path = importer(overwrite=overwrite, output_path=output_path) + importer.on_import_ckpt(model) + return ckpt_path def load_connector_from_trainer_ckpt(path: Path, target: str) -> ModelConnector: From 9472fc38656c5738fdc750a520771ce99ddc5461 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Tue, 3 Sep 2024 09:26:37 -0700 Subject: [PATCH 094/664] remove virtual pipeline parallel apex dependency (#10317) Signed-off-by: ashors1 --- nemo/collections/nlp/modules/common/megatron/megatron_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index d8fac724e63c..c060d140cb8c 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -76,7 +76,7 @@ try: - from apex.transformer.parallel_state import set_virtual_pipeline_model_parallel_world_size + from megatron.core.parallel_state import set_virtual_pipeline_model_parallel_world_size HAVE_INTERLEAVED = True From ac89593c3befeb146ba0dea40bc9e70f4d90116c Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Tue, 3 Sep 2024 10:00:53 -0700 Subject: [PATCH 095/664] Add option to selectively load context in nemo.lightning.io (#10279) * Add option to selectively load context in nemo.lightning.io Signed-off-by: Hemil Desai * Update docstring Signed-off-by: Hemil Desai * Fixes Signed-off-by: Hemil Desai --------- Signed-off-by: Hemil Desai Co-authored-by: Marc Romeyn Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> --- nemo/lightning/io/api.py | 10 ++++++++-- nemo/lightning/io/mixin.py | 40 ++++++++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py index 4f035b9a0248..1bbbe43f8df9 100644 --- a/nemo/lightning/io/api.py +++ b/nemo/lightning/io/api.py @@ -9,21 +9,27 @@ from nemo.lightning.io.pl import TrainerContext -def load_context(path: Path) -> TrainerContext: +def load_context(path: Path, subpath: Optional[str] = None) -> TrainerContext: """ Loads a TrainerContext from a json-file or directory. Args: path (Path): The path to the json-file or directory containing 'io.json'. + subpath (Optional[str]): Subpath to selectively load only specific objects inside the TrainerContext. Defaults to None. Returns ------- TrainerContext: The loaded TrainerContext instance. Example: + # Load the entire context checkpoint: TrainerContext = load_ckpt("/path/to/checkpoint") + + # Load a subpath of the context, for eg: model.config + checkpoint: TrainerContext = load_ckpt("/path/to/checkpoint", subpath="model.config") + """ - return load(path, output_type=TrainerContext) + return load(path, output_type=TrainerContext, subpath=subpath) def model_importer(target: Type[ConnectorMixin], ext: str) -> Callable[[Type[ConnT]], Type[ConnT]]: diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index e249e2e318b6..ff6c925a64bb 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -22,8 +22,9 @@ from nemo.lightning.io.capture import IOProtocol from nemo.lightning.io.connector import ModelConnector from nemo.lightning.io.fdl_torch import enable as _enable_ext +from nemo.utils import logging -ConnT = TypeVar('ConnT', bound=ModelConnector) +ConnT = TypeVar("ConnT", bound=ModelConnector) CkptType = TypeVar("CkptType") _enable_ext() @@ -361,7 +362,7 @@ def track_io(target, artifacts: Optional[List[Artifact]] = None): """ def _add_io_to_class(cls): - if inspect.isclass(cls) and hasattr(cls, '__init__') and not hasattr(cls, '__io__'): + if inspect.isclass(cls) and hasattr(cls, "__init__") and not hasattr(cls, "__io__"): if cls in [str, int, float, tuple, list, dict, bool, type(None)]: return cls @@ -497,7 +498,6 @@ def _io_flatten_object(instance): def _io_unflatten_object(values, metadata): - assert hasattr(_thread_local, "output_dir") output_dir = _thread_local.output_dir @@ -512,7 +512,7 @@ def _io_unflatten_object(values, metadata): def _io_path_elements_fn(x): try: serialization.dump_json(x.__io__) - except (serialization.UnserializableValueError, AttributeError) as e: + except (serialization.UnserializableValueError, AttributeError): return (serialization.IdentityElement(),) return x.__io__.__path_elements__() @@ -552,13 +552,14 @@ def _artifact_transform_load(cfg: fdl.Config, path: Path): pass -def load(path: Path, output_type: Type[CkptType] = Any) -> CkptType: +def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] = None) -> CkptType: """ Loads a configuration from a pickle file and constructs an object of the specified type. Args: path (Path): The path to the pickle file or directory containing 'io.pkl'. output_type (Type[CkptType]): The type of the object to be constructed from the loaded data. + subpath (Optional[str]): Subpath to selectively load only specific objects inside the output_type. Defaults to None. Returns ------- @@ -571,29 +572,48 @@ def load(path: Path, output_type: Type[CkptType] = Any) -> CkptType: Example: loaded_model = load("/path/to/model", output_type=MyModel) """ - del output_type # Just for type-hint - _path = Path(path) _thread_local.output_dir = _path - if hasattr(_path, 'is_dir') and _path.is_dir(): + if hasattr(_path, "is_dir") and _path.is_dir(): _path = Path(_path) / "io.json" - elif hasattr(_path, 'isdir') and _path.isdir: + elif hasattr(_path, "isdir") and _path.isdir: _path = Path(_path) / "io.json" if not _path.is_file(): raise FileNotFoundError(f"No such file: '{_path}'") + if subpath: + subpath = "." + subpath + ## add IO functionality to custom objects present in the json file with open(_path) as f: j = json.load(f) for obj, val in j["objects"].items(): clss = ".".join([val["type"]["module"], val["type"]["name"]]) + if subpath and "paths" in val: + if all(map(lambda p: subpath not in p, val["paths"])): + continue + if not serialization.find_node_traverser(locate(clss)): track_io(locate(clss)) with open(_path, "rb") as f: - config = serialization.load_json(f.read()) + json_config = json.loads(f.read()) + + root_key = None + for obj, val in json_config["objects"].items(): + if "paths" in val and subpath in val["paths"]: + root_key = obj + break + + if subpath and not root_key: + logging.warning(f"Could not find {subpath} for {output_type} in {_path}") + + if root_key: + json_config["root"]["key"] = root_key + + config = serialization.Deserialization(json_config).result _artifact_transform_load(config, path) return fdl.build(config) From 8eb1827511de930d3e275665f6dabd80b0448886 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 3 Sep 2024 10:39:48 -0700 Subject: [PATCH 096/664] Add EP to mixtral-8x22b recipe (#10337) Signed-off-by: Alexandros Koumparoulis --- nemo/collections/llm/recipes/mixtral_8x22b.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py index aaf0149dbdac..807e414fa73d 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -30,6 +30,7 @@ def trainer( virtual_pipeline_parallelism: Optional[int], context_parallelism: int, sequence_parallelism: bool, + expert_parallelism: int, num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 1168251, @@ -43,6 +44,7 @@ def trainer( virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, context_parallel_size=context_parallelism, sequence_parallel=sequence_parallelism, + expert_model_parallel_size=expert_parallelism, gradient_as_bucket_view=True, ckpt_include_optimizer=True, ckpt_async_save=True, @@ -88,6 +90,7 @@ def pretrain_recipe( virtual_pipeline_parallelism=None, context_parallelism=1, sequence_parallelism=True, + expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, callbacks=[Config(TimingCallback)], From ab6aba32967421c06e7fcf93106c9a686361d4e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Kami=C5=84ski?= <67481570+Laplasjan107@users.noreply.github.com> Date: Tue, 3 Sep 2024 21:13:46 +0200 Subject: [PATCH 097/664] Bugfix: export to trt-llm multi_block_mode flag (#10334) * bugfix Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * add quantisation flags to deploy_triton.py Signed-off-by: Piotr Kaminski --------- Signed-off-by: Piotr Kaminski Signed-off-by: Laplasjan107 Co-authored-by: Piotr Kaminski Co-authored-by: Laplasjan107 --- scripts/deploy/nlp/deploy_triton.py | 37 +++++++++++++++++++++++++++++ scripts/export/export_to_trt_llm.py | 8 ++++--- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index 0ec6264d6bf0..e3394726fa1c 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -18,6 +18,7 @@ import os import sys from pathlib import Path +from typing import Optional import uvicorn @@ -25,6 +26,11 @@ LOGGER = logging.getLogger("NeMo") + +class UsageError(Exception): + pass + + megatron_llm_supported = True try: from nemo.deploy.nlp import MegatronLLMDeployable @@ -202,7 +208,36 @@ def get_args(argv): help="Return the response from PyTriton server in OpenAI compatible format", ) parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode") + parser.add_argument( + "-fp8", + "--export_fp8_quantized", + default="auto", + type=str, + help="Enables exporting to a FP8-quantized TRT LLM checkpoint", + ) + parser.add_argument( + "-kv_fp8", + "--use_fp8_kv_cache", + default="auto", + type=str, + help="Enables exporting with FP8-quantizatized KV-cache", + ) args = parser.parse_args(argv) + + def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]: + s = s.lower() + true_strings = ["true", "1"] + false_strings = ["false", "0"] + if s in true_strings: + return True + if s in false_strings: + return False + if optional and s == 'auto': + return None + raise UsageError(f"Invalid boolean value for argument --{name}: '{s}'") + + args.export_fp8_quantized = str_to_bool("export_fp8_quantized", args.export_fp8_quantized, optional=True) + args.use_fp8_kv_cache = str_to_bool("use_fp8_kv_cache", args.use_fp8_kv_cache, optional=True) return args @@ -304,6 +339,8 @@ def get_trtllm_deployable(args): multiple_profiles=args.multiple_profiles, gpt_attention_plugin=args.gpt_attention_plugin, gemm_plugin=args.gemm_plugin, + fp8_quantized=args.export_fp8_quantized, + fp8_kvcache=args.use_fp8_kv_cache, ) except Exception as error: raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py index 3f5924fde80c..6b246131b69e 100644 --- a/scripts/export/export_to_trt_llm.py +++ b/scripts/export/export_to_trt_llm.py @@ -79,7 +79,8 @@ def get_args(argv): default=False, action='store_true', help='Split long kv sequence into multiple blocks (applied to generation MHA kernels). \ - It is beneifical when batchxnum_heads cannot fully utilize GPU.', + It is beneifical when batchxnum_heads cannot fully utilize GPU. \ + Only available when using c++ runtime.', ) parser.add_argument( '--use_lora_plugin', @@ -165,7 +166,9 @@ def nemo_export_trt_llm(argv): return try: - trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository, load_model=False) + trt_llm_exporter = TensorRTLLM( + model_dir=args.model_repository, load_model=False, multi_block_mode=args.multi_block_mode + ) LOGGER.info("Export to TensorRT-LLM function is called.") trt_llm_exporter.export( @@ -183,7 +186,6 @@ def nemo_export_trt_llm(argv): paged_kv_cache=(not args.no_paged_kv_cache), remove_input_padding=(not args.disable_remove_input_padding), dtype=args.dtype, - enable_multi_block_mode=args.multi_block_mode, use_lora_plugin=args.use_lora_plugin, lora_target_modules=args.lora_target_modules, max_lora_rank=args.max_lora_rank, From a1fd8991b8c1be50e3107d7d7735bfbd21938798 Mon Sep 17 00:00:00 2001 From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:35:03 -0700 Subject: [PATCH 098/664] fix (#10339) Signed-off-by: yaoyu-33 --- .../models/text_to_image/stable_diffusion/ldm/ddpm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py index 89b1d88819b8..854e9288368f 100644 --- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py +++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py @@ -1945,6 +1945,8 @@ def process_batch(batch): def fwd_output_and_loss_func(dataloader_iter, model): batch = next(dataloader_iter) + if isinstance(batch, tuple): + batch, _, _ = batch # PTL dataloader iter fix batch = process_batch(batch) batch = [x.cuda(non_blocking=True) for x in batch] if len(self.conditioning_keys) == 0: From 0d2d7c42db673d1ad1f5116089b8596c5cd187dc Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Wed, 4 Sep 2024 12:24:29 -0400 Subject: [PATCH 099/664] Add comment to address a frequently asked question (#10321) * Add comment to address a frequently asked question Signed-off-by: Chen Cui * wording Signed-off-by: Chen Cui * add docstring to GPTSFTPackedDataset Signed-off-by: Chen Cui --------- Signed-off-by: Chen Cui --- .../data/language_modeling/megatron/gpt_sft_dataset.py | 9 +++++++-- nemo/utils/sequence_packing_utils.py | 3 +++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index 2e21c57dddd3..c42249cec2f2 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -521,11 +521,16 @@ def collate_fn(self, batch): class GPTSFTPackedDataset(GPTSFTDataset): def __init__(self, file_path: str, tokenizer: TokenizerSpec, return_cu_seqlen: bool = True, **kwargs): + """ + file_path: See `file_path` in the parent class. + tokenizer: See `tokenizer` in the parent class. + return_cu_seqlen: Whether to return `cu_seqlen` to pass to the model. Having `cu_seqlen` in the model input + enables THD attention kernel, which is the correct format for training with packed sequence to prevent + cross-sequence attention. This flag should be True unless you have a specific use case. + """ np.random.seed(kwargs.get('seed', 1234)) super().__init__(file_path, tokenizer, **kwargs) assert self.virtual_tokens == 0, "P-Tuning with packed sequence is not supported." - - # Whether to return `cu_seqlen` to pass to model. This should be true for almost all use cases. self.return_cu_seqlen = return_cu_seqlen def __getitem__(self, idx): diff --git a/nemo/utils/sequence_packing_utils.py b/nemo/utils/sequence_packing_utils.py index 2a5a14f83823..cee2be248f73 100644 --- a/nemo/utils/sequence_packing_utils.py +++ b/nemo/utils/sequence_packing_utils.py @@ -118,6 +118,9 @@ def create_hist(dataset: np.array, truncate_seq_len: int): counts = [0] * truncate_seq_len for item_dict in dataset: + # Minus 1 here to account for the fact that transformer input and label have one less token than the full sequence + # Input is missing the last token and label is missing the first token (this way the tokens are aligned for next token prediction). + # We want pack size to be the length of the actual input and label, hence minus 1. seq_len = len(item_dict['input_ids']) - 1 sequences[seq_len].append(item_dict) counts[seq_len] += 1 From dd02d02660cbf88731bccd9b7450688b69bdf685 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Wed, 4 Sep 2024 10:26:11 -0700 Subject: [PATCH 100/664] Fix async checkpointing in nemo.lightning (#10324) * Fix async checkpointing in nemo.lightning Signed-off-by: Hemil Desai * PR feedback Signed-off-by: Hemil Desai * Add unit tests Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai --------- Signed-off-by: Hemil Desai Signed-off-by: hemildesai Co-authored-by: hemildesai --- .../pytorch/strategies/fsdp_strategy.py | 7 +++-- .../pytorch/strategies/megatron_strategy.py | 26 +++++++++-------- nemo/lightning/pytorch/strategies/utils.py | 11 +++----- .../strategies/test_megatron_strategy.py | 28 +++++++++++++++++++ 4 files changed, 51 insertions(+), 21 deletions(-) create mode 100644 tests/lightning/pytorch/strategies/test_megatron_strategy.py diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py index 9bb08b3cbd7a..048c2f28d18d 100644 --- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py +++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py @@ -22,8 +22,8 @@ from nemo.lightning import io from nemo.lightning.pytorch.strategies.utils import ( ckpt_to_dir, + create_checkpoint_io, fix_progress_bar, - get_checkpoint_io, init_model_parallel, mcore_to_pyt_sharded_state_dict, pyt_to_mcore_state_dict, @@ -159,7 +159,10 @@ def process_dataloader(self, dataloader: DataLoader) -> DataLoader: @property @override def checkpoint_io(self) -> CheckpointIO: - return get_checkpoint_io(self._checkpoint_io) + if not self._checkpoint_io: + self._checkpoint_io = create_checkpoint_io() + + return self._checkpoint_io @checkpoint_io.setter def checkpoint_io(self, io: CheckpointIO) -> None: diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index e719c50d8a50..67ac028d09a5 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -48,8 +48,8 @@ from nemo.lightning.pytorch.callbacks import ModelTransform from nemo.lightning.pytorch.strategies.utils import ( ckpt_to_dir, + create_checkpoint_io, fix_progress_bar, - get_checkpoint_io, init_model_parallel, setup_data_sampler, setup_parallel_ranks, @@ -652,17 +652,19 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr @property @override def checkpoint_io(self) -> CheckpointIO: - return get_checkpoint_io( - self._checkpoint_io, - save_ckpt_format=self.save_ckpt_format, - async_save=self.async_save, - torch_dist_multiproc=self.torch_dist_multiproc, - assume_constant_structure=self.assume_constant_structure, - parallel_save=self.parallel_save, - parallel_save_within_dp=self.parallel_save_within_dp, - parallel_load=self.parallel_load, - load_directly_on_device=self.load_directly_on_device, - ) + if not self._checkpoint_io: + self._checkpoint_io = create_checkpoint_io( + save_ckpt_format=self.save_ckpt_format, + async_save=self.async_save, + torch_dist_multiproc=self.torch_dist_multiproc, + assume_constant_structure=self.assume_constant_structure, + parallel_save=self.parallel_save, + parallel_save_within_dp=self.parallel_save_within_dp, + parallel_load=self.parallel_load, + load_directly_on_device=self.load_directly_on_device, + ) + + return self._checkpoint_io @checkpoint_io.setter def checkpoint_io(self, io: CheckpointIO) -> None: diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py index 86b7d58ae36b..5b10f2e46ad2 100644 --- a/nemo/lightning/pytorch/strategies/utils.py +++ b/nemo/lightning/pytorch/strategies/utils.py @@ -92,13 +92,10 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path: return filepath -def get_checkpoint_io(checkpoint_io, **kwargs): - if checkpoint_io is None: - checkpoint_io = MegatronCheckpointIO(**kwargs) - if kwargs.get("async_save", False): - checkpoint_io = AsyncFinalizableCheckpointIO(checkpoint_io) - elif isinstance(checkpoint_io, _WrappingCheckpointIO): - checkpoint_io.checkpoint_io = MegatronCheckpointIO() +def create_checkpoint_io(**kwargs): + checkpoint_io = MegatronCheckpointIO(**kwargs) + if kwargs.get("async_save", False): + checkpoint_io = AsyncFinalizableCheckpointIO(checkpoint_io) return checkpoint_io diff --git a/tests/lightning/pytorch/strategies/test_megatron_strategy.py b/tests/lightning/pytorch/strategies/test_megatron_strategy.py new file mode 100644 index 000000000000..fc9bf3816db0 --- /dev/null +++ b/tests/lightning/pytorch/strategies/test_megatron_strategy.py @@ -0,0 +1,28 @@ +from unittest.mock import patch + +from nemo.lightning.pytorch.strategies import MegatronStrategy + + +class TestMegatronStrategy: + @patch('nemo.lightning.pytorch.strategies.megatron_strategy.create_checkpoint_io') + def test_checkpoint_io(self, mock_create_checkpoint_io): + class Dummy: ... + + mock_create_checkpoint_io.side_effect = lambda *args, **kwargs: Dummy() + strategy = MegatronStrategy() + + first_io = strategy.checkpoint_io + mock_create_checkpoint_io.assert_called_once() + + assert first_io == strategy.checkpoint_io + + new_io = object() + strategy.checkpoint_io = new_io + assert new_io == strategy.checkpoint_io + + strategy2 = MegatronStrategy() + second_io = strategy2.checkpoint_io + mock_create_checkpoint_io.assert_called() + + assert first_io != second_io + assert second_io == strategy2.checkpoint_io From 32ba9859330d229c5ba5502d45f14e1aeac7e790 Mon Sep 17 00:00:00 2001 From: BoxiangW <45734921+BoxiangW@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:56:07 -0700 Subject: [PATCH 101/664] [Draft]Add Nemotron4 recipes and Long Context Recipe (#10262) * Add Nemotron recipes and Long Context Recipe Signed-off-by: Boxiang Wang * Add Nemotron4 recipes Signed-off-by: Boxiang Wang * Change long context recipe for llama3 mixtral and nemotron. Add Expert Parallel Support Signed-off-by: Boxiang Wang * Revert Chainedoptimizer change Signed-off-by: Boxiang Wang * Update mixed_precision.py Signed-off-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> * Update mixtral_8x7b.py Signed-off-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> * Add option to not generate attention mask Signed-off-by: Boxiang Wang * Remove nemtron Signed-off-by: Boxiang Wang * Resolve mock conflict Signed-off-by: Boxiang Wang --------- Signed-off-by: Boxiang Wang Signed-off-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> --- .../collections/llm/recipes/llama3_70b_16k.py | 27 ++++++++++---- .../collections/llm/recipes/llama3_70b_64k.py | 23 +++++++++--- nemo/collections/llm/recipes/llama3_8b.py | 1 - nemo/collections/llm/recipes/llama3_8b_16k.py | 23 +++++++++--- nemo/collections/llm/recipes/llama3_8b_64k.py | 23 +++++++++--- nemo/collections/llm/recipes/mixtral_8x22b.py | 15 +++----- nemo/collections/llm/recipes/mixtral_8x3b.py | 9 ++--- .../llm/recipes/mixtral_8x3b_16k.py | 30 ++++++++++++---- .../llm/recipes/mixtral_8x3b_64k.py | 29 +++++++++++---- nemo/collections/llm/recipes/mixtral_8x7b.py | 13 +++---- .../llm/recipes/mixtral_8x7b_16k.py | 29 +++++++++++---- .../llm/recipes/mixtral_8x7b_64k.py | 35 +++++++++++++------ 12 files changed, 177 insertions(+), 80 deletions(-) diff --git a/nemo/collections/llm/recipes/llama3_70b_16k.py b/nemo/collections/llm/recipes/llama3_70b_16k.py index 8829aa6b407b..87826661606f 100644 --- a/nemo/collections/llm/recipes/llama3_70b_16k.py +++ b/nemo/collections/llm/recipes/llama3_70b_16k.py @@ -3,8 +3,11 @@ import torch from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_70b -from nemo.collections.llm.utils import Partial +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback NAME = "llama3_70b_16k" @@ -16,21 +19,26 @@ def pretrain_recipe( name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn ) + model = llama3_70b.model() + model.config.seq_length = 16384 + trainer = llama3_70b.trainer( - tensor_parallelism=2, + tensor_parallelism=4, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, virtual_pipeline_parallelism=5, - context_parallelism=2, + context_parallelism=4, sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = llama3_70b.model() - model.config.seq_length = 16384 + + data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe @@ -40,6 +48,9 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node ) + model = llama3_70b.model() + model.config.seq_length = 16384 + trainer = llama3_70b.trainer( tensor_parallelism=2, pipeline_parallelism=4, @@ -49,11 +60,13 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = llama3_70b.model() - model.config.seq_length = 16384 + + data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe diff --git a/nemo/collections/llm/recipes/llama3_70b_64k.py b/nemo/collections/llm/recipes/llama3_70b_64k.py index 33f46f767a4d..5185e6b2ec45 100644 --- a/nemo/collections/llm/recipes/llama3_70b_64k.py +++ b/nemo/collections/llm/recipes/llama3_70b_64k.py @@ -3,8 +3,11 @@ import torch from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_70b -from nemo.collections.llm.utils import Partial +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback NAME = "llama3_70b_64k" @@ -16,6 +19,9 @@ def pretrain_recipe( name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn ) + model = llama3_70b.model() + model.config.seq_length = 65536 + trainer = llama3_70b.trainer( tensor_parallelism=8, pipeline_parallelism=4, @@ -25,12 +31,14 @@ def pretrain_recipe( sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = llama3_70b.model() - model.config.seq_length = 65536 + + data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe @@ -40,6 +48,9 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node ) + model = llama3_70b.model() + model.config.seq_length = 65536 + trainer = llama3_70b.trainer( tensor_parallelism=2, pipeline_parallelism=4, @@ -49,11 +60,13 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = llama3_70b.model() - model.config.seq_length = 65536 + + data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index 340cfbdf6e26..792f545d0d32 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -2,7 +2,6 @@ import pytorch_lightning as pl import torch -from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py index a57b4ef37298..27762777c622 100644 --- a/nemo/collections/llm/recipes/llama3_8b_16k.py +++ b/nemo/collections/llm/recipes/llama3_8b_16k.py @@ -3,8 +3,11 @@ import torch from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_8b -from nemo.collections.llm.utils import Partial +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback NAME = "llama3_8b_16k" @@ -16,6 +19,9 @@ def pretrain_recipe( name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn ) + model = llama3_8b.model() + model.config.seq_length = 16384 + trainer = llama3_8b.trainer( tensor_parallelism=2, pipeline_parallelism=4, @@ -25,12 +31,14 @@ def pretrain_recipe( sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = llama3_8b.model() - model.config.seq_length = 16384 + + data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe @@ -40,6 +48,9 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node ) + model = llama3_8b.model() + model.config.seq_length = 16384 + trainer = llama3_8b.trainer( tensor_parallelism=2, pipeline_parallelism=4, @@ -49,11 +60,13 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = llama3_8b.model() - model.config.seq_length = 16384 + + data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py index d06c9b08a716..90001c6189a0 100644 --- a/nemo/collections/llm/recipes/llama3_8b_64k.py +++ b/nemo/collections/llm/recipes/llama3_8b_64k.py @@ -3,8 +3,11 @@ import torch from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_8b -from nemo.collections.llm.utils import Partial +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback NAME = "llama3_8b_64k" @@ -16,6 +19,9 @@ def pretrain_recipe( name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn ) + model = llama3_8b.model() + model.config.seq_length = 65536 + trainer = llama3_8b.trainer( tensor_parallelism=2, pipeline_parallelism=4, @@ -25,12 +31,14 @@ def pretrain_recipe( sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = llama3_8b.model() - model.config.seq_length = 65536 + + data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe @@ -40,6 +48,9 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node ) + model = llama3_8b.model() + model.config.seq_length = 65536 + trainer = llama3_8b.trainer( tensor_parallelism=2, pipeline_parallelism=4, @@ -49,11 +60,13 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = llama3_8b.model() - model.config.seq_length = 65536 + + data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py index 807e414fa73d..aefab4f61699 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -2,7 +2,6 @@ import pytorch_lightning as pl import torch -from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -13,6 +12,7 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback @@ -49,11 +49,6 @@ def trainer( ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, - ddp=Config( - DistributedDataParallelConfig, - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - ), ) trainer = Config( @@ -68,7 +63,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + plugins=bf16_mixed_plugin(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -85,9 +80,9 @@ def pretrain_recipe( model=model(), trainer=trainer( tensor_parallelism=8, - pipeline_parallelism=1, - pipeline_parallelism_type=None, - virtual_pipeline_parallelism=None, + pipeline_parallelism=8, + pipeline_parallelism_type=torch.bfloat16, + virtual_pipeline_parallelism=7, context_parallelism=1, sequence_parallelism=True, expert_parallelism=1, diff --git a/nemo/collections/llm/recipes/mixtral_8x3b.py b/nemo/collections/llm/recipes/mixtral_8x3b.py index 223fe68af05d..01d3d15089c3 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b.py @@ -2,7 +2,6 @@ import pytorch_lightning as pl import torch -from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -13,6 +12,7 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback @@ -49,11 +49,6 @@ def trainer( ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, - ddp=Config( - DistributedDataParallelConfig, - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - ), ) trainer = Config( @@ -68,7 +63,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + plugins=bf16_mixed_plugin(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py index e496349a35d6..dbf27f86415c 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py @@ -3,8 +3,12 @@ import torch from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x3b -from nemo.collections.llm.utils import Partial +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback + NAME = "mixtral_8x3b_16k" @@ -16,6 +20,10 @@ def pretrain_recipe( name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn ) + model = mixtral_8x3b.model() + model.config.seq_length = 16384 + model.config.max_position_embeddings = 16384 + trainer = mixtral_8x3b.trainer( tensor_parallelism=2, pipeline_parallelism=2, @@ -23,15 +31,17 @@ def pretrain_recipe( virtual_pipeline_parallelism=8, context_parallelism=2, sequence_parallelism=True, - expert_parallelism=2, + expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = mixtral_8x3b.model() - model.config.seq_length = 16384 + + data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe @@ -41,6 +51,10 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node ) + model = mixtral_8x3b.model() + model.config.seq_length = 16384 + model.config.max_position_embeddings = 16384 + trainer = mixtral_8x3b.trainer( tensor_parallelism=2, pipeline_parallelism=2, @@ -48,14 +62,16 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: virtual_pipeline_parallelism=8, context_parallelism=2, sequence_parallelism=True, - expert_parallelism=2, + expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = mixtral_8x3b.model() - model.config.seq_length = 16384 + + data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py index f034f30ecd94..b2a7724b35a9 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py @@ -3,8 +3,11 @@ import torch from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x3b -from nemo.collections.llm.utils import Partial +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x3b_64k" @@ -16,6 +19,10 @@ def pretrain_recipe( name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn ) + model = mixtral_8x3b.model() + model.config.seq_length = 65536 + model.config.max_position_embeddings = 65536 + trainer = mixtral_8x3b.trainer( tensor_parallelism=4, pipeline_parallelism=4, @@ -23,15 +30,17 @@ def pretrain_recipe( virtual_pipeline_parallelism=8, context_parallelism=4, sequence_parallelism=True, - expert_parallelism=4, + expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = mixtral_8x3b.model() - model.config.seq_length = 65536 + + data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe @@ -41,6 +50,10 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node ) + model = mixtral_8x3b.model() + model.config.seq_length = 65536 + model.config.max_position_embeddings = 65536 + trainer = mixtral_8x3b.trainer( tensor_parallelism=2, pipeline_parallelism=2, @@ -48,14 +61,16 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: virtual_pipeline_parallelism=8, context_parallelism=4, sequence_parallelism=True, - expert_parallelism=2, + expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = mixtral_8x3b.model() - model.config.seq_length = 65536 + + data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py index 1710727bd711..6d24255f0a5f 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b.py @@ -2,7 +2,6 @@ import pytorch_lightning as pl import torch -from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -13,6 +12,7 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback @@ -49,11 +49,6 @@ def trainer( ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, - ddp=Config( - DistributedDataParallelConfig, - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - ), ) trainer = Config( @@ -68,7 +63,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + plugins=bf16_mixed_plugin(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -85,8 +80,8 @@ def pretrain_recipe( model=model(), trainer=trainer( tensor_parallelism=8, - pipeline_parallelism=1, - pipeline_parallelism_type=None, + pipeline_parallelism=2, + pipeline_parallelism_type=torch.bfloat16, virtual_pipeline_parallelism=None, context_parallelism=1, sequence_parallelism=True, diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py index 352069fc6831..0542f22836d6 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py @@ -3,8 +3,11 @@ import torch from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x7b -from nemo.collections.llm.utils import Partial +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x7b_16k" @@ -16,6 +19,10 @@ def pretrain_recipe( name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn ) + model = mixtral_8x7b.model() + model.config.seq_length = 16384 + model.config.max_position_embeddings = 16384 + trainer = mixtral_8x7b.trainer( tensor_parallelism=2, pipeline_parallelism=4, @@ -23,15 +30,17 @@ def pretrain_recipe( virtual_pipeline_parallelism=8, context_parallelism=4, sequence_parallelism=True, - expert_parallelism=8, + expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = mixtral_8x7b.model() - model.config.seq_length = 16384 + + data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe @@ -41,21 +50,27 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node ) + model = mixtral_8x7b.model() + model.config.seq_length = 16384 + model.config.max_position_embeddings = 16384 + trainer = mixtral_8x7b.trainer( tensor_parallelism=2, pipeline_parallelism=2, pipeline_parallelism_type=torch.bfloat16, virtual_pipeline_parallelism=8, - context_parallelism=2, + context_parallelism=1, sequence_parallelism=True, expert_parallelism=8, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = mixtral_8x7b.model() - model.config.seq_length = 16384 + + data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py index 503c83ecb66a..4fb8de98063e 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py @@ -3,8 +3,11 @@ import torch from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x7b -from nemo.collections.llm.utils import Partial +from nemo.collections.llm.utils import Config, Partial +from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x7b_64k" @@ -16,22 +19,28 @@ def pretrain_recipe( name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn ) + model = mixtral_8x7b.model() + model.config.seq_length = 65536 + model.config.max_position_embeddings = 65536 + trainer = mixtral_8x7b.trainer( tensor_parallelism=4, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=4, + virtual_pipeline_parallelism=4, + context_parallelism=8, sequence_parallelism=True, - expert_parallelism=8, + expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = mixtral_8x7b.model() - model.config.seq_length = 65536 + + data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe @@ -41,21 +50,27 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node ) + model = mixtral_8x7b.model() + model.config.seq_length = 65536 + model.config.max_position_embeddings = 65536 + trainer = mixtral_8x7b.trainer( tensor_parallelism=2, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, virtual_pipeline_parallelism=8, - context_parallelism=2, + context_parallelism=4, sequence_parallelism=True, - expert_parallelism=8, + expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, + callbacks=[Config(TimingCallback)], ) - model = mixtral_8x7b.model() - model.config.seq_length = 65536 + + data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) recipe.model = model recipe.trainer = trainer + recipe.data = data return recipe From 8134f3322bce3bef875e8018c5ca5a36631045b8 Mon Sep 17 00:00:00 2001 From: Marc Romeyn Date: Wed, 4 Sep 2024 20:03:06 +0200 Subject: [PATCH 102/664] [NeMo-UX] Adding copyright to collections.llm & lightning (#10345) * Adding copyright to collections.llm & lightning Signed-off-by: Marc Romeyn * Apply isort and black reformatting Signed-off-by: marcromeyn --------- Signed-off-by: Marc Romeyn Signed-off-by: marcromeyn Co-authored-by: marcromeyn --- nemo/collections/llm/__init__.py | 14 ++++++++++++++ nemo/collections/llm/api.py | 14 ++++++++++++++ nemo/collections/llm/fn/__init__.py | 14 ++++++++++++++ nemo/collections/llm/fn/activation.py | 14 ++++++++++++++ nemo/collections/llm/fn/base.py | 14 ++++++++++++++ nemo/collections/llm/fn/mixin.py | 14 ++++++++++++++ nemo/collections/llm/gpt/__init__.py | 13 +++++++++++++ nemo/collections/llm/gpt/data/__init__.py | 14 ++++++++++++++ nemo/collections/llm/gpt/data/api.py | 14 ++++++++++++++ nemo/collections/llm/gpt/data/core.py | 14 ++++++++++++++ nemo/collections/llm/gpt/data/dolly.py | 14 ++++++++++++++ nemo/collections/llm/gpt/data/fine_tuning.py | 14 ++++++++++++++ nemo/collections/llm/gpt/data/mock.py | 14 ++++++++++++++ nemo/collections/llm/gpt/data/pre_training.py | 14 ++++++++++++++ nemo/collections/llm/gpt/data/squad.py | 13 +++++++++++++ nemo/collections/llm/gpt/model/__init__.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/baichuan.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/base.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/chatglm.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/gemma.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/llama.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/mistral.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/mixtral.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/nemotron.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/qwen2.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/starcoder.py | 14 ++++++++++++++ nemo/collections/llm/gpt/model/starcoder2.py | 14 ++++++++++++++ nemo/collections/llm/peft/__init__.py | 14 ++++++++++++++ nemo/collections/llm/peft/api.py | 14 ++++++++++++++ nemo/collections/llm/peft/lora.py | 14 ++++++++++++++ nemo/collections/llm/tokenizer.py | 14 ++++++++++++++ nemo/collections/llm/utils.py | 14 ++++++++++++++ nemo/lightning/__init__.py | 14 ++++++++++++++ nemo/lightning/_strategy_lib.py | 14 ++++++++++++++ nemo/lightning/base.py | 14 ++++++++++++++ nemo/lightning/data.py | 14 ++++++++++++++ nemo/lightning/megatron_parallel.py | 14 ++++++++++++++ nemo/lightning/nemo_logger.py | 13 +++++++++++++ nemo/lightning/pytorch/__init__.py | 13 +++++++++++++ nemo/lightning/pytorch/callbacks/__init__.py | 14 ++++++++++++++ .../pytorch/callbacks/ddp_parity_checker.py | 14 ++++++++++++++ .../pytorch/callbacks/garbage_collection.py | 14 ++++++++++++++ .../lightning/pytorch/callbacks/memory_profiler.py | 14 ++++++++++++++ .../lightning/pytorch/callbacks/model_transform.py | 14 ++++++++++++++ nemo/lightning/pytorch/callbacks/nsys.py | 14 ++++++++++++++ nemo/lightning/pytorch/callbacks/peft.py | 14 ++++++++++++++ nemo/lightning/pytorch/callbacks/progress_bar.py | 14 ++++++++++++++ .../pytorch/callbacks/progress_printer.py | 14 ++++++++++++++ nemo/lightning/pytorch/optim/__init__.py | 14 ++++++++++++++ nemo/lightning/pytorch/optim/base.py | 14 ++++++++++++++ nemo/lightning/pytorch/optim/lr_scheduler.py | 14 ++++++++++++++ nemo/lightning/pytorch/optim/megatron.py | 14 ++++++++++++++ nemo/lightning/pytorch/plugins/data_sampler.py | 14 ++++++++++++++ nemo/lightning/pytorch/strategies/__init__.py | 14 ++++++++++++++ nemo/lightning/pytorch/strategies/fsdp_strategy.py | 14 ++++++++++++++ .../pytorch/strategies/megatron_strategy.py | 14 ++++++++++++++ nemo/lightning/pytorch/strategies/utils.py | 14 ++++++++++++++ nemo/lightning/pytorch/trainer.py | 14 ++++++++++++++ nemo/lightning/resume.py | 14 ++++++++++++++ nemo/lightning/run/__init__.py | 13 +++++++++++++ nemo/lightning/run/plugins.py | 14 ++++++++++++++ tests/collections/llm/fn/__init__.py | 13 +++++++++++++ tests/collections/llm/fn/test_base.py | 14 ++++++++++++++ tests/collections/llm/fn/test_mixin.py | 14 ++++++++++++++ .../llm/gpt/data/test_pre_training_data.py | 14 ++++++++++++++ tests/lightning/__init__.py | 13 +++++++++++++ tests/lightning/fabric/__init__.py | 13 +++++++++++++ tests/lightning/fabric/test_conversion.py | 14 ++++++++++++++ tests/lightning/io/__init__.py | 13 +++++++++++++ tests/lightning/io/test_api.py | 14 ++++++++++++++ tests/lightning/io/test_mixin.py | 14 ++++++++++++++ tests/lightning/io/test_state.py | 14 ++++++++++++++ tests/lightning/pytorch/__init__.py | 13 +++++++++++++ tests/lightning/pytorch/callbacks/__init__.py | 13 +++++++++++++ .../pytorch/callbacks/test_model_transform.py | 14 ++++++++++++++ tests/lightning/pytorch/callbacks/test_nsys.py | 14 ++++++++++++++ tests/lightning/pytorch/callbacks/test_peft.py | 14 ++++++++++++++ .../lightning/pytorch/callbacks/test_preemption.py | 14 ++++++++++++++ tests/lightning/pytorch/test_trainer.py | 14 ++++++++++++++ tests/lightning/test_data.py | 14 ++++++++++++++ tests/lightning/test_ddp_parity_checker.py | 14 ++++++++++++++ tests/lightning/test_dist_ckpt.py | 14 ++++++++++++++ tests/lightning/test_megatron_parallel.py | 14 ++++++++++++++ tests/lightning/test_nemo_logger.py | 14 ++++++++++++++ tests/lightning/test_precision_plugin.py | 14 ++++++++++++++ tests/lightning/test_strategy_lib.py | 14 ++++++++++++++ 86 files changed, 1193 insertions(+) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 52c353ba16d7..361df944a856 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # This is here to import it once, which improves the speed of launch when in debug-mode try: import transformer_engine # noqa diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 8bead26e653e..d330b42d08c4 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json import os from copy import deepcopy diff --git a/nemo/collections/llm/fn/__init__.py b/nemo/collections/llm/fn/__init__.py index 621c748f0995..5c825d5397e8 100644 --- a/nemo/collections/llm/fn/__init__.py +++ b/nemo/collections/llm/fn/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo.collections.llm.fn.base import map, walk from nemo.collections.llm.fn.mixin import FNMixin diff --git a/nemo/collections/llm/fn/activation.py b/nemo/collections/llm/fn/activation.py index fb638ee31f86..50e076a79d36 100644 --- a/nemo/collections/llm/fn/activation.py +++ b/nemo/collections/llm/fn/activation.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch diff --git a/nemo/collections/llm/fn/base.py b/nemo/collections/llm/fn/base.py index 41206e7afc4e..a8bc07e33e0f 100644 --- a/nemo/collections/llm/fn/base.py +++ b/nemo/collections/llm/fn/base.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import inspect from typing import Callable, Iterable, Protocol, TypeVar, Union, runtime_checkable diff --git a/nemo/collections/llm/fn/mixin.py b/nemo/collections/llm/fn/mixin.py index c566c6e9d392..cacb80b1faf5 100644 --- a/nemo/collections/llm/fn/mixin.py +++ b/nemo/collections/llm/fn/mixin.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from torch import nn from typing_extensions import Self diff --git a/nemo/collections/llm/gpt/__init__.py b/nemo/collections/llm/gpt/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/nemo/collections/llm/gpt/__init__.py +++ b/nemo/collections/llm/gpt/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py index f83da73c987b..45ca0788874f 100644 --- a/nemo/collections/llm/gpt/data/__init__.py +++ b/nemo/collections/llm/gpt/data/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo.collections.llm.gpt.data.dolly import DollyDataModule from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule from nemo.collections.llm.gpt.data.mock import MockDataModule diff --git a/nemo/collections/llm/gpt/data/api.py b/nemo/collections/llm/gpt/data/api.py index e674fea91b79..a7fde4cfc8d8 100644 --- a/nemo/collections/llm/gpt/data/api.py +++ b/nemo/collections/llm/gpt/data/api.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytorch_lightning as pl from nemo.collections.llm.gpt.data.dolly import DollyDataModule diff --git a/nemo/collections/llm/gpt/data/core.py b/nemo/collections/llm/gpt/data/core.py index 6f8fe237e10a..d9aff2167023 100644 --- a/nemo/collections/llm/gpt/data/core.py +++ b/nemo/collections/llm/gpt/data/core.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from pathlib import Path from typing import TYPE_CHECKING, Optional diff --git a/nemo/collections/llm/gpt/data/dolly.py b/nemo/collections/llm/gpt/data/dolly.py index 7ed17e460e0f..9ab24f0f612b 100644 --- a/nemo/collections/llm/gpt/data/dolly.py +++ b/nemo/collections/llm/gpt/data/dolly.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json import shutil from typing import TYPE_CHECKING, List, Optional diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py index 33a21990e8f7..062db00af41d 100644 --- a/nemo/collections/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math from functools import lru_cache from pathlib import Path diff --git a/nemo/collections/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py index 7d195bad9ec2..cb4455549ea0 100644 --- a/nemo/collections/llm/gpt/data/mock.py +++ b/nemo/collections/llm/gpt/data/mock.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py index b5266b24ac0b..285f790a77ac 100644 --- a/nemo/collections/llm/gpt/data/pre_training.py +++ b/nemo/collections/llm/gpt/data/pre_training.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import logging import warnings from pathlib import Path diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py index 11104fe3cab2..a2dfa12af69e 100644 --- a/nemo/collections/llm/gpt/data/squad.py +++ b/nemo/collections/llm/gpt/data/squad.py @@ -1,3 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json import shutil from typing import TYPE_CHECKING, List, Optional diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 7de5d5b5b5f4..81098040191c 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo.collections.llm.gpt.model.baichuan import Baichuan2Config, Baichuan2Config7B, Baichuan2Model from nemo.collections.llm.gpt.model.base import ( GPTConfig, diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py index 382a90547caa..b60c0430b8be 100644 --- a/nemo/collections/llm/gpt/model/baichuan.py +++ b/nemo/collections/llm/gpt/model/baichuan.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Annotated, Callable, Optional diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index eaac9394e887..a32fdb582764 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py index 10a3497070c4..3b6453b2b891 100644 --- a/nemo/collections/llm/gpt/model/chatglm.py +++ b/nemo/collections/llm/gpt/model/chatglm.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Annotated, Callable, Optional diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py index 7d45b76e6034..753d75165197 100644 --- a/nemo/collections/llm/gpt/model/gemma.py +++ b/nemo/collections/llm/gpt/model/gemma.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Annotated, Callable, Optional diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 4f7dd4d37a90..ec16750bdf44 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math from dataclasses import dataclass from pathlib import Path diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py index 61a96917537c..b8c17f158cbf 100644 --- a/nemo/collections/llm/gpt/model/mistral.py +++ b/nemo/collections/llm/gpt/model/mistral.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Callable, List, Optional diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py index 7100b62c2aa6..2d6d657b3df2 100644 --- a/nemo/collections/llm/gpt/model/mixtral.py +++ b/nemo/collections/llm/gpt/model/mixtral.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Callable, Optional, Union diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py index d946e5f48cce..ebb3f41ff72b 100644 --- a/nemo/collections/llm/gpt/model/nemotron.py +++ b/nemo/collections/llm/gpt/model/nemotron.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Annotated, Callable, Optional diff --git a/nemo/collections/llm/gpt/model/qwen2.py b/nemo/collections/llm/gpt/model/qwen2.py index eb67dd9d4f0d..9268e884bf8b 100644 --- a/nemo/collections/llm/gpt/model/qwen2.py +++ b/nemo/collections/llm/gpt/model/qwen2.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Annotated, Callable, Optional diff --git a/nemo/collections/llm/gpt/model/starcoder.py b/nemo/collections/llm/gpt/model/starcoder.py index e99b707964fe..15deb0ba2191 100644 --- a/nemo/collections/llm/gpt/model/starcoder.py +++ b/nemo/collections/llm/gpt/model/starcoder.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Annotated, Callable, Optional diff --git a/nemo/collections/llm/gpt/model/starcoder2.py b/nemo/collections/llm/gpt/model/starcoder2.py index e53f1bde7012..18aec2e0cd71 100644 --- a/nemo/collections/llm/gpt/model/starcoder2.py +++ b/nemo/collections/llm/gpt/model/starcoder2.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Annotated, Callable, List, Optional diff --git a/nemo/collections/llm/peft/__init__.py b/nemo/collections/llm/peft/__init__.py index 69855f6f9c53..3dae5622b733 100644 --- a/nemo/collections/llm/peft/__init__.py +++ b/nemo/collections/llm/peft/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo.collections.llm.peft.api import gpt_lora from nemo.collections.llm.peft.lora import LoRA diff --git a/nemo/collections/llm/peft/api.py b/nemo/collections/llm/peft/api.py index dc8fc76c752e..85c0ae6cae41 100644 --- a/nemo/collections/llm/peft/api.py +++ b/nemo/collections/llm/peft/api.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.utils import factory from nemo.lightning.pytorch.callbacks.peft import PEFT diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py index 71b60d5df59f..0d2a98fa3dfb 100644 --- a/nemo/collections/llm/peft/lora.py +++ b/nemo/collections/llm/peft/lora.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass, field from typing import List, Literal diff --git a/nemo/collections/llm/tokenizer.py b/nemo/collections/llm/tokenizer.py index 808642758ebe..77320c4b9c02 100644 --- a/nemo/collections/llm/tokenizer.py +++ b/nemo/collections/llm/tokenizer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo.lightning.io.artifact import FileArtifact from nemo.lightning.io.mixin import track_io diff --git a/nemo/collections/llm/utils.py b/nemo/collections/llm/utils.py index 5ff01a9b0a86..76c192b97f41 100644 --- a/nemo/collections/llm/utils.py +++ b/nemo/collections/llm/utils.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import logging from typing import Any, Callable, Generic, TypeVar, Union, overload diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py index 9d9b0df4da39..8768d13192cb 100644 --- a/nemo/lightning/__init__.py +++ b/nemo/lightning/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import Union from lightning_fabric.plugins.environments import slurm diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py index 9b4aaa8d0330..76e9ab3264f7 100644 --- a/nemo/lightning/_strategy_lib.py +++ b/nemo/lightning/_strategy_lib.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import itertools import os from collections import defaultdict diff --git a/nemo/lightning/base.py b/nemo/lightning/base.py index 0684cbeee2da..b6ba14726818 100644 --- a/nemo/lightning/base.py +++ b/nemo/lightning/base.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import gc import os from pathlib import Path diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py index 59acdec6f8e2..0f30dfe22851 100644 --- a/nemo/lightning/data.py +++ b/nemo/lightning/data.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import abc import logging import os diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index dd10a726e67a..91864a6e190e 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import abc import collections.abc import functools diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py index e5cd45181cc7..8d8932edbb57 100644 --- a/nemo/lightning/nemo_logger.py +++ b/nemo/lightning/nemo_logger.py @@ -1,3 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os import sys import time diff --git a/nemo/lightning/pytorch/__init__.py b/nemo/lightning/pytorch/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/nemo/lightning/pytorch/__init__.py +++ b/nemo/lightning/pytorch/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py index dd2908e6f5e6..6d3844a4301f 100644 --- a/nemo/lightning/pytorch/callbacks/__init__.py +++ b/nemo/lightning/pytorch/callbacks/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo.lightning.pytorch.callbacks.ddp_parity_checker import DdpParityChecker from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback from nemo.lightning.pytorch.callbacks.memory_profiler import MemoryProfileCallback diff --git a/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py b/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py index b5c2127433d7..391666fb8f32 100644 --- a/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py +++ b/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from functools import cache import torch diff --git a/nemo/lightning/pytorch/callbacks/garbage_collection.py b/nemo/lightning/pytorch/callbacks/garbage_collection.py index a2b2bb6498a3..ba4d378ee893 100644 --- a/nemo/lightning/pytorch/callbacks/garbage_collection.py +++ b/nemo/lightning/pytorch/callbacks/garbage_collection.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import gc from typing import Any diff --git a/nemo/lightning/pytorch/callbacks/memory_profiler.py b/nemo/lightning/pytorch/callbacks/memory_profiler.py index 089479637f61..5b2ee1d46e11 100644 --- a/nemo/lightning/pytorch/callbacks/memory_profiler.py +++ b/nemo/lightning/pytorch/callbacks/memory_profiler.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import torch diff --git a/nemo/lightning/pytorch/callbacks/model_transform.py b/nemo/lightning/pytorch/callbacks/model_transform.py index 8a07566f92c3..64602b501ac3 100644 --- a/nemo/lightning/pytorch/callbacks/model_transform.py +++ b/nemo/lightning/pytorch/callbacks/model_transform.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from functools import wraps from typing import Any, Callable, Optional, TypeVar diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py index 70ebf943b333..2a5707d3166c 100644 --- a/nemo/lightning/pytorch/callbacks/nsys.py +++ b/nemo/lightning/pytorch/callbacks/nsys.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import List, Optional import torch diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py index c7983af26752..a8f90c5d60f9 100644 --- a/nemo/lightning/pytorch/callbacks/peft.py +++ b/nemo/lightning/pytorch/callbacks/peft.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json from abc import ABC, abstractmethod from pathlib import Path diff --git a/nemo/lightning/pytorch/callbacks/progress_bar.py b/nemo/lightning/pytorch/callbacks/progress_bar.py index d6acb02ae377..6912c3fc57d4 100644 --- a/nemo/lightning/pytorch/callbacks/progress_bar.py +++ b/nemo/lightning/pytorch/callbacks/progress_bar.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from pytorch_lightning.callbacks.progress import TQDMProgressBar from pytorch_lightning.callbacks.progress.tqdm_progress import _update_n diff --git a/nemo/lightning/pytorch/callbacks/progress_printer.py b/nemo/lightning/pytorch/callbacks/progress_printer.py index 8ddc97a6ddd6..d32f7d70cbdd 100644 --- a/nemo/lightning/pytorch/callbacks/progress_printer.py +++ b/nemo/lightning/pytorch/callbacks/progress_printer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from collections import defaultdict from typing import Any diff --git a/nemo/lightning/pytorch/optim/__init__.py b/nemo/lightning/pytorch/optim/__init__.py index d23494a96a5f..1572e95e136a 100644 --- a/nemo/lightning/pytorch/optim/__init__.py +++ b/nemo/lightning/pytorch/optim/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo.lightning.pytorch.optim.base import LRSchedulerModule, OptimizerModule from nemo.lightning.pytorch.optim.lr_scheduler import ( CosineAnnealingScheduler, diff --git a/nemo/lightning/pytorch/optim/base.py b/nemo/lightning/pytorch/optim/base.py index ef7f9d96843d..1d476142941a 100644 --- a/nemo/lightning/pytorch/optim/base.py +++ b/nemo/lightning/pytorch/optim/base.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import types from abc import ABC, abstractmethod from copy import deepcopy diff --git a/nemo/lightning/pytorch/optim/lr_scheduler.py b/nemo/lightning/pytorch/optim/lr_scheduler.py index 4e865443b8fc..9a0f276006a7 100644 --- a/nemo/lightning/pytorch/optim/lr_scheduler.py +++ b/nemo/lightning/pytorch/optim/lr_scheduler.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import Optional from nemo.core.optim.lr_scheduler import ( diff --git a/nemo/lightning/pytorch/optim/megatron.py b/nemo/lightning/pytorch/optim/megatron.py index 1eb5290652a4..5252f7621859 100644 --- a/nemo/lightning/pytorch/optim/megatron.py +++ b/nemo/lightning/pytorch/optim/megatron.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import inspect from typing import Callable, List, Optional diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py index bacb7cb0af5c..2a3b25f97cdc 100644 --- a/nemo/lightning/pytorch/plugins/data_sampler.py +++ b/nemo/lightning/pytorch/plugins/data_sampler.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import logging from typing import Any, Dict, List, Literal, Optional diff --git a/nemo/lightning/pytorch/strategies/__init__.py b/nemo/lightning/pytorch/strategies/__init__.py index d946d8a9c149..9ef58bcc9023 100644 --- a/nemo/lightning/pytorch/strategies/__init__.py +++ b/nemo/lightning/pytorch/strategies/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo.lightning.pytorch.strategies.fsdp_strategy import FSDPStrategy from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py index 048c2f28d18d..dba58e121048 100644 --- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py +++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import shutil from collections import OrderedDict from pathlib import Path diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index 67ac028d09a5..25d11e6f5cfb 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import functools import inspect import logging diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py index 5b10f2e46ad2..e2c1b1e22825 100644 --- a/nemo/lightning/pytorch/strategies/utils.py +++ b/nemo/lightning/pytorch/strategies/utils.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import io from pathlib import Path from typing import Any, Dict, Iterable, List, Tuple, Union, cast diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py index 81f4d12bd3fb..164c07fe5b80 100644 --- a/nemo/lightning/pytorch/trainer.py +++ b/nemo/lightning/pytorch/trainer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from copy import deepcopy import fiddle as fdl diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py index a2de3ce6f690..6c1da21288bb 100644 --- a/nemo/lightning/resume.py +++ b/nemo/lightning/resume.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os from pathlib import Path, PosixPath, WindowsPath from typing import Optional, Union diff --git a/nemo/lightning/run/__init__.py b/nemo/lightning/run/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/nemo/lightning/run/__init__.py +++ b/nemo/lightning/run/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py index 0f6a76d4799f..18850c9d607e 100644 --- a/nemo/lightning/run/plugins.py +++ b/nemo/lightning/run/plugins.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import copy import logging import os diff --git a/tests/collections/llm/fn/__init__.py b/tests/collections/llm/fn/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/tests/collections/llm/fn/__init__.py +++ b/tests/collections/llm/fn/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/collections/llm/fn/test_base.py b/tests/collections/llm/fn/test_base.py index a000a3d032f2..84f2b677abdf 100644 --- a/tests/collections/llm/fn/test_base.py +++ b/tests/collections/llm/fn/test_base.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest import torch import torch.nn as nn diff --git a/tests/collections/llm/fn/test_mixin.py b/tests/collections/llm/fn/test_mixin.py index 3c5f0eaf7422..359e3ecc1482 100644 --- a/tests/collections/llm/fn/test_mixin.py +++ b/tests/collections/llm/fn/test_mixin.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from torch import nn from nemo.collections.llm import fn diff --git a/tests/collections/llm/gpt/data/test_pre_training_data.py b/tests/collections/llm/gpt/data/test_pre_training_data.py index c42434bbdf31..31a7b51cdf53 100644 --- a/tests/collections/llm/gpt/data/test_pre_training_data.py +++ b/tests/collections/llm/gpt/data/test_pre_training_data.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest import nemo.lightning as nl diff --git a/tests/lightning/__init__.py b/tests/lightning/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/tests/lightning/__init__.py +++ b/tests/lightning/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/lightning/fabric/__init__.py b/tests/lightning/fabric/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/tests/lightning/fabric/__init__.py +++ b/tests/lightning/fabric/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/lightning/fabric/test_conversion.py b/tests/lightning/fabric/test_conversion.py index 53d8d1a2dd49..e690557ec2eb 100644 --- a/tests/lightning/fabric/test_conversion.py +++ b/tests/lightning/fabric/test_conversion.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest from lightning_fabric import plugins as fl_plugins from lightning_fabric import strategies as fl_strategies diff --git a/tests/lightning/io/__init__.py b/tests/lightning/io/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/tests/lightning/io/__init__.py +++ b/tests/lightning/io/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/lightning/io/test_api.py b/tests/lightning/io/test_api.py index 93c6cc33307a..c0fb1a41ba3d 100644 --- a/tests/lightning/io/test_api.py +++ b/tests/lightning/io/test_api.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from functools import partial import pytest diff --git a/tests/lightning/io/test_mixin.py b/tests/lightning/io/test_mixin.py index 3a520b8e74ae..601384d7d589 100644 --- a/tests/lightning/io/test_mixin.py +++ b/tests/lightning/io/test_mixin.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo.lightning import io diff --git a/tests/lightning/io/test_state.py b/tests/lightning/io/test_state.py index f368f3ce02ce..7e9cd4a70399 100644 --- a/tests/lightning/io/test_state.py +++ b/tests/lightning/io/test_state.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest from torch import nn diff --git a/tests/lightning/pytorch/__init__.py b/tests/lightning/pytorch/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/tests/lightning/pytorch/__init__.py +++ b/tests/lightning/pytorch/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/lightning/pytorch/callbacks/__init__.py b/tests/lightning/pytorch/callbacks/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/tests/lightning/pytorch/callbacks/__init__.py +++ b/tests/lightning/pytorch/callbacks/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/lightning/pytorch/callbacks/test_model_transform.py b/tests/lightning/pytorch/callbacks/test_model_transform.py index 9894f7d7bc58..c59a82895125 100644 --- a/tests/lightning/pytorch/callbacks/test_model_transform.py +++ b/tests/lightning/pytorch/callbacks/test_model_transform.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest import pytorch_lightning as pl from torch import nn diff --git a/tests/lightning/pytorch/callbacks/test_nsys.py b/tests/lightning/pytorch/callbacks/test_nsys.py index d87da58b8ad0..9653e707198e 100644 --- a/tests/lightning/pytorch/callbacks/test_nsys.py +++ b/tests/lightning/pytorch/callbacks/test_nsys.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from unittest.mock import MagicMock, patch import pytest diff --git a/tests/lightning/pytorch/callbacks/test_peft.py b/tests/lightning/pytorch/callbacks/test_peft.py index 99a22f82fa50..53f9016a3bac 100644 --- a/tests/lightning/pytorch/callbacks/test_peft.py +++ b/tests/lightning/pytorch/callbacks/test_peft.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from unittest.mock import MagicMock, call, patch import torch.nn as nn diff --git a/tests/lightning/pytorch/callbacks/test_preemption.py b/tests/lightning/pytorch/callbacks/test_preemption.py index a385582ea021..4152f7fcce59 100644 --- a/tests/lightning/pytorch/callbacks/test_preemption.py +++ b/tests/lightning/pytorch/callbacks/test_preemption.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import signal from unittest.mock import MagicMock, PropertyMock, patch diff --git a/tests/lightning/pytorch/test_trainer.py b/tests/lightning/pytorch/test_trainer.py index 65c247eae0ef..6a7941953a17 100644 --- a/tests/lightning/pytorch/test_trainer.py +++ b/tests/lightning/pytorch/test_trainer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from nemo import lightning as nl diff --git a/tests/lightning/test_data.py b/tests/lightning/test_data.py index 7acdcc91b486..2519616766f4 100644 --- a/tests/lightning/test_data.py +++ b/tests/lightning/test_data.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from pathlib import Path from unittest.mock import MagicMock, patch diff --git a/tests/lightning/test_ddp_parity_checker.py b/tests/lightning/test_ddp_parity_checker.py index 7d180ba17dfe..892df513539a 100644 --- a/tests/lightning/test_ddp_parity_checker.py +++ b/tests/lightning/test_ddp_parity_checker.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import os diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py index 98fe8d4a6107..1888d1eb12cc 100644 --- a/tests/lightning/test_dist_ckpt.py +++ b/tests/lightning/test_dist_ckpt.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os from pathlib import Path diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py index e504c7eb5c7c..7d723c543b5a 100644 --- a/tests/lightning/test_megatron_parallel.py +++ b/tests/lightning/test_megatron_parallel.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from collections import defaultdict from unittest.mock import MagicMock diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 3476f1361809..08902d03d786 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import time from pathlib import Path diff --git a/tests/lightning/test_precision_plugin.py b/tests/lightning/test_precision_plugin.py index bdd834c3bf7a..2c029ce23936 100644 --- a/tests/lightning/test_precision_plugin.py +++ b/tests/lightning/test_precision_plugin.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest import pytorch_lightning as pl import torch diff --git a/tests/lightning/test_strategy_lib.py b/tests/lightning/test_strategy_lib.py index 6a63450f37df..36143cedb8c4 100644 --- a/tests/lightning/test_strategy_lib.py +++ b/tests/lightning/test_strategy_lib.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from unittest.mock import ANY, MagicMock, patch import torch From 73bec061c8da695302d6ac503d31b5660e274532 Mon Sep 17 00:00:00 2001 From: Kunal Dhawan Date: Wed, 4 Sep 2024 11:29:06 -0700 Subject: [PATCH 103/664] added support for FC model in Diarization with ASR and timestamps (#10154) Signed-off-by: Kunal Dhawan Co-authored-by: Nithin Rao --- .../asr/parts/utils/decoder_timestamps_utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py b/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py index a740f899ca67..c39ff7da58d9 100644 --- a/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py +++ b/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py @@ -352,6 +352,17 @@ def set_asr_model(self): self.asr_batch_size = if_none_get_default(self.params['asr_batch_size'], 4) self.model_stride_in_secs = 0.02 + elif 'fastconformer' in self.ASR_model_name.lower(): + self.run_ASR = self.run_ASR_BPE_CTC + self.encdec_class = EncDecCTCModelBPE + self.decoder_delay_in_sec = if_none_get_default(self.params['decoder_delay_in_sec'], 0.08) + self.word_ts_anchor_offset = if_none_get_default(self.params['word_ts_anchor_offset'], 0.12) + self.asr_batch_size = if_none_get_default(self.params['asr_batch_size'], 16) + self.model_stride_in_secs = 0.08 + # FastConformer requires buffered inference and the parameters for buffered processing. + self.chunk_len_in_sec = 15 + self.total_buffer_in_secs = 30 + elif 'conformer' in self.ASR_model_name.lower(): self.run_ASR = self.run_ASR_BPE_CTC self.encdec_class = EncDecCTCModelBPE From d8efee959a0cd684e161aa6f1f6711eb501e8568 Mon Sep 17 00:00:00 2001 From: Ao Tang Date: Wed, 4 Sep 2024 19:06:38 -0400 Subject: [PATCH 104/664] Remove apply_query_key_layer_scaling for GPT models (#10349) * remove qk layer scaling * add sc2 --- nemo/collections/llm/gpt/model/llama.py | 1 - nemo/collections/llm/gpt/model/mistral.py | 1 - nemo/collections/llm/gpt/model/mixtral.py | 1 - nemo/collections/llm/gpt/model/nemotron.py | 1 - nemo/collections/llm/gpt/model/qwen2.py | 1 - nemo/collections/llm/gpt/model/starcoder2.py | 1 - 6 files changed, 6 deletions(-) diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index ec16750bdf44..2c76b2fdd976 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -88,7 +88,6 @@ class Llama3Config(GPTConfig): add_bias_linear: bool = False activation_func: Callable = F.silu gated_linear_unit: bool = True - apply_query_key_layer_scaling: bool = False # Fusions bias_activation_fusion: bool = True masked_softmax_fusion: bool = True diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py index b8c17f158cbf..73e6a34fd7c2 100644 --- a/nemo/collections/llm/gpt/model/mistral.py +++ b/nemo/collections/llm/gpt/model/mistral.py @@ -41,7 +41,6 @@ class MistralConfig7B(GPTConfig): position_embedding_type: str = "rope" add_bias_linear: bool = False gated_linear_unit: bool = True - apply_query_key_layer_scaling: bool = False # TODO: Should this be True? num_layers: int = 32 hidden_size: int = 4096 diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py index 2d6d657b3df2..dc438320eeff 100644 --- a/nemo/collections/llm/gpt/model/mixtral.py +++ b/nemo/collections/llm/gpt/model/mixtral.py @@ -42,7 +42,6 @@ class MixtralConfig(GPTConfig): position_embedding_type: str = "rope" add_bias_linear: bool = False gated_linear_unit: bool = True - apply_query_key_layer_scaling: bool = False num_layers: int = 32 hidden_size: int = 4096 diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py index ebb3f41ff72b..44f10c0bee60 100644 --- a/nemo/collections/llm/gpt/model/nemotron.py +++ b/nemo/collections/llm/gpt/model/nemotron.py @@ -43,7 +43,6 @@ class NemotronConfig(GPTConfig): hidden_dropout: float = 0.0 attention_dropout: float = 0.0 - apply_query_key_layer_scaling: bool = True rotary_percent: float = 0.5 masked_softmax_fusion: bool = True persist_layer_norm: bool = True diff --git a/nemo/collections/llm/gpt/model/qwen2.py b/nemo/collections/llm/gpt/model/qwen2.py index 9268e884bf8b..643bdda3ba8d 100644 --- a/nemo/collections/llm/gpt/model/qwen2.py +++ b/nemo/collections/llm/gpt/model/qwen2.py @@ -48,7 +48,6 @@ class Qwen2Config(GPTConfig): layernorm_epsilon: float = 1e-6 rotary_base: float = 1000000.0 position_embedding_type: str = "rope" - apply_query_key_layer_scaling: bool = True @dataclass diff --git a/nemo/collections/llm/gpt/model/starcoder2.py b/nemo/collections/llm/gpt/model/starcoder2.py index 18aec2e0cd71..c49af006c6f5 100644 --- a/nemo/collections/llm/gpt/model/starcoder2.py +++ b/nemo/collections/llm/gpt/model/starcoder2.py @@ -49,7 +49,6 @@ class Starcoder2Config(GPTConfig): kv_channels: int = None num_query_groups: int = None window_size: Optional[List[int]] = None - apply_query_key_layer_scaling: bool = True attention_softmax_in_fp32: bool = True bias_activation_fusion: bool = True bias_dropout_fusion: bool = True From 7738b1d84f13975c504aaf89b7160666df5d24b6 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Wed, 4 Sep 2024 22:49:23 -0700 Subject: [PATCH 105/664] remove grad clipping from mixed_precision plugin (#10303) * remove grad clipping from mixed_precision plugin Signed-off-by: Alexandros Koumparoulis * Raise an expection if user tries to use Trainer's clip_grad Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * Remove gradient_clip_val from recipes Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- nemo/collections/llm/recipes/llama3_70b.py | 1 - nemo/collections/llm/recipes/llama3_8b.py | 1 - nemo/collections/llm/recipes/mixtral_8x22b.py | 1 - nemo/collections/llm/recipes/mixtral_8x3b.py | 1 - nemo/collections/llm/recipes/mixtral_8x7b.py | 1 - nemo/collections/llm/recipes/optim/adam.py | 1 + .../pytorch/plugins/mixed_precision.py | 17 +++++++++++++++++ 7 files changed, 18 insertions(+), 5 deletions(-) diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index c784989ac370..a0aaf3e8cbf1 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -56,7 +56,6 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - gradient_clip_val=1.0, limit_test_batches=50, limit_val_batches=32, log_every_n_steps=10, diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index 792f545d0d32..4f178fad025b 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -55,7 +55,6 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - gradient_clip_val=1.0, limit_test_batches=50, limit_val_batches=32, log_every_n_steps=10, diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py index aefab4f61699..7f292f4cfc2d 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -57,7 +57,6 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - gradient_clip_val=1.0, limit_test_batches=50, limit_val_batches=32, log_every_n_steps=10, diff --git a/nemo/collections/llm/recipes/mixtral_8x3b.py b/nemo/collections/llm/recipes/mixtral_8x3b.py index 01d3d15089c3..0a75d08b419f 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b.py @@ -57,7 +57,6 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - gradient_clip_val=1.0, limit_test_batches=50, limit_val_batches=32, log_every_n_steps=10, diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py index 6d24255f0a5f..059dc74f9844 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b.py @@ -57,7 +57,6 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - gradient_clip_val=1.0, limit_test_batches=50, limit_val_batches=32, log_every_n_steps=10, diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py index d46f7d5d36d6..d38bbc09d8e6 100644 --- a/nemo/collections/llm/recipes/optim/adam.py +++ b/nemo/collections/llm/recipes/optim/adam.py @@ -17,6 +17,7 @@ def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> Config use_distributed_optimizer=True, overlap_grad_reduce=True, overlap_param_gather=True, + clip_grad=1.0, ) sched = Config( diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py index 37a895ea1875..e2ac5d95aefb 100644 --- a/nemo/lightning/pytorch/plugins/mixed_precision.py +++ b/nemo/lightning/pytorch/plugins/mixed_precision.py @@ -187,6 +187,23 @@ def forward_context(self) -> Generator[None, None, None]: finally: pass + def clip_gradients( + self, + optimizer: Optimizer, + clip_val: Union[int, float] = 0.0, + gradient_clip_algorithm=None, + ) -> None: + if clip_val > 0.0: + raise ValueError( + "Gradient clipping is handled in Mcore's optimizer. Use the clip_grad attribute in OptimizerConfig." + ) + + def clip_grad_by_value(self, optimizer: Optimizer, clip_val: Union[int, float]) -> None: + return + + def clip_grad_by_norm(self, optimizer: Optimizer, clip_val: Union[int, float]) -> None: + return + def update_config_with_dtype_overrides(dtype_config, config): if hasattr(config, "__io__"): From 19f904e1845163fc82f12c44736df55656f3def9 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Thu, 5 Sep 2024 00:54:57 -0700 Subject: [PATCH 106/664] Add option to selectively restore model weights and optimizer states in AutoResume and MegatronStrategy (#10295) * Add option to selectively restore model weights and optimizer states in AutoResume and MegatronStrategy Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai * PR feedback Signed-off-by: Hemil Desai * Fix PEFT checkpointing Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * address comments Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * Refactor selective restore Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai * fix tokenizer issue for peft load Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * fix peft optimizer states loading Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * Fix fabric strategy Signed-off-by: Hemil Desai * PR feedback Signed-off-by: Hemil Desai * Rename SelectiveRestoreConfig to RestoreConfig Signed-off-by: Hemil Desai * Update hf llama 3 paths Signed-off-by: Hemil Desai * Fixes Signed-off-by: Hemil Desai * Fix tests Signed-off-by: Hemil Desai --------- Signed-off-by: Hemil Desai Signed-off-by: hemildesai Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: hemildesai Co-authored-by: Chen Cui Co-authored-by: cuichenx --- nemo/collections/llm/gpt/data/fine_tuning.py | 5 +- nemo/collections/llm/recipes/llama3_70b.py | 6 +- nemo/collections/llm/recipes/llama3_8b.py | 6 +- nemo/collections/llm/recipes/mistral.py | 2 +- nemo/collections/llm/recipes/mixtral_8x22b.py | 6 +- nemo/collections/llm/recipes/mixtral_8x3b.py | 6 +- nemo/collections/llm/recipes/mixtral_8x7b.py | 6 +- nemo/lightning/__init__.py | 3 + nemo/lightning/fabric/strategies.py | 2 +- nemo/lightning/pytorch/callbacks/peft.py | 60 +++- .../pytorch/strategies/fsdp_strategy.py | 16 +- .../pytorch/strategies/megatron_strategy.py | 83 +++-- nemo/lightning/pytorch/strategies/utils.py | 18 +- nemo/lightning/resume.py | 319 ++++++++++-------- .../collections/llm/test_mnist_model_nemo2.py | 1 - .../llm/test_mnist_model_nemo2_fsdp.py | 1 - tests/lightning/test_ddp_parity_checker.py | 3 +- tests/lightning/test_nemo_logger.py | 19 +- tests/lightning/test_precision_plugin.py | 6 +- 19 files changed, 360 insertions(+), 208 deletions(-) diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py index 062db00af41d..7fa5bd719581 100644 --- a/nemo/collections/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -68,10 +68,7 @@ def __init__( self.seq_length = seq_length self.seed = seed self.dataset_root = Path(dataset_root) - - from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer - - self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer") + self.tokenizer = tokenizer self.memmap_workers = memmap_workers self.num_workers = num_workers self.pin_memory = pin_memory diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index a0aaf3e8cbf1..60c3d3697449 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -45,7 +45,6 @@ def trainer( context_parallel_size=context_parallelism, sequence_parallel=sequence_parallelism, gradient_as_bucket_view=True, - ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, ) @@ -95,7 +94,10 @@ def pretrain_recipe( def hf_resume() -> Config[nl.AutoResume]: - return Config(nl.AutoResume, import_path="hf://meta-llama/Meta-Llama-3.1-70B") + return Config( + nl.AutoResume, + restore_config=Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-70B"), + ) def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index 4f178fad025b..3f07d6b53c94 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -44,7 +44,6 @@ def trainer( context_parallel_size=context_parallelism, sequence_parallel=sequence_parallelism, gradient_as_bucket_view=True, - ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, ) @@ -94,7 +93,10 @@ def pretrain_recipe( def hf_resume() -> Config[nl.AutoResume]: - return Config(nl.AutoResume, import_path="hf://meta-llama/Meta-Llama-3.1-8B") + return Config( + nl.AutoResume, + restore_config=Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-8B"), + ) def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral.py index 061e82c9d9d2..c504340348fe 100644 --- a/nemo/collections/llm/recipes/mistral.py +++ b/nemo/collections/llm/recipes/mistral.py @@ -32,7 +32,7 @@ def trainer(devices=8) -> nl.Trainer: @factory(name=NAME + "_hf") def hf_resume() -> nl.AutoResume: - return nl.AutoResume(import_path="hf://mistralai/Mistral-7B-v0.3") + return nl.AutoResume(restore_config=nl.RestoreConfig(path="hf://mistralai/Mistral-7B-v0.3")) @factory(name=NAME, for_task="llm.pretrain") diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py index 7f292f4cfc2d..209a5926a008 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -46,7 +46,6 @@ def trainer( sequence_parallel=sequence_parallelism, expert_model_parallel_size=expert_parallelism, gradient_as_bucket_view=True, - ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, ) @@ -97,7 +96,10 @@ def pretrain_recipe( def hf_resume() -> Config[nl.AutoResume]: - return Config(nl.AutoResume, import_path="hf://mistralai/Mixtral-8x22B-v0.1") + return Config( + nl.AutoResume, + restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x22B-v0.1"), + ) def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: diff --git a/nemo/collections/llm/recipes/mixtral_8x3b.py b/nemo/collections/llm/recipes/mixtral_8x3b.py index 0a75d08b419f..7dc8170e13e3 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b.py @@ -46,7 +46,6 @@ def trainer( sequence_parallel=sequence_parallelism, expert_model_parallel_size=expert_parallelism, gradient_as_bucket_view=True, - ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, ) @@ -97,7 +96,10 @@ def pretrain_recipe( def hf_resume() -> Config[nl.AutoResume]: - return Config(nl.AutoResume, import_path="hf://mistralai/Mixtral-8x7B-v0.1") + return Config( + nl.AutoResume, + restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), + ) def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py index 059dc74f9844..bacbfcab4e2d 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b.py @@ -46,7 +46,6 @@ def trainer( sequence_parallel=sequence_parallelism, expert_model_parallel_size=expert_parallelism, gradient_as_bucket_view=True, - ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, ) @@ -97,7 +96,10 @@ def pretrain_recipe( def hf_resume() -> Config[nl.AutoResume]: - return Config(nl.AutoResume, import_path="hf://mistralai/Mixtral-8x7B-v0.1") + return Config( + nl.AutoResume, + restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), + ) def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py index 8768d13192cb..bfb5a4f6fceb 100644 --- a/nemo/lightning/__init__.py +++ b/nemo/lightning/__init__.py @@ -33,6 +33,7 @@ from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler from nemo.lightning.pytorch.strategies import FSDPStrategy, MegatronStrategy +from nemo.lightning.pytorch.strategies.utils import RestoreConfig from nemo.lightning.pytorch.trainer import Trainer from nemo.lightning.resume import AutoResume @@ -59,6 +60,8 @@ def _is_slurm_interactive_mode(): "MegatronDataSampler", "MegatronMixedPrecision", "MegatronOptimizerModule", + "FSDPStrategy", + "RestoreConfig", "lr_scheduler", "NeMoLogger", "ModelCheckpoint", diff --git a/nemo/lightning/fabric/strategies.py b/nemo/lightning/fabric/strategies.py index a183c434dc52..aaa816205dd9 100644 --- a/nemo/lightning/fabric/strategies.py +++ b/nemo/lightning/fabric/strategies.py @@ -326,7 +326,7 @@ def checkpoint_io(self) -> CheckpointIO: @property def parallelism(self): - from nemo.lightning.pytorch.strategies import ParallelismConfig + from nemo.lightning.pytorch.strategies.megatron_strategy import ParallelismConfig return ParallelismConfig( tensor_model_parallel_size=self.tensor_model_parallel_size, diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py index a8f90c5d60f9..100f1df3f9ab 100644 --- a/nemo/lightning/pytorch/callbacks/peft.py +++ b/nemo/lightning/pytorch/callbacks/peft.py @@ -108,10 +108,13 @@ def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) def apply_transform(self, trainer): super().apply_transform(trainer) + adapter_sharded_state_dict = {} if self.wrapped_io.adapter_ckpt_path is not None: logging.info(f"Loading adapters from {self.wrapped_io.adapter_ckpt_path}") - adapter_state = self.wrapped_io.load_checkpoint(self.wrapped_io.adapter_ckpt_path) - trainer.strategy.load_model_state_dict(adapter_state, strict=False) + # create sharded state dict for adapter weights only to enable PEFT resume + adapter_sharded_state_dict['state_dict'] = { + k: v for k, v in trainer.model.sharded_state_dict().items() if self.adapter_key_filter(k) + } if hasattr(trainer.strategy, "init_model_parallel"): logging.info("Initializing model parallel") @@ -120,6 +123,19 @@ def apply_transform(self, trainer): if trainer.state.fn == TrainerFn.FITTING: logging.info("Setting up optimizers") trainer.strategy.setup_optimizers(trainer) + if self.wrapped_io.adapter_ckpt_path is not None and trainer.strategy.should_restore_optimizer_states(): + # PEFT resume, load optimizer state + adapter_sharded_state_dict['optimizer'] = [ + trainer.strategy.optimizer_sharded_state_dict(is_loading=True) + ] + + if adapter_sharded_state_dict: + adapter_state = self.wrapped_io.load_checkpoint( + self.wrapped_io.adapter_ckpt_path, sharded_state_dict=adapter_sharded_state_dict + ) + trainer.strategy.load_model_state_dict(adapter_state, strict=False) + if trainer.state.fn == TrainerFn.FITTING: + trainer.strategy.load_optimizer_state_dict(adapter_state, selective_restore=True) def on_save_checkpoint( self, trainer: pl.Trainer, pl_module: pl.LightningModule, checkpoint: Dict[str, Any] @@ -274,21 +290,53 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio def load_checkpoint( self, path: _PATH, sharded_state_dict=None, map_location: Optional[Callable] = None ) -> Dict[str, Any]: + """ + ===================== + Initial PEFT Training + ===================== + Initial PEFT training requires loading the base model weights. In this case, this function is called by + trainer.strategy.setup() -> megatron_strategy.restore_model() -> megatron_strategy.load_checkpoint(). + `path = PosixPath()`, and sharded_state_dict contains only base model weights + + =========== + PEFT Resume + =========== + PEFT resume requires loading two set of model weights, 1) base model weights and 2) adapter weights + Base model weights could be imported from e.g. HF, and is frozen during PEFT training. + Adapter weights contains the training metadata that will need to be loaded. + As such, this function will be entered twice during PEFT training resume. + + For the FIRST TIME this function is called by trainer._checkpoint_connector._restore_modules_and_callbacks. + `path = AdapterPath(, base_model_path=)`, and sharded_state_dict contains only base model weights + + For the SECOND TIME this function is called by PEFT.apply_transform (above, in the same file). + `path = PosixPath()`, and sharded_state_dict contains only adapter weights. + """ + assert self.checkpoint_io is not None adapter_meta_path = ckpt_to_dir(path) / _ADAPTER_META_FILENAME - if getattr(path, "adapter_path", None): - self.model_ckpt_path = path - self.adapter_ckpt_path = path.adapter_path + adapter_ckpt = None + if getattr(path, "base_model_path", None): + ## PEFT Resume, FIRST TIME + self.adapter_ckpt_path = Path(str(path)) + adapter_ckpt = self.checkpoint_io.load_checkpoint(path) # Loads only metadata + # path is adapter path to restore the training metadata, but switch to loading base model here. + path = self.model_ckpt_path = path.base_model_path elif adapter_meta_path.exists(): + ## PEFT Resume, SECOND TIME with open(adapter_meta_path, "r") as f: metadata = json.load(f) self.model_ckpt_path = Path(metadata['model_ckpt_path']) self.adapter_ckpt_path = path else: + ## Initial PEFT Training self.model_ckpt_path = path # Note: this will include the Trainer-state of the model-checkpoint model_ckpt = self.checkpoint_io.load_checkpoint(path, sharded_state_dict, map_location) - + if adapter_ckpt is not None: + ## PEFT Resume, FIRST TIME + adapter_ckpt['state_dict'].update(model_ckpt['state_dict']) + return adapter_ckpt return model_ckpt diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py index dba58e121048..24087f80aae4 100644 --- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py +++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py @@ -70,14 +70,16 @@ def __init__( self, auto_wrap_policy={TransformerLayer}, state_dict_type="sharded", - ckpt_include_optimizer=False, + ckpt_load_optimizer: bool = True, + ckpt_save_optimizer: bool = True, data_sampler=None, **kwargs, ): super().__init__(auto_wrap_policy=auto_wrap_policy, state_dict_type=state_dict_type, **kwargs) self.data_sampler = data_sampler - self.ckpt_include_optimizer = ckpt_include_optimizer + self.ckpt_load_optimizer = ckpt_load_optimizer + self.ckpt_save_optimizer = ckpt_save_optimizer @override def setup_environment(self) -> None: @@ -210,7 +212,7 @@ def save_checkpoint( for optim_state in checkpoint['optimizer_states']: optim_state.pop("state") - if self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_include_optimizer: + if self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_save_optimizer: checkpoint['optimizer'] = get_optimizer_state_dict(self.model, self.optimizers) pyt_to_mcore_state_dict(checkpoint['optimizer']['state'], prefix="optimizer.state.") @@ -241,7 +243,7 @@ def load_checkpoint(self, checkpoint_path: str | Path) -> Dict[str, Any]: pyt_to_mcore_state_dict(msd) sharded_state_dict["sharded_state_dict"] = msd - if self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING: + if self.ckpt_load_optimizer and self.trainer.state.fn == TrainerFn.FITTING: osd = get_optimizer_state_dict(self.model, self.optimizers, options=StateDictOptions(cpu_offload=True)) pyt_to_mcore_state_dict(osd['state'], prefix="optimizer.state.") sharded_state_dict["optimizer"] = osd @@ -249,14 +251,14 @@ def load_checkpoint(self, checkpoint_path: str | Path) -> Dict[str, Any]: checkpoint = self.checkpoint_io.load_checkpoint(path, sharded_state_dict=sharded_state_dict) mcore_to_pyt_sharded_state_dict(checkpoint['sharded_state_dict'], msd) - if self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING: + if self.ckpt_load_optimizer and self.trainer.state.fn == TrainerFn.FITTING: mcore_to_pyt_sharded_state_dict(checkpoint['optimizer']['state'], osd['state']) set_state_dict( self.model, - self.optimizers if self.ckpt_include_optimizer else [], + self.optimizers if self.ckpt_load_optimizer else [], model_state_dict=checkpoint['sharded_state_dict'], - optim_state_dict=checkpoint['optimizer'] if self.ckpt_include_optimizer else None, + optim_state_dict=checkpoint['optimizer'] if self.ckpt_load_optimizer else None, ) return checkpoint diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index 25d11e6f5cfb..fae6df5be207 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -14,7 +14,6 @@ import functools import inspect -import logging import os import shutil from collections import OrderedDict @@ -61,6 +60,7 @@ from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction from nemo.lightning.pytorch.callbacks import ModelTransform from nemo.lightning.pytorch.strategies.utils import ( + RestoreConfig, ckpt_to_dir, create_checkpoint_io, fix_progress_bar, @@ -68,7 +68,8 @@ setup_data_sampler, setup_parallel_ranks, ) -from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO, AsyncFinalizerCallback +from nemo.utils import logging +from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizerCallback if TYPE_CHECKING: from nemo.lightning.pytorch.plugins.data_sampler import DataSampler @@ -118,7 +119,8 @@ class MegatronStrategy(DDPStrategy, io.IOMixin): checkpoint_io: Checkpoint I/O handler. Defaults to None. find_unused_parameters (bool): Find unused parameters in DDP. Defaults to False. ckpt_type (TrainerCkptProtocol): Checkpoint type. Defaults to TrainerCheckpoint. - ckpt_include_optimizer (bool): Include optimizer state in checkpoint. Defaults to True. + ckpt_load_optimizer (bool): Load optimizer state from trainer.ckpt_path. Defaults to True. + ckpt_save_optimizer (bool): Save optimizer states in checkpoint. Defaults to True. ddp (Union[DDPLiteral, DistributedDataParallelConfig]): DDP configuration. Defaults to "megatron". lazy_init (bool): Use lazy initialization for model parallel parameters. Defaults to False. pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. Defaults to None. @@ -168,16 +170,17 @@ def __init__( sequence_parallel: bool = False, expert_model_parallel_size: int = 1, moe_extended_tp: bool = False, - data_sampler: Optional['DataSampler'] = None, + data_sampler: Optional["DataSampler"] = None, parallel_devices: Optional[List[torch.device]] = None, cluster_environment=None, # TODO: Add type-hint checkpoint_io=None, # TODO: Add type-hint find_unused_parameters: bool = False, - ckpt_include_optimizer: bool = True, + ckpt_load_optimizer: bool = True, + ckpt_save_optimizer: bool = True, ddp: Union[DDPLiteral, DistributedDataParallelConfig] = "megatron", lazy_init: bool = False, pipeline_dtype: Optional[torch.dtype] = None, - save_ckpt_format: str = 'torch_dist', + save_ckpt_format: str = "torch_dist", ckpt_async_save: bool = False, ckpt_torch_dist_multiproc: int = None, ## TODO(ashors): put elsewhere? ckpt_assume_constant_structure: bool = False, @@ -190,6 +193,7 @@ def __init__( init_model_parallel: bool = True, replace_progress_bar: bool = True, progress_interval: int = 1, + restore_config: Optional[RestoreConfig] = None, **kwargs, ) -> None: super().__init__( @@ -201,7 +205,7 @@ def __init__( ) self.megatron_callbacks = CallbackConnector() - self.data_sampler: Optional['DataSampler'] = data_sampler + self.data_sampler: Optional["DataSampler"] = data_sampler self.tensor_model_parallel_size = tensor_model_parallel_size self.pipeline_model_parallel_size = pipeline_model_parallel_size self.context_parallel_size = context_parallel_size @@ -210,7 +214,8 @@ def __init__( self.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size self.sequence_parallel = sequence_parallel self.lazy_init = lazy_init - self.ckpt_include_optimizer = ckpt_include_optimizer + self.ckpt_load_optimizer = ckpt_load_optimizer + self.ckpt_save_optimizer = ckpt_save_optimizer self.pipeline_dtype = pipeline_dtype self._setup_optimizers = setup_optimizers self._init_model_parallel = init_model_parallel @@ -230,6 +235,8 @@ def __init__( self.replace_progress_bar = replace_progress_bar self.progress_interval = progress_interval + self.restore_config = restore_config + self._ddp = ddp if ddp == "megatron": self.ddp_config = DistributedDataParallelConfig(check_for_nan_in_grad=True) @@ -252,7 +259,7 @@ def connect(self, model: pl.LightningModule) -> None: if _maybe_mcore_config: self._mcore_config = _maybe_mcore_config - dtype_config = getattr(self._precision_plugin, 'dtype_config', None) + dtype_config = getattr(self._precision_plugin, "dtype_config", None) if dtype_config: from nemo.lightning.pytorch.plugins.mixed_precision import update_config_with_dtype_overrides @@ -272,8 +279,6 @@ def connect(self, model: pl.LightningModule) -> None: self.ddp_config = update_config_with_dtype_overrides(dtype_config, self.ddp_config) if mcore_opt_config.use_distributed_optimizer != ddp_config.use_distributed_optimizer: - from nemo.utils import logging - logging.info("Fixing mis-match between ddp-config & mcore-optimizer config") ddp_config.use_distributed_optimizer = mcore_opt_config.use_distributed_optimizer @@ -346,6 +351,10 @@ def setup(self, trainer: pl.Trainer) -> None: if not have_async_callback: self.trainer.callbacks.append(AsyncFinalizerCallback()) + ## Restore model weights and optimizer states if needed + if self.restore_config and not self.trainer.ckpt_path: + self.selective_restore() + @override def setup_distributed(self) -> None: setup_parallel_ranks(self) @@ -465,14 +474,14 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP out = self.model(dataloader_iter, forward_only=False, *args, **kwargs) self.lightning_module.log( - 'global_step', + "global_step", self.trainer.global_step, prog_bar=True, batch_size=1, ) self.lightning_module.log( - 'step', + "step", self.trainer.global_step, ) @@ -496,7 +505,7 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP # p2p now, broadcast later at ckpt. only with pp, some ranks will log 0.0 # WHICH IS OK because we broadcast later at checkpoint time _strategy_lib._sync_from_last_pipeline_stage(out, broadcast=False) - self.lightning_module.log('reduced_train_loss', out, prog_bar=True, batch_size=1, sync_dist=False) + self.lightning_module.log("reduced_train_loss", out, prog_bar=True, batch_size=1, sync_dist=False) return out @@ -534,7 +543,7 @@ def validation_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OU # ranks that are not final pp stage have 0 for loss, and out will be mean-reduced over pp # groups (due to sync_dist), which divides val_loss by pp_size. so we multiply by pp_size to cancel out self.lightning_module.log( - 'val_loss', + "val_loss", out * pp_size, prog_bar=True, sync_dist=True, @@ -542,7 +551,7 @@ def validation_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OU on_epoch=True, ) else: - self.lightning_module.log('val_loss', out, prog_bar=True, on_epoch=True) + self.lightning_module.log("val_loss", out, prog_bar=True, on_epoch=True) return out @@ -602,7 +611,7 @@ def optimizer_sharded_state_dict(self, is_loading=False): # TODO: Fix when MainParamsOptimizerWrapper is not used optimizer = self.lightning_module.optimizers(use_pl_optimizer=False) - sharding_type = 'fully_sharded_model_space' if self.parallel_save_optim else 'dp_zero_gather_scatter' + sharding_type = "fully_sharded_model_space" if self.parallel_save_optim else "dp_zero_gather_scatter" return _strategy_lib.optimizer_sharded_state_dict( self.megatron_parallel, optimizer, is_loading=is_loading, sharding_type=sharding_type @@ -616,13 +625,19 @@ def save_checkpoint( # retrieve `sharded_state_dict` if it has not already been configured in `on_save_checkpoint` if "sharded_state_dict" not in checkpoint: checkpoint["sharded_state_dict"] = self.megatron_parallel.sharded_state_dict() - if self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_include_optimizer: + if self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_save_optimizer: checkpoint["optimizer"] = [self.optimizer_sharded_state_dict()] self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options) + def should_restore_optimizer_states(self, selective_restore: bool = False) -> bool: + if selective_restore: + return self.restore_config.load_optim_state if self.restore_config else False + + return self.ckpt_load_optimizer + @override - def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: + def load_checkpoint(self, checkpoint_path: Union[str, Path], selective_restore: bool = False) -> Dict[str, Any]: """PTL method which we override to integrate distributed checkpoints for model parallel models. In order to load distributed checkpoints we need to provide the sharded_state_dict to the distributed load function. We get the sharded_state_dict from self.lightning_module @@ -634,7 +649,10 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: sharded_state_dict = {} sharded_state_dict["state_dict"] = self.megatron_parallel.sharded_state_dict() - if self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING: + if ( + self.should_restore_optimizer_states(selective_restore=selective_restore) + and self.trainer.state.fn == TrainerFn.FITTING + ): if self.lightning_module.optimizers(use_pl_optimizer=False): sharded_state_dict["optimizer"] = [self.optimizer_sharded_state_dict(is_loading=True)] @@ -642,9 +660,30 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: return checkpoint + def selective_restore(self) -> None: + if not self.restore_config: + return + + logging.info(f"Doing selective restore from {self.restore_config}") + + checkpoint = self.load_checkpoint(checkpoint_path=self.restore_config.path, selective_restore=True) + + if self.restore_config.load_model_state: + logging.info(f"Restoring model weights from {self.restore_config}") + self.load_model_state_dict(checkpoint=checkpoint) + + if self.restore_config.load_optim_state: + logging.info(f"Restoring optimizer states from {self.restore_config}") + self.load_optimizer_state_dict(checkpoint=checkpoint, selective_restore=True) + + logging.info(f"Finished restoring from {self.restore_config}, cleaning up.") + torch.cuda.empty_cache() + # wait for all to catch up + self.trainer.strategy.barrier("MegatronStrategy.restore_end") + @override - def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None: - if not self.ckpt_include_optimizer: + def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any], selective_restore: bool = False) -> None: + if not self.should_restore_optimizer_states(selective_restore=selective_restore): return optimizer_states = checkpoint["optimizer"] diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py index e2c1b1e22825..64345a378257 100644 --- a/nemo/lightning/pytorch/strategies/utils.py +++ b/nemo/lightning/pytorch/strategies/utils.py @@ -13,8 +13,9 @@ # limitations under the License. import io +from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Iterable, List, Tuple, Union, cast +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast import pytorch_lightning as pl import torch @@ -35,6 +36,14 @@ from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO +@dataclass(kw_only=True) +class RestoreConfig: + path: str + adapter_path: Optional[str] = None + load_model_state: bool = True + load_optim_state: bool = False + + def setup_parallel_ranks(strategy: pl.strategies.Strategy): from megatron.core.model_parallel_config import ModelParallelConfig @@ -172,7 +181,6 @@ def _convert(checkpoint, sharded_state_dict, k, device_mesh=None): def pyt_to_mcore_state_dict( state_dict: Dict[str, Any], prefix: str = "", device_mesh: DeviceMesh = None ) -> Dict[str, List[ShardedBase]]: - def _dtensor_to_mcore_sharded_tensor( key: str, dten: DTensor, @@ -302,16 +310,16 @@ def _convert(state_dict, k, sh_key, v, prepend_offsets, prefix="", allow_shape_m num_layers = 0 for k in state_dict: if k.startswith("module.decoder.layers."): - num_layers = max(num_layers, int(k.split('.')[3]) + 1) + num_layers = max(num_layers, int(k.split(".")[3]) + 1) for k, v in state_dict.items(): prepend_offsets = [] sh_key = k allow_shape_mismatch = k.endswith(".word_embeddings.weight") # vocab size can be different if k.startswith("module.decoder.layers."): - sh_key = k.split('.') + sh_key = k.split(".") global_layer_offset = int(sh_key.pop(3)) - sh_key = '.'.join(sh_key) + sh_key = ".".join(sh_key) prepend_offsets.append((0, global_layer_offset, num_layers)) _convert(state_dict, k, sh_key, v, prepend_offsets, prefix, allow_shape_mismatch, device_mesh) diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py index 6c1da21288bb..bce1964b6699 100644 --- a/nemo/lightning/resume.py +++ b/nemo/lightning/resume.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os +from dataclasses import dataclass from pathlib import Path, PosixPath, WindowsPath from typing import Optional, Union @@ -20,178 +22,221 @@ import pytorch_lightning as pl from nemo.lightning import io -from nemo.lightning.io.mixin import IOMixin +from nemo.lightning.pytorch.strategies.utils import RestoreConfig from nemo.utils import logging from nemo.utils.app_state import AppState from nemo.utils.model_utils import uninject_model_parallel_rank # Dynamically inherit from the correct Path subclass based on the operating system. -if os.name == 'nt': +if os.name == "nt": BasePath = WindowsPath else: BasePath = PosixPath -class Resume(IOMixin): - def nemo_path(self, model=None) -> Optional[Path]: - """Returns the checkpoint to resume from.""" +@dataclass(kw_only=True) +class AutoResume: + """Class that handles the logic for setting checkpoint paths and restoring from + checkpoints in NeMo. + + Attributes: + restore_config (Optional[RestoreConfig]): Optional config for selectively restoring specific parts like model weights, optimizer states, etc. + If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be automatically converted to a NeMo compatible format. + resume_from_folder or the run's log_dir takes precedence over restore_config. + resume_from_directory (str): Path to the checkpointing directory to restore from. Defaults to /checkpoints + adapter_path (str): Path to any adapter checkpoints. + resume_if_exists (bool): Whether this experiment is resuming from a previous run. If + True, it sets trainer._checkpoint_connector._ckpt_path so that the trainer should + auto-resume. exp_manager will move files under log_dir to log_dir/run_{int}. + Defaults to False. + resume_past_end (bool): By default, AutoResume throws an error if resume_if_exists is + True and a checkpoint matching ``*end.ckpt`` indicating a previous training run + fully completed. Setting resume_past_end=True disables this behavior and loads the + last checkpoint. + resume_ignore_no_checkpoint (bool): AutoResume throws an error if resume_if_exists is + True and no checkpoint could be found. Setting resume_ignore_no_checkpoint=True + disables this behavior, in which case exp_manager will print a message and + continue without restoring. + """ + + restore_config: Optional[RestoreConfig] = None + resume_from_directory: Optional[str] = None + adapter_path: Optional[str] = None + resume_if_exists: bool = False + resume_past_end: bool = False + resume_ignore_no_checkpoint: bool = False def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None): if isinstance(trainer, fl.Fabric): raise NotImplementedError("Fabric is not supported yet.") - ckpt_path = self.nemo_path(model) - if ckpt_path: - trainer.ckpt_path = ckpt_path - trainer.checkpoint_callback.last_model_path = ckpt_path - - -class AutoResume(Resume, io.IOMixin): - """Class that handles the logic for setting checkpoint paths and restoring from - checkpoints in NeMo. - """ - - def __init__( - self, - path: Optional[str] = None, ## old resume_from_checkpoint - dirpath: Optional[str] = None, ## optional path to checkpoint directory - import_path: Optional[str] = None, ## for importing from hf or other checkpoint formats - adapter_path: Optional[str] = None, - resume_if_exists: bool = False, - resume_past_end: bool = False, - resume_ignore_no_checkpoint: bool = False, - ): - """ - Args: - path (str): Can be used to specify a path to a specific checkpoint file to load from. - This will override any checkpoint found when resume_if_exists is True. - Defaults to None - dirpath (str): Path to the checkpointing directory to restore from. Defaults to /checkpoints - import_path (str): Path to specify if importing a checkpoint from HF or - another non-NeMo checkpoint format. If import_path is provided, other arguments - are unused. - adapter_path (str): Path to any adapter checkpoints. - resume_if_exists (bool): Whether this experiment is resuming from a previous run. If - True, it sets trainer._checkpoint_connector._ckpt_path so that the trainer should - auto-resume. exp_manager will move files under log_dir to log_dir/run_{int}. - Defaults to False. - resume_past_end (bool): By default, AutoResume throws an error if resume_if_exists is - True and a checkpoint matching ``*end.ckpt`` indicating a previous training run - fully completed. Setting resume_past_end=True disables this behavior and loads the - last checkpoint. - resume_ignore_no_checkpoint (bool): AutoResume throws an error if resume_if_exists is - True and no checkpoint could be found. Setting resume_ignore_no_checkpoint=True - disables this behavior, in which case exp_manager will print a message and - continue without restoring. - """ - if path and import_path: - raise ValueError("Only one of path or import_path can be set") - - self.path = path - self.dirpath = dirpath - self.import_path = import_path - self.adapter_path = adapter_path - self.resume_if_exists = resume_if_exists - self.resume_past_end = resume_past_end - self.resume_ignore_no_checkpoint = resume_ignore_no_checkpoint - - def nemo_path(self, model=None) -> Optional[Path]: + trainer_ckpt_path = self.get_trainer_ckpt_path(model) + if trainer_ckpt_path: + trainer.ckpt_path = trainer_ckpt_path + trainer.checkpoint_callback.last_model_path = trainer_ckpt_path + elif self.restore_config: + new_path = self._try_import_model( + model=model, + path=self.restore_config.path, + adapter_path=self.restore_config.adapter_path, + ) + if isinstance(new_path, AdapterPath): + self.restore_config.path = new_path.base_model_path + self.restore_config.adapter_path = str(new_path) + else: + self.restore_config.path = str(new_path) + trainer.strategy.restore_config = self.restore_config + + def _try_import_model( + self, model: Optional[io.ConnectorMixin], path: str, adapter_path: Optional[str] = None + ) -> BasePath: + if model is None: + raise ValueError("Model is needed to import checkpoint from HF or other non-NeMo checkpoint format.") + try: + new_path = model.import_ckpt(path) + except (ValueError, AttributeError): + # This is reached when the model connector does not exist for the particular path. + new_path = path + + if adapter_path: + new_path = AdapterPath(Path(adapter_path), base_model_path=new_path) + + if isinstance(new_path, str): + new_path = Path(new_path) + + return new_path + + def _resume_peft(self, adapter_meta_path, model): + with open(adapter_meta_path, "r") as f: + metadata = json.load(f) + + assert self.restore_config, "PEFT resume requires specifying restore_config" + assert ( + "://" in self.restore_config.path + ), "For now PEFT resume requires specifying the import path instead of local path" + base_model_path = self._try_import_model(model, self.restore_config.path) + if base_model_path != Path(metadata['model_ckpt_path']): + raise ValueError( + f"When trying to resume a PEFT training run, found mismatching values: " + f"your specified restore_path points to {base_model_path}, " + f"but the PEFT checkpoint was trained with " + f"model_ckpt_path={metadata['model_ckpt_path']}" + ) + return base_model_path + + def _find_trainer_ckpt_path(self) -> Optional[Path]: from nemo.utils.exp_manager import NotFoundError, _filter_out_unfinished_checkpoints - if self.import_path: - if model is None: - raise ValueError("Model is needed to import checkpoint from HF or other non-NeMo checkpoint format.") - output = model.import_ckpt(self.import_path) - if self.adapter_path: - return AdapterPath(output, adapter_path=Path(self.adapter_path)) - return output - - ### refactored from exp_manager - checkpoint = None app_state = AppState() log_dir = app_state.log_dir - app_state.restore = self.resume_if_exists - if self.path: - checkpoint = self.path - if self.resume_if_exists: - # Use /checkpoints/ unless `dirpath` is set - checkpoint_dir = Path(self.dirpath) if self.dirpath else Path(Path(log_dir) / "checkpoints") - - # when using distributed checkpointing, checkpoint_dir is a directory of directories - # we check for this here - dist_checkpoints = [d for d in list(checkpoint_dir.glob("*")) if d.is_dir()] - end_dist_checkpoints = [d for d in dist_checkpoints if d.match("*end")] - last_dist_checkpoints = [d for d in dist_checkpoints if d.match("*last")] - - end_chkpt_cnt = len(end_dist_checkpoints) - end_checkpoints = _filter_out_unfinished_checkpoints(end_dist_checkpoints) - finished_end_chkpt_cnt = len(end_checkpoints) - if end_chkpt_cnt > 0 and finished_end_chkpt_cnt == 0: - raise ValueError( - "End checkpoint is unfinished and cannot be used to resume the training." - " Please remove the checkpoint manually to avoid unexpected cosequences, such as" - " restarting from scratch." - ) - last_chkpt_cnt = len(last_dist_checkpoints) - last_checkpoints = _filter_out_unfinished_checkpoints(last_dist_checkpoints) - finished_last_chkpt_cnt = len(last_checkpoints) - if last_chkpt_cnt > 0 and finished_last_chkpt_cnt == 0: - raise ValueError( - "Last checkpoint is unfinished and cannot be used to resume the training." - " Please remove the checkpoint manually to avoid unexpected cosequences, such as" - " restarting from scratch. Hint: Iteration number can be added to the checkpoint name pattern" - " to maximize chance that there is at least one finished last checkpoint to resume from." - ) + checkpoint = None - if not checkpoint_dir.exists() or (not len(end_checkpoints) > 0 and not len(last_checkpoints) > 0): - if self.resume_ignore_no_checkpoint: - warn = f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :{checkpoint_dir}. " - if checkpoint is None: - warn += "Training from scratch." - elif checkpoint == self.path: - warn += f"Training from {self.path}." - logging.warning(warn) + # Use /checkpoints/ unless `dirpath` is set + checkpoint_dir = ( + Path(self.resume_from_directory) if self.resume_from_directory else Path(Path(log_dir) / "checkpoints") + ) + + # when using distributed checkpointing, checkpoint_dir is a directory of directories + # we check for this here + dist_checkpoints = [d for d in list(checkpoint_dir.glob("*")) if d.is_dir()] + end_dist_checkpoints = [d for d in dist_checkpoints if d.match("*end")] + last_dist_checkpoints = [d for d in dist_checkpoints if d.match("*last")] + + end_chkpt_cnt = len(end_dist_checkpoints) + end_checkpoints = _filter_out_unfinished_checkpoints(end_dist_checkpoints) + finished_end_chkpt_cnt = len(end_checkpoints) + if end_chkpt_cnt > 0 and finished_end_chkpt_cnt == 0: + raise ValueError( + "End checkpoint is unfinished and cannot be used to resume the training." + " Please remove the checkpoint manually to avoid unexpected cosequences, such as" + " restarting from scratch." + ) + + last_chkpt_cnt = len(last_dist_checkpoints) + last_checkpoints = _filter_out_unfinished_checkpoints(last_dist_checkpoints) + finished_last_chkpt_cnt = len(last_checkpoints) + if last_chkpt_cnt > 0 and finished_last_chkpt_cnt == 0: + raise ValueError( + "Last checkpoint is unfinished and cannot be used to resume the training." + " Please remove the checkpoint manually to avoid unexpected cosequences, such as" + " restarting from scratch. Hint: Iteration number can be added to the checkpoint name pattern" + " to maximize chance that there is at least one finished last checkpoint to resume from." + ) + + if not checkpoint_dir.exists() or (not len(end_checkpoints) > 0 and not len(last_checkpoints) > 0): + if self.resume_ignore_no_checkpoint: + warn = f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :{checkpoint_dir}. " + if checkpoint is None: + warn += "Training from scratch." + logging.warning(warn) + else: + if self.restore_config: + # resume_if_exists is True but run is not resumable. Do not fail and try to do selective restore later instead. + return None else: raise NotFoundError( f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Cannot resume." ) - elif len(end_checkpoints) > 0: - if self.resume_past_end: - if len(end_checkpoints) > 1: - if 'mp_rank' in str(end_checkpoints[0]): - checkpoint = end_checkpoints[0] - else: - raise ValueError(f"Multiple checkpoints {end_checkpoints} that matches *end.ckpt.") - else: - raise ValueError( - f"Found {end_checkpoints[0]} indicating that the last training run has already completed." - ) - elif len(last_checkpoints) > 1: - if any([s for s in ['mp_rank', 'tp_rank', 'fsdp_shard'] if s in str(last_checkpoints[0])]): - checkpoint = last_checkpoints[0] - checkpoint = uninject_model_parallel_rank(checkpoint) + elif len(end_checkpoints) > 0: + if not self.resume_past_end: + raise ValueError( + f"Found {end_checkpoints[0]} indicating that the last training run has already completed." + ) + + if len(end_checkpoints) > 1: + if "mp_rank" in str(end_checkpoints[0]): + checkpoint = end_checkpoints[0] else: - # Select the checkpoint with the latest modified time - checkpoint = sorted(last_checkpoints, key=lambda pth: pth.lstat().st_mtime, reverse=True)[0] - logging.warning( - f"Multiple checkpoints {last_checkpoints} matches *last.ckpt. Selecting one with the latest modified time." - ) - else: + raise ValueError(f"Multiple checkpoints {end_checkpoints} that matches *end.ckpt.") + elif len(last_checkpoints) > 1: + if any([s for s in ["mp_rank", "tp_rank", "fsdp_shard"] if s in str(last_checkpoints[0])]): checkpoint = last_checkpoints[0] + checkpoint = uninject_model_parallel_rank(checkpoint) + else: + # Select the checkpoint with the latest modified time + checkpoint = sorted(last_checkpoints, key=lambda pth: pth.lstat().st_mtime, reverse=True)[0] + logging.warning( + f"Multiple checkpoints {last_checkpoints} matches *last.ckpt. Selecting one with the latest modified time." + ) + else: + checkpoint = last_checkpoints[0] + + return checkpoint + + def get_trainer_ckpt_path(self, model: Optional[io.ConnectorMixin] = None) -> Optional[Path]: + checkpoint = None + app_state = AppState() + app_state.restore = self.resume_if_exists + if self.resume_if_exists: + checkpoint = self._find_trainer_ckpt_path() if checkpoint: if self.adapter_path: - return AdapterPath(checkpoint, adapter_path=Path(self.adapter_path)) - return Path(checkpoint) + return AdapterPath(Path(self.adapter_path), base_model_path=checkpoint) + else: + from nemo.lightning.pytorch.callbacks.peft import _ADAPTER_META_FILENAME + + adapter_meta_path = checkpoint / _ADAPTER_META_FILENAME + if adapter_meta_path.exists(): + base_model_path = self._resume_peft(adapter_meta_path, model) + return AdapterPath(checkpoint, base_model_path=base_model_path) + else: + return Path(checkpoint) return None class AdapterPath(BasePath): - adapter_path: Optional[Path] + """Path object for adapter paths which include a field for the base model the adapters are trained on + to facilitate model loading.""" - def __new__(cls, *args, adapter_path: Optional[Path] = None, **kwargs): + base_model_path: Optional[Path] + + def __new__(cls, *args, base_model_path: Optional[Path] = None, **kwargs): output = super().__new__(cls, *args, **kwargs) - output.adapter_path = adapter_path + output.base_model_path = base_model_path return output + + def __repr__(self): + return "{}({!r}, base_model_path={})".format(self.__class__.__name__, self.as_posix(), self.base_model_path) diff --git a/tests/collections/llm/test_mnist_model_nemo2.py b/tests/collections/llm/test_mnist_model_nemo2.py index c78306201751..c9507ab66bb3 100644 --- a/tests/collections/llm/test_mnist_model_nemo2.py +++ b/tests/collections/llm/test_mnist_model_nemo2.py @@ -543,7 +543,6 @@ def run_train_mnist_litautoencoder_with_megatron_strategy_single_gpu(): trainer=trainer, log=nemo_logger, resume=resume.AutoResume( - path=None, # Overrides the path found by resume_if_exists when set. resume_if_exists=True, # Looks for the -last checkpoint to continue training. resume_ignore_no_checkpoint=True, # When false this will throw an error with no existing checkpoint. ), diff --git a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py index 32fde23bceb9..025f589e2f39 100644 --- a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py +++ b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py @@ -560,7 +560,6 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu(): trainer=trainer, log=nemo_logger, resume=resume.AutoResume( - path=None, # Overrides the path found by resume_if_exists when set. resume_if_exists=True, # Looks for the -last checkpoint to continue training. resume_ignore_no_checkpoint=True, # When false this will throw an error with no existing checkpoint. ), diff --git a/tests/lightning/test_ddp_parity_checker.py b/tests/lightning/test_ddp_parity_checker.py index 892df513539a..a93cd3243890 100644 --- a/tests/lightning/test_ddp_parity_checker.py +++ b/tests/lightning/test_ddp_parity_checker.py @@ -86,7 +86,8 @@ def make_trainer_optim(args): max_steps=4, accelerator="gpu", strategy=nl.MegatronStrategy( - ckpt_include_optimizer=False, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, ), plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), limit_val_batches=1, diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 08902d03d786..54636f56472a 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -96,21 +96,20 @@ def test_resume(self, trainer, tmp_path): # Error because explicit_log_dir does not exist with pytest.raises(NotFoundError): nl.AutoResume( - dirpath=str(tmp_path / "test_resume"), + resume_from_directory=str(tmp_path / "test_resume"), resume_if_exists=True, ).setup(trainer) # Error because checkpoints folder does not exist with pytest.raises(NotFoundError): nl.AutoResume( - dirpath=str(tmp_path / "test_resume" / "does_not_exist"), - path="does_not_exist", + resume_from_directory=str(tmp_path / "test_resume" / "does_not_exist"), resume_if_exists=True, ).setup(trainer) # No error because we tell autoresume to ignore notfounderror nl.AutoResume( - dirpath=str(tmp_path / "test_resume" / "does_not_exist"), + resume_from_directory=str(tmp_path / "test_resume" / "does_not_exist"), resume_if_exists=True, resume_ignore_no_checkpoint=True, ).setup(trainer) @@ -119,7 +118,7 @@ def test_resume(self, trainer, tmp_path): # Error because checkpoints do not exist in folder with pytest.raises(NotFoundError): nl.AutoResume( - dirpath=path, + resume_from_directory=path, resume_if_exists=True, ).setup(trainer) @@ -127,7 +126,7 @@ def test_resume(self, trainer, tmp_path): # Error because *end.ckpt is in folder indicating that training has already finished with pytest.raises(ValueError): nl.AutoResume( - dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), + resume_from_directory=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), resume_if_exists=True, ).setup(trainer) Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").rmdir() @@ -137,7 +136,7 @@ def test_resume(self, trainer, tmp_path): # Error because *end.ckpt is unfinished, should raise an error despite resume_ignore_no_checkpoint=True with pytest.raises(ValueError): nl.AutoResume( - dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), + resume_from_directory=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), resume_if_exists=True, resume_past_end=True, resume_ignore_no_checkpoint=True, @@ -150,7 +149,7 @@ def test_resume(self, trainer, tmp_path): # Error because *last.ckpt is unfinished, should raise an error despite resume_ignore_no_checkpoint=True with pytest.raises(ValueError): nl.AutoResume( - dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), + resume_from_directory=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), resume_if_exists=True, resume_ignore_no_checkpoint=True, ).setup(trainer) @@ -167,7 +166,7 @@ def test_resume(self, trainer, tmp_path): Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel3--last-unfinished").touch() nl.AutoResume( - dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), + resume_from_directory=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), resume_if_exists=True, ).setup(trainer) assert str(trainer.ckpt_path) == str( @@ -203,6 +202,6 @@ def test_resume(self, trainer, tmp_path): logger.setup(trainer) nl.AutoResume( resume_if_exists=True, - dirpath=str(dirpath_checkpoint_dir), + resume_from_directory=str(dirpath_checkpoint_dir), ).setup(trainer) assert Path(trainer.ckpt_path).resolve() == dirpath_checkpoint.resolve() diff --git a/tests/lightning/test_precision_plugin.py b/tests/lightning/test_precision_plugin.py index 2c029ce23936..44ffa5939fab 100644 --- a/tests/lightning/test_precision_plugin.py +++ b/tests/lightning/test_precision_plugin.py @@ -48,7 +48,8 @@ def connect(self, model: pl.LightningModule) -> None: strategy=nl.MegatronStrategy( tensor_model_parallel_size=2, sequence_parallel=True, - ckpt_include_optimizer=False, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, ), plugins=nl.MegatronMixedPrecision(precision="bf16-mixed", fp8='e4m3'), limit_val_batches=0.0, @@ -79,7 +80,8 @@ def test_precision_plugin_precision_params_override(self): strategy=nl.MegatronStrategy( tensor_model_parallel_size=2, sequence_parallel=True, - ckpt_include_optimizer=False, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, ), plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), limit_val_batches=0.0, From e6db2f3fe5e4f92d586ef619ab02b231d0a5f442 Mon Sep 17 00:00:00 2001 From: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Date: Thu, 5 Sep 2024 21:56:15 +0530 Subject: [PATCH 107/664] alltoall (#10357) Signed-off-by: Malay Nagda --- nemo/collections/llm/gpt/model/mixtral.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py index dc438320eeff..b0f40a2fc785 100644 --- a/nemo/collections/llm/gpt/model/mixtral.py +++ b/nemo/collections/llm/gpt/model/mixtral.py @@ -56,8 +56,12 @@ class MixtralConfig(GPTConfig): # MoE num_moe_experts: int = 8 + moe_aux_loss_coeff: float = 0.01 + moe_expert_capacity_factor: float = 1.0 + moe_pad_expert_input_to_capacity: bool = True moe_router_topk: int = 1 moe_router_pre_softmax: bool = True + moe_token_dispatcher_type: str = "alltoall" init_method_std: float = 0.02 layernorm_epsilon: float = 1e-5 From a5673806aa33a0235a72ad6bfade267f9b6e2238 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Thu, 5 Sep 2024 10:43:09 -0600 Subject: [PATCH 108/664] Fix links (#10359) * fix links Signed-off-by: eharper * fix links Signed-off-by: eharper --------- Signed-off-by: eharper --- nemo/lightning/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/lightning/README.md b/nemo/lightning/README.md index 7b9266d3fa30..df887303d27c 100644 --- a/nemo/lightning/README.md +++ b/nemo/lightning/README.md @@ -2,12 +2,12 @@ The NeMo Lightning directory provides custom PyTorch Lightning-compatible objects for seamlessly training NeMo 2.0 models using PTL. NeMo 2.0 models are implemented using [Megatron Core](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). NeMo Lightning provides the bridge between higher-level, object-oriented PTL APIs and lower-level Megatron APIs. -For detailed tutorials and documentation on NeMo 2.0, refer to the [docs](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo_2.0/index.html). +For detailed tutorials and documentation on NeMo 2.0, refer to the [docs](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/index.html). Some of the helpful classes provided here include: -- [`Trainer`](./pytorch/trainer.py): A lightweight wrapper around PTL's `Trainer` object which provides some additional support for capturing the arguments used to initialized the trainer. More information on NeMo 2's serialization mechanisms is available [here](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo_2.0/design/serialization.html). -- [`MegatronStrategy`](./pytorch/strategies.py): A PTL strategy that enables training of Megatron models on NVIDIA GPUs. +- [`Trainer`](./pytorch/trainer.py): A lightweight wrapper around PTL's `Trainer` object which provides some additional support for capturing the arguments used to initialized the trainer. More information on NeMo 2's serialization mechanisms is available [here](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/design/serialization.html). +- [`MegatronStrategy`](./pytorch/strategies/megatron_strategy.py): A PTL strategy that enables training of Megatron models on NVIDIA GPUs. - [`MegatronParallel`](./megatron_parallel.py): Class which sets up and manages Megatron's distributed model parallelism. - [`MegatronMixedPrecision`](./pytorch/plugins/mixed_precision.py): A specialized precision plugin for training Megatron-based models in PTL. -More information on `MegatronStrategy`, `MegatronParallel`, and `MegatronMixedPrecision` can be found in [this document](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo_2.0/design/megatron.html). +More information on `MegatronStrategy`, `MegatronParallel`, and `MegatronMixedPrecision` can be found in [this document](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/design/megatron.html). From a9746a654d37d3451bcc33ad58cf8378efe787b7 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Thu, 5 Sep 2024 12:06:01 -0700 Subject: [PATCH 109/664] Improve TE import guards (#10322) * improve TE import guards Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * small fixes Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * bug fixes Signed-off-by: ashors1 --------- Signed-off-by: ashors1 Signed-off-by: ashors1 Co-authored-by: ashors1 --- nemo/collections/llm/__init__.py | 7 +- nemo/collections/llm/gpt/data/mock.py | 7 +- nemo/collections/llm/gpt/data/pre_training.py | 7 +- nemo/collections/llm/gpt/model/base.py | 7 +- .../clip/megatron_clip_models.py | 11 +- .../modules/stable_diffusion/attention.py | 13 +- .../diffusionmodules/openaimodel.py | 10 +- .../gpt_full_te_layer_autocast_spec.py | 25 +- .../language_modeling/megatron_gpt_model.py | 17 +- .../language_modeling/megatron_retro_model.py | 12 +- .../modules/common/megatron/adapters/qlora.py | 5 +- .../modules/common/megatron/transformer.py | 16 +- nemo/core/optim/distributed_adam.py | 4 +- nemo/lightning/__init__.py | 7 +- nemo/utils/import_utils.py | 391 ++++++++++++++++++ nemo/utils/te_utils.py | 10 +- tests/lightning/io/test_api.py | 4 +- 17 files changed, 452 insertions(+), 101 deletions(-) create mode 100644 nemo/utils/import_utils.py diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 361df944a856..8da00b0edd7f 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -13,10 +13,9 @@ # limitations under the License. # This is here to import it once, which improves the speed of launch when in debug-mode -try: - import transformer_engine # noqa -except ImportError: - pass +from nemo.utils.import_utils import safe_import + +safe_import("transformer_engine") from nemo.collections.llm import peft, tokenizer from nemo.collections.llm.api import export_ckpt, finetune, import_ckpt, pretrain, train, validate diff --git a/nemo/collections/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py index cb4455549ea0..1c5e01c89bbd 100644 --- a/nemo/collections/llm/gpt/data/mock.py +++ b/nemo/collections/llm/gpt/data/mock.py @@ -22,12 +22,9 @@ from torch.utils.data import DataLoader, Dataset from nemo.lightning.pytorch.plugins import MegatronDataSampler +from nemo.utils.import_utils import safe_import -HAVE_TE = True -try: - import transformer_engine -except (ImportError, ModuleNotFoundError): - HAVE_TE = False +_, HAVE_TE = safe_import("transformer_engine") if TYPE_CHECKING: from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py index 285f790a77ac..ccb2d21729ed 100644 --- a/nemo/collections/llm/gpt/data/pre_training.py +++ b/nemo/collections/llm/gpt/data/pre_training.py @@ -24,12 +24,9 @@ from nemo.lightning.data import WrappedDataLoader from nemo.lightning.io.mixin import IOMixin from nemo.lightning.pytorch.plugins import MegatronDataSampler +from nemo.utils.import_utils import safe_import -HAVE_TE = True -try: - import transformer_engine -except (ImportError, ModuleNotFoundError): - HAVE_TE = False +_, HAVE_TE = safe_import("transformer_engine") if TYPE_CHECKING: from megatron.core.datasets.gpt_dataset import GPTDatasetConfig diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index a32fdb582764..d13e86ce2ca2 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -28,12 +28,9 @@ from nemo.lightning.megatron_parallel import MaskedTokenLossReduction from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule from nemo.utils import logging +from nemo.utils.import_utils import safe_import -HAVE_TE = True -try: - import transformer_engine -except (ImportError, ModuleNotFoundError): - HAVE_TE = False +_, HAVE_TE = safe_import("transformer_engine") if TYPE_CHECKING: from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py index 99bb8a23cf47..393acdef35de 100644 --- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py +++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py @@ -51,6 +51,7 @@ from nemo.collections.vision.modules.vit.vit_backbone import VitBackbone from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.import_utils import safe_import, safe_import_from try: from apex.transformer.enums import AttnMaskType @@ -102,14 +103,8 @@ logging.warning("Megatron num_microbatches_calculator not found, using Apex version.") from apex.transformer.pipeline_parallel.utils import get_num_microbatches -try: - import transformer_engine - from transformer_engine.pytorch import module as te_module - - HAVE_TE = True - -except (ImportError, ModuleNotFoundError): - HAVE_TE = False +_, HAVE_TE = safe_import("transformer_engine") +te_module, _ = safe_import_from("transformer_engine.pytorch", "module") @cache diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py index 9d4d5de2d203..646540e88a5e 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py @@ -39,15 +39,12 @@ ) from nemo.core import adapter_mixins from nemo.utils import logging +from nemo.utils.import_utils import safe_import_from -try: - from transformer_engine.pytorch.attention import DotProductAttention - from transformer_engine.pytorch.module import LayerNormLinear, LayerNormMLP - - HAVE_TE = True - -except (ImportError, ModuleNotFoundError): - HAVE_TE = False +DotProductAttention, HAVE_DPA = safe_import_from("transformer_engine.pytorch.attention", "DotProductAttention") +LayerNormLinear, HAVE_LN_LINEAR = safe_import_from("transformer_engine.pytorch.module", "LayerNormLinear") +LayerNormMLP, HAVE_LN_MLP = safe_import_from("transformer_engine.pytorch.module", "LayerNormMLP") +HAVE_TE = HAVE_DPA and HAVE_LN_LINEAR and HAVE_LN_MLP def check_cuda(): diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py index 66df3c378bfb..528048b04950 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py @@ -40,15 +40,9 @@ zero_module, ) from nemo.utils import logging +from nemo.utils.import_utils import safe_import -try: - # FP8 related import - import transformer_engine - - HAVE_TE = True - -except (ImportError, ModuleNotFoundError): - HAVE_TE = False +transformer_engine, HAVE_TE = safe_import("transformer_engine") try: from apex.contrib.group_norm import GroupNorm diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py index a2d85ebe3006..cdff4a0036d1 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py @@ -20,19 +20,13 @@ from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults from nemo.collections.nlp.parts import utils_funcs +from nemo.utils.import_utils import safe_import_from -try: - from transformer_engine.pytorch import TransformerLayer - - HAVE_TE = True - -except (ImportError, ModuleNotFoundError) as e: +TransformerLayer, HAVE_TE = safe_import_from("transformer_engine.pytorch", "TransformerLayer") +if not HAVE_TE: TransformerLayer = ApexGuardDefaults - HAVE_TE = False - IMPORT_ERROR = e - try: from megatron.core import parallel_state, tensor_parallel from megatron.core.fusions.fused_layer_norm import FusedLayerNorm @@ -88,8 +82,9 @@ def __init__( device: str = 'cuda', **kwargs, ) -> None: - if not HAVE_MEGATRON_CORE or not HAVE_TE: - raise ImportError(IMPORT_ERROR) + assert ( + HAVE_MEGATRON_CORE and HAVE_TE + ), "AutocastTransformerLayer requires Megatron Core and Transformer Engine to be installed." transformer_layer_args = { "hidden_size": hidden_size, @@ -182,8 +177,9 @@ def forward( class TETransformerLayerAutocast(AutocastTransformerLayer, BaseTransformerLayer): def __init__(self, config, layer_number=1, hidden_dropout=None): - if not HAVE_MEGATRON_CORE or not HAVE_TE: - raise ImportError(IMPORT_ERROR) + assert ( + HAVE_MEGATRON_CORE and HAVE_TE + ), "TETransformerLayerAutocast requires Megatron Core and Transformer Engine to be installed." self.config = config self.is_first_microbatch = True @@ -325,8 +321,7 @@ def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), meta # Use this spec to use the full Transformer layer from Transformer Engine def get_gpt_full_te_layer_autocast_spec(transformer_config) -> ModuleSpec: - if not HAVE_MEGATRON_CORE or not HAVE_TE: - raise ImportError(IMPORT_ERROR) + assert HAVE_MEGATRON_CORE and HAVE_TE, "Please ensure Megatron Core and Transformer Engine are installed." num_layers = get_num_layers_to_build(transformer_config) return TransformerBlockSubmodules( layer_specs=[ModuleSpec(module=TETransformerLayerAutocast)] * num_layers, layer_norm=FusedLayerNorm diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index ee2d891e83e4..4e34782e99e4 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -74,6 +74,7 @@ from nemo.core.classes.common import PretrainedModelInfo from nemo.core.neural_types import ChannelType, NeuralType from nemo.utils import logging +from nemo.utils.import_utils import safe_import, safe_import_from from nemo.utils.te_utils import is_float8tensor try: @@ -127,16 +128,12 @@ update_num_microbatches, ) -try: - import transformer_engine - from transformer_engine.pytorch import module as te_module - - from nemo.collections.nlp.modules.common.hyena.hyena_spec import get_gpt_layer_with_te_and_hyena_spec - - HAVE_TE = True - -except (ImportError, ModuleNotFoundError): - HAVE_TE = False +transformer_engine, HAVE_TE = safe_import("transformer_engine") +te_module, HAVE_TE_MODULE = safe_import_from("transformer_engine.pytorch", "module") +get_gpt_layer_with_te_and_hyena_spec, HAVE_HYENA_SPEC = safe_import_from( + "nemo.collections.nlp.modules.common.hyena.hyena_spec", "get_gpt_layer_with_te_and_hyena_spec" +) +HAVE_TE = HAVE_TE and HAVE_TE_MODULE and HAVE_HYENA_SPEC @cache diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py index 9061f430e722..a6bf75fb9444 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py @@ -66,6 +66,7 @@ from nemo.core.classes.common import PretrainedModelInfo from nemo.core.neural_types import ChannelType, NeuralType from nemo.utils import logging +from nemo.utils.import_utils import safe_import, safe_import_from try: from megatron.core import InferenceParams, parallel_state @@ -98,14 +99,9 @@ logging.warning("Megatron num_microbatches_calculator not found, using Apex version.") from apex.transformer.pipeline_parallel.utils import get_num_microbatches -try: - import transformer_engine - from transformer_engine.pytorch import module as te_module - - HAVE_TE = True - -except (ImportError, ModuleNotFoundError): - HAVE_TE = False +transformer_engine, HAVE_TE = safe_import("transformer_engine") +te_module, HAVE_TE_MODULE = safe_import_from("transformer_engine.pytorch", "module") +HAVE_TE = HAVE_TE and HAVE_TE_MODULE class MegatronRetroModel(MegatronGPTModel): diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py b/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py index 4a180234e3cf..afe05991b03c 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py @@ -22,6 +22,7 @@ from nemo.collections.nlp.parts.peft_config import LORA_CONFIG_TO_MCORE_MAP, get_target_modules from nemo.utils import logging +from nemo.utils.import_utils import safe_import_from te_version = packaging.version.Version(version("transformer-engine")) @@ -178,11 +179,11 @@ def _create_layer_norm_fn(self): since this is for QLoRA. ''' if self.normalization == 'LayerNorm': - from transformer_engine.pytorch.module.layernorm import _LayerNorm + _LayerNorm, _ = safe_import_from("transformer_engine.pytorch.module.layernorm", "_LayerNorm") layer_norm_fn = _LayerNorm.apply elif self.normalization == 'RMSNorm': - from transformer_engine.pytorch.module.rmsnorm import _RMSNorm + _RMSNorm, _ = safe_import_from("transformer_engine.pytorch.module.layernorm", "_RMSNorm") layer_norm_fn = _RMSNorm.apply else: diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index e803a622f75d..ab10b0d0e8b3 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -45,6 +45,7 @@ from nemo.collections.nlp.parts import utils_funcs from nemo.core import adapter_mixins from nemo.utils import logging +from nemo.utils.import_utils import safe_import_from try: from apex.normalization import MixedFusedRMSNorm @@ -70,16 +71,13 @@ HAVE_MEGATRON_CORE = False -try: - from transformer_engine.common import recipe - from transformer_engine.pytorch import TransformerLayer, fp8_autocast - from transformer_engine.pytorch.distributed import checkpoint as te_checkpoint - - HAVE_TE = True - -except: - HAVE_TE = False +recipe, HAVE_RECIPE = safe_import_from("transformer_engine.common", "recipe") +TransformerLayer, HAVE_LAYER = safe_import_from("transformer_engine.pytorch", "TransformerLayer") +fp8_autocast, HAVE_AUTOCAST = safe_import_from("transformer_engine.pytorch", "fp8_autocast") +te_checkpoint, HAVE_CKPT = safe_import_from("transformer_engine.pytorch.distributed", "checkpoint") +HAVE_TE = HAVE_RECIPE and HAVE_LAYER and HAVE_AUTOCAST and HAVE_CKPT +if not HAVE_TE: # fake missing class class TransformerLayer(ApexGuardDefaults): def __init__(self): diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py index 716c905493e0..1d35456b42ff 100644 --- a/nemo/core/optim/distributed_adam.py +++ b/nemo/core/optim/distributed_adam.py @@ -33,11 +33,13 @@ from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace from megatron.core.dist_checkpointing.mapping import ShardedTensor from megatron.core.dist_checkpointing.optimizer import get_param_id_to_sharded_param_map, optim_state_to_sharding_state -from transformer_engine.pytorch.cpp_extensions import cast_to_fp8 from nemo.utils import logging, str_to_dtype +from nemo.utils.import_utils import safe_import_from from nemo.utils.te_utils import is_float8tensor +cast_to_fp8, _ = safe_import_from("transformer_engine.pytorch.cpp_extensions", "cast_to_fp8") + _distribute_within_nodes_pgs = {} diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py index bfb5a4f6fceb..2cc720e148d4 100644 --- a/nemo/lightning/__init__.py +++ b/nemo/lightning/__init__.py @@ -18,10 +18,9 @@ from pytorch_lightning import plugins as _pl_plugins # This is here to import it once, which improves the speed of launch when in debug-mode -try: - import transformer_engine # noqa -except ImportError: - pass +from nemo.utils.import_utils import safe_import + +safe_import("transformer_engine") from nemo.lightning.base import get_vocab_size, teardown from nemo.lightning.fabric.fabric import Fabric diff --git a/nemo/utils/import_utils.py b/nemo/utils/import_utils.py new file mode 100644 index 000000000000..a5b646eb3aa8 --- /dev/null +++ b/nemo/utils/import_utils.py @@ -0,0 +1,391 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file is taken from https://github.com/NVIDIA/NeMo-Curator, which is adapted from cuML's safe_imports module: +# https://github.com/rapidsai/cuml/blob/e93166ea0dddfa8ef2f68c6335012af4420bc8ac/python/cuml/internals/safe_imports.py + + +import importlib +import logging +import traceback +from contextlib import contextmanager + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(logging.StreamHandler()) + +GPU_INSTALL_STRING = """Install GPU packages via `pip install --extra-index-url https://pypi.nvidia.com nemo-curator[cuda12x]` +or use `pip install --extra-index-url https://pypi.nvidia.com ".[cuda12x]"` if installing from source""" + + +class UnavailableError(Exception): + """Error thrown if a symbol is unavailable due to an issue importing it""" + + +@contextmanager +def null_decorator(*args, **kwargs): + if len(kwargs) == 0 and len(args) == 1 and callable(args[0]): + return args[0] + else: + + def inner(func): + return func + + return inner + + +class UnavailableMeta(type): + """A metaclass for generating placeholder objects for unavailable symbols + + This metaclass allows errors to be deferred from import time to the time + that a symbol is actually used in order to streamline the usage of optional + dependencies. This is particularly useful for attempted imports of GPU-only + modules which will only be invoked if GPU-only functionality is + specifically used. + + If an attempt to import a symbol fails, this metaclass is used to generate + a class which stands in for that symbol. Any attempt to call the symbol + (instantiate the class) or access its attributes will throw an + UnavailableError exception. Furthermore, this class can be used in + e.g. isinstance checks, since it will (correctly) fail to match any + instance it is compared against. + + In addition to calls and attribute access, a number of dunder methods are + implemented so that other common usages of imported symbols (e.g. + arithmetic) throw an UnavailableError, but this is not guaranteed for + all possible uses. In such cases, other exception types (typically + TypeErrors) will be thrown instead. + """ + + def __new__(meta, name, bases, dct): + if dct.get("_msg", None) is None: + dct["_msg"] = f"{name} could not be imported" + name = f"MISSING{name}" + return super(UnavailableMeta, meta).__new__(meta, name, bases, dct) + + def __call__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __getattr__(cls, name): + raise UnavailableError(cls._msg) + + def __eq__(cls, other): + raise UnavailableError(cls._msg) + + def __lt__(cls, other): + raise UnavailableError(cls._msg) + + def __gt__(cls, other): + raise UnavailableError(cls._msg) + + def __le__(cls, other): + raise UnavailableError(cls._msg) + + def __ge__(cls, other): + raise UnavailableError(cls._msg) + + def __ne__(cls, other): + raise UnavailableError(cls._msg) + + def __abs__(cls): + raise UnavailableError(cls._msg) + + def __add__(cls, other): + raise UnavailableError(cls._msg) + + def __radd__(cls, other): + raise UnavailableError(cls._msg) + + def __iadd__(cls, other): + raise UnavailableError(cls._msg) + + def __floordiv__(cls, other): + raise UnavailableError(cls._msg) + + def __rfloordiv__(cls, other): + raise UnavailableError(cls._msg) + + def __ifloordiv__(cls, other): + raise UnavailableError(cls._msg) + + def __lshift__(cls, other): + raise UnavailableError(cls._msg) + + def __rlshift__(cls, other): + raise UnavailableError(cls._msg) + + def __mul__(cls, other): + raise UnavailableError(cls._msg) + + def __rmul__(cls, other): + raise UnavailableError(cls._msg) + + def __imul__(cls, other): + raise UnavailableError(cls._msg) + + def __ilshift__(cls, other): + raise UnavailableError(cls._msg) + + def __pow__(cls, other): + raise UnavailableError(cls._msg) + + def __rpow__(cls, other): + raise UnavailableError(cls._msg) + + def __ipow__(cls, other): + raise UnavailableError(cls._msg) + + def __rshift__(cls, other): + raise UnavailableError(cls._msg) + + def __rrshift__(cls, other): + raise UnavailableError(cls._msg) + + def __irshift__(cls, other): + raise UnavailableError(cls._msg) + + def __sub__(cls, other): + raise UnavailableError(cls._msg) + + def __rsub__(cls, other): + raise UnavailableError(cls._msg) + + def __isub__(cls, other): + raise UnavailableError(cls._msg) + + def __truediv__(cls, other): + raise UnavailableError(cls._msg) + + def __rtruediv__(cls, other): + raise UnavailableError(cls._msg) + + def __itruediv__(cls, other): + raise UnavailableError(cls._msg) + + def __divmod__(cls, other): + raise UnavailableError(cls._msg) + + def __rdivmod__(cls, other): + raise UnavailableError(cls._msg) + + def __neg__(cls): + raise UnavailableError(cls._msg) + + def __invert__(cls): + raise UnavailableError(cls._msg) + + def __hash__(cls): + raise UnavailableError(cls._msg) + + def __index__(cls): + raise UnavailableError(cls._msg) + + def __iter__(cls): + raise UnavailableError(cls._msg) + + def __delitem__(cls, name): + raise UnavailableError(cls._msg) + + def __setitem__(cls, name, value): + raise UnavailableError(cls._msg) + + def __enter__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __get__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __delete__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __len__(cls): + raise UnavailableError(cls._msg) + + +def is_unavailable(obj): + """Helper to check if given symbol is actually a placeholder""" + return type(obj) is UnavailableMeta + + +class UnavailableNullContext: + """A placeholder class for unavailable context managers + + This context manager will return a value which will throw an + UnavailableError if used in any way, but the context manager itself can be + safely invoked. + """ + + def __init__(self, *args, **kwargs): + pass + + def __enter__(self): + return UnavailableMeta( + "MissingContextValue", + (), + {"_msg": "Attempted to make use of placeholder context return value."}, + ) + + def __exit__(self, *args, **kwargs): + pass + + +def safe_import(module, *, msg=None, alt=None): + """A function used to import modules that may not be available + + This function will attempt to import a module with the given name, but it + will not throw an ImportError if the module is not found. Instead, it will + return a placeholder object which will raise an exception only if used. + + Parameters + ---------- + module: str + The name of the module to import. + msg: str or None + An optional error message to be displayed if this module is used + after a failed import. + alt: object + An optional module to be used in place of the given module if it + fails to import + + Returns + ------- + Tuple(object, bool) + The imported module, the given alternate, or a class derived from + UnavailableMeta, and a boolean indicating whether the intended import was successful. + """ + try: + return importlib.import_module(module), True + except ImportError: + exception_text = traceback.format_exc() + logger.debug(f"Import of {module} failed with: {exception_text}") + except Exception: + exception_text = traceback.format_exc() + raise + if msg is None: + msg = f"{module} could not be imported" + if alt is None: + return UnavailableMeta(module.rsplit(".")[-1], (), {"_msg": msg}), False + else: + return alt, False + + +def safe_import_from(module, symbol, *, msg=None, alt=None): + """A function used to import symbols from modules that may not be available + + This function will attempt to import a symbol with the given name from + the given module, but it will not throw an ImportError if the symbol is not + found. Instead, it will return a placeholder object which will raise an + exception only if used. + + Parameters + ---------- + module: str + The name of the module in which the symbol is defined. + symbol: str + The name of the symbol to import. + msg: str or None + An optional error message to be displayed if this symbol is used + after a failed import. + alt: object + An optional object to be used in place of the given symbol if it fails + to import + + Returns + ------- + Tuple(object, bool) + The imported symbol, the given alternate, or a class derived from + UnavailableMeta, and a boolean indicating whether the intended import was successful. + """ + try: + imported_module = importlib.import_module(module) + return getattr(imported_module, symbol), True + except ImportError: + exception_text = traceback.format_exc() + logger.debug(f"Import of {module} failed with: {exception_text}") + except AttributeError: + exception_text = traceback.format_exc() + logger.info(f"Import of {symbol} from {module} failed with: {exception_text}") + except Exception: + exception_text = traceback.format_exc() + raise + if msg is None: + msg = f"{module}.{symbol} could not be imported" + if alt is None: + return UnavailableMeta(symbol, (), {"_msg": msg}), False + else: + return alt, False + + +def gpu_only_import(module, *, alt=None): + """A function used to import modules required only in GPU installs + + This function will attempt to import a module with the given name. + This function will attempt to import a symbol with the given name from + the given module, but it will not throw an ImportError if the symbol is not + found. Instead, it will return a placeholder object which will raise an + exception only if used with instructions on installing a GPU build. + + Parameters + ---------- + module: str + The name of the module to import. + alt: object + An optional module to be used in place of the given module if it + fails to import in a non-GPU-enabled install + + Returns + ------- + object + The imported module, the given alternate, or a class derived from + UnavailableMeta. + """ + + return safe_import( + module, + msg=f"{module} is not enabled in non GPU-enabled installations or environemnts. {GPU_INSTALL_STRING}", + alt=alt, + ) + + +def gpu_only_import_from(module, symbol, *, alt=None): + """A function used to import symbols required only in GPU installs + + This function will attempt to import a module with the given name. + This function will attempt to import a symbol with the given name from + the given module, but it will not throw an ImportError if the symbol is not + found. Instead, it will return a placeholder object which will raise an + exception only if used with instructions on installing a GPU build. + + Parameters + ---------- + module: str + The name of the module to import. + symbol: str + The name of the symbol to import. + alt: object + An optional object to be used in place of the given symbol if it fails + to import in a non-GPU-enabled install + + Returns + ------- + object + The imported symbol, the given alternate, or a class derived from + UnavailableMeta. + """ + return safe_import_from( + module, + symbol, + msg=f"{module}.{symbol} is not enabled in non GPU-enabled installations or environments. {GPU_INSTALL_STRING}", + alt=alt, + ) diff --git a/nemo/utils/te_utils.py b/nemo/utils/te_utils.py index 8f073e211681..abf8fc14fdd9 100644 --- a/nemo/utils/te_utils.py +++ b/nemo/utils/te_utils.py @@ -13,16 +13,10 @@ # limitations under the License. import torch +from nemo.utils.import_utils import safe_import_from # Check if Transformer Engine has Float8Tensor class -HAVE_TE_FLOAT8TENSOR = False -try: - from transformer_engine.pytorch.float8_tensor import Float8Tensor - - HAVE_TE_FLOAT8TENSOR = True -except (ImportError, ModuleNotFoundError): - # Float8Tensor not found - pass +Float8Tensor, HAVE_TE_FLOAT8TENSOR = safe_import_from("transformer_engine.pytorch.float8_tensor", "Float8Tensor") def is_float8tensor(tensor: torch.Tensor) -> bool: diff --git a/tests/lightning/io/test_api.py b/tests/lightning/io/test_api.py index c0fb1a41ba3d..83f77390ec6e 100644 --- a/tests/lightning/io/test_api.py +++ b/tests/lightning/io/test_api.py @@ -15,13 +15,15 @@ from functools import partial import pytest -import transformer_engine as te from pytorch_lightning.loggers import TensorBoardLogger from nemo import lightning as nl from nemo.collections import llm from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer from nemo.lightning import io +from nemo.utils.import_utils import safe_import + +te, HAVE_TE = safe_import("transformer_engine") def dummy_extra(a, b, c=5): From 5bd2b899964057efb2ab72fd040b5b386c5b557d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 5 Sep 2024 12:31:21 -0700 Subject: [PATCH 110/664] ci: Detect secrets (#10343) * ci: Add secrets detector Signed-off-by: Oliver Koenig * chore: Add baseline Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .github/workflows/config/.secrets.baseline | 2078 ++++++++++++++++++++ .github/workflows/secrets-detector.yml | 32 + 2 files changed, 2110 insertions(+) create mode 100644 .github/workflows/config/.secrets.baseline create mode 100644 .github/workflows/secrets-detector.yml diff --git a/.github/workflows/config/.secrets.baseline b/.github/workflows/config/.secrets.baseline new file mode 100644 index 000000000000..2bf4e372565c --- /dev/null +++ b/.github/workflows/config/.secrets.baseline @@ -0,0 +1,2078 @@ +{ + "version": "1.5.0", + "plugins_used": [ + { + "name": "ArtifactoryDetector" + }, + { + "name": "AWSKeyDetector" + }, + { + "name": "AzureStorageKeyDetector" + }, + { + "name": "Base64HighEntropyString", + "limit": 4.5 + }, + { + "name": "BasicAuthDetector" + }, + { + "name": "CloudantDetector" + }, + { + "name": "DiscordBotTokenDetector" + }, + { + "name": "GitHubTokenDetector" + }, + { + "name": "GitLabTokenDetector" + }, + { + "name": "HexHighEntropyString", + "limit": 3.0 + }, + { + "name": "IbmCloudIamDetector" + }, + { + "name": "IbmCosHmacDetector" + }, + { + "name": "IPPublicDetector" + }, + { + "name": "JwtTokenDetector" + }, + { + "name": "KeywordDetector", + "keyword_exclude": "" + }, + { + "name": "MailchimpDetector" + }, + { + "name": "NpmDetector" + }, + { + "name": "OpenAIDetector" + }, + { + "name": "PrivateKeyDetector" + }, + { + "name": "PypiTokenDetector" + }, + { + "name": "SendGridDetector" + }, + { + "name": "SlackDetector" + }, + { + "name": "SoftlayerDetector" + }, + { + "name": "SquareOAuthDetector" + }, + { + "name": "StripeDetector" + }, + { + "name": "TelegramBotTokenDetector" + }, + { + "name": "TwilioKeyDetector" + } + ], + "filters_used": [ + { + "path": "detect_secrets.filters.allowlist.is_line_allowlisted" + }, + { + "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", + "min_level": 2 + }, + { + "path": "detect_secrets.filters.heuristic.is_indirect_reference" + }, + { + "path": "detect_secrets.filters.heuristic.is_likely_id_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_lock_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_potential_uuid" + }, + { + "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" + }, + { + "path": "detect_secrets.filters.heuristic.is_sequential_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_swagger_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_templated_secret" + } + ], + "results": { + "docs/source/nlp/question_answering.rst": [ + { + "type": "Hex High Entropy String", + "filename": "docs/source/nlp/question_answering.rst", + "hashed_secret": "22e6f19f702bbd215acc1862da6acba7e874674e", + "is_verified": false, + "line_number": 130 + } + ], + "examples/multimodal/multimodal_llm/neva/conf/lita_config.yaml": [ + { + "type": "Base64 High Entropy String", + "filename": "examples/multimodal/multimodal_llm/neva/conf/lita_config.yaml", + "hashed_secret": "a90ca639abde504aba67797b0663923a0075fe6e", + "is_verified": false, + "line_number": 75 + } + ], + "examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 50 + } + ], + "examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 165 + } + ], + "examples/nlp/language_modeling/conf/megatron_baichuan2_inference.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/conf/megatron_baichuan2_inference.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 38 + } + ], + "examples/nlp/language_modeling/conf/megatron_chatglm_inference.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/conf/megatron_chatglm_inference.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 38 + } + ], + "examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 37 + } + ], + "examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 41 + } + ], + "examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 239 + } + ], + "examples/nlp/language_modeling/conf/megatron_llama_inference.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/conf/megatron_llama_inference.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 38 + } + ], + "examples/nlp/language_modeling/conf/megatron_mamba_inference.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/conf/megatron_mamba_inference.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 40 + } + ], + "examples/nlp/language_modeling/conf/megatron_qwen2_inference.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/conf/megatron_qwen2_inference.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 38 + } + ], + "examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 160 + } + ], + "examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 168 + } + ], + "examples/nlp/language_modeling/tuning/conf/megatron_t5_generate_config.yaml": [ + { + "type": "Secret Keyword", + "filename": "examples/nlp/language_modeling/tuning/conf/megatron_t5_generate_config.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 157 + } + ], + "scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py": [ + { + "type": "Hex High Entropy String", + "filename": "scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py", + "hashed_secret": "e0308bd21bffc156d79208f9ecf130370a015002", + "is_verified": false, + "line_number": 460 + } + ], + "scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py": [ + { + "type": "Base64 High Entropy String", + "filename": "scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py", + "hashed_secret": "adfce53cfc5a36ea58ba816ea6d005231db6455c", + "is_verified": false, + "line_number": 40 + } + ], + "scripts/nlp_language_modeling/service_launch_scripts/conf/retro_web_server.yaml": [ + { + "type": "Secret Keyword", + "filename": "scripts/nlp_language_modeling/service_launch_scripts/conf/retro_web_server.yaml", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 9 + } + ], + "scripts/nlp_language_modeling/service_launch_scripts/env_variables.sh": [ + { + "type": "Secret Keyword", + "filename": "scripts/nlp_language_modeling/service_launch_scripts/env_variables.sh", + "hashed_secret": "109f4b3c50d7b0df729d299bc6f8e9ef9066971f", + "is_verified": false, + "line_number": 33 + } + ], + "scripts/tts_dataset_files/cmudict-0.7b_nv22.10": [ + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/cmudict-0.7b_nv22.10", + "hashed_secret": "27b998c0976876189b861934085bba2964e49a11", + "is_verified": false, + "line_number": 4873 + } + ], + "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict": [ + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "6a1c698aa2f0b96b536710f5a2abd0ca64fdb2c1", + "is_verified": false, + "line_number": 8415 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "9a611ff964db23d7cd8a6f63beb7471144f6be92", + "is_verified": false, + "line_number": 8421 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "f2f0bc1252ccb8bbc113859849f8c36be203fd9e", + "is_verified": false, + "line_number": 8424 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "b65cab561c4acd1e20bafd08ce19a9e7b5ea9e1c", + "is_verified": false, + "line_number": 8431 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "c08ba5eff4953649130e83149817e1087c183358", + "is_verified": false, + "line_number": 8433 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "b182776899d0959f28f3d80c7d50317de5c0fb2b", + "is_verified": false, + "line_number": 8441 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "e466719c0309450475aa6b0e0c53a2211db0176d", + "is_verified": false, + "line_number": 8442 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "0e7ae57946d9c88ce5f017ac50640e2c93c5277b", + "is_verified": false, + "line_number": 8443 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "6d13086a143b48a00e3aa745a955095fbbc075b2", + "is_verified": false, + "line_number": 8444 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "d206dd2b79987f5989b23a7a43d6163ed48312f0", + "is_verified": false, + "line_number": 8446 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "be10bcbbe755bdd51e856af50af01bb4f95bfedb", + "is_verified": false, + "line_number": 8449 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "2f00d2108827b933019dbedc8cc3f9c84f3bfe1e", + "is_verified": false, + "line_number": 8451 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "a705c12d23346a892256fa95c892304bd8f4d265", + "is_verified": false, + "line_number": 8454 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "2471810862c167bed6c3a8383dd410aedb8cb7e1", + "is_verified": false, + "line_number": 8456 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "34befc5cf63e9c9259c7e75714c81db892419549", + "is_verified": false, + "line_number": 8464 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "343d4559dcb06080b6fc5a9f42fbb88b6e343526", + "is_verified": false, + "line_number": 8470 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "96631b5faaa50b4d24076072a357a13f48f1435f", + "is_verified": false, + "line_number": 8472 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "c5be9fd8c751b6d2ff6219c63de90e4aa9f97546", + "is_verified": false, + "line_number": 8477 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "4f68de76b2c605cb67d99b1e97b1461ef4618db2", + "is_verified": false, + "line_number": 8479 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "90d1db77f31a86c5e0081468e0dbda8172b70131", + "is_verified": false, + "line_number": 8508 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "6f3ce73b5dfbe7e4948bdad86c804ce74e17e0cf", + "is_verified": false, + "line_number": 8510 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "a1f22c6489b0cf0016b9364619a7d7eed95b6364", + "is_verified": false, + "line_number": 8512 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "b55ad569f88dc7e42868047f4f4a59c3cb6c110a", + "is_verified": false, + "line_number": 8519 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "f2c207cddfe45b6c893619c3757f264f721025c6", + "is_verified": false, + "line_number": 8521 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "d1371bfdfaf5d02731488ca6b6f12e51617813ab", + "is_verified": false, + "line_number": 8549 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "76d22e171aa272dbdd6d1d67c23d9f700362f09a", + "is_verified": false, + "line_number": 8555 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "0efe19de03cdaa4a382654246b4bff22dd79e518", + "is_verified": false, + "line_number": 8568 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "d318e8342db9deb2f6827f39fad6229ca4e4521a", + "is_verified": false, + "line_number": 8581 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "fb99a4a93e08e86cf5f4475aba66e85def711752", + "is_verified": false, + "line_number": 8586 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "04ebb8445e6050d78c4636b19a17221387216b0d", + "is_verified": false, + "line_number": 8588 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "2ec592daffb8a6e8c402690f7aadac6475c5f2ec", + "is_verified": false, + "line_number": 8591 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "f7df3933dce6e3f5843b7e55ccff8f2516302229", + "is_verified": false, + "line_number": 8601 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "fe860f7095f48e2f6a63b97124da4afd8c987895", + "is_verified": false, + "line_number": 8606 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "abd3d69f7d8c98bb7fe98cb6aa41bf3d42d23911", + "is_verified": false, + "line_number": 8609 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "2070834a550a6fbae2a3912e1117fa9f4270752b", + "is_verified": false, + "line_number": 8619 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "e7d64bf030b802c205af2832114589c141ea1f33", + "is_verified": false, + "line_number": 8622 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "c84bccf09830b2804e4d59e8bf82393ae76a8b03", + "is_verified": false, + "line_number": 8625 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "86888ce709ad4a2590221da4f4d602a254443470", + "is_verified": false, + "line_number": 8628 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "ae5da5b749fe3b2aea4957c7ee83938a75622bb7", + "is_verified": false, + "line_number": 8645 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "94ccf1bc68c1a19f5733d9ac6e5bb60166e6530d", + "is_verified": false, + "line_number": 8647 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "892dbf65134c4dfb554a16f5fc387ef153e269ed", + "is_verified": false, + "line_number": 8697 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "61bfa79fd6923bc44a8c8c500b8992a66f5ae340", + "is_verified": false, + "line_number": 8703 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "db880cb9754850e54f247dc1f7d1cce5f80ec056", + "is_verified": false, + "line_number": 8718 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "a020736e7f661dc0d7985bfdcfadb7dce8db6216", + "is_verified": false, + "line_number": 8719 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "ee8ddcdb18d90b8ddc8d9d4bb28e7fe700965ea8", + "is_verified": false, + "line_number": 8720 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "f5726d5cd941bc5fd7cd26fdbf6d1ed5750e55bd", + "is_verified": false, + "line_number": 8721 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "49e5b074ecc6a910f32617b1d2103849177a254e", + "is_verified": false, + "line_number": 8750 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "e0a73b397eafd55a2f9cac7b031bf8716bc2b79e", + "is_verified": false, + "line_number": 8751 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "9c593b1ecb2cfed50a12793d23fb3fc0cd5f5e21", + "is_verified": false, + "line_number": 8762 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "e1db0a80a43c626e55171566c5823588cfb02fd8", + "is_verified": false, + "line_number": 8764 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "a5f7d1f0a545e46e361e56191788d0f5fee625fb", + "is_verified": false, + "line_number": 8772 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "4d199d4ccaea5f654b23dbc22e54acd79cc76e8c", + "is_verified": false, + "line_number": 8774 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "865d1b874b4cd866febe48e024ed94af5303f808", + "is_verified": false, + "line_number": 8775 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "fa25b96c74c4f77bb3eb89c55a7ab88550d12de5", + "is_verified": false, + "line_number": 8776 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "a21fc67d6e3ff4d5f39596793a9e2b01c608d69f", + "is_verified": false, + "line_number": 8777 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "b2e00015ad1a9cb779655b48a19eff8c834bc36e", + "is_verified": false, + "line_number": 8778 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "ca51add4d292a3744a883a8844bcbf067b34b468", + "is_verified": false, + "line_number": 8779 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "8479cbf554082f4f4b3c9587f21c62f406940e93", + "is_verified": false, + "line_number": 8782 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "d1edb7ebbefe0dd0dfcddce8e0b7e63e74430a5b", + "is_verified": false, + "line_number": 8784 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "c8b9b705e819ebeba33d4ff4f6dc638a9ee1f47c", + "is_verified": false, + "line_number": 8787 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "250c748db4a13f7a3cff6ff0934bb0338e9f3190", + "is_verified": false, + "line_number": 8788 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "6035f3432d36e33f2a57c3c6ee62014a9fb95a19", + "is_verified": false, + "line_number": 8789 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict", + "hashed_secret": "82b23ffc5cea766826613f29718bc1e3023f58e8", + "is_verified": false, + "line_number": 8790 + } + ], + "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict": [ + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "6a1c698aa2f0b96b536710f5a2abd0ca64fdb2c1", + "is_verified": false, + "line_number": 8743 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "9a611ff964db23d7cd8a6f63beb7471144f6be92", + "is_verified": false, + "line_number": 8751 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "f2f0bc1252ccb8bbc113859849f8c36be203fd9e", + "is_verified": false, + "line_number": 8755 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "b65cab561c4acd1e20bafd08ce19a9e7b5ea9e1c", + "is_verified": false, + "line_number": 8764 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "c08ba5eff4953649130e83149817e1087c183358", + "is_verified": false, + "line_number": 8766 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "b182776899d0959f28f3d80c7d50317de5c0fb2b", + "is_verified": false, + "line_number": 8774 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "e466719c0309450475aa6b0e0c53a2211db0176d", + "is_verified": false, + "line_number": 8775 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "0e7ae57946d9c88ce5f017ac50640e2c93c5277b", + "is_verified": false, + "line_number": 8776 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "6d13086a143b48a00e3aa745a955095fbbc075b2", + "is_verified": false, + "line_number": 8777 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "d206dd2b79987f5989b23a7a43d6163ed48312f0", + "is_verified": false, + "line_number": 8779 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "be10bcbbe755bdd51e856af50af01bb4f95bfedb", + "is_verified": false, + "line_number": 8782 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "2f00d2108827b933019dbedc8cc3f9c84f3bfe1e", + "is_verified": false, + "line_number": 8784 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "a705c12d23346a892256fa95c892304bd8f4d265", + "is_verified": false, + "line_number": 8787 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "2471810862c167bed6c3a8383dd410aedb8cb7e1", + "is_verified": false, + "line_number": 8789 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "34befc5cf63e9c9259c7e75714c81db892419549", + "is_verified": false, + "line_number": 8797 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "343d4559dcb06080b6fc5a9f42fbb88b6e343526", + "is_verified": false, + "line_number": 8803 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "96631b5faaa50b4d24076072a357a13f48f1435f", + "is_verified": false, + "line_number": 8805 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "c5be9fd8c751b6d2ff6219c63de90e4aa9f97546", + "is_verified": false, + "line_number": 8810 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "4f68de76b2c605cb67d99b1e97b1461ef4618db2", + "is_verified": false, + "line_number": 8812 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "90d1db77f31a86c5e0081468e0dbda8172b70131", + "is_verified": false, + "line_number": 8841 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "6f3ce73b5dfbe7e4948bdad86c804ce74e17e0cf", + "is_verified": false, + "line_number": 8843 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "a1f22c6489b0cf0016b9364619a7d7eed95b6364", + "is_verified": false, + "line_number": 8845 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "b55ad569f88dc7e42868047f4f4a59c3cb6c110a", + "is_verified": false, + "line_number": 8854 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "f2c207cddfe45b6c893619c3757f264f721025c6", + "is_verified": false, + "line_number": 8856 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "d1371bfdfaf5d02731488ca6b6f12e51617813ab", + "is_verified": false, + "line_number": 8884 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "76d22e171aa272dbdd6d1d67c23d9f700362f09a", + "is_verified": false, + "line_number": 8890 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "0efe19de03cdaa4a382654246b4bff22dd79e518", + "is_verified": false, + "line_number": 8903 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "d318e8342db9deb2f6827f39fad6229ca4e4521a", + "is_verified": false, + "line_number": 8916 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "fb99a4a93e08e86cf5f4475aba66e85def711752", + "is_verified": false, + "line_number": 8921 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "04ebb8445e6050d78c4636b19a17221387216b0d", + "is_verified": false, + "line_number": 8923 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "2ec592daffb8a6e8c402690f7aadac6475c5f2ec", + "is_verified": false, + "line_number": 8926 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "f7df3933dce6e3f5843b7e55ccff8f2516302229", + "is_verified": false, + "line_number": 8936 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "fe860f7095f48e2f6a63b97124da4afd8c987895", + "is_verified": false, + "line_number": 8941 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "abd3d69f7d8c98bb7fe98cb6aa41bf3d42d23911", + "is_verified": false, + "line_number": 8944 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "2070834a550a6fbae2a3912e1117fa9f4270752b", + "is_verified": false, + "line_number": 8954 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "e7d64bf030b802c205af2832114589c141ea1f33", + "is_verified": false, + "line_number": 8958 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "c84bccf09830b2804e4d59e8bf82393ae76a8b03", + "is_verified": false, + "line_number": 8962 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "86888ce709ad4a2590221da4f4d602a254443470", + "is_verified": false, + "line_number": 8966 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "ae5da5b749fe3b2aea4957c7ee83938a75622bb7", + "is_verified": false, + "line_number": 8997 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "94ccf1bc68c1a19f5733d9ac6e5bb60166e6530d", + "is_verified": false, + "line_number": 8999 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "892dbf65134c4dfb554a16f5fc387ef153e269ed", + "is_verified": false, + "line_number": 9050 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "61bfa79fd6923bc44a8c8c500b8992a66f5ae340", + "is_verified": false, + "line_number": 9058 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "db880cb9754850e54f247dc1f7d1cce5f80ec056", + "is_verified": false, + "line_number": 9076 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "a020736e7f661dc0d7985bfdcfadb7dce8db6216", + "is_verified": false, + "line_number": 9077 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "ee8ddcdb18d90b8ddc8d9d4bb28e7fe700965ea8", + "is_verified": false, + "line_number": 9078 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "f5726d5cd941bc5fd7cd26fdbf6d1ed5750e55bd", + "is_verified": false, + "line_number": 9079 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "49e5b074ecc6a910f32617b1d2103849177a254e", + "is_verified": false, + "line_number": 9108 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "e0a73b397eafd55a2f9cac7b031bf8716bc2b79e", + "is_verified": false, + "line_number": 9109 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "9c593b1ecb2cfed50a12793d23fb3fc0cd5f5e21", + "is_verified": false, + "line_number": 9120 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "e1db0a80a43c626e55171566c5823588cfb02fd8", + "is_verified": false, + "line_number": 9122 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "a5f7d1f0a545e46e361e56191788d0f5fee625fb", + "is_verified": false, + "line_number": 9130 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "4d199d4ccaea5f654b23dbc22e54acd79cc76e8c", + "is_verified": false, + "line_number": 9132 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "865d1b874b4cd866febe48e024ed94af5303f808", + "is_verified": false, + "line_number": 9133 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "fa25b96c74c4f77bb3eb89c55a7ab88550d12de5", + "is_verified": false, + "line_number": 9134 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "a21fc67d6e3ff4d5f39596793a9e2b01c608d69f", + "is_verified": false, + "line_number": 9135 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "b2e00015ad1a9cb779655b48a19eff8c834bc36e", + "is_verified": false, + "line_number": 9136 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "ca51add4d292a3744a883a8844bcbf067b34b468", + "is_verified": false, + "line_number": 9137 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "8479cbf554082f4f4b3c9587f21c62f406940e93", + "is_verified": false, + "line_number": 9140 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "d1edb7ebbefe0dd0dfcddce8e0b7e63e74430a5b", + "is_verified": false, + "line_number": 9142 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "c8b9b705e819ebeba33d4ff4f6dc638a9ee1f47c", + "is_verified": false, + "line_number": 9145 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "250c748db4a13f7a3cff6ff0934bb0338e9f3190", + "is_verified": false, + "line_number": 9146 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "6035f3432d36e33f2a57c3c6ee62014a9fb95a19", + "is_verified": false, + "line_number": 9147 + }, + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/es_LA/es_LA_nv230301.dict", + "hashed_secret": "82b23ffc5cea766826613f29718bc1e3023f58e8", + "is_verified": false, + "line_number": 9148 + } + ], + "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt": [ + { + "type": "Artifactory Credentials", + "filename": "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt", + "hashed_secret": "27b998c0976876189b861934085bba2964e49a11", + "is_verified": false, + "line_number": 4886 + } + ], + "tests/hydra/config1_invalid.yaml": [ + { + "type": "Secret Keyword", + "filename": "tests/hydra/config1_invalid.yaml", + "hashed_secret": "e5e9fa1ba31ecd1ae84f75caaa474f3a663f05f4", + "is_verified": false, + "line_number": 2 + } + ], + "tests/hydra/config_subdir/config2_invalid.yaml": [ + { + "type": "Secret Keyword", + "filename": "tests/hydra/config_subdir/config2_invalid.yaml", + "hashed_secret": "ebf1f14a3530cd22650b951908b5159f1f2a3ca8", + "is_verified": false, + "line_number": 2 + } + ], + "tests/infer_data_path.py": [ + { + "type": "Base64 High Entropy String", + "filename": "tests/infer_data_path.py", + "hashed_secret": "e3fb89ccb261c88146519164f7e8a47786d33fee", + "is_verified": false, + "line_number": 271 + } + ], + "tutorials/asr/Multilang_ASR.ipynb": [ + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "23ab3508f39164c81139e2bc866ebe46b69248f3", + "is_verified": false, + "line_number": 562 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "bfbd2a23ecc348e67d5cb55f2db6bf9f9cebb325", + "is_verified": false, + "line_number": 563 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "81dee32e25f1d0c256140df030b2531dde332acb", + "is_verified": false, + "line_number": 564 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "4af542b2d6e243a440ad4cf2815d6e2237808df5", + "is_verified": false, + "line_number": 565 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "a67ed767388e7eece04b5c6f7b568ddf29edac7c", + "is_verified": false, + "line_number": 566 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "c022505b9f732f0cb352fe8ed5fc278912b46488", + "is_verified": false, + "line_number": 567 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "182d4019337b9c8786b6e34a83441a21bad1874b", + "is_verified": false, + "line_number": 568 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "740eb48367bc73edfc56bc4414e9b2662736566c", + "is_verified": false, + "line_number": 569 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "b1aef696788110cd1e18595d6cebb0959d36adfa", + "is_verified": false, + "line_number": 570 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "c7fe8bf177365ffe460c15da553c7368ddfdbaa7", + "is_verified": false, + "line_number": 571 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "f3112d03411e69f7a23d5d1582079e1173cd032e", + "is_verified": false, + "line_number": 572 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "46859f9c2007195e9762d88c36247d018859f09d", + "is_verified": false, + "line_number": 676 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "afc8f88c8c20204370c5f35925ef36fc71c71379", + "is_verified": false, + "line_number": 677 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "df94d52f84db3fa667993f544ea80aca4f9a2125", + "is_verified": false, + "line_number": 678 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "783c5002441b0a6225cc26f71864f7f8504ef819", + "is_verified": false, + "line_number": 679 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "469b7833ff898575016f6df77172c9fc910efdf7", + "is_verified": false, + "line_number": 680 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "90771a6f73123f8665fe199fa920d06ff407ec72", + "is_verified": false, + "line_number": 681 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "cbf4db9d163e190540825e529b55b551ba21733a", + "is_verified": false, + "line_number": 682 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "4f58de63d50f78243fadbf94a08fc58fb51d1428", + "is_verified": false, + "line_number": 683 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "2dc56a67a4ab217c7197c4ce468e0dbe7f20d54b", + "is_verified": false, + "line_number": 684 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "a4d526869e953d0eebcfd7948f798ba1d39397c4", + "is_verified": false, + "line_number": 685 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "8109bf627c2f367f0f75ec0fc4079040d46ea65b", + "is_verified": false, + "line_number": 686 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "05d48a4ba182e2966da054c756e527a7304b586e", + "is_verified": false, + "line_number": 1148 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "a091c7128789cea09adabf026a3dbd7b0febcfef", + "is_verified": false, + "line_number": 1149 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "38403bcc580ba265c4918eaea606399cf37689a2", + "is_verified": false, + "line_number": 1150 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "d572b2cd7c13cbb69a638f2b978da146b34340df", + "is_verified": false, + "line_number": 1151 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "f97db3647a1e5869756fc4019bf198e900034e67", + "is_verified": false, + "line_number": 1152 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "ae0722be63cd884e5788cc8153a9f7728d54f72c", + "is_verified": false, + "line_number": 1153 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "d694e89fef75e8d868accaba50c224a1f2dd8399", + "is_verified": false, + "line_number": 1154 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "8e2fe85dc67f14fa2f02ffa465974dec89655856", + "is_verified": false, + "line_number": 1155 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "e23d0990cdb54a3d1dc3c9bcf922242fc7bacc64", + "is_verified": false, + "line_number": 1156 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "d49a849228aa07e957274e8b1349405868626e56", + "is_verified": false, + "line_number": 1157 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "a32050f253ef8a9f61b37ac26180efb6acf0bb30", + "is_verified": false, + "line_number": 1158 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "834213b15871dbd6b30f517de52ec9df2730eb7b", + "is_verified": false, + "line_number": 1196 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "b0555871dc9fc34a149b82e62fc72fc654b84667", + "is_verified": false, + "line_number": 1197 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "32fe03ac71cc6c8a31c72e21c9aec64557cb4fba", + "is_verified": false, + "line_number": 1198 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "3e712f05fedce2068adf4f6ac44b3759a1fbbd59", + "is_verified": false, + "line_number": 1199 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "f8472c93db4afd582aafddcc5a2dd8d568cb3e43", + "is_verified": false, + "line_number": 1200 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "1d8c14b1e24558e9cdbd47644f36c5675edcdcca", + "is_verified": false, + "line_number": 1201 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "2a11395949beadb049a0cd129b6f5e3aa3782814", + "is_verified": false, + "line_number": 1202 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "edae2d92434824df126daa3340589b60da2afa67", + "is_verified": false, + "line_number": 1203 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "cc9e0c1b12b1abaffa7eb6cbccbee72f65ac1535", + "is_verified": false, + "line_number": 1204 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "e80f107771869ec656d0cf6e8486486b69f922ae", + "is_verified": false, + "line_number": 1205 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "160fb2093e3388000d26c76d25428170a035bdcf", + "is_verified": false, + "line_number": 1206 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "9488ce4f041c4c3b404ff21110170940ce5295f2", + "is_verified": false, + "line_number": 2028 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "4fb50b7293c24d257d3b39eec254517e73cd27be", + "is_verified": false, + "line_number": 2029 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "2d3ad63f96d6f447f63a2ec9d451ba34a8c8013e", + "is_verified": false, + "line_number": 2030 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "7c0a1424bcd4d5086497ba05ff5d2b5c11112655", + "is_verified": false, + "line_number": 2031 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "5582b5110b9376a45e2329d2d127f972673287a9", + "is_verified": false, + "line_number": 2032 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "e99b32711f4beb0cd40130d75016b45ec188eac0", + "is_verified": false, + "line_number": 2033 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "5cbed1a68bafa82a8a0ff59f03c8f28148081f1e", + "is_verified": false, + "line_number": 2034 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "d7604ec9987dc19cfd2e512d1fdab6f1ceb41a7c", + "is_verified": false, + "line_number": 2035 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "b329420ee2aa11facc5a652f2bb32ca0f186c442", + "is_verified": false, + "line_number": 2036 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "f16bb2651aa872a8d12dd98e11efb58bd23da895", + "is_verified": false, + "line_number": 2037 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "74032a5243a1cf863a7e84e1749a3fac81e7290f", + "is_verified": false, + "line_number": 2038 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "b5b39748fce09b60e94b21b466c52c791d627cba", + "is_verified": false, + "line_number": 2039 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "db98703d26bdf4616c4e65036986b6355611211f", + "is_verified": false, + "line_number": 2040 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "4d14a5871469d7490614a53cab3fbfc52a7279e5", + "is_verified": false, + "line_number": 2041 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "258ddab91bab22bd68bc3015ea889521fa3fdacd", + "is_verified": false, + "line_number": 2042 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "6d7a940238710f5f734f1816fc47959ef7f95636", + "is_verified": false, + "line_number": 2043 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "de52434f2efe55aed0df2256de9abd275539cf0b", + "is_verified": false, + "line_number": 2044 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "4cd4d2a18051aaad9320442a4377cd18577f73f9", + "is_verified": false, + "line_number": 2045 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "954ca4decf6425679bb28e223ff6b137c1e9ce6e", + "is_verified": false, + "line_number": 2046 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "f6b0d563b01f1250220b29dbdf5f70c14d206be7", + "is_verified": false, + "line_number": 2047 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "99285fb5e057fc901d1877f1f0c8b8c94ecd1358", + "is_verified": false, + "line_number": 2048 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "faa5674f871179e8abe110c410403b51893c917f", + "is_verified": false, + "line_number": 2049 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "92a0615e0edaf88003fb0b6f70dc77de93828bb8", + "is_verified": false, + "line_number": 2103 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "77351aa360f4c3e50ef4cd700bba74219d5b5aa2", + "is_verified": false, + "line_number": 2104 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "c7226bd151222bb3ab574a4d21d83d97ed054112", + "is_verified": false, + "line_number": 2105 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "f73edb9e83b0f41052050055ac563973a6850afb", + "is_verified": false, + "line_number": 2106 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "346a54ded37ddfd7de0b98b3f8a6fefb92c97ec5", + "is_verified": false, + "line_number": 2107 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "078cde4cd776c14c9166304412a643cff409faa0", + "is_verified": false, + "line_number": 2108 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "4a3c4394031a7842a72b787cf5b08a165e8b7206", + "is_verified": false, + "line_number": 2109 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "406aa8259676acb13859b17f5e1f94fcdb69c0d6", + "is_verified": false, + "line_number": 2110 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "3607002f0fabd7bbbe4e90715beb1c85a6ade333", + "is_verified": false, + "line_number": 2111 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "465ee282a0d7b2cf762d9dbce0b1f356d75bc15d", + "is_verified": false, + "line_number": 2112 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "44608c743318ec5489b23a388f4f6e0f724ac0a8", + "is_verified": false, + "line_number": 2113 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "1a4251d6b7ba19233f16766868ac29e3428db4f0", + "is_verified": false, + "line_number": 2203 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "55dd3f4c1b8682cf8ec7b21243cdffe0127f0e03", + "is_verified": false, + "line_number": 2204 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "220d1fa41654cb432843792dd63e8f446eedefa9", + "is_verified": false, + "line_number": 2205 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "35654d3cde31330d1e3fd05a04d4cbea41c1f993", + "is_verified": false, + "line_number": 2206 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "119aea04e3efd1c741bfd98fe51439d6bb280a95", + "is_verified": false, + "line_number": 2207 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "b5c60cd211f3aac2db08a223c0571d1e53e7c60d", + "is_verified": false, + "line_number": 2208 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "4039b99a1c2bdca7737ed4f7934e5a9db2e76f8c", + "is_verified": false, + "line_number": 2209 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "6c04b8e8560c27e7c586a92b046c532da6b1e50c", + "is_verified": false, + "line_number": 2210 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "6a76bf0f171be18b2c3bd53219235c3ab16fdc7f", + "is_verified": false, + "line_number": 2211 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "025bef6ec65b88f3999f1b25157b7a6e17f5bfcd", + "is_verified": false, + "line_number": 2212 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "026954128506e379050cbea083edbddd1c1777b6", + "is_verified": false, + "line_number": 2213 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "745cfb125e37dee46e2952e02a0a4d5d5d56f546", + "is_verified": false, + "line_number": 2314 + }, + { + "type": "Base64 High Entropy String", + "filename": "tutorials/asr/Multilang_ASR.ipynb", + "hashed_secret": "543a58dd6dadfca9bc28a7f164e6b36c0b6e9a96", + "is_verified": false, + "line_number": 4799 + } + ], + "tutorials/multimodal/DreamBooth Tutorial.ipynb": [ + { + "type": "Base64 High Entropy String", + "filename": "tutorials/multimodal/DreamBooth Tutorial.ipynb", + "hashed_secret": "effe1444a4c73d23e5fc2a9f28048e1dd0653e82", + "is_verified": false, + "line_number": 223 + } + ], + "tutorials/multimodal/Multimodal Data Preparation.ipynb": [ + { + "type": "Hex High Entropy String", + "filename": "tutorials/multimodal/Multimodal Data Preparation.ipynb", + "hashed_secret": "5251fc51eb9d18d0de87c87ea09e4af9d1bf7a9f", + "is_verified": false, + "line_number": 18 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/multimodal/Multimodal Data Preparation.ipynb", + "hashed_secret": "194c1dff6224941326fe1d95936ba0e22fd60bb2", + "is_verified": false, + "line_number": 65 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/multimodal/Multimodal Data Preparation.ipynb", + "hashed_secret": "fe852a949d021998ad30bdd6a73a20c82221d486", + "is_verified": false, + "line_number": 217 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/multimodal/Multimodal Data Preparation.ipynb", + "hashed_secret": "b641cbe299c9e27b480cc8a823bb020d45962236", + "is_verified": false, + "line_number": 660 + } + ], + "tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb": [ + { + "type": "Artifactory Credentials", + "filename": "tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb", + "hashed_secret": "b5077fe6f2a92ca029dc6b5c022321b3828a2998", + "is_verified": false, + "line_number": 876 + } + ], + "tutorials/nlp/Token_Classification-BioMegatron.ipynb": [ + { + "type": "Hex High Entropy String", + "filename": "tutorials/nlp/Token_Classification-BioMegatron.ipynb", + "hashed_secret": "308d692d0307f53df800c91720fb271d62078391", + "is_verified": false, + "line_number": 689 + } + ], + "tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb": [ + { + "type": "Hex High Entropy String", + "filename": "tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb", + "hashed_secret": "80903ddedcf4ec0a2ee5911cefa7e1ad52419dcc", + "is_verified": false, + "line_number": 989 + } + ], + "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb": [ + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "3d52dd8e15de7d018930a034453752599cecfb95", + "is_verified": false, + "line_number": 18 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "8dd2391af4e9a5dc76b28f46ff201e1e7c9942f0", + "is_verified": false, + "line_number": 38 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "af149e43b0d4f9e4554b6b7a3260eafbfe1b5d8e", + "is_verified": false, + "line_number": 89 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "bb0c7befac27daf54e33d8e8b179598866feae04", + "is_verified": false, + "line_number": 112 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "645a085cdd80cde400cc162bfa2c24a286ca24c8", + "is_verified": false, + "line_number": 132 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "592340cf9486ccc3b0eff57e8f78ccf1f90add85", + "is_verified": false, + "line_number": 147 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "a9fdd91c2d3e03d3a8f039126c275d27c4adc5b2", + "is_verified": false, + "line_number": 198 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "eb455e1717efd60ca3896632c94a2b41d87855f2", + "is_verified": false, + "line_number": 212 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "096179c125fa827fde3bb88f53873aeaa3f7709f", + "is_verified": false, + "line_number": 263 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "9b895d433fc031a41bf872d697038c218e909066", + "is_verified": false, + "line_number": 283 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "daf16ebe45194e3ebed5c2cc59f334279d51fa0f", + "is_verified": false, + "line_number": 334 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "0d39ffb347d6fcd6c7b3519310756c1020091c6c", + "is_verified": false, + "line_number": 357 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "8eedca919f4fb2dafb68c2afc1faf76088ab12e0", + "is_verified": false, + "line_number": 377 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "7bc61a4c215f078b7de911363bedd91b471fa213", + "is_verified": false, + "line_number": 392 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "3088c565074b06f94684576b96b4d5b59560497c", + "is_verified": false, + "line_number": 443 + }, + { + "type": "Hex High Entropy String", + "filename": "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb", + "hashed_secret": "209b86707a2f8f8253ce09587ca7ac1f6eefd4da", + "is_verified": false, + "line_number": 457 + } + ], + "tutorials/tts/FastPitch_Finetuning.ipynb": [ + { + "type": "Hex High Entropy String", + "filename": "tutorials/tts/FastPitch_Finetuning.ipynb", + "hashed_secret": "e915298ac5414db160aada21c7a235431ddfa98d", + "is_verified": false, + "line_number": 717 + } + ], + "tutorials/tts/FastPitch_MixerTTS_Training.ipynb": [ + { + "type": "Hex High Entropy String", + "filename": "tutorials/tts/FastPitch_MixerTTS_Training.ipynb", + "hashed_secret": "e915298ac5414db160aada21c7a235431ddfa98d", + "is_verified": false, + "line_number": 611 + } + ], + "tutorials/tts/Inference_ModelSelect.ipynb": [ + { + "type": "Hex High Entropy String", + "filename": "tutorials/tts/Inference_ModelSelect.ipynb", + "hashed_secret": "98307baca81149afa8a07a7cbe7ebc48c447d4be", + "is_verified": false, + "line_number": 315 + } + ], + "tutorials/tts/NeMo_TTS_Primer.ipynb": [ + { + "type": "Hex High Entropy String", + "filename": "tutorials/tts/NeMo_TTS_Primer.ipynb", + "hashed_secret": "e915298ac5414db160aada21c7a235431ddfa98d", + "is_verified": false, + "line_number": 2063 + } + ] + }, + "generated_at": "2024-09-04T00:45:39Z" +} diff --git a/.github/workflows/secrets-detector.yml b/.github/workflows/secrets-detector.yml new file mode 100644 index 000000000000..4de052535cc1 --- /dev/null +++ b/.github/workflows/secrets-detector.yml @@ -0,0 +1,32 @@ +# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: Secrets detector + +on: + pull_request: + +jobs: + main: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ inputs.branch-name || github.head_ref }} + + - name: Install secrets detector + run: pip install detect-secrets + + - name: Run on change-set + run: git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .github/workflows/config/.secrets.baseline \ No newline at end of file From fdf1979257ce82056f8ffe0c9985b4791fd88143 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 5 Sep 2024 14:14:45 -0700 Subject: [PATCH 111/664] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let'?= =?UTF-8?q?s=20bump=20`Dockerfile.ci`=20to=203396356=20!=20(#10353)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> --- Dockerfile.ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ci b/Dockerfile.ci index 43f137bf0c89..3d9a9d9b08a1 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -34,7 +34,7 @@ WORKDIR /workspace # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.15.0 -ARG MCORE_TAG=9ab31cbd6265f83640008801e1c3efbf80892cea +ARG MCORE_TAG=3396356ab4ca83cc4c4d3272530b142a1702606e ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ From 1d5de5946035ad555a913ad3e1a9b13909426513 Mon Sep 17 00:00:00 2001 From: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> Date: Fri, 6 Sep 2024 01:14:38 -0400 Subject: [PATCH 112/664] [NeMo-UX] Turn on mcore performance optimizations (#10209) * expose TP overlap Signed-off-by: Jieming Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * add tp overlap recipes Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * turn on pipeline parallel overlap Signed-off-by: Jimmy Zhang * refactor Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * Update base.py Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> * Update megatron_parallel.py Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> * remove env var Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * add optimization config Signed-off-by: Jimmy Zhang * fix typo Signed-off-by: Jimmy Zhang * refactor into megatron parallel setup Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * refactor Signed-off-by: Jimmy Zhang * fix config ordering, add wgrad deferral Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * cleanup Signed-off-by: Jimmy Zhang * use config Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * clean Signed-off-by: Jimmy Zhang * enable wgrad defferal Signed-off-by: Jimmy Zhang * add grad bucket size Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * move everthing into a callback Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * cleanup Signed-off-by: Jimmy Zhang * fix imports Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * move userbuffer init Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * cleanup Signed-off-by: Jimmy Zhang * fix VP Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * address comments Signed-off-by: Jimmy Zhang * add gradient accum guard Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * Update base.py Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> * address comments Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 * address comments Signed-off-by: Jimmy Zhang * Apply isort and black reformatting Signed-off-by: JimmyZhang12 --------- Signed-off-by: Jieming Zhang Signed-off-by: JimmyZhang12 Signed-off-by: Jimmy Zhang Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> Co-authored-by: Jieming Zhang Co-authored-by: JimmyZhang12 --- nemo/collections/llm/gpt/model/base.py | 11 + nemo/collections/llm/recipes/llama3_70b.py | 25 ++ nemo/collections/llm/recipes/llama3_8b.py | 20 ++ .../recipes/tp_overlap_configs/__init__.py | 0 .../recipes/tp_overlap_configs/userbuffers.py | 73 +++++ .../callbacks/megatron_comm_overlap.py | 268 ++++++++++++++++++ 6 files changed, 397 insertions(+) create mode 100644 nemo/collections/llm/recipes/tp_overlap_configs/__init__.py create mode 100644 nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py create mode 100644 nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index d13e86ce2ca2..a6b53f4e859d 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -32,6 +32,15 @@ _, HAVE_TE = safe_import("transformer_engine") +# Gradient accumulation fusion may be enabled if available, for more information see: +# https://github.com/NVIDIA/Megatron-LM/blob/01945b98d1ea3a2acb5e8301e181a328104f4856/megatron/core/tensor_parallel/layers.py#L575 +# TODO: Clean this up with a getter and install instructions +_grad_accum_fusion_available = True +try: + import fused_weight_gradient_mlp_cuda +except ImportError: + _grad_accum_fusion_available = False + if TYPE_CHECKING: from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel @@ -124,6 +133,8 @@ class GPTConfig(TransformerConfig, io.IOMixin): seq_length: int = 1024 attention_softmax_in_fp32: bool = False masked_softmax_fusion: bool = True + cross_entropy_loss_fusion: bool = True + gradient_accumulation_fusion: bool = _grad_accum_fusion_available deallocate_pipeline_outputs = True transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = default_layer_spec diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index 60c3d3697449..cbf6b5e2e7a1 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -14,7 +14,9 @@ from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin +from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192 from nemo.collections.llm.utils import Config, Partial +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "llama3_70b" @@ -93,6 +95,29 @@ def pretrain_recipe( ) +def pretrain_recipe_performance( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + """'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default + due to being model specific or lacking sufficent support. For better compatibility please use + the default 'pretrain_recipe()' above.""" + recipe = pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn + ) + + recipe.trainer.callbacks.append( + Config( + MegatronCommOverlapCallback, + tp_comm_overlap=True, + tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, + defer_embedding_wgrad_compute=True, + wgrad_deferral_limit=22, + ) + ) + + return recipe + + def hf_resume() -> Config[nl.AutoResume]: return Config( nl.AutoResume, diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index 3f07d6b53c94..17d4e8b168b3 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -14,6 +14,7 @@ from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin from nemo.collections.llm.utils import Config, Partial +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "llama3_8b" @@ -92,6 +93,25 @@ def pretrain_recipe( ) +def pretrain_recipe_performance( + name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain +) -> Partial: + """'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default + due to being model specific or lacking sufficent support. For better compatibility please use + the default 'pretrain_recipe()' above.""" + recipe = pretrain_recipe( + name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn + ) + + recipe.trainer.callbacks.append( + Config( + MegatronCommOverlapCallback, + tp_comm_overlap=False, + ) + ) + return recipe + + def hf_resume() -> Config[nl.AutoResume]: return Config( nl.AutoResume, diff --git a/nemo/collections/llm/recipes/tp_overlap_configs/__init__.py b/nemo/collections/llm/recipes/tp_overlap_configs/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py b/nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py new file mode 100644 index 000000000000..a17a77c3b45b --- /dev/null +++ b/nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py @@ -0,0 +1,73 @@ +from dataclasses import dataclass + + +@dataclass +class TPOverlapCfg: + pass + + +@dataclass +class PipelineOverlapCfg(TPOverlapCfg): + num_sm: int + cga_size: int + num_splits: int + set_sm_margin: bool + fp8_buf: bool = (False,) + method: str = 'pipeline' + + +@dataclass +class RingExchangeOverlapCfg(TPOverlapCfg): + aggregate: bool = False + method: str = 'ring_exchange' + + +@dataclass +class BulkOverlapCfg(TPOverlapCfg): + num_sm: int + cga_size: int + set_sm_margin: bool + method: str = 'bulk' + + +@dataclass +class TransformerLayerTPOverlapCfg: + qkv_dgrad: TPOverlapCfg + qkv_wgrad: TPOverlapCfg + fc1_dgrad: TPOverlapCfg + fc1_wgrad: TPOverlapCfg + qkv_fprop: TPOverlapCfg + proj_dgrad: TPOverlapCfg + fc1_fprop: TPOverlapCfg + fc2_dgrad: TPOverlapCfg + proj_fprop: TPOverlapCfg + fc2_fprop: TPOverlapCfg + + +# TODO: Add more configs and create a getter function for expose a single api +# Model configs: H100/70B/TP8/MBS1/SeqLen8K +userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg( + qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), + qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False), + fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), + fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), + qkv_fprop=RingExchangeOverlapCfg(aggregate=False), + proj_dgrad=RingExchangeOverlapCfg(aggregate=False), + fc1_fprop=RingExchangeOverlapCfg(aggregate=False), + fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), + proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True), + fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True), +) + +userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg( + qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), + qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False), + fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), + fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), + qkv_fprop=RingExchangeOverlapCfg(aggregate=False), + proj_dgrad=RingExchangeOverlapCfg(aggregate=False), + fc1_fprop=RingExchangeOverlapCfg(aggregate=False), + fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), + proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True), + fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True), +) diff --git a/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py b/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py new file mode 100644 index 000000000000..f9181e8ad70e --- /dev/null +++ b/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py @@ -0,0 +1,268 @@ +from dataclasses import asdict, dataclass, fields +import pytorch_lightning as pl + +from megatron.core import ModelParallelConfig +from megatron.core.distributed import DistributedDataParallelConfig +from megatron.core.optimizer import OptimizerConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import TransformerLayerTPOverlapCfg +from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy, ParallelismConfig +from nemo.utils import logging + +try: + from megatron.core.num_microbatches_calculator import get_micro_batch_size +except (ImportError, ModuleNotFoundError): + logging.warning("Megatron num_microbatches_calculator not found, using Apex version.") + from apex.transformer.pipeline_parallel.utils import get_micro_batch_size + +try: + import transformer_engine + + HAVE_TE = True +except (ImportError, ModuleNotFoundError): + HAVE_TE = False + + +@dataclass +class _CommOverlapConfig: + # Tensor parallel communication overlap (experimental) + tp_comm_overlap: bool = None + tp_comm_overlap_cfg: dict = None + # Pipeline parallel communication overlap + overlap_p2p_comm: bool = None + batch_p2p_comm: bool = None + # Data parallel communication overlap + overlap_grad_reduce: bool = None + overlap_param_gather: bool = None + overlap_param_gather_with_optimizer_step: bool = None + align_param_gather: bool = None + bucket_size: int = None + # Pipeline bubble overlap + defer_embedding_wgrad_compute: bool = None + wgrad_deferral_limit: int = None + + +class MegatronCommOverlapCallback(Callback): + """ + A PyTorch Lightning callback to enable communication compute overlap. + This callback enables the following: + - tensor parallel communication overlap + - pipeline parallel communication overlap + - data parallel communication overlap + - pipeline bubble overlap + + Args: + tp_comm_overlap (bool): Enable tensor parallel overlap (experimental) + tp_comm_overlap_cfg (TransformerLayerTPOverlapCfg): Tensor parallel overlap config + overlap_p2p_comm (bool): Enable pipeline parallel overlap + batch_p2p_comm (bool): Batch pipeline parallel send/recv into a single op + overlap_grad_reduce (bool): Overlap data parallel gradient reduction with compute + overlap_param_gather (bool): Overlap data parallel parameter gather with compute + overlap_param_gather_with_optimizer_step (bool): Overlap data parallel parameter gather optimizer step + align_param_gather (bool): Align data parallel parameter gather across virtual pipeline chunks + bucket_size (int): The DDP bucket size, controls the data parallel overlap granularity + defer_embedding_wgrad_compute (bool): Overlap wgrads with the pipeline drain bubble for the last pipeline stage + wgrad_deferral_limit (int): Limit of how many outstanding wgrads may be overlapped with the pipeline drain bubble + + Example: + >>> callback = MegatronCommOverlapCallback(tp_comm_overlap=True) + >>> trainer = Trainer(callbacks=[callback]) + """ + + def __init__( + self, + tp_comm_overlap: bool = None, + tp_comm_overlap_cfg: TransformerLayerTPOverlapCfg = None, + overlap_p2p_comm: bool = None, + batch_p2p_comm: bool = None, + overlap_grad_reduce: bool = None, + overlap_param_gather: bool = None, + overlap_param_gather_with_optimizer_step: bool = None, + align_param_gather: bool = None, + bucket_size: int = None, + defer_embedding_wgrad_compute: bool = None, + wgrad_deferral_limit: int = None, + ): + + self.user_comm_overlap_cfg = _CommOverlapConfig( + tp_comm_overlap=tp_comm_overlap, + tp_comm_overlap_cfg=tp_comm_overlap_cfg, + overlap_p2p_comm=overlap_p2p_comm, + batch_p2p_comm=batch_p2p_comm, + overlap_grad_reduce=overlap_grad_reduce, + overlap_param_gather=overlap_param_gather, + overlap_param_gather_with_optimizer_step=overlap_param_gather_with_optimizer_step, + align_param_gather=align_param_gather, + bucket_size=bucket_size, + defer_embedding_wgrad_compute=defer_embedding_wgrad_compute, + wgrad_deferral_limit=wgrad_deferral_limit, + ) + + self.tp_comm_overlap_cfg = None + self.need_tp_overlap_ub_init = False + + def _get_model_comm_overlap_cfgs( + self, + parallelism_cfg: ParallelismConfig, + ) -> _CommOverlapConfig: + comm_overlap_cfg = _CommOverlapConfig() + + vp_size = parallelism_cfg.virtual_pipeline_model_parallel_size + if vp_size is None: + vp_size = 1 + + # Optimizations disabled by default, can be overriden by user + comm_overlap_cfg.tp_comm_overlap = False + comm_overlap_cfg.tp_comm_overlap_cfg = None + comm_overlap_cfg.defer_embedding_wgrad_compute = False + comm_overlap_cfg.wgrad_deferral_limit = -1 + + # Check if TP overlap can be safely enabled + if self.user_comm_overlap_cfg.tp_comm_overlap is True: + if parallelism_cfg.tensor_model_parallel_size < 2: + logging.warning("Disabling tensor parallel communication overlap due to TP size < 2.") + self.user_comm_overlap_cfg.tp_comm_overlap = False + elif not parallelism_cfg.sequence_parallel: + logging.warning("Disabling tensor parallel communication overlap due to sequence_parallel=False.") + self.user_comm_overlap_cfg.tp_comm_overlap = False + elif not HAVE_TE: + logging.warning("Disabling tensor parallel communication overlap due to TE not detected.") + self.user_comm_overlap_cfg.tp_comm_overlap = False + + # PP overlap + if parallelism_cfg.pipeline_model_parallel_size > 1: + if vp_size > 1: + comm_overlap_cfg.overlap_p2p_comm = True + comm_overlap_cfg.batch_p2p_comm = False + else: + comm_overlap_cfg.overlap_p2p_comm = False + comm_overlap_cfg.batch_p2p_comm = True + else: + comm_overlap_cfg.overlap_p2p_comm = False + comm_overlap_cfg.batch_p2p_comm = False + + comm_overlap_cfg = self._override_user_cfgs(comm_overlap_cfg) + return comm_overlap_cfg + + def _get_optimizer_overlap_cfgs(self, parallelism_cfg: ParallelismConfig) -> _CommOverlapConfig: + from nemo.utils import AppState + + app_state = AppState() + data_parallel_size = app_state.data_parallel_size + + vp_size = parallelism_cfg.virtual_pipeline_model_parallel_size + if vp_size is None: + vp_size = 1 + + comm_overlap_cfg = _CommOverlapConfig() + comm_overlap_cfg.bucket_size = None + comm_overlap_cfg.overlap_grad_reduce = False + comm_overlap_cfg.overlap_param_gather = False + comm_overlap_cfg.overlap_param_gather_with_optimizer_step = False + comm_overlap_cfg.align_param_gather = False + + if data_parallel_size > 1: + comm_overlap_cfg.bucket_size = 128 * 1024 * 1024 + comm_overlap_cfg.overlap_grad_reduce = True + comm_overlap_cfg.overlap_param_gather = True + if parallelism_cfg.pipeline_model_parallel_size > 1 and vp_size > 1: + comm_overlap_cfg.overlap_param_gather_with_optimizer_step = True + comm_overlap_cfg.align_param_gather = True + + comm_overlap_cfg = self._override_user_cfgs(comm_overlap_cfg) + return comm_overlap_cfg + + def _apply_cfgs(self, src_cfg, dest_cfg): + # apply optimizations into dest_cfg + for field in fields(src_cfg): + if hasattr(dest_cfg, field.name): + setattr(dest_cfg, field.name, getattr(src_cfg, field.name)) + + def _override_user_cfgs(self, comm_overlap_cfg): + # override default configs with any user provided configs + if isinstance(self.user_comm_overlap_cfg, _CommOverlapConfig): + for field in fields(self.user_comm_overlap_cfg): + user_value = getattr(self.user_comm_overlap_cfg, field.name) + if user_value is not None: + setattr(comm_overlap_cfg, field.name, user_value) + + return comm_overlap_cfg + + def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None: + assert isinstance(trainer.strategy, MegatronStrategy), "MegatronCommOverlapCallback requires MegatronStrategy" + parallelism_cfg = trainer.strategy.parallelism + + if hasattr(trainer.model, "config") and isinstance(trainer.model.config, ModelParallelConfig): + comm_overlap_cfg = self._get_model_comm_overlap_cfgs(parallelism_cfg) + self._apply_cfgs(comm_overlap_cfg, trainer.model.config) + if hasattr(trainer.model, '__io__'): + self._apply_cfgs(comm_overlap_cfg, trainer.model.__io__.config) + + if trainer.model.config.tp_comm_overlap: + self.tp_comm_overlap_cfg = comm_overlap_cfg.tp_comm_overlap_cfg + self.need_tp_overlap_ub_init = True + + # Data parallel overlap is only available with the Megatron DDP and Distributed optimizer + if ( + hasattr(trainer.model.optim, "config") + and isinstance(trainer.model.optim.config, OptimizerConfig) + and isinstance(trainer.strategy.ddp_config, DistributedDataParallelConfig) + and trainer.strategy.ddp_config.use_distributed_optimizer + ): + comm_overlap_cfg = self._get_optimizer_overlap_cfgs(parallelism_cfg) + self._apply_cfgs(comm_overlap_cfg, trainer.model.optim.config) + self._apply_cfgs(comm_overlap_cfg, trainer.strategy.ddp_config) + if hasattr(trainer.model, '__io__'): + self._apply_cfgs(comm_overlap_cfg, trainer.model.__io__.optim.config) + + def _init_te_userbuffers(self, model_parallel_cfg: ModelParallelConfig): + from megatron.core import parallel_state + + if self.tp_comm_overlap_cfg is None: + logging.warning( + "Tensor parallel overlap: No overlap config provided. Initializing TP comm overlap with the default config." + ) + else: + # ub_cfgs is a dataclass, however TE needs a dict, so convert here + self.tp_comm_overlap_cfg = asdict(self.tp_comm_overlap_cfg) + + micro_batch_size = get_micro_batch_size() + hidden_size = model_parallel_cfg.hidden_size + sequence_length = model_parallel_cfg.seq_length + fp8 = model_parallel_cfg.fp8 is not None + + input_shape = [ + sequence_length * micro_batch_size // parallel_state.get_context_parallel_world_size(), + hidden_size, + ] + + try: + transformer_engine.pytorch.module.base.initialize_ub( + shape=input_shape, + tp_size=parallel_state.get_tensor_model_parallel_world_size(), + use_fp8=fp8, + ub_cfgs=self.tp_comm_overlap_cfg, + ) + except Exception as error: + raise Exception(f"Tensor parallel overlap: userbuffer initialization failed with {error}") + + self.need_tp_overlap_ub_init = False + + # _init_te_userbuffers must run once before any stages, however there isnt such a + # unified callback, so add a hook for every stage + def on_fit_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: + if self.need_tp_overlap_ub_init: + self._init_te_userbuffers(trainer.model.config) + + def on_validation_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: + if self.need_tp_overlap_ub_init: + self._init_te_userbuffers(trainer.model.config) + + def on_test_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: + if self.need_tp_overlap_ub_init: + self._init_te_userbuffers(trainer.model.config) + + def on_predict_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: + if self.need_tp_overlap_ub_init: + self._init_te_userbuffers(trainer.model.config) From 34393c642505fec19164f77c5a8f8d03b023a5ad Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Fri, 6 Sep 2024 00:26:10 -0700 Subject: [PATCH 113/664] [NeMo-UX] checkpointing improvements (#10241) * save model weights and artifacts to separate directories Signed-off-by: ashors1 * add save_artifacts_on_train_end Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * do not save optimizer states in final checkpoint Signed-off-by: ashors1 * WIP support for saving only last k optimizer states Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * minor cleanup Signed-off-by: ashors1 * Revert support for saving last k optimizer states. This will be addressed in a subsequent PR. * use storage_options to determine when to skip saving optimizer states Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * fix variable names, make checkpoint load work when optimizer states don't exist in the checkpoint Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * FSDP updates, provide option to save optimizer states on_train_end Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * simplify implementation, remove save_best_model option Signed-off-by: ashors1 * update default value of ckpt_include_optimizer for fsdp Signed-off-by: ashors1 * remove unused imports Signed-off-by: ashors1 * remove unused import Signed-off-by: ashors1 * cleanup Signed-off-by: ashors1 * make storage_options optional again Signed-off-by: ashors1 * fix failing tests Signed-off-by: ashors1 * address some comments Signed-off-by: ashors1 * use save_weights_only to determine whether to save optimizer states Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * add some comments Signed-off-by: ashors1 * fix tests Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * fixes Signed-off-by: ashors1 * Apply isort and black reformatting Signed-off-by: ashors1 * remove unnecessary line Signed-off-by: ashors1 --------- Signed-off-by: ashors1 Signed-off-by: ashors1 Co-authored-by: ashors1 --- examples/llm/megatron_gpt_pretraining.py | 1 - nemo/lightning/io/mixin.py | 4 +- .../pytorch/callbacks/model_checkpoint.py | 83 +++++++++---------- .../pytorch/strategies/fsdp_strategy.py | 14 ++-- .../pytorch/strategies/megatron_strategy.py | 11 ++- nemo/lightning/resume.py | 16 ++++ .../collections/llm/test_mnist_model_nemo2.py | 6 +- .../llm/test_mnist_model_nemo2_fsdp.py | 6 +- tests/lightning/test_nemo_logger.py | 12 ++- 9 files changed, 93 insertions(+), 60 deletions(-) diff --git a/examples/llm/megatron_gpt_pretraining.py b/examples/llm/megatron_gpt_pretraining.py index cfdb6a6acb4b..bf36971d35d6 100644 --- a/examples/llm/megatron_gpt_pretraining.py +++ b/examples/llm/megatron_gpt_pretraining.py @@ -71,7 +71,6 @@ def get_args(): strategy = nl.MegatronStrategy() checkpoint_callback = ModelCheckpoint( every_n_train_steps=5000, - enable_nemo_ckpt_io=False, ) callbacks = [checkpoint_callback] diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index ff6c925a64bb..eee2d9ef751a 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -141,7 +141,7 @@ def io_dump(self, output: Path): will be stored. """ output_path = Path(output) - local_artifacts_dir = "artifacts" + local_artifacts_dir = "." artifacts_dir = output_path / local_artifacts_dir artifacts_dir.mkdir(parents=True, exist_ok=True) @@ -518,7 +518,7 @@ def _io_path_elements_fn(x): return x.__io__.__path_elements__() -def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "artifacts"): +def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."): for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []): current_val = getattr(cfg, artifact.attr) if current_val is None: diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py index db48ded0d10d..7ebeed138d2c 100644 --- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py +++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py @@ -40,21 +40,25 @@ class ModelCheckpoint(PTLModelCheckpoint): verbose: Verbosity mode. save_last: When ``True``, saves a `*-last` copy whenever a checkpoint file gets saved. save_top_k: When ``True``, saves the top-k checkpoints according to ``monitor``. - save_weights_only: if ``True``, then only the model's weights will be saved. + save_weights_only: if ``True``, then only the model's weights will be saved. Optimizer states will + be omitted from all checkpoints. mode: One of {min, max}. Whether the objective is to minimize or maximize the monitored quantity. every_n_epochs: Number of epochs between checkpoints. every_n_train_steps: Number of train steps between checkpoints. train_time_interval: After each interval, monitor checkpoints. Not to be used with ``every_n_epochs`` or ``every_n_train_steps``. - save_best_model: When ``True``, reloads and saves the best checkpoint. save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch - enable_nemo_ckpt_io: Whether to dump the current model model state, including the - config file, to allow for reproducibility of experiments. + save_optim_on_train_end: Whether to include the optimizer states in the final checkpoint + at the end of training. Only applicable when save_weights_only is ``True``. + always_save_context: Whether to dump the artifacts needed to reinintialize the current + model, trainer, and dataloader to allow for reproducibility of experiments. + save_context_on_train_end: Whether to dump the artifacts on_train_end regardless of whether + ``always_save_context`` is ``True``. async_save: Whether to enable asynchronous checkpointing. - try_restore_best_ckpt: Whether to restore the best model path. """ UNFINISHED_CHECKPOINT_SUFFIX = "-unfinished" + WEIGHTS_PATH = "weights" def __init__( self, @@ -67,21 +71,21 @@ def __init__( every_n_epochs: int = None, every_n_train_steps: Optional[int] = None, train_time_interval: Optional[timedelta] = None, - save_best_model: bool = False, save_on_train_epoch_end: Optional[bool] = False, # Save after training, not after validation - enable_nemo_ckpt_io: bool = True, - try_restore_best_ckpt: bool = True, + save_optim_on_train_end: Optional[bool] = False, + always_save_context: bool = False, + save_context_on_train_end: bool = True, **kwargs, ): - self.save_best_model = save_best_model - self.previous_best_path = "" - self.enable_nemo_ckpt_io = enable_nemo_ckpt_io + self.always_save_context = always_save_context + self.save_context_on_train_end = save_context_on_train_end + self.save_optim_on_train_end = save_optim_on_train_end + # Checkpoints which removal is deferred until async save is done. # Each element of `deferred_ckpts_to_remove` is a growing list # that `self._remove_checkpoint` adds to. Once `self._save_checkpoint` # is called, the last element is frozen and a new element is added. self.deferred_ckpts_to_remove: List[List[str]] = [] - self.try_restore_best_ckpt = try_restore_best_ckpt # Call the parent class constructor with the remaining kwargs. super().__init__( @@ -251,11 +255,9 @@ def setup(self, trainer, *args, **kwargs) -> None: self.async_save = getattr(trainer.strategy, "async_save", False) super().setup(trainer, *args, **kwargs) - def on_save_checkpoint(self, trainer, pl_module, checkpoint): - output = super().on_save_checkpoint(trainer, pl_module, checkpoint) - return output - def on_train_end(self, trainer, pl_module): + from nemo.utils.get_rank import is_global_rank_zero + if trainer.fast_dev_run: return None @@ -272,26 +274,11 @@ def on_train_end(self, trainer, pl_module): logging.debug(f'Last checkpoint {self.last_model_path} already saved') else: super()._save_last_checkpoint(trainer, monitor_candidates) + if self.save_context_on_train_end and not self.always_save_context and is_global_rank_zero(): + TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(self.last_model_path) / "context") # Call parent on_train_end() to save the -last checkpoint super().on_train_end(trainer, pl_module) - # Load the best model and then re-save it - if self.save_best_model: - # wait for all processes - trainer.strategy.barrier("SaveBestCheckpointConnector.resume_end") - if self.best_model_path == "": - logging.warning( - f"{self} was told to save the best checkpoint at the end of training, but no saved checkpoints " - "were found. Saving latest model instead." - ) - - else: - if os.path.isdir(self.best_model_path.split('.ckpt')[0]): - self.best_model_path = self.best_model_path.split('.ckpt')[0] - if self.try_restore_best_ckpt: - self.best_model_path = trainer.strategy.broadcast(self.best_model_path) - trainer._checkpoint_connector.restore(self.best_model_path) - def _del_model_without_trainer(self, filepath: str) -> None: from nemo.utils.get_rank import is_global_rank_zero @@ -409,8 +396,11 @@ def _monitor_candidates(self, trainer: "pl.Trainer") -> Dict[str, torch.Tensor]: return monitor_candidates def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None: + from nemo.utils.get_rank import is_global_rank_zero + # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed. # if anything goes wrong during checkpointing, we should be able to detect that data is incomplete. + ckpt_filepath = ckpt_to_dir(filepath) / ModelCheckpoint.WEIGHTS_PATH self.set_checkpoint_unfinished_marker(filepath, barrier_after=True) ema_callback = self._ema_callback(trainer) @@ -420,17 +410,26 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) if self.async_save: raise ValueError('async_save with EMA not supported') with ema_callback.save_original_optimizer_state(trainer): - super()._save_checkpoint(trainer, filepath) + super()._save_checkpoint(trainer, ckpt_filepath) # save EMA copy of the model as well. with ema_callback.save_ema_model(trainer): - rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}") - filepath = self._ema_format_filepath(filepath) + rank_zero_info(f"Saving EMA weights to separate checkpoint {ckpt_filepath}") + ckpt_filepath = self._ema_format_filepath(ckpt_filepath) if self.verbose: - rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}") - super()._save_checkpoint(trainer, filepath) + rank_zero_info(f"Saving EMA weights to separate checkpoint {ckpt_filepath}") + super()._save_checkpoint(trainer, ckpt_filepath) self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True) else: + ## Determine whether to include optimizer states in the checkpoint + ## optimizer states are included when + ## 1. save_weights_only is False and + ## 2. either save_optim_on_train_end is True, or save_optim_on_train_end is False but the checkpoint + ## is an intermediate checkpoint. + save_weights_only = self.save_weights_only or ( + not self.save_optim_on_train_end and trainer.global_step == trainer.max_steps + ) + # Async save passes the finalization function to checkpoint_io, # sync save calls the finalization function immediately after save. finalize_fn = self._get_finalize_save_checkpoint_callback(trainer, filepath, trainer.global_step) @@ -445,13 +444,11 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) self.deferred_ckpts_to_remove.append([]) else: storage_options = None - trainer.save_checkpoint(filepath, self.save_weights_only, storage_options=storage_options) + trainer.save_checkpoint(ckpt_filepath, save_weights_only, storage_options=storage_options) - ## NOTE: saving context happens synchronously always - from nemo.utils.get_rank import is_global_rank_zero + if self.always_save_context and is_global_rank_zero(): + TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath) / "context") - if self.enable_nemo_ckpt_io and is_global_rank_zero(): - TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath)) if self.async_save: logging.info(f'Scheduled async checkpoint save for {filepath}') else: diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py index 24087f80aae4..2a210c9bd7f0 100644 --- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py +++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py @@ -208,11 +208,15 @@ def save_checkpoint( checkpoint["sharded_state_dict"] = pyt_to_mcore_state_dict(checkpoint.pop("state_dict")) checkpoint["state_dict"] = OrderedDict([]) - # TODO: do we still need to keep this? - for optim_state in checkpoint['optimizer_states']: - optim_state.pop("state") - - if self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_save_optimizer: + ## replace unsharded optimizer_states with sharded dict. + ## note that if trainer.save_checkpoint(path, save_weights_only=True) is called, + ## the checkpoint will contain only model weights. Optimizer states will be omitted. + if ( + "optimizer_states" in checkpoint + and self.trainer.state.fn == TrainerFn.FITTING + and self.ckpt_save_optimizer + ): + del checkpoint["optimizer_states"] checkpoint['optimizer'] = get_optimizer_state_dict(self.model, self.optimizers) pyt_to_mcore_state_dict(checkpoint['optimizer']['state'], prefix="optimizer.state.") diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index fae6df5be207..4bf8c42ece02 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -625,7 +625,16 @@ def save_checkpoint( # retrieve `sharded_state_dict` if it has not already been configured in `on_save_checkpoint` if "sharded_state_dict" not in checkpoint: checkpoint["sharded_state_dict"] = self.megatron_parallel.sharded_state_dict() - if self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_save_optimizer: + + ## replace unsharded optimizer_states with sharded dict. + ## note that if trainer.save_checkpoint(path, save_weights_only=True) is called, + ## the checkpoint will contain only model weights. Optimizer states will be omitted. + if ( + "optimizer_states" in checkpoint + and self.trainer.state.fn == TrainerFn.FITTING + and self.ckpt_save_optimizer + ): + del checkpoint["optimizer_states"] checkpoint["optimizer"] = [self.optimizer_sharded_state_dict()] self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options) diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py index bce1964b6699..c8cefb4dd8d3 100644 --- a/nemo/lightning/resume.py +++ b/nemo/lightning/resume.py @@ -66,6 +66,11 @@ class AutoResume: resume_past_end: bool = False resume_ignore_no_checkpoint: bool = False + WEIGHTS_PATH = "weights" + + def get_model_weights_path(self, path): + return Path(path) / self.WEIGHTS_PATH + def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None): if isinstance(trainer, fl.Fabric): raise NotImplementedError("Fabric is not supported yet.") @@ -90,6 +95,7 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None): def _try_import_model( self, model: Optional[io.ConnectorMixin], path: str, adapter_path: Optional[str] = None ) -> BasePath: + if model is None: raise ValueError("Model is needed to import checkpoint from HF or other non-NeMo checkpoint format.") try: @@ -99,6 +105,11 @@ def _try_import_model( new_path = path if adapter_path: + + maybe_model_weights_path = self.get_model_weights_path(adapter_path) + if os.path.isdir(maybe_model_weights_path): + adapter_path = maybe_model_weights_path + new_path = AdapterPath(Path(adapter_path), base_model_path=new_path) if isinstance(new_path, str): @@ -211,6 +222,11 @@ def get_trainer_ckpt_path(self, model: Optional[io.ConnectorMixin] = None) -> Op if self.resume_if_exists: checkpoint = self._find_trainer_ckpt_path() + if checkpoint: + maybe_model_weights_path = self.get_model_weights_path(checkpoint) + if os.path.isdir(maybe_model_weights_path): + checkpoint = maybe_model_weights_path + if checkpoint: if self.adapter_path: return AdapterPath(Path(self.adapter_path), base_model_path=checkpoint) diff --git a/tests/collections/llm/test_mnist_model_nemo2.py b/tests/collections/llm/test_mnist_model_nemo2.py index c9507ab66bb3..616d845f590f 100644 --- a/tests/collections/llm/test_mnist_model_nemo2.py +++ b/tests/collections/llm/test_mnist_model_nemo2.py @@ -496,13 +496,12 @@ def run_train_mnist_litautoencoder_with_megatron_strategy_single_gpu(): # Configure our custom Checkpointer name = "test_experiment" checkpoint_callback = nl_callbacks.ModelCheckpoint( - save_best_model=True, save_last=True, monitor="val_loss", save_top_k=1, every_n_train_steps=5, # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe - enable_nemo_ckpt_io=True, + always_save_context=True, ) root_dir = tmpdir save_dir = root_dir / name @@ -571,6 +570,9 @@ def run_train_mnist_litautoencoder_with_megatron_strategy_single_gpu(): ckpt_path = checkpoint_callback.last_model_path.replace( ".ckpt", "" ) # strip .ckpt off the end of the last path + ckpt_path = ( + Path(ckpt_path) / "weights" + ) ## weights are saved to the "weights" directory within the checkpoint assert Path( ckpt_path diff --git a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py index 025f589e2f39..3ef0f14f10d8 100644 --- a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py +++ b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py @@ -519,13 +519,12 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu(): # Configure our custom Checkpointer name = "test_experiment" checkpoint_callback = nl_callbacks.ModelCheckpoint( - save_best_model=True, save_last=True, monitor="val_loss", save_top_k=1, every_n_train_steps=5, # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe - enable_nemo_ckpt_io=True, + always_save_context=True, ) root_dir = tmpdir save_dir = root_dir / name @@ -583,6 +582,9 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu(): ckpt_path = checkpoint_callback.last_model_path.replace( ".ckpt", "" ) # strip .ckpt off the end of the last path + ckpt_path = ( + Path(ckpt_path) / "weights" + ) ## weights are saved to the "weights" directory within the checkpoint assert Path( ckpt_path diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 54636f56472a..387d3540930f 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import shutil import time from pathlib import Path from unittest.mock import patch @@ -159,7 +160,10 @@ def test_resume(self, trainer, tmp_path): ## if there are multiple "-last" checkpoints, choose the most recent one Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last").mkdir() time.sleep(1) ## sleep for a second so the checkpoints are created at different times - Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last").mkdir() + ## make a "weights" dir within the checkpoint + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last" / "weights").mkdir( + parents=True + ) time.sleep(1) # unfinished last, that should be ignored Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel3--last").mkdir() @@ -169,11 +173,11 @@ def test_resume(self, trainer, tmp_path): resume_from_directory=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), resume_if_exists=True, ).setup(trainer) + ## if "weights" exists, we should restore from there assert str(trainer.ckpt_path) == str( - Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last") + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last" / "weights") ) - # Finally succeed logger = nl.NeMoLogger( name="default", dir=str(tmp_path) + "/test_resume", @@ -181,7 +185,7 @@ def test_resume(self, trainer, tmp_path): use_datetime_version=False, ) logger.setup(trainer) - Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last").rmdir() + shutil.rmtree(Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last")) nl.AutoResume( resume_if_exists=True, ).setup(trainer) From ad5ef750e351edbb5eeb7eb6df2d0c804819600f Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 6 Sep 2024 11:01:56 -0700 Subject: [PATCH 114/664] [Nemo Unit Tests] Split CPU unit tests (#10365) * Split CPU unit tests * Split CPU unit tests * Fix:Run pytest in specific paths * Fix:Run pytest in specific paths * Fix:Run pytest in specific paths --- .github/workflows/cicd-main.yml | 135 +++++++++++++++++++++++++++++--- 1 file changed, 124 insertions(+), 11 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 7375f81c4b09..dd74e050a533 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -133,16 +133,119 @@ jobs: NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads IS_OPTIONAL: true - # # TODO refactor: Commenting this test out until it is fixed & works properly again (test passes again) - # OPTIONAL_L0_Unit_Tests_CPU: - # needs: [cicd-test-container-setup] - # uses: ./.github/workflows/_test_template.yml - # with: - # RUNNER: self-hosted-azure-cpu - # TIMEOUT: 60 - # SCRIPT: | - # CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - # IS_OPTIONAL: true + # L0: CPU unit tests + L0_Unit_Tests_CPU_ASR: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + TIMEOUT: 20 + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + IS_OPTIONAL: true + + L0_Unit_Tests_CPU_Audio: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + IS_OPTIONAL: true + + L0_Unit_Tests_CPU_Common: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + IS_OPTIONAL: true + + L0_Unit_Tests_CPU_LLM: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + IS_OPTIONAL: true + + L0_Unit_Tests_CPU_Multimodal: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + IS_OPTIONAL: true + + L0_Unit_Tests_CPU_NLP: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + IS_OPTIONAL: true + + L0_Unit_Tests_CPU_TTS: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + IS_OPTIONAL: true + + L0_Unit_Tests_CPU_Core: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + IS_OPTIONAL: true + + L0_Unit_Tests_CPU_Hydra: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + IS_OPTIONAL: true + + L0_Unit_Tests_CPU_Lightning: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat + IS_OPTIONAL: true + + L0_Unit_Tests_CPU_Ohers: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + SCRIPT: | + CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat \ + --ignore=tests/collections/asr \ + --ignore=tests/collections/audio \ + --ignore=tests/collections/common \ + --ignore=tests/collections/llm \ + --ignore=tests/collections/multimodal \ + --ignore=tests/collections/nlp \ + --ignore=tests/collections/tts \ + --ignore=tests/core \ + --ignore=tests/core_ptl \ + --ignore=tests/hydra \ + --ignore=tests/lightning \ + --ignore=tests/utils + IS_OPTIONAL: true + L0_Setup_Test_Data_And_Models: needs: [cicd-test-container-setup] @@ -4868,7 +4971,17 @@ jobs: - gpu-test - cicd-test-container-setup - L0_Unit_Tests_GPU - #- OPTIONAL_L0_Unit_Tests_CPU + - L0_Unit_Tests_CPU_ASR + - L0_Unit_Tests_CPU_Audio + - L0_Unit_Tests_CPU_Common + - L0_Unit_Tests_CPU_LLM + - L0_Unit_Tests_CPU_Multimodal + - L0_Unit_Tests_CPU_NLP + - L0_Unit_Tests_CPU_TTS + - L0_Unit_Tests_CPU_Core + - L0_Unit_Tests_CPU_Hydra + - L0_Unit_Tests_CPU_Lightning + - L0_Unit_Tests_CPU_Ohers - L2_Community_LLM_Checkpoints_tests_Bert - L2_Community_LLM_Checkpoints_tests_Mamba2 - L2_Community_LLM_Checkpoints_tests_Llama From 95944ee48e03786fe185a590483348f82e2b9e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 6 Sep 2024 14:52:55 -0700 Subject: [PATCH 115/664] ci: Fix checkout of secrets detector (#10381) * ci: Fix checkout of secrets detector Signed-off-by: Oliver Koenig * f Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .github/workflows/secrets-detector.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/secrets-detector.yml b/.github/workflows/secrets-detector.yml index 4de052535cc1..a7793a9c62db 100644 --- a/.github/workflows/secrets-detector.yml +++ b/.github/workflows/secrets-detector.yml @@ -23,10 +23,14 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 with: + path: ${{ github.run_id }} ref: ${{ inputs.branch-name || github.head_ref }} + fetch-depth: 0 - name: Install secrets detector run: pip install detect-secrets - name: Run on change-set - run: git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .github/workflows/config/.secrets.baseline \ No newline at end of file + run: | + cd ${{ github.run_id }} + git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .github/workflows/config/.secrets.baseline \ No newline at end of file From 7ba06811d8b5d46067690a090ec0fb1d27a3012c Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:07:52 -0700 Subject: [PATCH 116/664] only log consumed samples during training (#10371) Signed-off-by: ashors1 --- nemo/lightning/pytorch/plugins/data_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py index 2a3b25f97cdc..060ec7915ec0 100644 --- a/nemo/lightning/pytorch/plugins/data_sampler.py +++ b/nemo/lightning/pytorch/plugins/data_sampler.py @@ -111,7 +111,7 @@ def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModul self.prev_global_batch_size = self.current_global_batch_size consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_global_step) - if self.output_log: + if self.output_log and self.trainer.training: # You may need to turn off logging, for example when doing trainer.predict(model, data) pl_module.log( 'consumed_samples', From 62c1dce1dacca85509d4a1ffb33d07f75325abed Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Sat, 7 Sep 2024 08:24:48 -0500 Subject: [PATCH 117/664] Alit/mamba 2 0 migration (#10338) --- nemo/collections/llm/__init__.py | 16 + nemo/collections/llm/gpt/data/fine_tuning.py | 5 + nemo/collections/llm/gpt/data/squad.py | 2 + nemo/collections/llm/gpt/model/__init__.py | 18 + nemo/collections/llm/gpt/model/ssm.py | 317 ++++++++++++++++++ .../language_modeling/megatron_mamba_model.py | 3 - nemo/lightning/io/mixin.py | 8 +- nemo/lightning/io/state.py | 2 +- .../convert_mamba2_pyt_to_nemo.py | 7 +- .../llm/megatron_ssm_finetuning.py | 110 ++++++ .../llm/megatron_ssm_pretraining.py | 129 +++++++ 11 files changed, 607 insertions(+), 10 deletions(-) create mode 100644 nemo/collections/llm/gpt/model/ssm.py create mode 100644 tests/collections/llm/megatron_ssm_finetuning.py create mode 100644 tests/collections/llm/megatron_ssm_pretraining.py diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 8da00b0edd7f..a5ce0c82a0e0 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -31,6 +31,11 @@ Baichuan2Config, Baichuan2Config7B, Baichuan2Model, + BaseMambaConfig1_3B, + BaseMambaConfig2_7B, + BaseMambaConfig130M, + BaseMambaConfig370M, + BaseMambaConfig780M, ChatGLM2Config6B, ChatGLM3Config6B, ChatGLMConfig, @@ -71,12 +76,15 @@ Nemotron4Config340B, NemotronConfig, NemotronModel, + NVIDIAMambaConfig8B, + NVIDIAMambaHybridConfig8B, Qwen2Config, Qwen2Config1P5B, Qwen2Config7B, Qwen2Config72B, Qwen2Config500M, Qwen2Model, + SSMConfig, Starcoder2Config, Starcoder2Config3B, Starcoder2Config7B, @@ -120,6 +128,14 @@ "Nemotron4Config22B", "Nemotron4Config340B", "NemotronConfig", + "SSMConfig", + "BaseMambaConfig130M", + "BaseMambaConfig370M", + "BaseMambaConfig780M", + "BaseMambaConfig1_3B", + "BaseMambaConfig2_7B", + "NVIDIAMambaConfig8B", + "NVIDIAMambaHybridConfig8B", "LlamaConfig", "Llama2Config7B", "Llama2Config13B", diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py index 7fa5bd719581..46cab3163368 100644 --- a/nemo/collections/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -63,6 +63,7 @@ def __init__( num_workers: int = 8, pin_memory: bool = True, persistent_workers: bool = False, + pad_to_max_length: bool = False, ): super().__init__() self.seq_length = seq_length @@ -78,6 +79,7 @@ def __init__( self.rampup_batch_size = rampup_batch_size self.data_sampler = None self.max_train_samples = None + self.pad_to_max_length = pad_to_max_length def setup(self, stage: str): self.data_sampler = MegatronDataSampler( @@ -97,6 +99,7 @@ def train_dataloader(self) -> DataLoader: self._create_dataset( str(self.train_path), max_num_samples=self.max_train_samples, + pad_to_max_length=self.pad_to_max_length, ) ) @@ -105,6 +108,7 @@ def val_dataloader(self) -> DataLoader: self._create_dataset( str(self.validation_path), is_test=True, + pad_to_max_length=self.pad_to_max_length, ), ) @@ -114,6 +118,7 @@ def test_dataloader(self) -> DataLoader: str(self.test_path), tokens_to_generate=32, is_test=True, + pad_to_max_length=self.pad_to_max_length, ) ) diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py index a2dfa12af69e..3f73d67ec61d 100644 --- a/nemo/collections/llm/gpt/data/squad.py +++ b/nemo/collections/llm/gpt/data/squad.py @@ -53,6 +53,7 @@ def __init__( num_workers: int = 8, pin_memory: bool = True, persistent_workers: bool = False, + pad_to_max_length: bool = False, ): self.force_redownload = force_redownload self.delete_raw = delete_raw @@ -69,6 +70,7 @@ def __init__( num_workers=num_workers, pin_memory=pin_memory, persistent_workers=persistent_workers, + pad_to_max_length=pad_to_max_length, ) def prepare_data(self) -> None: diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 81098040191c..9785889aaf92 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -71,6 +71,16 @@ Qwen2Config500M, Qwen2Model, ) +from nemo.collections.llm.gpt.model.ssm import ( + BaseMambaConfig1_3B, + BaseMambaConfig2_7B, + BaseMambaConfig130M, + BaseMambaConfig370M, + BaseMambaConfig780M, + NVIDIAMambaConfig8B, + NVIDIAMambaHybridConfig8B, + SSMConfig, +) from nemo.collections.llm.gpt.model.starcoder import StarcoderConfig, StarcoderConfig15B, StarcoderModel from nemo.collections.llm.gpt.model.starcoder2 import ( Starcoder2Config, @@ -137,6 +147,14 @@ "Qwen2Config7B", "Qwen2Config72B", "Qwen2Model", + "SSMConfig", + "BaseMambaConfig130M", + "BaseMambaConfig370M", + "BaseMambaConfig780M", + "BaseMambaConfig1_3B", + "BaseMambaConfig2_7B", + "NVIDIAMambaConfig8B", + "NVIDIAMambaHybridConfig8B", "MaskedTokenLossReduction", "gpt_data_step", "gpt_forward_step", diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py new file mode 100644 index 000000000000..954fa8bfe9f7 --- /dev/null +++ b/nemo/collections/llm/gpt/model/ssm.py @@ -0,0 +1,317 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, Literal, Optional + +import torch + +from nemo.utils import logging + +try: + from megatron.core import parallel_state + from megatron.core.models.mamba import MambaModel as MCoreMambaModel + from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec + + HAVE_MEGATRON_CORE_OR_TE = True + +except (ImportError, ModuleNotFoundError): + logging.warning("The package `megatron.core` was not imported in this environment which is needed for SSMs.") + HAVE_MEGATRON_CORE_OR_TE = False + +from megatron.core.transformer.transformer_config import TransformerConfig +from nemo.collections.llm.gpt.model.base import GPTModel, gpt_data_step +from nemo.lightning import get_vocab_size, io, teardown + + +def ssm_forward_step(model, batch) -> torch.Tensor: + + forward_args = { + "input_ids": batch["tokens"], + "position_ids": batch["position_ids"], + "labels": batch["labels"], + } + forward_args["attention_mask"] = None + return model(**forward_args) + + +@dataclass +class SSMConfig(TransformerConfig, io.IOMixin): + # From megatron.core.models.mamba.mamba_model.MambaModel + fp16_lm_cross_entropy: bool = False + parallel_output: bool = True + share_embeddings_and_output_weights: bool = False + num_layers: int = 2 + mamba_ssm_ngroups: int = 8 + num_attention_heads: int = 1 + hybrid_attention_ratio: float = 0.0 + hybrid_mlp_ratio: float = 0.0 + hybrid_override_pattern: str = None + post_process: bool = True + pre_process: bool = True + seq_length: int = 2048 + # Mamba with no attention has no need for position embeddings, so none is default + position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'none' + rotary_percent: float = 1.0 + rotary_base: int = 10000 + seq_len_interpolation_factor: Optional[float] = None + apply_rope_fusion: bool = True + make_vocab_size_divisible_by: int = 128 + gated_linear_unit: bool = False + fp32_residual_connections: bool = True + normalization: str = 'RMSNorm' + add_bias_linear: bool = False + hidden_dropout: float = 0.0 + attention_dropout: float = 0.0 + layernorm_epsilon: float = 1e-5 + # TODO: Move this to better places? + get_attention_mask_from_fusion: bool = False + + forward_step_fn: Callable = ssm_forward_step + data_step_fn: Callable = gpt_data_step + + def configure_model(self, tokenizer) -> "MCoreMambaModel": + + return MCoreMambaModel( + self, + mamba_stack_spec=mamba_stack_spec, + vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by), + max_sequence_length=self.seq_length, + mamba_ssm_ngroups=self.mamba_ssm_ngroups, + hybrid_attention_ratio=self.hybrid_attention_ratio, + hybrid_mlp_ratio=self.hybrid_mlp_ratio, + hybrid_override_pattern=self.hybrid_override_pattern, + position_embedding_type=self.position_embedding_type, + rotary_percent=self.rotary_percent, + rotary_base=self.rotary_base, + seq_len_interpolation_factor=self.seq_len_interpolation_factor, + pre_process=parallel_state.is_pipeline_first_stage(), + post_process=parallel_state.is_pipeline_last_stage(), + ) + + +@io.model_importer(GPTModel, "pytorch") +class PyTorchSSMImporter(io.ModelConnector["GPTModel", GPTModel]): + + def __new__(cls, path: str, model_config=None): + instance = super().__new__(cls, path) + instance.model_config = model_config + return instance + + def init(self) -> GPTModel: + + return GPTModel(self.config, tokenizer=self.tokenizer) + + def apply(self, output_path: Path) -> Path: + + source = torch.load(str(self), map_location='cpu') + if 'model' in source: + source = source['model'] + + class ModelState: + def __init__(self, state_dict): + self._state_dict = state_dict + + def state_dict(self): + return self._state_dict + + source = ModelState(source) + target = self.init() + trainer = self.nemo_setup(target) + self.convert_state(source, target) + self.nemo_save(output_path, trainer) + + logging.info(f"Converted SSM model to Nemo, model saved to {output_path}") + + teardown(trainer, target) + del trainer, target + + return output_path + + def convert_state(self, source, target): + + if self.model_config.mapping_type == "base": + mapping = { + 'backbone.embedding.weight': 'embedding.word_embeddings.weight', + 'backbone.layers.*.mixer.A_log': 'decoder.layers.*.mixer.A_log', + 'backbone.layers.*.mixer.D': 'decoder.layers.*.mixer.D', + 'backbone.layers.*.mixer.conv1d.weight': 'decoder.layers.*.mixer.conv1d.weight', + 'backbone.layers.*.mixer.conv1d.bias': 'decoder.layers.*.mixer.conv1d.bias', + 'backbone.layers.*.mixer.in_proj.weight': 'decoder.layers.*.mixer.in_proj.weight', + 'backbone.layers.*.mixer.dt_bias': 'decoder.layers.*.mixer.dt_bias', + 'backbone.layers.*.mixer.out_proj.weight': 'decoder.layers.*.mixer.out_proj.weight', + 'backbone.layers.*.mixer.norm.weight': 'decoder.layers.*.mixer.norm.weight', + 'backbone.layers.*.norm.weight': 'decoder.layers.*.mixer.in_proj.layer_norm_weight', + 'backbone.norm_f.weight': 'decoder.final_norm.weight', + 'lm_head.weight': 'output_layer.weight', + } + elif "nvidia" in self.model_config.mapping_type: + mapping = { + 'embedding.word_embeddings.weight': 'embedding.word_embeddings.weight', + 'decoder.layers.*.mixer.A_log': 'decoder.layers.*.mixer.A_log', + 'decoder.layers.*.mixer.D': 'decoder.layers.*.mixer.D', + 'decoder.layers.*.mixer.conv1d.weight': 'decoder.layers.*.mixer.conv1d.weight', + 'decoder.layers.*.mixer.conv1d.bias': 'decoder.layers.*.mixer.conv1d.bias', + 'decoder.layers.*.mixer.in_proj.weight': 'decoder.layers.*.mixer.in_proj.weight', + 'decoder.layers.*.mixer.dt_bias': 'decoder.layers.*.mixer.dt_bias', + 'decoder.layers.*.mixer.out_proj.weight': 'decoder.layers.*.mixer.out_proj.weight', + 'decoder.layers.*.mixer.norm.weight': 'decoder.layers.*.mixer.norm.weight', + 'decoder.layers.*.norm.weight': 'decoder.layers.*.mixer.in_proj.layer_norm_weight', + 'decoder.final_norm.weight': 'decoder.final_norm.weight', + 'output_layer.weight': 'output_layer.weight', + } + if "hybrid" in self.model_config.mapping_type: + mapping.update( + { + 'decoder.layers.*.mlp.linear_fc1.layer_norm_weight': 'decoder.layers.*.mlp.linear_fc1.layer_norm_weight', + 'decoder.layers.*.mlp.linear_fc1.weight': 'decoder.layers.*.mlp.linear_fc1.weight', + 'decoder.layers.*.mlp.linear_fc2.weight': 'decoder.layers.*.mlp.linear_fc2.weight', + 'decoder.layers.*.self_attention.linear_proj.weight': 'decoder.layers.*.self_attention.linear_proj.weight', + 'decoder.layers.*.self_attention.linear_qkv.layer_norm_weight': 'decoder.layers.*.self_attention.linear_qkv.layer_norm_weight', + 'decoder.layers.*.self_attention.linear_qkv.weight': 'decoder.layers.*.self_attention.linear_qkv.weight', + } + ) + else: + raise AttributeError(f"mapping type [{self.mapping_type}] not found.") + return io.apply_transforms(source, target, mapping=mapping) + + @property + def tokenizer(self): + from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer + + tokenizer = get_nmt_tokenizer( + library=self.model_config.tokenizer_library, + model_name=self.model_config.tokenizer_name, + tokenizer_model=self.model_config.tokenizer_model_path, + use_fast=True, + ) + + return tokenizer + + @property + def config(self) -> SSMConfig: + return self.model_config + + +@dataclass +class BaseMambaConfig130M(SSMConfig): + hybrid_override_pattern: str = "M" * 24 + num_layers: int = 24 + seq_length: int = 2048 + hidden_size: int = 768 + mamba_ssm_ngroups: int = 1 + ffn_hidden_size: int = 768 + make_vocab_size_divisible_by: int = 16 + tokenizer_library: str = 'huggingface' + tokenizer_name: str = "EleutherAI/gpt-neox-20b" + mapping_type: str = "base" + + +@dataclass +class BaseMambaConfig370M(SSMConfig): + hybrid_override_pattern: str = "M" * 48 + num_layers: int = 48 + seq_length: int = 2048 + hidden_size: int = 1024 + mamba_ssm_ngroups: int = 1 + ffn_hidden_size: int = 1024 + make_vocab_size_divisible_by: int = 16 + tokenizer_library: str = 'huggingface' + tokenizer_name: str = "EleutherAI/gpt-neox-20b" + mapping_type: str = "base" + + +@dataclass +class BaseMambaConfig780M(SSMConfig): + hybrid_override_pattern: str = "M" * 48 + num_layers: int = 48 + seq_length: int = 2048 + hidden_size: int = 1536 + mamba_ssm_ngroups: int = 1 + ffn_hidden_size: int = 1536 + make_vocab_size_divisible_by: int = 16 + tokenizer_library: str = 'huggingface' + tokenizer_name: str = "EleutherAI/gpt-neox-20b" + mapping_type: str = "base" + + +@dataclass +class BaseMambaConfig1_3B(SSMConfig): + hybrid_override_pattern: str = "M" * 48 + num_layers: int = 48 + seq_length: int = 2048 + hidden_size: int = 2048 + mamba_ssm_ngroups: int = 1 + ffn_hidden_size: int = 2048 + make_vocab_size_divisible_by: int = 16 + tokenizer_library: str = 'huggingface' + tokenizer_name: str = "EleutherAI/gpt-neox-20b" + mapping_type: str = "base" + + +@dataclass +class BaseMambaConfig2_7B(SSMConfig): + hybrid_override_pattern: str = "M" * 64 + num_layers: int = 64 + seq_length: int = 2048 + hidden_size: int = 2560 + mamba_ssm_ngroups: int = 1 + ffn_hidden_size: int = 2560 + make_vocab_size_divisible_by: int = 16 + tokenizer_library: str = 'huggingface' + tokenizer_name: str = "EleutherAI/gpt-neox-20b" + mapping_type: str = "base" + + +@dataclass +class NVIDIAMambaConfig8B(SSMConfig): + hybrid_override_pattern: str = "M" * 56 + num_layers: int = 56 + seq_length: int = 4096 + hidden_size: int = 4096 + mamba_ssm_ngroups: int = 8 + ffn_hidden_size: int = 4096 + make_vocab_size_divisible_by: int = 128 + tokenizer_library: str = 'megatron' + tokenizer_name: str = "GPTSentencePieceTokenizer" + mapping_type: str = "nvidia-pure" + + +@dataclass +class NVIDIAMambaHybridConfig8B(SSMConfig): + hybrid_override_pattern: str = "M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-" + num_layers: int = 56 + seq_length: int = 4096 + hidden_size: int = 4096 + mamba_ssm_ngroups: int = 8 + ffn_hidden_size: int = 16384 + num_attention_heads: int = 32 + num_query_groups: int = 8 + make_vocab_size_divisible_by: int = 128 + tokenizer_library: str = 'megatron' + tokenizer_name: str = "GPTSentencePieceTokenizer" + mapping_type: str = "nvidia-hybrid" + + +__all__ = [ + "SSMConfig", + "BaseMambaConfig130M", + "BaseMambaConfig370M", + "BaseMambaConfig780M", + "BaseMambaConfig1_3B", + "BaseMambaConfig2_7B", + "NVIDIAMambaConfig8B", + "NVIDIAMambaHybridConfig8B", +] diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py index 54dff1cd7887..afbe85e0edbb 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py @@ -67,9 +67,6 @@ def on_validation_epoch_end(self): averaged_loss = torch.tensor(0.0, dtype=torch.float32).cuda() return averaged_loss - def sharded_state_dict(self, prefix: str = ''): - return None - def _reset_activation_checkpointing_args(self): return diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index eee2d9ef751a..f8abc97dc7fc 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -281,7 +281,7 @@ def exporter(cls, ext: str, path: Union[str, Path]) -> ModelConnector: """ return cls._get_connector(ext, path, importer=False) - def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Path] = None) -> Path: + def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Path] = None, **kwargs) -> Path: """ Imports a checkpoint from a specified path, potentially overwriting existing files. @@ -299,14 +299,14 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa ------ FileNotFoundError: If the checkpoint file does not exist at the specified path. """ - connector = self._get_connector(path) + connector = self._get_connector(path, **kwargs) ckpt_path: Path = connector.local_path(base_path=base_path) ckpt_path = connector(ckpt_path, overwrite=overwrite) connector.on_import_ckpt(self) return ckpt_path @classmethod - def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector: + def _get_connector(cls, ext, path=None, importer=True, **kwargs) -> ModelConnector: """ Retrieves the appropriate model connector based on the file extension and path, distinguishing between importers and exporters. @@ -341,7 +341,7 @@ def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector: return connector() - return connector(_path) + return connector(_path, **kwargs) def track_io(target, artifacts: Optional[List[Artifact]] = None): diff --git a/nemo/lightning/io/state.py b/nemo/lightning/io/state.py index 18e0865171c7..b79ca69c5845 100644 --- a/nemo/lightning/io/state.py +++ b/nemo/lightning/io/state.py @@ -321,7 +321,7 @@ def call_transform(self, ctx: TransformCTX, *args, **kwargs): def _match_keys(keys: List[str], pattern: str) -> np.ndarray: - regex_pattern = re.compile("^" + pattern.replace("*", "(.*)") + "$") + regex_pattern = re.compile("^" + pattern.replace("*", r"([^.]+)") + "$") wildcard_matches = [[] for _ in range(pattern.count("*"))] for key in filter(lambda x: x is not None, keys): diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py index 7a7484bf9c20..635cf6db2fde 100644 --- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py +++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py @@ -128,7 +128,7 @@ def convert(args): key = key[:-11] + 'mixer.in_proj.layer_norm_weight' new_state_dict["model." + key] = value - # Tokenizer settings + # NVIDIA Mamba Model Tokenizer Settings tokenizer_library = 'megatron' tokenizer_type = 'GPTSentencePieceTokenizer' tokenizer_model = args.tokenizer_model_dir @@ -180,8 +180,11 @@ def convert(args): trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer() nemo_model_from_pyt = MegatronMambaModel(nemo_config.model, trainer) - # Setting strict=False for the _extra_state + for k, v in nemo_model_from_pyt.state_dict().items(): + if "_extra" in k: + new_state_dict[k] = v + # Setting strict=False for the _extra_state nemo_model_from_pyt.load_state_dict(new_state_dict, strict=False) dtype = torch_dtype_from_precision(args.precision) nemo_model_from_pyt = nemo_model_from_pyt.to(dtype=dtype) diff --git a/tests/collections/llm/megatron_ssm_finetuning.py b/tests/collections/llm/megatron_ssm_finetuning.py new file mode 100644 index 000000000000..187384e15dcd --- /dev/null +++ b/tests/collections/llm/megatron_ssm_finetuning.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## NOTE: This script is present for github-actions testing only. +## There are no guarantees that this script is up-to-date with latest NeMo. + +import argparse +import torch +from megatron.core.optimizer import OptimizerConfig +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule + + +def get_args(): + parser = argparse.ArgumentParser(description='Train a small GPT model using NeMo 2.0') + parser.add_argument('--devices', type=int, help="Number of devices to use for training") + parser.add_argument('--max-steps', type=int, help="Number of steps to train for") + parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to") + parser.add_argument('--model-path', type=str, help="Path to model checkpoint") + parser.add_argument( + '--tokenizer-model-path', type=str, default=None, help="Path to tokenizer model, defaults to None" + ) + return parser.parse_args() + + +if __name__ == "__main__": + + args = get_args() + + # Checkpoint callback setup + checkpoint_callback = nl.ModelCheckpoint( + save_best_model=True, + save_last=False, + monitor="reduced_train_loss", + save_top_k=1, + every_n_train_steps=10, + enable_nemo_ckpt_io=False, + dirpath=args.experiment_dir, + ) + + trainer = nl.Trainer( + devices=args.devices, + max_steps=args.max_steps, + accelerator="gpu", + strategy=nl.MegatronStrategy( + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + tensor_model_parallel_size=1, + ), + plugins=nl.MegatronMixedPrecision( + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + log_every_n_steps=1, + limit_val_batches=5, + val_check_interval=10, + num_sanity_val_steps=0, + ) + + opt_config = OptimizerConfig( + optimizer='adam', + lr=1e-5, + min_lr=1e-5, + use_distributed_optimizer=False, + clip_grad=1.0, + bf16=True, + ) + + optim = MegatronOptimizerModule(config=opt_config) + model_config = llm.BaseMambaConfig130m() + model_config.tokenizer_model_path = args.tokenizer_model_path + + tokenizer = get_nmt_tokenizer( + library=model_config.tokenizer_library, + model_name=model_config.tokenizer_name, + tokenizer_model=model_config.tokenizer_model_path, + use_fast=True, + ) + + model = llm.GPTModel(model_config, optim=optim, tokenizer=tokenizer) + + ckpt_path = model.import_ckpt( + path="pytorch://" + args.model_path, + model_config=model_config, + ) + + data = llm.SquadDataModule( + seq_length=512, + micro_batch_size=2, + global_batch_size=4, + tokenizer=model.tokenizer, + num_workers=0, + pad_to_max_length=True, + ) + + trainer.fit(model, data, ckpt_path=ckpt_path) diff --git a/tests/collections/llm/megatron_ssm_pretraining.py b/tests/collections/llm/megatron_ssm_pretraining.py new file mode 100644 index 000000000000..ed7e551cba7b --- /dev/null +++ b/tests/collections/llm/megatron_ssm_pretraining.py @@ -0,0 +1,129 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## NOTE: This script is present for github-actions testing only. +## There are no guarantees that this script is up-to-date with latest NeMo. + +import argparse +import torch +from megatron.core.optimizer import OptimizerConfig +from pytorch_lightning.loggers import TensorBoardLogger +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import train +from nemo.collections.llm.gpt.data import PreTrainingDataModule +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.lightning import NeMoLogger +from nemo.lightning.pytorch.callbacks import ModelCheckpoint +from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule + + +def get_args(): + parser = argparse.ArgumentParser(description='Train a Mamba model using NeMo 2.0') + parser.add_argument('--devices', type=int, help="Number of devices to use for training") + parser.add_argument('--max-steps', type=int, help="Number of steps to train for") + parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to") + parser.add_argument('--data-path', type=str, help="Path to data file") + parser.add_argument('--tokenizer-path', type=str, default=None, help="Path to tokenizer model") + + return parser.parse_args() + + +if __name__ == '__main__': + + args = get_args() + + seq_length = 512 + + tokenizer = get_nmt_tokenizer( + "huggingface", + "EleutherAI/gpt-neox-20b", + tokenizer_model=None, + use_fast=True, + ) + data = PreTrainingDataModule( + paths=args.data_path, + seq_length=seq_length, + micro_batch_size=2, + global_batch_size=16, + seed=1234, + tokenizer=tokenizer, + ) + ssm_config = llm.SSMConfig( + hybrid_override_pattern="M-M*", + num_layers=4, + hidden_size=1024, + ffn_hidden_size=1024, + num_attention_heads=4, + seq_length=seq_length, + init_method_std=0.02, + hidden_dropout=0.0, + attention_dropout=0.0, + layernorm_epsilon=1e-5, + make_vocab_size_divisible_by=16, + ) + model = llm.GPTModel(ssm_config, tokenizer=data.tokenizer) + strategy = nl.MegatronStrategy( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + ) + checkpoint_callback = ModelCheckpoint( + every_n_train_steps=10, + enable_nemo_ckpt_io=False, + ) + callbacks = [checkpoint_callback] + + loggers = [] + tensorboard_logger = TensorBoardLogger( + save_dir='dummy', ## NOTE: this gets overwritten by default + ) + loggers.append(tensorboard_logger) + + opt_config = OptimizerConfig( + optimizer='adam', + lr=6e-4, + min_lr=6e-5, + clip_grad=1.0, + use_distributed_optimizer=False, + bf16=True, + ) + opt = MegatronOptimizerModule(config=opt_config) + + trainer = nl.Trainer( + devices=args.devices, + max_steps=args.max_steps, + accelerator="gpu", + strategy=strategy, + logger=loggers, + callbacks=callbacks, + log_every_n_steps=1, + limit_val_batches=2, + plugins=nl.MegatronMixedPrecision( + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + ) + + nemo_logger = NeMoLogger( + dir=args.experiment_dir, + ) + + train( + model=model, + data=data, + trainer=trainer, + log=nemo_logger, + tokenizer='data', + optim=opt, + ) From 9e372d3433b61fba00caa507cf07afa43a4b7566 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Sat, 7 Sep 2024 09:38:37 -0700 Subject: [PATCH 118/664] [NeMo-UX] Checkpointing fixes (#10376) * remove save_best_model from default logger Signed-off-by: ashors1 * fix broken checkpoint restore Signed-off-by: ashors1 * fix fsdp Signed-off-by: ashors1 * rename weights path to avoid confusion Signed-off-by: ashors1 * Revert "rename weights path to avoid confusion". We'll add this in a separate PR This reverts commit 72bae8bdf4dd7444d549cdcc1ed48ac5fb33c0de. --------- Signed-off-by: ashors1 --- nemo/collections/llm/recipes/log/default.py | 1 - nemo/lightning/pytorch/strategies/fsdp_strategy.py | 2 +- nemo/lightning/pytorch/strategies/megatron_strategy.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py index 4d5e9223b535..94e595bdb811 100644 --- a/nemo/collections/llm/recipes/log/default.py +++ b/nemo/collections/llm/recipes/log/default.py @@ -32,7 +32,6 @@ def default_log( ) -> Config[nl.NeMoLogger]: ckpt = Config( nl.ModelCheckpoint, - save_best_model=False, save_last=True, save_top_k=10, every_n_train_steps=200, diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py index 2a210c9bd7f0..d34d1716e6b4 100644 --- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py +++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py @@ -216,7 +216,7 @@ def save_checkpoint( and self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_save_optimizer ): - del checkpoint["optimizer_states"] + checkpoint["optimizer_states"] = {} checkpoint['optimizer'] = get_optimizer_state_dict(self.model, self.optimizers) pyt_to_mcore_state_dict(checkpoint['optimizer']['state'], prefix="optimizer.state.") diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index 4bf8c42ece02..3a0a0368bcef 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -634,7 +634,7 @@ def save_checkpoint( and self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_save_optimizer ): - del checkpoint["optimizer_states"] + checkpoint["optimizer_states"] = {} checkpoint["optimizer"] = [self.optimizer_sharded_state_dict()] self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options) From cda2a637e9c1fefaa419e7b31ab2203d72d9819f Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Sun, 8 Sep 2024 00:10:15 +0300 Subject: [PATCH 119/664] add auto configurator to NeMo (#10270) * add base configs Signed-off-by: dimapihtar * add auto configurator functionality Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * add runner Signed-off-by: dimapihtar * add end-to-end example for auto configurator Signed-off-by: dimapihtar * add unit tests for auto configurator Signed-off-by: dimapihtar * add GPT configs Signed-off-by: dimapihtar * add GPT configs Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * switch to dataclass Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * switch to dataclass Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * fix dataclasses usage Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * remove unused imports Signed-off-by: dimapihtar * remove extra function Signed-off-by: dimapihtar * fix docstring style Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * take Config object as input for model Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * add nemotron support Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * remove search_config.py Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * move configs creation to Basic class Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * move to common basic class Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * rename main config Signed-off-by: dimapihtar * remove base configs for models Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: artbataev * change auto conf functionality Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * fix docstring Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * remove unused imports Signed-off-by: dimapihtar * add changes Signed-off-by: dimapihtar * remove activations_checkpoint_num_layers Signed-off-by: dimapihtar * remove gbs from config Signed-off-by: dimapihtar * fix logs Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * fix performance calculation Signed-off-by: dimapihtar * fix end-to-end example Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * fix model config Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * minor changes Signed-off-by: dimapihtar * minor changes Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * fix unit tests Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * add README Signed-off-by: dimapihtar * fix README Signed-off-by: dimapihtar * fix README Signed-off-by: dimapihtar * fix readme Signed-off-by: dimapihtar * fix readme Signed-off-by: dimapihtar * remove extra arg Signed-off-by: dimapihtar * remove unused imports Signed-off-by: dimapihtar * add nemo-run installation Signed-off-by: dimapihtar * fix unit tests Signed-off-by: dimapihtar * fix unit tests Signed-off-by: dimapihtar --------- Signed-off-by: dimapihtar Signed-off-by: dimapihtar Signed-off-by: artbataev Co-authored-by: dimapihtar Co-authored-by: artbataev --- Dockerfile.ci | 4 + examples/llm/auto_configurator/README.md | 85 ++ examples/llm/auto_configurator/auto_config.py | 81 ++ nemo/collections/llm/__init__.py | 6 + nemo/collections/llm/gpt/model/__init__.py | 6 + nemo/collections/llm/gpt/model/base.py | 54 ++ .../llm/tools/auto_configurator/__init__.py | 2 + .../tools/auto_configurator/core/__init__.py | 13 + .../auto_configurator/core/base_config.py | 367 +++++++ .../core/calculate_performance.py | 334 +++++++ .../auto_configurator/core/training_config.py | 892 ++++++++++++++++++ .../llm/tools/auto_configurator/core/utils.py | 470 +++++++++ .../llm/tools/auto_configurator/runner.py | 246 +++++ .../llm/auto_conf/test_autoconf_utils.py | 131 +++ .../llm/auto_conf/test_base_configs.py | 341 +++++++ .../llm/auto_conf/test_generate_configs.py | 307 ++++++ 16 files changed, 3339 insertions(+) create mode 100644 examples/llm/auto_configurator/README.md create mode 100644 examples/llm/auto_configurator/auto_config.py create mode 100644 nemo/collections/llm/tools/auto_configurator/__init__.py create mode 100644 nemo/collections/llm/tools/auto_configurator/core/__init__.py create mode 100644 nemo/collections/llm/tools/auto_configurator/core/base_config.py create mode 100644 nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py create mode 100644 nemo/collections/llm/tools/auto_configurator/core/training_config.py create mode 100644 nemo/collections/llm/tools/auto_configurator/core/utils.py create mode 100644 nemo/collections/llm/tools/auto_configurator/runner.py create mode 100644 tests/collections/llm/auto_conf/test_autoconf_utils.py create mode 100644 tests/collections/llm/auto_conf/test_base_configs.py create mode 100644 tests/collections/llm/auto_conf/test_generate_configs.py diff --git a/Dockerfile.ci b/Dockerfile.ci index 3d9a9d9b08a1..33490a6d9079 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -31,6 +31,10 @@ EOF WORKDIR /workspace +RUN pip install hatchling # needed to install nemo-run +ARG NEMU_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2 +RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_TAG} + # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.15.0 diff --git a/examples/llm/auto_configurator/README.md b/examples/llm/auto_configurator/README.md new file mode 100644 index 000000000000..26cf5cd75263 --- /dev/null +++ b/examples/llm/auto_configurator/README.md @@ -0,0 +1,85 @@ +> [!IMPORTANT] +> This is an early version of the Auto Configurator, and the code base can be modified as it will be integrated into the CLI. + +Use Auto Configurator to Find the Optimal Configuration +------------------------------------------------------- + +Auto Configurator searches for hyperparameters (HPs) that achieve the maximum highest training throughput when working with Large Language Models (LLMs) utilizing the NeMo Framework. + +> [!NOTE] +> Auto Configurator is only supported now for GPT-based models: GPT3, LLama, Mixtral, Mistral, Gemma and Nemotron. + +Auto Configurator Capabilities +------------------------------ + +Auto Configurator is intended to iterate over different model configurations quickly and find the best configuration, that is, the configuration that minimizes both time and financial expenditure. It offers a range of features to facilitate this, as detailed in the list below. + +- **Model size recommendation**: finds the optimal model size if the parameter is not specified. +- **Training time estimation**: estimates model training time based on input parameters. +- **Base configuration generation**: returns a basic model configuration. +- **Hyperparameters recommendation**: finds the optimal list of hyperparameters to be trained. +- **Optimal configuration recommendation**: calculates the performance after a short training of candidate configurations and finds the optimal model configuration. + +Model Size Recommendation +------------------------- + +If you have not decided what model size you want to train, Auto Configurator can recommend a model size for your use case. If you know the number of GPUs, TFLOPS per GPU, the maximum time to train, and the number of tokens to train for, it can recommend a model size that can be trained with the specified hardware and time constraints. + +For example, if you had 20 NVIDIA DGX nodes available (in 80 GB GPU memory), and wanted to train a GPT model for a maximum of 5 days, Auto Configurator would recommend using a 5B parameter GPT model. + +Training Time Estimation +------------------------ + +Auto Configurator calculates the estimated training time for your model. It provides a projection of the training time in days, based on the input dataset and parameters you provide. + +Base Configuration Generation +----------------------------- + +When you provide the model size, or Auto Configurator has suggested one, it generates a base configuration for the target model. The base configuration is a valid configuration in NeMo 2.0 format. The optimization of throughput, however, is conducted in the next step. + +Hyperparameters Recommendation +------------------------------ + +After Auto Configurator generates the base configuration, it searches over four critical hyperparameters that have a great impact on training throughput but do not affect model convergence. These hyperparameters include Tensor Parallelism (TP), Pipeline Parallelism (PP), Context Parallelism (CP), Expert Parallelism (EP), Micro Batch Size (MBS), and Activation Checkpointing Layers (ActCkpt). Auto Configurator will also provide optimal Global Batch Size (GBS) if it's not specified. + +Auto Configurator initially applies heuristics to identify suitable candidates for the four key parameters, subsequently generating a grid of candidate configurations. It returns all of the candidate configurations in NeMo 2.0 format. + +> [!NOTE] +> Some of the candidate configurations may not work due to high-memory usage or other issues. + +Once the candidate configurations are generated, you can use NeMo Framework to launch the most promising candidates. + +When running the candidates on the cluster, you can limit job time and job max steps by using ``max_minutes_per_run`` and ``max_steps_per_run`` parameters. During this search, the jobs will run with the number of nodes specified in the configuration files, using the ``num_nodes`` parameter. Once all of the jobs have finished running, you'll need to run compare_throughput.py to get a ``.csv`` table with performance results for each succeeded job. + +Optimal Configuration Recommendation +------------------------------------ + +After all of the candidate jobs are done, Auto Configurator calculates performance parameters for each of the candidates. +Auto Configurator generates two ``.csv`` files: one detailing the performance measures of the candidates and another listing the candidates that failed due to out-of-memory errors. + +End-To-End Example +------------------ + +The following list shows the required input parameters for the Auto Configurator runner: + +- ``model``: model configuration based on NeMo 2.0. +- ``num_nodes``: number of nodes to be used for the training. +- ``seq_length``: sequence length to be used for the training. +- ``data_paths``: dataset to be used for the training. +- ``tokenizer_path``: path to tokenizer model if custom tokenizer will be used. + +The following list shows the optional parameters for the Auto Configurator runner: + +- ``global_batch_size``: global batch size to be used. +- ``tensor_parallel_sizes``: a list, such as ``[1, 2, 4]``. +- ``pipeline_parallel_sizes``: a list, such as ``[1, 2, 4]``. +- ``context_parallel_sizes``: a list, such as ``[1, 2, 4]``. +- ``expert_parallel_sizes``: a list, such as ``[1, 2, 4]``. +- ``micro_batch_sizes``: a list, such as ``[1, 2, 4]``. +- ``min_model_parallel_size``: a value for the minimum desired parallelism. +- ``max_model_parallel_size``: a value for the maximum desired parallelism. + +For each of the optional parameters, Auto Configurator will find the optimal value if the parameter is not specified. To view the full list of parameters, please visit [this page](https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/nemo/collections/llm/tools/auto_configurator/runner.py#L51). + +To view an end-to-end example of how to generate candidate configs, train them, and calculate the performance using Auto Configurator with NeMo Framework, please visit [this page](https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/examples/llm/auto_configurator/auto_config.py). + diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py new file mode 100644 index 000000000000..c202d4d33325 --- /dev/null +++ b/examples/llm/auto_configurator/auto_config.py @@ -0,0 +1,81 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import fiddle as fdl +import nemo_run as run + +from nemo.collections.llm import GPTConfig126M +from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs, get_results + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--run_number", type=int, help="Number of config to run") + parser.add_argument("--logs_dir", type=str, help="Path where to save training logs") + parser.add_argument("--data_path", type=str, help="Path to the dataset") + parser.add_argument("--get_results", action="store_true") + + return parser.parse_args() + + +def train_config(args): + # GPT-3 126M + # This example will generate 3 configs. + # It is expected that this script will be run 3 times with changing --run_number flag for each run from 0 to 2. + # After all configurations are trained, please trigger the script using --get_results flag. + runner = AutoConfigurator( + model=run.Config(GPTConfig126M), + num_nodes=1, + gpus_per_node=1, + gpu_memory_gb=40, + global_batch_size=16, + seq_length=512, + tensor_parallel_sizes=[1], + pipeline_parallel_sizes=[1], + micro_batch_sizes=[1, 2, 4], + max_training_days=1, + max_steps_per_run=25, + num_tokens_in_b=10, + vocab_size=51200, + data_paths=args.data_path, + path_to_logs=args.logs_dir, + ) + + base_cfg, configs = generate_configs(runner) + if not args.get_results: + # Get generated configs + partials = list(configs.values()) + names = list(configs.keys()) + + # Run pre-training + partial = partials[args.run_number - 1] + partial.log.dir = os.path.join(args.logs_dir, names[args.run_number - 1]) + pretrain = fdl.build(partial) + pretrain() + else: + # # Get Auto Configurator results + get_results(base_cfg, runner, args.logs_dir) + print(f"The results were successfully saved to {args.logs_dir}.") + + +def main(): + args = get_args() + train_config(args) + + +if __name__ == '__main__': + main() diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index a5ce0c82a0e0..614af0df400c 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -51,6 +51,12 @@ GemmaConfig7B, GemmaModel, GPTConfig, + GPTConfig5B, + GPTConfig7B, + GPTConfig20B, + GPTConfig40B, + GPTConfig126M, + GPTConfig175B, GPTModel, Llama2Config7B, Llama2Config13B, diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 9785889aaf92..aa3615b3ddfd 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -15,6 +15,12 @@ from nemo.collections.llm.gpt.model.baichuan import Baichuan2Config, Baichuan2Config7B, Baichuan2Model from nemo.collections.llm.gpt.model.base import ( GPTConfig, + GPTConfig5B, + GPTConfig7B, + GPTConfig20B, + GPTConfig40B, + GPTConfig126M, + GPTConfig175B, GPTModel, MaskedTokenLossReduction, gpt_data_step, diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index a6b53f4e859d..e0d752bf3411 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -182,6 +182,60 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel": ) +@dataclass +class GPTConfig126M(GPTConfig): + seq_length: int = 2048 + num_layers: int = 12 + hidden_size: int = 768 + ffn_hidden_size: int = 3072 + num_attention_heads: int = 12 + + +@dataclass +class GPTConfig5B(GPTConfig): + seq_length: int = 2048 + num_layers: int = 24 + hidden_size: int = 4096 + ffn_hidden_size: int = 16384 + num_attention_heads: int = 32 + + +@dataclass +class GPTConfig7B(GPTConfig): + seq_length: int = 2048 + num_layers: int = 32 + hidden_size: int = 4096 + ffn_hidden_size: int = 10880 + num_attention_heads: int = 32 + + +@dataclass +class GPTConfig20B(GPTConfig): + seq_length: int = 2048 + num_layers: int = 44 + hidden_size: int = 6144 + ffn_hidden_size: int = 24576 + num_attention_heads: int = 48 + + +@dataclass +class GPTConfig40B(GPTConfig): + seq_length: int = 2048 + num_layers: int = 48 + hidden_size: int = 8192 + ffn_hidden_size: int = 32768 + num_attention_heads: int = 64 + + +@dataclass +class GPTConfig175B(GPTConfig): + seq_length: int = 2048 + num_layers: int = 96 + hidden_size: int = 12288 + ffn_hidden_size: int = 49152 + num_attention_heads: int = 96 + + class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin): def __init__( self, diff --git a/nemo/collections/llm/tools/auto_configurator/__init__.py b/nemo/collections/llm/tools/auto_configurator/__init__.py new file mode 100644 index 000000000000..5c6bde2c285a --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/__init__.py @@ -0,0 +1,2 @@ +from nemo.collections.llm.tools.auto_configurator.core.calculate_performance import get_results +from nemo.collections.llm.tools.auto_configurator.runner import AutoConfigurator, generate_configs diff --git a/nemo/collections/llm/tools/auto_configurator/core/__init__.py b/nemo/collections/llm/tools/auto_configurator/core/__init__.py new file mode 100644 index 000000000000..d9155f923f18 --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/core/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py new file mode 100644 index 000000000000..ee1579f6f6e8 --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py @@ -0,0 +1,367 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from megatron.core.optimizer import OptimizerConfig +from pytorch_lightning.loggers import TensorBoardLogger + +from nemo import lightning as nl +from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer +from nemo.collections.llm import PreTrainingDataModule +from nemo.collections.llm.utils import Config +from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule +from nemo.utils.exp_manager import TimingCallback + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class BaseConfig: + def __init__(self, config=None): + """ + Args: + config (AutoConfigurator): auto configurator runner config. + """ + + self.config = config + + self.model = self.get_model() + self.optim = self.get_optim() + self.trainer = self.get_trainer() + self.data = self.get_data() + self.log = self.get_logger() + self.run = self.get_run_config() + self.tokenizer = self.get_tokenizer(config.tokenizer_type, config.tokenizer_path) + + def get_model(self): + """Function that returns model config. + + Returns: + Config: model config. + """ + + self.config.model.seq_length = self.config.seq_length + + return self.config.model + + def get_optim(self) -> Config[OptimizerConfig]: + """Function that returns optimizer config. + + Returns: + Config[OptimizerConfig]: optimizer config. + """ + optim_params = { + "optimizer": "adam", + "lr": 1e-4, + "min_lr": 1e-5, + "use_distributed_optimizer": True, + "bf16": True, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "overlap_grad_reduce": True, + "overlap_param_gather": True, + "clip_grad": 1.0, + "adam_eps": 1e-5, + } + + optim_config = Config( + OptimizerConfig, + **optim_params, + ) + + sched = Config( + CosineAnnealingScheduler, + warmup_steps=10, + constant_steps=0, + min_lr=optim_config.min_lr, + ) + + return Config( + MegatronOptimizerModule, + config=optim_config, + lr_scheduler=sched, + ) + + def get_trainer(self) -> Config[nl.Trainer]: + """Function that returns config for PTL trainer. + + Returns: + Config[nl.Trainer]: trainer config. + """ + + trainer_config = { + "accelerator": "gpu", + "enable_checkpointing": False, + "use_distributed_sampler": False, + "max_epochs": None, + "log_every_n_steps": 1, + "limit_val_batches": 1, + "limit_test_batches": 1, + "accumulate_grad_batches": 1, + "num_nodes": self.config.num_nodes, + "devices": self.config.num_gpus, + "max_steps": self.config.max_steps_per_run, + "val_check_interval": self.config.max_steps_per_run, + } + + strategy = Config( + nl.MegatronStrategy, + pipeline_dtype=torch.bfloat16, + ) + + return Config( + nl.Trainer, + **trainer_config, + strategy=strategy, + plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + callbacks=[Config(TimingCallback)], + ) + + def get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config: + """Function that returns the tokenizer config. + + Args: + tokenizer_type (str): tokenizer type. + tokenizer_path (str): path to the tokenizer. + + Returns: + Config: tokenizer config. + """ + + if tokenizer_type == "sentencepiece": + return Config(SentencePieceTokenizer, model_path=tokenizer_path) + else: + return Config(AutoTokenizer, pretrained_model_name=tokenizer_path) + + def get_data(self) -> Config[PreTrainingDataModule]: + """Function that returns dataset config. + + Returns: + Config[PreTrainingDataModule]: data config. + """ + + # Data config + data_config = { + "paths": self.config.data_paths, + "seq_length": self.config.seq_length, + "global_batch_size": self.config.global_batch_size, + "num_workers": 2, + "index_mapping_dir": None, + } + + # Define the tokenizer + tokenizer = self.get_tokenizer( + self.config.tokenizer_type, + self.config.tokenizer_path, + ) + + return Config( + PreTrainingDataModule, + **data_config, + tokenizer=tokenizer, + ) + + def get_logger(self) -> Config[nl.NeMoLogger]: + """Function that returns the training strategy. + + Returns: + Config[nl.NeMoLogger]: NeMo Logger config. + """ + + # Define TensorBoard Logger + tb_logger = Config(TensorBoardLogger, save_dir="tb_logs") + + ckpt = Config( + nl.ModelCheckpoint, + monitor="reduced_train_loss", + save_last=False, + save_top_k=0, + ) + + return Config( + nl.NeMoLogger, + ckpt=ckpt, + tensorboard=tb_logger, + wandb=None, + dir=self.config.path_to_logs, + ) + + def get_run_config(self) -> dict: + """Function that returns config for cluster job. + + Returns: + dict: cluster job config. + """ + + run_config = { + "name": self.config.model.__class__.__name__, + "time_limit": f"0-00:{self.config.max_minutes_per_run}:00", + } + + return run_config + + +def calculate_model_size( + gpu_count: int, + max_training_days: float, + model_size_in_b: float = None, + tflops_per_gpu: int = 140, + num_tokens_in_b: int = 300, + model_name: str = "gpt3", +) -> float: + """Estimates a model size to be trained given the constraints. If the + model_size is provided, it estimates the time to train it with the given + constraints. + + Example: + output 5B params to train for 7 days with 160 GPUs. + + Args: + gpu_count (int): number of gpus to use (num_nodes * gpus_per_node). + max_training_days (float): number of days to train the model for. + model_size_in_b (float): number of parameters in the model, if known. + tflops_per_gpu (int): estimated number of TFLOPS/s per GPU. + num_tokens_in_b (int): number of tokens to train the model for. + model_name (str): name of the model. + + Returns: + float: number of parameters to use for training. + """ + + # Model size is not known, must be estimated. + if model_size_in_b is None: + model_size_in_b = _estimate_model_size( + max_training_days=max_training_days, + gpu_count=gpu_count, + tflops_per_gpu=tflops_per_gpu, + num_tokens_in_b=num_tokens_in_b, + model_name=model_name, + ) + # Model size is known, so only time to train estimate is needed. + else: + max_training_days = _estimate_training_time( + model_size_in_b=model_size_in_b, + gpu_count=gpu_count, + tflops_per_gpu=tflops_per_gpu, + num_tokens_in_b=num_tokens_in_b, + model_name=model_name, + ) + + print( + f"You can train a {model_size_in_b}B parameter model in " + f"{max_training_days} days using {gpu_count} GPUs. This result assumes " + f"you are training to {num_tokens_in_b}B tokens, and each GPU achieves " + f"{tflops_per_gpu} TFLOPS." + ) + return model_size_in_b + + +def _estimate_model_size( + max_training_days: float, + gpu_count: int, + tflops_per_gpu: int, + num_tokens_in_b: int, + model_name: str, +) -> float: + """Estimates model size given time and hardware constraints. It's only used if the model size is not provided by the user. + + Args: + max_training_days (float): number of days to train the model for. + gpu_count (int): number of gpus to use (num_nodes * gpus_per_node). + tflops_per_gpu (int): estimated number of TFLOPS/s per GPU. + num_tokens_in_b (int): number of tokens to train the model for. + model_name (str): name of the model, such as gpt3, t5, mt5... + + Returns: + float: number of parameters to use for training. + + Raises: + NotImplementedError: if the model_name is not one of the supported models. + """ + + model_penalty = 0.87 if model_name == "mt5" else 1.0 + valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma", "nemotron"] + try: + if model_name in valid_models: + return round( + model_penalty + * (max_training_days * 3600 * 24 * gpu_count * tflops_per_gpu * 1e12) + / (8 * num_tokens_in_b * 1e9) + / 1e9, + 2, + ) + else: + raise NotImplementedError + except ValueError as err: + print(f"Input values were not valid: {err}") + except ZeroDivisionError as err: + print(f"Cannot divide by zero. This can happen if num_tokens_in_b is zero: {err}") + except NotImplementedError as err: + print(f"Model size estimation is only available for {valid_models}: {err}") + return None + + +def _estimate_training_time( + model_size_in_b: float, + gpu_count: int, + tflops_per_gpu: int, + num_tokens_in_b: int, + model_name: str, +) -> float: + """Estimates training time for a given model size and hardware constraint. To be used when a model size is provided by the user. + + Args: + model_size_in_b (float): number of parameters to use for training. + gpu_count (int): number of gpus to use (num_nodes * gpus_per_node). + tflops_per_gpu (int): estimated number of TFLOPS/s per GPU. + num_tokens_in_b (int): number of tokens to train the model for. + model_name (str): name of the model, such as gpt3, t5, mt5... + + Returns: + float: number of days it will take to train the model. + + Raises: + NotImplementedError: if the model_name is not one of the supported models. + """ + + model_penalty = 1.15 if model_name == "mt5" else 1.0 + valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma", "nemotron"] + try: + if model_name in valid_models: + return round( + model_penalty + * (model_size_in_b * 1e9 * 8 * num_tokens_in_b * 1e9) + / (3600 * 24 * gpu_count * tflops_per_gpu * 1e12), + 2, + ) + else: + raise NotImplementedError + except ValueError as err: + print(f"Input values were not valid: {err}") + except ZeroDivisionError as err: + print(f"Cannot divide by zero. This can happen if gpu_count or tflops_per_gpu are zero: {err}") + except NotImplementedError as err: + print(f"Training time estimation is only available for {valid_models}: {err}") + return None diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py new file mode 100644 index 000000000000..5b7ac0ebc4d3 --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py @@ -0,0 +1,334 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from typing import Optional + +import pandas as pd +from tensorboard.backend.event_processing import event_accumulator + + +def get_results( + base_config=None, + train_config=None, + path_to_save: str = None, + output_top_n: Optional[int] = 10, +): + """Generates performance results. + + Args: + config (AutoConfigurator): auto configurator runner config. + path_to_save (str): path where to save performance results. + output_top_n (Optional[int]): Number of configs to be printed out as best configs. + """ + + # Define needed variables + model_name = train_config.model_type + model_size = train_config.model_size_in_b + global_batch_size = base_config.data.global_batch_size + seq_length = base_config.data.seq_length + + vocab_size = train_config.vocab_size + num_nodes = train_config.num_nodes + gpus_per_node = train_config.gpus_per_node + + layers = base_config.model.num_layers + hs = base_config.model.hidden_size + ffn_hs = base_config.model.ffn_hidden_size + + training_logs = path_to_save + final_result_logs = path_to_save + + result_columns = [ + "Model Name", + "Model Size", + "Seq Length", + "TP", + "PP", + "CP", + "EP", + "MBS", + "Act Ckpt Layers", + "Act Ckpt Micro Bathes", + "Act Ckpt Layers per Pipeline", + "Num Layers", + "Hidden Size", + "FFN Hidden Size", + "GBS", + "Nodes", + "GPUs per Node", + "Time per Step", + "Samples per Second", + "Model TFLOPS / GPU", + "Model TFLOPS Aggregate", + ] + error_columns = [ + "Model Name", + "Model Size", + "Seq Length", + "TP", + "PP", + "CP", + "EP", + "MBS", + "Act Ckpt Layers", + "Act Ckpt Micro Bathes", + "Act Ckpt Layers per Pipeline", + "Num Layers", + "Hidden Size", + "FFN Hidden Size", + "GBS", + "Nodes", + "GPUs per Node", + "Error Message", + ] + result = [] + errors = [] + dirs = [f.path for f in os.scandir(training_logs) if f.is_dir()] + + for candidate_dir in dirs: + logs_dir = os.path.join(training_logs, candidate_dir, "tb_logs/lightning_logs") + logs_folder = [f.path for f in os.scandir(logs_dir) if f.is_dir()][0] + tp, pp, cp, ep, mbs, act_ckpt, num_mbs_act, act_per_pipe = get_config(candidate_dir) + + for f in os.listdir(logs_folder): + if f.endswith("0.txt"): + error_file = os.path.join(logs_folder, f) + error = find_error(error_file) + if error: + errors.append( + [ + model_name, + model_size, + seq_length, + tp, + pp, + cp, + ep, + mbs, + act_ckpt, + num_mbs_act, + act_per_pipe, + layers, + hs, + ffn_hs, + global_batch_size, + num_nodes, + gpus_per_node, + error, + ] + ) + + files = os.listdir(logs_folder) + for f in files: + if f.startswith("events"): + event_file = os.path.join(logs_folder, f) + ea = event_accumulator.EventAccumulator(event_file) + ea.Reload() + try: + timing_list = ea.Scalars("train_step_timing in s") + if len(timing_list) <= 6: + continue + timing_list = [x.value for x in timing_list[5:]] + avg_global_step_time = round(sum(timing_list) / len(timing_list), 4) + samples_per_s = round(global_batch_size / avg_global_step_time, 2) + m_tflops, m_tflops_gpu = calculate_tflops( + model_name=model_name, + gbs=global_batch_size, + enc_seq_len=seq_length, + dec_seq_len=seq_length, + hs=hs, + ffn_hs=ffn_hs, + layers=layers, + vocab=vocab_size, + nodes=num_nodes, + gpus_per_node=gpus_per_node, + time_per_step=avg_global_step_time, + ) + config_name = f"tp{tp}_pp{pp}_cp{cp}_ep{ep}_mbs{mbs}_act_{act_ckpt}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}" + result.append( + [ + model_name, + model_size, + seq_length, + tp, + pp, + cp, + ep, + mbs, + act_ckpt, + num_mbs_act, + act_per_pipe, + layers, + hs, + ffn_hs, + global_batch_size, + num_nodes, + gpus_per_node, + avg_global_step_time, + samples_per_s, + m_tflops_gpu, + m_tflops, + ] + ) + finally: + continue + result.sort(key=lambda x: x[17]) + print(f"Top {min(output_top_n, len(result))} configs sorted from fastest to slowest:") + for i, res in enumerate(result): + print(f"Config #{i+1}: {res[-1]} with {res[17]:.4f}s per global step.") + if i + 1 == output_top_n: + break + + top_config = f"{model_name}_{model_size}b_{num_nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_cp_{result[0][5]}_ep_{result[0][6]}_mbs_{result[0][7]}_act_ckpt_{result[0][8]}_num_mbs_act_{result[0][9]}_act_per_pipe_{result[0][10]}" + print("\n==================================================") + print(f"Optimal config: {top_config} with {result[0][17]:.4f}s per global step.") + print("==================================================\n") + + # Save results as a CSV file. + os.makedirs(final_result_logs, exist_ok=True) + result_df = pd.DataFrame(result, columns=result_columns) + result_df.to_csv(os.path.join(final_result_logs, f"final_summary_{num_nodes}nodes.csv"), index=False) + + error_df = pd.DataFrame(errors, columns=error_columns) + error_df.to_csv(os.path.join(final_result_logs, f"failed_jobs_{num_nodes}nodes.csv"), index=False) + + +def calculate_tflops( + model_name, + gbs, + enc_seq_len, + dec_seq_len, + hs, + ffn_hs, + layers, + vocab, + nodes, + gpus_per_node, + time_per_step, +): + """Calculates model and hardware TFLOPS for each model. + + GPT-3 Formulas: + Model FLOPs = (24𝐵𝑠ℎ^2 + 4𝐵��^2ℎ) x (3 x num_layers) + 6𝐵𝑠ℎ + T5/mT5 Formula: + Model FLOPs = + Bert Formula: + Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL)) + """ + + if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral"]: + # Model FLOPS calculation + model_flops = ( + (24 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs) * (3 * layers) + + (6 * gbs * enc_seq_len * hs * vocab) + ) / time_per_step + model_flops_per_gpu = model_flops / (nodes * gpus_per_node) + + model_tflops = model_flops / 1e12 + model_tflops_per_gpu = model_flops_per_gpu / 1e12 + + elif model_name == "bert": + model_flops = ( + 72 * gbs * layers * enc_seq_len * hs * hs * (1 + (enc_seq_len / (6 * hs)) + (vocab / (12 * hs * layers))) + ) / time_per_step + model_flops_per_gpu = model_flops / (nodes * gpus_per_node) + model_tflops = model_flops / 1e12 + model_tflops_per_gpu = model_flops_per_gpu / 1e12 + + elif model_name in ["t5", "mt5"]: + # Encoder Layer FLOPS: include self attention + MLP + flops_self_attn_enc = 8 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs + flops_mlp_enc = 6 * gbs * enc_seq_len * hs * ffn_hs # geglu needs two gemms for h -> ffn_h + flops_enc_layer = flops_self_attn_enc + flops_mlp_enc + + # Decoder Layer FLOPS: inlcude self_attn + cross_attn + MLP + flops_self_attn_dec = 8 * gbs * dec_seq_len * hs * hs + 4 * gbs * dec_seq_len * dec_seq_len * hs + flops_cross_attn_dec = ( + 4 * gbs * enc_seq_len * hs * hs + + 4 * gbs * dec_seq_len * hs * hs + + 4 * gbs * enc_seq_len * dec_seq_len * hs + ) + flops_mlp_dec = 6 * gbs * dec_seq_len * hs * ffn_hs # geglu needs two gemms for h -> ffn_h + flops_dec_layer = flops_self_attn_dec + flops_cross_attn_dec + flops_mlp_dec + + # FLOPs of logits layer in the head + flops_logits = 2 * gbs * dec_seq_len * hs * vocab + + # FLOPs of fprop + flops_fprop = (flops_enc_layer + flops_dec_layer) * (layers // 2) + flops_logits + + # FLOPs of each train step (FLOPs of bprop is 2*fprop) + model_flops = 3 * flops_fprop / time_per_step + model_flops_per_gpu = model_flops / (nodes * gpus_per_node) + model_tflops = model_flops / 1e12 + model_tflops_per_gpu = model_flops_per_gpu / 1e12 + + else: + raise NotImplementedError("Model type not supported.") + return round(model_tflops, 2), round(model_tflops_per_gpu, 2) + + +def find_error(error_file: str, errors: list = ["CUDA out of memory"]): + """Function that finds the error among job output. + + Args: + errors (list): list of "popular" errors. + error_file (str): path to the job output. + + Returns: + str: serror message if job has been failed because of one of listed errors or None if not. + """ + + error = None + with open(error_file, "r") as f: + output = f.read() + for e in errors: + if e in output: + error = e + return error + + +def get_config(run_name: str) -> tuple: + """Function that extract model parallelism parameters + + Args: + run_name (str): name of the run. + + Returns: + tuple: model parallelism parameters. + """ + pattern = r'_(tp|pp|cp|ep|mbs|act_ckpt|num_mbs_act|act_per_pipe)_([^_]+)' + + # Find all matches in the input string + matches = re.findall(pattern, run_name) + + # Convert matches to a dictionary + params = {param: value for param, value in matches} + + return ( + params["tp"], + params["pp"], + params["cp"], + params["ep"], + params["mbs"], + params["act_ckpt"], + params["num_mbs_act"], + params["act_per_pipe"], + ) + + +if __name__ == "__main__": + main() diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py new file mode 100644 index 000000000000..087bf3c6fb0e --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py @@ -0,0 +1,892 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import List, Tuple + +from nemo.collections.llm.tools.auto_configurator.core import utils + + +GPT_BASED_MODELS = [ + "gpt3", + "bert", + "llama", + "baichuan2", + "chatglm", + "qwen2", + "mixtral", + "mistral", + "gemma", + "nemotron", +] + + +def generate_grid_search_configs( + base_cfg: dict, + train_cfg: dict, +) -> Tuple[dict, dict]: + """Generates the grid of all possible configurations for the given model, and stores each different configuration in a yaml file. + + Args: + base_cfg (dict): base configuration of the model to be trained. + train_cfg (dict): train configuration of the model to be trained. + + Returns: + dict: base config. + dict: generated configs. + """ + + model_name = train_cfg.model_type + model_size_in_b = train_cfg.model_size_in_b + + # 2 * num_layers is needed because of encoder/decoder architecture. + multiplier = 1 if model_name in GPT_BASED_MODELS else 2 + + seq_length = base_cfg.model.seq_length + num_layers = base_cfg.model.num_layers if model_name in GPT_BASED_MODELS else base_cfg.model.encoder.num_layers + + if model_name in GPT_BASED_MODELS: + act_method = None + else: + act_method = base_cfg.model.encoder.activations_checkpoint_method + + params = _calculate_tp_pp_mbs_grid( + model_size_in_b=model_size_in_b, + num_layers=num_layers, + model_name=model_name, + seq_length=seq_length, + train_cfg=train_cfg, + ) + + max_minutes = train_cfg.max_minutes_per_run + max_steps = train_cfg.max_steps_per_run + num_nodes = train_cfg.num_nodes + + valid_tp_pp_list = [] + for tp in params.tp: + for pp in params.pp: + for cp in params.cp: + for ep in params.ep: + for mbs in params.mbs: + num_gpus = base_cfg.trainer.num_nodes * base_cfg.trainer.devices + base_cfg.data.global_batch_size = params.gbs + if model_name in GPT_BASED_MODELS: + att_heads = base_cfg.model.num_attention_heads + num_layers = base_cfg.model.num_layers + else: + att_heads = base_cfg.model.encoder.num_attention_heads + num_layers = base_cfg.model.encoder.num_layers + model_parallelism = (tp * pp * cp * ep) if (cp and ep) else (tp * pp) + mod_gbs = params.gbs % (mbs * num_gpus / model_parallelism) + mod_att_heads = att_heads % tp + mod_layers = (multiplier * num_layers) % pp + mod_cp = cp if cp else 1 + mod_ep = ep if ep else 1 + if ( + mod_gbs == 0 + and mod_att_heads == 0 + and mod_layers == 0 + and (tp, pp, cp, ep) not in valid_tp_pp_list + and (mod_cp // mod_ep == mod_cp or mod_ep // mod_cp == mod_ep) + and params.min_model_parallel <= model_parallelism <= params.max_model_parallel + ): + valid_tp_pp_list.append((tp, pp, cp, ep)) + + # Generate grid search configs. + configs = {} + for tp, pp, cp, ep in valid_tp_pp_list: + ( + virtual_pipelines, + act_ckpt_layers, + num_micro_batches_partial_act_ckpt, + act_ckpt_layers_per_pipeline, + ) = _set_activations_checkpoint_params( + tp, + pp, + cp, + ep, + num_layers, + act_method, + multiplier, + model_size_in_b, + model_name, + ) + for mbs in params.mbs: + kwargs = { + "base_cfg": base_cfg, + "act": None, + "num_mbs_act": None, + "act_per_pipe": None, + "tp": tp, + "pp": pp, + "cp": cp, + "ep": ep, + "virtual_pipelines": virtual_pipelines, + "mbs": mbs, + "max_minutes": max_minutes, + "max_steps": max_steps, + "num_nodes": num_nodes, + "model_name": model_name, + "model_size": model_size_in_b, + } + if act_ckpt_layers[0] is not None: + if act_layers is not None and act_layers != "auto": + act_ckpt_layers = act_layers + for act in act_ckpt_layers: + for num_mbs_act in num_micro_batches_partial_act_ckpt: + for act_per_pipe in act_ckpt_layers_per_pipeline: + kwargs["act"] = act + kwargs["num_mbs_act"] = num_mbs_act + kwargs["act_per_pipe"] = act_per_pipe + new_cfg = utils.modify_cfg(**kwargs) + if new_cfg: # Save candidate cfg. + configs[new_cfg["run"]["name"]] = new_cfg + else: + new_cfg = utils.modify_cfg(**kwargs) + if new_cfg: # Save candidate cfg. + config_name = new_cfg["run"]["name"] + new_cfg.pop("run") + configs[config_name] = new_cfg + + print(f"\nAll candidate configurations created correctly. Total number of configs: {len(configs)}.\n") + return base_cfg, configs + + +def _set_activations_checkpoint_params( + tp, pp, cp, ep, num_layers, act_method, multiplier, model_size_in_b, model_name +): + act_multiple = 4 // pp + if act_method == "block": + if 1.0 <= model_size_in_b < 11.3: + act_multiple = 8 // pp + elif 11.3 <= model_size_in_b < 26.0: + act_multiple = 16 // pp + elif 26.0 <= model_size_in_b < 60.0: + act_multiple = 16 // pp + elif 60.0 <= model_size_in_b: + act_multiple = 32 // pp + act_multiple = max(act_multiple, 1) + + virtual_pipelines = None + # Num micro batches with partial act ckpt + min_micro_b = 0 # 0 will not be used, minimum will be set to 1 later in the code. + max_micro_b = pp + interval_micro_b = 1 + # Act ckpt layers per pipeline + min_layers_per_pipe = 0 + max_layers_per_pipe = num_layers + interval_layers_per_pipe = act_multiple + if model_name in GPT_BASED_MODELS and pp > 2: # Interleaved pipeline scheduling. + virtual_pipelines = num_layers // pp # TODO: verify that this is the best value. + act_multiple = 1 + max_micro_b = pp * (virtual_pipelines - 1) + (pp - 1) * 2 + 1 + interval_micro_b = virtual_pipelines * 8 + max_layers_per_pipe = multiplier * num_layers // pp // virtual_pipelines + 1 + + ( + act_ckpt_layers, + num_micro_batches_partial_act_ckpt, + act_ckpt_layers_per_pipeline, + ) = ([None], [None], [None]) + if act_method == "block": + # Act ckpt num layers + if virtual_pipelines is None: + act_ckpt_layers = range(0, multiplier * num_layers // pp + 1, act_multiple) + else: + act_ckpt_layers = range(0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple) + + if pp > 1 and model_name in GPT_BASED_MODELS: + # Num micro batches with partial act ckpt + num_micro_batches_partial_act_ckpt = list(range(min_micro_b, max_micro_b + 1, interval_micro_b)) + if num_micro_batches_partial_act_ckpt[0] == 0: + num_micro_batches_partial_act_ckpt[0] = 1 + + # Act ckpt layers per pipeline + act_ckpt_layers_per_pipeline = range( + min_layers_per_pipe, max_layers_per_pipe + 1, interval_layers_per_pipe + ) + + return ( + virtual_pipelines, + act_ckpt_layers, + num_micro_batches_partial_act_ckpt, + act_ckpt_layers_per_pipeline, + ) + + +@dataclass +class GPT3GridSearch: + """Selects grid search space for TP, PP, CP, EP, MBS parameters for GPT-3 and 80GB GPUs. + + Args: + model_size_in_b (float): number of parameters in the model. + valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config. + seq_length (int): sequence length to use for training. + gpu_memory_gb (int): size of GPU memory in GB. + """ + + model_size_in_b: int + valid_pp: List[int] + seq_length: int + gpu_memory_gb: int + + tp = [1, 2, 4, 8] + pp = [1] + cp = [1] + ep = [1] + mbs = [1, 2, 4, 8] + + gbs: int = 1024 + min_model_parallel: int = 1 + max_model_parallel: int = 8 + + def init_params(self): + model_size_in_b = self.model_size_in_b + gpu_memory_gb = self.gpu_memory_gb + seq_length = self.seq_length + + if gpu_memory_gb == 80: + if seq_length == 2048: + if model_size_in_b <= 1.0: + self.tp = [1, 2] + self.gbs = 256 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.gbs = 1024 + elif model_size_in_b <= 8.0: + self.tp = [1, 2, 4] + self.gbs = 2048 + elif model_size_in_b <= 13.0: + self.tp = [1, 2, 4, 8] + self.gbs = 2048 + elif model_size_in_b <= 23.0: + self.tp = [1, 2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 4] + self.mbs = [1, 2, 4] + self.min_model_parallel = 4 + self.max_model_parallel = 8 + self.gbs = 2048 + elif model_size_in_b <= 45.0: + self.tp = [2, 4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 4] + self.mbs = [1, 2, 4] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 2048 + elif model_size_in_b <= 95: + self.tp = [2, 4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 8] + self.mbs = [1, 2, 4, 8] + self.min_model_parallel = 8 + self.max_model_parallel = 64 + self.gbs = 2048 + elif model_size_in_b <= 130.0: + self.tp = [2, 4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 16] + self.mbs = [1, 2, 4, 8] + self.min_model_parallel = 16 + self.max_model_parallel = 128 + self.gbs = 2048 + elif model_size_in_b <= 195.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 16] + self.mbs = [1, 2, 4] + self.min_model_parallel = 32 + self.max_model_parallel = 256 + self.gbs = 2048 + elif model_size_in_b <= 395.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 8 <= x <= 32] + self.mbs = [1, 2, 4] + self.min_model_parallel = 64 + self.max_model_parallel = 512 + self.gbs = 2048 + elif model_size_in_b <= 790.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 8 <= x <= 100] + self.mbs = [1, 2, 4] + self.min_model_parallel = 128 + self.max_model_parallel = 1024 + self.gbs = 2048 + elif model_size_in_b <= 1100.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 16 <= x <= 130] + self.mbs = [1, 2, 4] + self.min_model_parallel = 256 + self.max_model_parallel = 2048 + self.gbs = 2048 + elif seq_length == 4096: + if model_size_in_b <= 1.0: + self.tp = [1, 2, 4] + self.mbs = [1, 2, 4, 8] + self.gbs = 128 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.mbs = [1, 2, 4, 8] + self.gbs = 512 + elif model_size_in_b <= 8.0: + self.tp = [1, 2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2, 4] + self.gbs = 1024 + elif model_size_in_b <= 13.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2, 4] + self.gbs = 1024 + elif model_size_in_b <= 23.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.gbs = 1024 + elif model_size_in_b <= 45.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 4] + self.mbs = [1, 2] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 1024 + elif model_size_in_b <= 95: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 8] + self.mbs = [1, 2] + self.min_model_parallel = 8 + self.max_model_parallel = 64 + self.gbs = 1024 + elif seq_length == 8192: + if model_size_in_b <= 1.0: + self.tp = [1, 2] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2, 4] + self.gbs = 64 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2, 4] + self.gbs = 128 + elif model_size_in_b <= 8.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2] + self.gbs = 256 + elif model_size_in_b <= 13.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2] + self.gbs = 256 + elif model_size_in_b <= 23.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 4] + self.mbs = [1] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 256 + elif model_size_in_b <= 45.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 8] + self.mbs = [1] + self.min_model_parallel = 32 + self.max_model_parallel = 64 + self.gbs = 256 + elif seq_length == 16384: + if model_size_in_b <= 1.0: + self.tp = [2, 4] + self.mbs = [1, 2] + self.gbs = 32 + elif model_size_in_b <= 4.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1] + self.gbs = 64 + elif model_size_in_b <= 8.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1] + self.gbs = 128 + elif model_size_in_b <= 13.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1] + self.gbs = 128 + elif model_size_in_b <= 23.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 4] + self.mbs = [1] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 128 + elif seq_length == 32768: + if model_size_in_b <= 1.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1] + self.gbs = 16 + elif model_size_in_b <= 4.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1] + self.gbs = 32 + elif model_size_in_b <= 8.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.mbs = [1] + self.gbs = 64 + elif model_size_in_b <= 13.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.mbs = [1] + self.gbs = 64 + elif model_size_in_b <= 23.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 4] + self.mbs = [1] + self.min_model_parallel = 16 + self.max_model_parallel = 32 + self.gbs = 64 + elif gpu_memory_gb == 40: + if model_size_in_b <= 1.0: + self.tp = [1, 2, 4] + self.mbs = [1, 2, 4, 8] + self.gbs = 256 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4, 8] + self.mbs = [1, 2, 4, 8] + self.gbs = 1024 + elif model_size_in_b <= 8.0: + self.tp = [2, 4, 8] + self.pp = [1, 2] + self.mbs = [1, 2, 4] + self.min_model_parallel = 2 + self.gbs = 2048 + elif model_size_in_b <= 13.0: + self.tp = [4, 8] + self.pp = [1, 2, 4] + self.mbs = [1, 2, 4] + self.min_model_parallel = 4 + self.max_model_parallel = 32 + self.gbs = 2048 + elif model_size_in_b <= 23.0: + self.tp = [2, 4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 8] + self.min_model_parallel = 8 + self.max_model_parallel = 64 + self.gbs = 2048 + elif model_size_in_b <= 45.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 12] + self.mbs = [1, 2, 4] + self.min_model_parallel = 16 + self.max_model_parallel = 128 + self.gbs = 2048 + elif model_size_in_b <= 95: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 16] + self.mbs = [1, 2, 4] + self.min_model_parallel = 16 + self.max_model_parallel = 256 + self.gbs = 2048 + elif model_size_in_b <= 130.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 26] + self.mbs = [1, 2] + self.min_model_parallel = 32 + self.max_model_parallel = 512 + self.gbs = 2048 + elif model_size_in_b <= 195.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 32] + self.mbs = [1, 2] + self.min_model_parallel = 64 + self.max_model_parallel = 1024 + self.gbs = 2048 + elif model_size_in_b <= 395.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 64] + self.mbs = [1, 2] + self.min_model_parallel = 128 + self.max_model_parallel = 2048 + self.gbs = 2048 + elif model_size_in_b <= 790.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 8 <= x <= 128] + self.mbs = [1, 2] + self.min_model_parallel = 256 + self.max_model_parallel = 4096 + self.gbs = 2048 + elif model_size_in_b <= 1100.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 8 <= x <= 192] + self.mbs = [1, 2] + self.min_model_parallel = 512 + self.max_model_parallel = 8192 + self.gbs = 2048 + + +@dataclass +class T5GridSearch: + """Selects grid search space for TP, PP, MBS parameters for T5/mT5 and 80GB GPUs. + + Args: + model_size_in_b (float): number of parameters in the model. + valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config. + seq_length (int): sequence length to use for training. + gpu_memory_gb (int): size of GPU memory in GB. + """ + + model_size_in_b: int + seq_length: int + gpu_memory_gb: int + valid_pp: List[int] + + tp = [1, 2, 4, 8] + pp = [1] + cp = [None] + ep = [None] + mbs = [1, 2, 4, 6, 8, 12, 16] + + gbs: int = 1920 + min_model_parallel: int = 1 + max_model_parallel: int = 8 + + def init_params(self): + model_size_in_b = self.model_size_in_b + gpu_memory_gb = self.gpu_memory_gb + seq_length = self.seq_length + + if gpu_memory_gb == 80: + if model_size_in_b <= 1.0: + self.tp = [1, 2] + self.mbs = [16, 32, 64, 128] + self.gbs = 2048 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.mbs = [4, 6, 8, 12, 16, 24, 32, 48] + self.gbs = 1920 + elif model_size_in_b <= 8.0: + self.tp = [2, 4, 8] + self.mbs = [4, 6, 8, 12, 16, 24, 32] + self.gbs = 1920 + elif model_size_in_b <= 14.5: + self.tp = [4, 8] + self.mbs = [2, 4, 6, 8, 12, 16, 24] + self.gbs = 1920 + elif model_size_in_b <= 25.9: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.gbs = 1920 + elif model_size_in_b <= 43.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 4] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 1920 + elif model_size_in_b <= 85.5: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 8] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 16 + self.max_model_parallel = 64 + self.gbs = 1920 + elif model_size_in_b <= 165.5: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 16] + self.mbs = [1, 2, 4, 6] + self.min_model_parallel = 32 + self.max_model_parallel = 128 + self.gbs = 1920 + elif model_size_in_b <= 250: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 32] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 64 + self.max_model_parallel = 256 + self.gbs = 1920 + elif gpu_memory_gb == 40: + if model_size_in_b <= 1.0: + self.tp = [1, 2] + self.mbs = [16, 32, 64, 128] + self.gbs = 2048 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.mbs = [4, 8, 12, 16, 24, 32, 48] + self.gbs = 1920 + elif model_size_in_b <= 8.0: + self.tp = [2, 4, 8] + self.mbs = [4, 6, 8, 12, 16, 24] + self.gbs = 1920 + elif model_size_in_b <= 14.5: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [2, 4, 6, 8, 12, 16] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.gbs = 1920 + elif model_size_in_b <= 25.9: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 8] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 1920 + elif model_size_in_b <= 43.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 8] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 16 + self.max_model_parallel = 32 + self.gbs = 1920 + elif model_size_in_b <= 85.5: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 8] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 32 + self.max_model_parallel = 64 + self.gbs = 1920 + elif model_size_in_b <= 165.5: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 32] + self.mbs = [1, 2, 4] + self.min_model_parallel = 64 + self.max_model_parallel = 128 + self.gbs = 1920 + elif model_size_in_b <= 250: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 8 <= x <= 64] + self.mbs = [1, 2, 4] + self.min_model_parallel = 128 + self.max_model_parallel = 256 + self.gbs = 1920 + + +@dataclass +class BertGridSearch: + """Selects grid search space for TP, PP, MBS parameters for BERT and 80GB GPUs. + + Args: + model_size_in_b (float): number of parameters in the model. + valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config. + seq_length (int): sequence length to use for training. + gpu_memory_gb (int): size of GPU memory in GB. + """ + + model_size_in_b: int + seq_length: int + gpu_memory_gb: int + valid_pp: List[int] + + tp = [1, 2, 4, 8] + pp = [1] + cp = [None] + ep = [None] + mbs = [1, 2, 4, 6, 8, 12, 16] + + gbs: int = 1920 + min_model_parallel: int = 1 + max_model_parallel: int = 8 + + def init_params(self): + model_size_in_b = self.model_size_in_b + gpu_memory_gb = self.gpu_memory_gb + seq_length = self.seq_length + + if gpu_memory_gb == 80: + if model_size_in_b <= 1.0: + self.tp = [1, 2] + self.gbs = 256 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.gbs = 1024 + elif model_size_in_b <= 8.0: + self.tp = [2, 4, 8] + self.min_model_parallel = 2 + self.gbs = 2048 + elif model_size_in_b <= 13.0: + self.tp = [2, 4, 8] + self.mbs = [1, 2, 3, 4, 6] + self.min_model_parallel = 2 + self.gbs = 2048 + elif model_size_in_b <= 25.0: + self.tp = [4, 8] + self.mbs = [1, 2, 3, 4] + self.min_model_parallel = 4 + self.gbs = 2048 + elif model_size_in_b <= 46.5: + self.tp = [4, 8] + self.pp = [1, 2, 4] + self.mbs = [1, 2, 3, 4] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.gbs = 2048 + elif model_size_in_b <= 87.5: + self.tp = [4, 8] + self.pp = [2, 4, 6, 8] + self.mbs = [1, 2, 3, 4] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 2048 + elif model_size_in_b <= 165.5: + self.tp = [4, 8] + self.pp = [4, 6, 8, 16] + self.mbs = [2, 4, 6, 8] + self.min_model_parallel = 16 + self.max_model_parallel = 128 + self.gbs = 2048 + elif model_size_in_b <= 250.5: + self.tp = [8] + self.pp = [4, 8, 16, 32] + self.mbs = [1, 2, 3, 4] + self.min_model_parallel = 32 + self.max_model_parallel = 256 + self.gbs = 2048 + else: + raise ValueError("No BERT model larger than 250B parameters is supported.") + elif gpu_memory_gb == 40: + if model_size_in_b <= 1.0: + self.tp = [1, 2, 4] + self.gbs = 256 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4, 8] + self.gbs = 1024 + elif model_size_in_b <= 8.0: + self.tp = [2, 4, 8] + self.mbs = [1, 2, 4] + self.gbs = 2048 + elif model_size_in_b <= 13.0: + self.tp = [2, 4, 8] + self.mbs = [1, 2, 4] + self.gbs = 2048 + elif model_size_in_b <= 25.0: + self.tp = [2, 4, 8] + self.pp = [1, 2] + self.mbs = [1, 2, 4] + self.min_model_parallel = 2 + self.max_model_parallel = 16 + self.gbs = 2048 + elif model_size_in_b <= 46.5: + self.tp = [4, 8] + self.pp = [1, 2, 4, 8] + self.mbs = [1, 2, 3] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 2048 + elif model_size_in_b <= 87.5: + self.tp = [4, 8] + self.pp = [2, 4, 6, 8] + self.mbs = [1, 2, 3] + self.min_model_parallel = 16 + self.max_model_parallel = 64 + self.gbs = 2048 + elif model_size_in_b <= 165.5: + self.tp = [8] + self.pp = [4, 6, 8, 16] + self.mbs = [1, 2] + self.min_model_parallel = 32 + self.max_model_parallel = 256 + self.gbs = 2048 + elif model_size_in_b <= 250.5: + self.tp = [8] + self.pp = [8, 16, 32] + self.mbs = [1, 2] + self.min_model_parallel = 64 + self.max_model_parallel = 512 + self.gbs = 2048 + else: + raise ValueError("No BERT model larger than 250B parameters is supported.") + + +def _calculate_tp_pp_mbs_grid( + model_size_in_b: float, + num_layers: int, + model_name: str, + seq_length: int, + train_cfg: dict, +) -> Tuple[int, int, int]: + """Selects grid search space for TP, PP, MBS parameters for any model, and calls the necessary heuristics function accordingly. + + Args: + model_size_in_b (float): number of parameters in the model. + num_layers (int): number of layers in the model config. + model_name (str): name of the model to be used, such as gpt3, t5, mt5... + seq_length (int): sequence length to use for training. + train_cfg (dict): config of the model that will be launched. + + Returns: + dataclass object with model parallelism parameters. + + Raises: + NotImplementedError: if the model_name is not one of the supported models. + """ + + tp_sizes = train_cfg.tensor_parallel_sizes + pp_sizes = train_cfg.pipeline_parallel_sizes + cp_sizes = train_cfg.context_parallel_sizes + ep_sizes = train_cfg.expert_parallel_sizes + min_model_parallel_size = train_cfg.min_model_parallel_size + max_model_parallel_size = train_cfg.max_model_parallel_size + mbs_sizes = train_cfg.micro_batch_sizes + gbs_size = train_cfg.global_batch_size + gpu_memory_gb = train_cfg.gpu_memory_gb + multiplier = 1 if model_name in GPT_BASED_MODELS else 2 + init_pp = [] if model_name in GPT_BASED_MODELS else [1] + valid_pp = init_pp + [ + multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0 + ] # Only divisors of num_layers are possible. + + kwargs = { + "model_size_in_b": model_size_in_b, + "valid_pp": valid_pp, + "seq_length": seq_length, + "gpu_memory_gb": gpu_memory_gb, + } + + if model_name in GPT_BASED_MODELS: + search_class = GPT3GridSearch + elif model_name in ["t5", "mt5"]: + search_class = T5GridSearch + elif model_name == "bert": + search_class = BertGridSearch + else: + raise NotImplementedError("Model name not implemented.") + + params = search_class(**kwargs) + params.init_params() + + # Override the tp, pp, mbs search if indicated in the config params. + if tp_sizes is not None and tp_sizes != "auto": + params.tp = tp_sizes + if pp_sizes is not None and pp_sizes != "auto": + params.pp = pp_sizes + if cp_sizes is not None and cp_sizes != "auto": + params.cp = cp_sizes + if ep_sizes is not None and ep_sizes != "auto": + params.ep = ep_sizes + if mbs_sizes is not None and mbs_sizes != "auto": + params.mbs = mbs_sizes + if gbs_size is not None and gbs_size != "auto": + params.gbs = gbs_size + if min_model_parallel_size is not None and min_model_parallel_size != "auto": + params.min_model_parallel = min_model_parallel_size + if max_model_parallel_size is not None and max_model_parallel_size != "auto": + params.max_model_parallel = max_model_parallel_size + return params diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py new file mode 100644 index 000000000000..3441c7cdbf9b --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py @@ -0,0 +1,470 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass + + +GPT_BASED_MODELS = [ + "gpt3", + "bert", + "llama", + "baichuan2", + "chatglm", + "qwen2", + "mixtral", + "mistral", + "gemma", + "nemotron", +] + + +@dataclass +class ModelSizeParams: + """Calculates the parameters that affect model_size: hidden size, attention heads, KV channels, and FFN size. It also calculates the learning rate. + + Args: + model_size_in_b (float): number of parameters in the desired model config, in billions. + vocab_size (int): size of the vocabulary to use for training. + seq_length (int): sequence length to be used during training. + model_name (str): name of the model to be trained, i.e. gpt3, t5, mt5... + + Raises: + ValueError: if the model size is larger than the max supported model size. + NotImplementedError: if the model name is not supported. + """ + + model_size_in_b: float + vocab_size: int + seq_length: int + model_name: str + + # Model size params + layers: int = None + hs: int = None + att_h: int = None + ffn: int = None + kv: int = None + lr: float = None + + def init_params(self): + model_name = self.model_name + model_size_in_b = self.model_size_in_b + if model_name in GPT_BASED_MODELS: + if model_size_in_b < 0.25: + self.hs, self.att_h, self.lr = 768, 12, 6e-4 + elif model_size_in_b < 0.5: + self.hs, self.att_h, self.lr = 1024, 16, 3e-4 + elif model_size_in_b < 1: + self.hs, self.att_h, self.lr = 1536, 16, 2.5e-4 + elif model_size_in_b < 2: + self.hs, self.att_h, self.lr = 2048, 16, 2e-4 + elif model_size_in_b < 3: + self.hs, self.att_h, self.lr = 2560, 32, 1.6e-4 + elif model_size_in_b < 4.5: + self.hs, self.att_h, self.lr = 3072, 32, 1.4e-4 + elif model_size_in_b < 8: + self.hs, self.att_h, self.lr = 4096, 32, 1.2e-4 + elif model_size_in_b < 15: + self.hs, self.att_h, self.lr = 5120, 40, 1e-4 + elif model_size_in_b < 25: + self.hs, self.att_h, self.lr = 6144, 48, 1e-4 + elif model_size_in_b < 52: + self.hs, self.att_h, self.lr = 8192, 64, 0.8e-4 + elif model_size_in_b < 105: + self.hs, self.att_h, self.lr = 10240, 80, 0.7e-4 + elif model_size_in_b < 205: + self.hs, self.att_h, self.lr = 12288, 96, 0.6e-4 + elif model_size_in_b < 405: + self.hs, self.att_h, self.lr = 20480, 128, 0.5e-4 + elif model_size_in_b < 805: + self.hs, self.att_h, self.lr = 20480, 128, 0.4e-4 + elif model_size_in_b < 1105: + self.hs, self.att_h, self.lr = 25600, 160, 0.3e-4 + else: + raise ValueError("Model_size for GPT-3 must be smaller than 1.1T parameters.") + elif model_name == "t5": + self.kv, self.lr = 64, 1e-4 + if model_size_in_b < 0.1: + self.hs, self.att_h, self.ffn = 512, 6, 1024 + elif model_size_in_b < 0.4: + self.hs, self.att_h, self.ffn = 768, 12, 2048 + elif model_size_in_b < 1: + self.hs, self.att_h, self.ffn = 1024, 16, 2816 + elif model_size_in_b < 5: + self.hs, self.att_h, self.ffn = 2048, 32, 5120 + elif model_size_in_b < 15: + self.hs, self.att_h, self.ffn = 4096, 64, 10240 + elif model_size_in_b < 25.9: + self.hs, self.att_h, self.ffn = 5120, 80, 10880 + elif model_size_in_b < 43.0: + self.hs, self.att_h, self.ffn = 6144, 96, 10880 + elif model_size_in_b <= 85.5: + self.hs, self.att_h, self.ffn = 6144, 96, 16384 + elif model_size_in_b <= 165.5: + self.hs, self.att_h, self.ffn, kv = 7680, 96, 20480, 128 + elif model_size_in_b <= 250: + self.hs, self.att_h, self.ffn, kv = 12288, 96, 32768, 128 + else: + raise ValueError("Model_size for T5 must be smaller than 250B parameters.") + elif model_name == "mt5": + self.kv, self.lr = 64, 1e-4 + if model_size_in_b < 0.25: + self.hs, self.att_h, self.ffn = 512, 6, 1024 + elif model_size_in_b < 0.5: + self.hs, self.att_h, self.ffn = 768, 12, 2048 + elif model_size_in_b < 1.2: + self.hs, self.att_h, self.ffn = 1024, 16, 2816 + elif model_size_in_b < 5: + self.hs, self.att_h, self.ffn = 2048, 32, 5120 + elif model_size_in_b < 15: + self.hs, self.att_h, self.ffn = 4096, 64, 10240 + elif model_size_in_b < 25.9: + self.hs, self.att_h, self.ffn = 5120, 80, 10880 + elif model_size_in_b < 43.0: + self.hs, self.att_h, self.ffn = 6144, 96, 10880 + elif model_size_in_b <= 85.5: + self.hs, self.att_h, self.ffn = 6144, 96, 16384 + elif model_size_in_b <= 165.5: + self.hs, self.att_h, self.ffn, kv = 7680, 96, 20480, 128 + elif model_size_in_b <= 250: + self.hs, self.att_h, self.ffn, kv = 12288, 96, 32768, 128 + else: + raise ValueError("Model_size for mT5 must be smaller than 250B parameters.") + elif model_name == "bert": + self.lr = 1e-4 + if model_size_in_b < 0.25: + self.hs, self.att_h, self.lr = 768, 12, 2e-4 + elif model_size_in_b < 0.5: + self.hs, self.att_h, self.lr = 1024, 16, 2e-4 + elif model_size_in_b < 1: + self.hs, self.att_h = 1536, 16 + elif model_size_in_b < 2: + self.hs, self.att_h = 2048, 16 + elif model_size_in_b < 3: + self.hs, self.att_h = 2560, 32 + elif model_size_in_b < 4.5: + self.hs, self.att_h = 2560, 32 + elif model_size_in_b < 8: + self.hs, self.att_h = 4096, 32 + elif model_size_in_b < 15: + self.hs, self.att_h = 5120, 40 + elif model_size_in_b <= 25: + self.hs, self.att_h = 6144, 48 + elif model_size_in_b <= 46.5: + self.hs, self.att_h = 7680, 48 + elif model_size_in_b <= 87.5: + self.hs, self.att_h = 9216, 96 + elif model_size_in_b <= 165.5: + self.hs, self.att_h = 9216, 96 + elif model_size_in_b <= 250.5: + self.hs, self.att_h = 12288, 96 + else: + raise ValueError("Model_size for BERT must be smaller than 25B parameters.") + self.ffn = 4 * self.hs + else: + raise NotImplementedError("Model name is not valid.") + + # Try powers of 2 + margin = 0.01 + for attempt in range(0, 10): + for layers in (2**p for p in range(1, 10)): + out_size = _calculate_model_size( + vocab_size=self.vocab_size, + seq_length=self.seq_length, + hidden_size=self.hs, + num_layers=layers, + ffn_size=self.ffn, + kv_channels=self.kv, + att_heads=self.att_h, + model_name=self.model_name, + ) + if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers: + self.layers = layers + margin += 0.01 # Double margin of acceptable model sizes. + + # Try multiples of 16 + margin = 0.01 + for attempt in range(0, 6): + for layers in range(16, 201, 16): + out_size = _calculate_model_size( + vocab_size=self.vocab_size, + seq_length=self.seq_length, + hidden_size=self.hs, + num_layers=layers, + ffn_size=self.ffn, + kv_channels=self.kv, + att_heads=self.att_h, + model_name=self.model_name, + ) + if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers: + self.layers = layers + margin += 0.01 # Double margin of acceptable model sizes. + + # Try multiples of 2 + margin = 0.01 + for attempt in range(0, 6): + for layers in range(2, 201, 2): + out_size = _calculate_model_size( + vocab_size=self.vocab_size, + seq_length=self.seq_length, + hidden_size=self.hs, + num_layers=layers, + ffn_size=self.ffn, + kv_channels=self.kv, + att_heads=self.att_h, + model_name=self.model_name, + ) + if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers: + self.layers = layers + margin += 0.01 # Double margin of acceptable model sizes. + + # Try multiples of 5 + margin = 0.01 + for attempt in range(0, 6): + for layers in range(5, 201, 5): + out_size = _calculate_model_size( + vocab_size=self.vocab_size, + seq_length=self.seq_length, + hidden_size=self.hs, + num_layers=layers, + ffn_size=self.ffn, + kv_channels=self.kv, + att_heads=self.att_h, + model_name=self.model_name, + ) + if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers: + self.layers = layers + margin += 0.01 # Double margin of acceptable model sizes. + + # Try any valid number + margin = 0.01 + for attempt in range(0, 10): + for layers in range(1, 200): + out_size = _calculate_model_size( + vocab_size=self.vocab_size, + seq_length=self.seq_length, + hidden_size=self.hs, + num_layers=layers, + ffn_size=self.ffn, + kv_channels=self.kv, + att_heads=self.att_h, + model_name=self.model_name, + ) + if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers: + self.layers = layers + margin += 0.01 # Double margin of acceptable model sizes. + + if not self.layers: + raise Exception("Number of layers not found, config is not possible.") + + +def _calculate_model_size( + vocab_size: int = None, + seq_length: int = None, + hidden_size: int = None, + num_layers: int = None, + ffn_size: int = None, + kv_channels: int = None, + att_heads: int = None, + model_name: str = "gpt3", +): + """Calculates the model size (number of parameters in billions), given the model parameters and name. + + Args: + vocab_size (int): vocabulary size to be used during training. + seq_length (int): input sequence length to be used during training. + hidden_size (int): size of the hidden layers of the model. + num_layers (int): number of layers in the model. + ffn_size (int): FFN size of the model. + kv_channels (int): number of KV channels in the transformer layers. + att_heads (int): number of attention heads in the transformer layers. + model_name (str): name of the model, i.e gpt3, t5, mt5... + + Returns: + float: size of the model in billions of parameters. + + Raises: + NotImplementedError: if the model name is not valid. + """ + + if model_name in GPT_BASED_MODELS: + model_size = ( + 12 + * num_layers + * hidden_size**2 + * (1 + (13 / (12 * hidden_size)) + ((vocab_size + seq_length) / (12 * num_layers * hidden_size))) + / 1e9 + ) + elif model_name in ["t5", "mt5"]: + # 2 L F + 3 L P + H (2 + 4 L F + L (21 + 12 P) + 1 S + 1 V) + proj_size = att_heads * kv_channels + model_size = ( + 2 * num_layers * 1.5 * ffn_size + + 3 * num_layers * proj_size + + hidden_size + * (2 + 4 * num_layers * 1.5 * ffn_size + num_layers * (21 + 12 * proj_size) + seq_length + vocab_size) + ) / 1e9 + elif model_name == "bert": + model_size = ( + num_layers * (ffn_size + hidden_size * (4 * hidden_size + 3 * att_heads + 2 * ffn_size + 6)) + + hidden_size * (vocab_size + seq_length + hidden_size + 5) + ) / 1e9 + + else: + raise NotImplementedError("Model name is not valid.") + + return model_size + + +def generic_base_config(config) -> dict: + """Generates a base config dictionary from a base config python file. + + Args: + config (AutoConfigurator): config object for the Auto Configurator tool. + + Returns: + BaseConfig: base configuration for the model. + AutoConfigurator: config object for the Auto Configurator tool. + """ + + from nemo.collections.llm.tools.auto_configurator.core.base_config import BaseConfig, calculate_model_size + + default_model = False if config.model_size_in_b else True + + model_size_in_b = calculate_model_size( + config.gpu_count, + config.max_training_days, + config.model_size_in_b, + config.tflops_per_gpu, + config.num_tokens_in_b, + config.model_type, + ) + base_cfg = BaseConfig(config) + + if default_model: + params = ModelSizeParams( + model_size_in_b, + config.vocab_size, + config.seq_length, + config.model_type, + ) + params.init_params() + + if config.model_type in GPT_BASED_MODELS: + base_cfg.model.num_layers = params.layers + base_cfg.model.hidden_size = params.hs + base_cfg.model.num_attention_heads = params.att_h + base_cfg.model.kv_channels = params.kv + if not params.ffn: + base_cfg.model.ffn_hidden_size = params.hs * 4 + else: + base_cfg.model.ffn_hidden_size = params.ffn + + config.model_size_in_b = model_size_in_b + + return base_cfg, config + + +def modify_cfg( + base_cfg: dict, + act: int, + num_mbs_act: int, + act_per_pipe: int, + tp: int, + pp: int, + cp: int, + ep: int, + virtual_pipelines: int, + mbs: int, + max_minutes: int, + max_steps: int, + num_nodes: int, + model_name: str, + model_size, +) -> dict: + """Modify the base configuration for the model with the new parameters that are specific to the current model, which the Auto Configurator tool heuristics selected. + + Args: + base_cfg (dict): base configuration for the current model, which will be modified in this function. + act (int): number of activation checkpointing layers to use for the model. + num_mbs_act (int): sets the number of micro-batches where only a partial number of Transformer layers get checkpointed and recomputed within a window of micro-batches. + act_per_pipe (int): sets the number of Transformer layers to skip checkpointing at later pipeline stages. + tp (int): Tensor Parallelism (TP) value to be set for the model. + pp (int): Pipeline Parallelism (PP) value to be set for the model. + cp (int): Context Parallelism (CP) value to be set for the model. + ep (int): Expert Parallelism (EP) value to be set for the model. + virtual_pipelines (int): Virtual Pipelines value to be set for the model. + mbs (int): Micro Batch Size (MBS) value to be set for the model. + max_minutes (int): maximum amount of time to run this model for. + max_steps (int): maximum number of steps to run this model for. + num_nodes (int): number of nodes to use for the training run. + model_name (str): name of the model, i.e. gpt3, t5, mt5... + + Returns: + dict: dictionary containing the updated model configuration parameters. + """ + + if model_name in GPT_BASED_MODELS: + att_heads = base_cfg.model.num_attention_heads + num_layers = base_cfg.model.num_layers + else: + att_heads = base_cfg.model.encoder.num_attention_heads + num_layers = base_cfg.model.encoder.num_layers + + # gbs = mbs * num_gpus * accumulate_grad_batches / (tp * pp) + num_gpus = base_cfg.trainer.num_nodes * base_cfg.trainer.devices + gbs = base_cfg.data.global_batch_size + seq_len = base_cfg.model.seq_length + + new_cfg = dict(run=base_cfg.run) + if act is not None: + if model_name in GPT_BASED_MODELS: + new_cfg["activations_checkpoint_num_layers"] = act + else: + new_cfg["encoder"]["activations_checkpoint_num_layers"] = act // 2 + new_cfg["decoder"]["activations_checkpoint_num_layers"] = act // 2 + + if num_mbs_act is not None and model_name in GPT_BASED_MODELS: + new_cfg["num_micro_batches_with_partial_activation_checkpoints"] = num_mbs_act + + if act_per_pipe is not None and model_name in GPT_BASED_MODELS: + new_cfg["activations_checkpoint_layers_per_pipeline"] = act_per_pipe + + if virtual_pipelines is not None and model_name in GPT_BASED_MODELS: + new_cfg["virtual_pipeline_model_parallel_size"] = virtual_pipelines + + new_cfg["tensor_model_parallel_size"] = tp + new_cfg["pipeline_model_parallel_size"] = pp + new_cfg["micro_batch_size"] = mbs + new_cfg["global_batch_size"] = gbs + + if cp is not None: + new_cfg["context_parallel_size"] = cp + + if ep is not None: + new_cfg["expert_model_parallel_size"] = ep + + mod_gbs = gbs % (mbs * num_gpus / (tp * pp)) + mod_att_heads = att_heads % tp + mod_layers = num_layers % pp + if mod_gbs == 0 and mod_att_heads == 0 and mod_layers == 0: + # Valid config + new_cfg["run"][ + "name" + ] = f"{model_name}_{str(model_size)}b_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}" + print( + f"Valid config: SeqLen={seq_len}, GBS={gbs}, MBS={mbs}, TP={tp}, PP={pp}, CP={cp}, EP={ep}, act_ckpt_layers={act}, num_mbs_act={num_mbs_act}, act_per_pipe={act_per_pipe}. Adding to directory." + ) + return new_cfg + return None diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py new file mode 100644 index 000000000000..0c80c9a21a9e --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/runner.py @@ -0,0 +1,246 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import re + +from typing import List, Optional + +from nemo.collections.llm import GPTModel +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.tools.auto_configurator.core.training_config import generate_grid_search_configs +from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config +from nemo.collections.llm.utils import Config, Partial +from nemo.utils import logging + +SUPPORTED_MODELS = [ + "gpt3", + "llama", + "mixtral", + "mistral", + "gemma", + "nemotron", +] + +SUPPORTED_TOKENIZERS = [ + "autotokenizer", + "sentencepiece", + "huggingface", +] + + +class AutoConfigurator: + """Auto Configurator runner config class.""" + + def __init__( + self, + model: Config = None, + num_nodes: int = None, + data_paths: List = None, + path_to_logs: str = None, + tokenizer_type: Optional[str] = "autotokenizer", + tokenizer_path: Optional[str] = "GPT2BPETokenizer", + gpus_per_node: Optional[int] = 8, + gpu_memory_gb: Optional[int] = 80, + seq_length: Optional[int] = 2048, + global_batch_size: Optional[int] = "auto", + tensor_parallel_sizes: Optional[List[int]] = "auto", + pipeline_parallel_sizes: Optional[List[int]] = "auto", + micro_batch_sizes: Optional[List[int]] = "auto", + context_parallel_sizes: Optional[List[int]] = [1], + expert_parallel_sizes: Optional[List[int]] = [1], + min_model_parallel_size: Optional[int] = "auto", + max_model_parallel_size: Optional[int] = "auto", + num_tokens_in_b: Optional[int] = 300, + tflops_per_gpu: Optional[int] = 140, + max_minutes_per_run: Optional[int] = 30, + max_training_days: Optional[int] = 2, + max_steps_per_run: Optional[int] = 50, + vocab_size: Optional[int] = 51200, + ): + """ + Args: + model_type (Config): model type to be used for training. + num_nodes (int): number of nodes to be used for training. + data_paths (List): list of datafiles to be used for training. + path_to_logs (str): path to the directory where the logs will be stored. + tokenizer_type (Optional[str]): tokenizer type. + tokenizer_path (Optional[str]): path to the tokenizer model. + model_size (Optional[int]): size of model to be trained. + gpus_per_node (Optional[int]): number of GPUs per node to be used. + gpu_memory_gb (Optional[int]): memory per GPU, in GB. Currently 40GB and 80GB A100s/H100s supported. + seq_length (Optional[int]): model sequence length. Available seq_length list for GPT-based models: [2048, 4096, 8192, 16384, 32768]. + global_batch_size (Optional[int]): model global batch size. Set to "auto" if you want auto configurator to find optimal gbs. + tensor_parallel_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8]. + pipeline_parallel_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8]. + micro_batch_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8]. + context_parallel_sizes (Optional[List[int]]): model context parallel size. A list, such as [1, 2, 4, 8]. + expert_parallel_sizes (Optional[List[int]]): model expert parallel size. A list, such as [1, 2, 4, 8]. + min_model_parallel_size (Optional[int]): set to "auto" to use our recommendation, or a value for the minimum desired parallelism. + max_model_parallel_size (Optional[int]): set to "auto" to use our recommendation, or a value for the maximum desired parallelism. + num_tokens_in_b (Optional[int]): number of tokens in billions in train dataset. + tflops_per_gpu (Optional[int]): estimated tflops per GPU. + max_minutes_per_run (Optional[int]): maximum number of minutes per run for the grid search. + max_training_days (Optional[int]): number of days expected model to be trained. + max_steps_per_run (Optional[int]): maximum number of steps per run for the grid search. + vocab_size (Optional[int]): size of tokenizer vocabulary. + """ + + # Print out the config + config = locals() + config.pop('self') + for key, value in config.items(): + setattr(self, key, value) + logging.info(self._get_message(config)) + + model_type = self._get_model_type(model) + assert model_type in SUPPORTED_MODELS, f"model_type must be set to one of {SUPPORTED_MODELS}." + assert tokenizer_type in SUPPORTED_TOKENIZERS, f"tokenizer_type must be set to one of {SUPPORTED_TOKENIZERS}." + assert num_nodes, "num_nodes value must be specified." + assert data_paths, "training data must be specified." + assert path_to_logs, f"path_to_logs parameter must be specified." + gpu_count = num_nodes * gpus_per_node + assert gpu_count > 0, "num_nodes * gpus_per_node must be an int larger than zero." + assert gpu_memory_gb in ( + 40, + 80, + ), "gpu_memory_gb can only be 40 or 80." + assert max_minutes_per_run >= 10, "max_minutes_per_run must be an int and be at least 10 minutes." + + self.model_type = model_type + self.model_size_in_b = self._get_model_size(model) + self.gpu_count = gpu_count + self.num_gpus = gpus_per_node + + def _get_message(self, config: dict) -> str: + """ + Function that returns runner config line by line. + + Args: + config (dict): runner config. + + Returns: + str: runner config params. + """ + + message = "AutoConfigurator runner config:\n" + for key, value in config.items(): + message += f"{key}: {value}\n" + + return message + + def _get_model_type(self, model: Config) -> str: + """ + Function that returns model type from model class name. + + Args: + models (Config): model object. + + Returns: + str: model type. + """ + + match = re.search(r"\w+\d+[MB]", str(model)) + if match: + model = match.group(0) + + if "GPT" in model: + return "gpt3" + elif "Llama" in model: + return "llama" + elif "Mixtral" in model: + return "mixtral" + elif "Mistral" in model: + return "mistral" + elif "Gemma" in model: + return "gemma" + elif "Nemotron" in model: + return "nemotron" + else: + return None + + def _get_model_size(self, model: Config) -> int: + """ + Function that returns model size from model class name. + + Args: + model (Config): model class name. + + Returns: + int: model size. + """ + match = re.search(r'(\d+)([BM])', str(model)) + if match: + size = int(match.group(1)) + measure = match.group(2) + if measure == 'B': + return size + elif measure == 'M': + return size / 1000 # Convert millions to billions + return None + + +def generate_configs(runner_config: AutoConfigurator = None) -> dict: + """ + Function that returns a dictionary of Partial configs. + + Args: + config (AutoConfigurator): Auto Configurator object. + + Returns: + dict: dictionary of Partial configs. + """ + + # Generate base config for the given model size + base_cfg, train_cfg = generic_base_config(runner_config) + + # Launch grid search for training constraints + base_config, train_configs = generate_grid_search_configs(base_cfg, train_cfg) + + tokenizer = base_config.tokenizer + model = Config(GPTModel, config=base_config.model, tokenizer=tokenizer) + + configs = {} + for name, config in train_configs.items(): + trainer = copy.deepcopy(base_config.trainer) + data = copy.deepcopy(base_config.data) + log = copy.deepcopy(base_config.log) + + # Set data params + data.micro_batch_size = config.get("micro_batch_size") + data.global_batch_size = config.get("global_batch_size") + + # Set strategy params + trainer.strategy.tensor_model_parallel_size = config.get("tensor_model_parallel_size") + trainer.strategy.pipeline_model_parallel_size = config.get("pipeline_model_parallel_size") + trainer.strategy.context_parallel_size = config.get("context_parallel_size") + trainer.strategy.expert_model_parallel_size = config.get("expert_model_parallel_size") + trainer.strategy.virtual_pipeline_model_parallel_size = config.get( + "virtual_pipeline_model_parallel_size", None + ) + if config.get("tensor_model_parallel_size") > 1: + trainer.strategy.sequence_parallel = True + + # Set the directory where to save the logs + configs[name] = Partial( + pretrain, + model=model, + trainer=trainer, + data=data, + optim=base_config.optim, + log=log, + resume=None, + ) + + return base_cfg, configs diff --git a/tests/collections/llm/auto_conf/test_autoconf_utils.py b/tests/collections/llm/auto_conf/test_autoconf_utils.py new file mode 100644 index 000000000000..0faa86c13016 --- /dev/null +++ b/tests/collections/llm/auto_conf/test_autoconf_utils.py @@ -0,0 +1,131 @@ +from nemo.collections.llm.tools.auto_configurator.core.base_config import _estimate_training_time, calculate_model_size + + +class TestUtils: + def test_calculate_model_size(self): + # GPT + model_size = calculate_model_size( + 8, + 7, + None, + 140, + 300, + "gpt3", + ) + assert model_size == 0.28, f"expected model_size is 0.28 but got {model_size}." + + # Llama + model_size = calculate_model_size( + 128, + 30, + None, + 100, + 3000, + "llama", + ) + assert model_size == 1.38, f"expected model_size is 1.38 but got {model_size}." + + # Mixtral + model_size = calculate_model_size( + 256, + 20, + None, + 140, + 600, + "mixtral", + ) + assert model_size == 12.9, f"expected model_size is 12.9 but got {model_size}." + + # Mistral + model_size = calculate_model_size( + 1028, + 30, + None, + 240, + 100, + "mistral", + ) + assert model_size == 799.37, f"expected model_size is 799.37 but got {model_size}." + + # Gemma + model_size = calculate_model_size( + 512, + 30, + None, + 240, + 100, + "gemma", + ) + assert model_size == 398.13, f"expected model_size is 398.13 but got {model_size}." + + # Nemotron + model_size = calculate_model_size( + 256, + 15, + None, + 240, + 120, + "gemma", + ) + assert model_size == 82.94, f"expected model_size is 82.94 but got {model_size}." + + def test_calculate_train_time(self): + # GPT + train_time = _estimate_training_time( + 175, + 1024, + 140, + 300, + "gpt3", + ) + assert train_time == 33.91, f"expected train_time is 33.91 but got {train_time}." + + # Llama + train_time = _estimate_training_time( + 35, + 512, + 60, + 3000, + "llama", + ) + assert train_time == 316.48, f"expected train_time is 316.48 but got {train_time}." + + # Mixtral + train_time = _estimate_training_time( + 0.8, + 128, + 140, + 1000, + "mixtral", + ) + assert train_time == 4.13, f"expected train_time is 4.13 but got {train_time}." + + # Mistral + train_time = _estimate_training_time( + 11, + 24, + 60, + 250, + "mistral", + ) + assert train_time == 176.83, f"expected train_time is 176.83 but got {train_time}." + + # Gemma + train_time = _estimate_training_time( + 7, + 8, + 55, + 100, + "gemma", + ) + assert train_time == 147.31, f"expected train_time is 147.31 but got {train_time}." + + # Nemotron + train_time = _estimate_training_time( + 14, + 12, + 11, + 55, + "nemotron", + ) + assert train_time == 540.12, f"expected train_time is 540.12 but got {train_time}." diff --git a/tests/collections/llm/auto_conf/test_base_configs.py b/tests/collections/llm/auto_conf/test_base_configs.py new file mode 100644 index 000000000000..46ee49ae0629 --- /dev/null +++ b/tests/collections/llm/auto_conf/test_base_configs.py @@ -0,0 +1,341 @@ +import nemo_run as run +import torch + +from megatron.core.optimizer import OptimizerConfig +from pytorch_lightning.loggers import TensorBoardLogger + +from nemo import lightning as nl +from nemo.collections.common.tokenizers import AutoTokenizer +from nemo.collections.llm import ( + GemmaConfig2B, + GPTConfig126M, + Llama3Config8B, + MistralConfig7B, + MixtralConfig8x3B, + Nemotron4Config22B, + PreTrainingDataModule, +) +from nemo.collections.llm.tools.auto_configurator import AutoConfigurator +from nemo.collections.llm.tools.auto_configurator.core.base_config import BaseConfig +from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule +from nemo.utils.exp_manager import TimingCallback + + +def get_tokenizer() -> run.Config: + return run.Config(AutoTokenizer, pretrained_model_name="GPT2BPETokenizer") + + +def get_data(seq_length, global_batch_size) -> run.Config[PreTrainingDataModule]: + config = { + "paths": "/", + "seq_length": seq_length, + "global_batch_size": global_batch_size, + "num_workers": 2, + "index_mapping_dir": None, + } + + return run.Config( + PreTrainingDataModule, + **config, + tokenizer=get_tokenizer(), + ) + + +def get_trainer(num_nodes) -> run.Config[nl.Trainer]: + trainer_config = { + "accelerator": "gpu", + "enable_checkpointing": False, + "use_distributed_sampler": False, + "max_epochs": None, + "log_every_n_steps": 1, + "limit_val_batches": 1, + "limit_test_batches": 1, + "accumulate_grad_batches": 1, + "num_nodes": num_nodes, + "devices": 8, + "max_steps": 50, + "val_check_interval": 50, + } + + strategy = run.Config( + nl.MegatronStrategy, + pipeline_dtype=torch.bfloat16, + ) + + return run.Config( + nl.Trainer, + **trainer_config, + strategy=strategy, + plugins=run.Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + callbacks=[run.Config(TimingCallback)], + ) + + +def get_optim() -> run.Config[OptimizerConfig]: + optim_params = { + "optimizer": "adam", + "lr": 1e-4, + "min_lr": 1e-5, + "use_distributed_optimizer": True, + "bf16": True, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "overlap_grad_reduce": True, + "overlap_param_gather": True, + "clip_grad": 1.0, + "adam_eps": 1e-5, + } + + optim_config = run.Config( + OptimizerConfig, + **optim_params, + ) + + sched = run.Config( + CosineAnnealingScheduler, + warmup_steps=10, + constant_steps=0, + min_lr=optim_config.min_lr, + ) + + return run.Config( + MegatronOptimizerModule, + config=optim_config, + lr_scheduler=sched, + ) + + +def get_logger() -> run.Config[nl.NeMoLogger]: + tb_logger = run.Config(TensorBoardLogger, save_dir="tb_logs") + + ckpt = run.Config( + nl.ModelCheckpoint, + monitor="reduced_train_loss", + save_last=False, + save_top_k=0, + ) + + return run.Config( + nl.NeMoLogger, + ckpt=ckpt, + tensorboard=tb_logger, + wandb=None, + dir="/", + ) + + +class TestBaseConfigs: + def test_gpt3_base_config(self): + # GPT3 7B + model_config = run.Config(GPTConfig126M) + runner = AutoConfigurator(model=model_config, num_nodes=8, path_to_logs="/", data_paths="/") + base_config = BaseConfig(runner) + model_size = runner._get_model_size(model_config) + model_type = runner._get_model_type(model_config) + data_config = get_data(2048, 'auto') + trainer_config = get_trainer(8) + optim_config = get_optim() + logger_config = get_logger() + + assert ( + base_config.model == model_config + ), f"{model_config} is expected class object but got {base_config.model}" + assert model_size == 0.126, f"0.126 is expected size for {model_config} but got {model_size}" + assert model_type == "gpt3", f"gpt3 is expected model type for {model_config} but got {model_type}" + assert ( + base_config.data == data_config + ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}" + assert ( + base_config.trainer == trainer_config + ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}" + assert ( + base_config.optim == optim_config + ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}" + assert ( + base_config.log == logger_config + ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}" + + def test_llama_base_config(self): + # Llama3 8B + model_config = run.Config(Llama3Config8B) + runner = AutoConfigurator( + model=model_config, + num_nodes=16, + path_to_logs="/", + data_paths="/", + seq_length=8192, + global_batch_size=2048, + ) + base_config = BaseConfig(runner) + model_size = runner._get_model_size(model_config) + model_type = runner._get_model_type(model_config) + data_config = get_data(8192, 2048) + trainer_config = get_trainer(16) + optim_config = get_optim() + logger_config = get_logger() + + assert ( + base_config.model == model_config + ), f"{model_config} is expected class object but got {base_config.model}" + assert model_size == 8, f"8 is expected size for {model_config} but got {model_size}" + assert model_type == "llama", f"llama is expected model type for {model_config} but got {model_type}" + assert ( + base_config.data == data_config + ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}" + assert ( + base_config.trainer == trainer_config + ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}" + assert ( + base_config.optim == optim_config + ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}" + assert ( + base_config.log == logger_config + ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}" + + def test_mistral_base_config(self): + # Mistral 7B + model_config = run.Config(MistralConfig7B) + runner = AutoConfigurator( + model=model_config, + num_nodes=16, + path_to_logs="/", + data_paths="/", + seq_length=32768, + global_batch_size=2048, + ) + base_config = BaseConfig(runner) + model_size = runner._get_model_size(model_config) + model_type = runner._get_model_type(model_config) + data_config = get_data(32768, 2048) + trainer_config = get_trainer(16) + optim_config = get_optim() + logger_config = get_logger() + + assert ( + base_config.model == model_config + ), f"{model_config} is expected class object but got {base_config.model}" + assert model_size == 7, f"7 is expected size for {model_config} but got {model_size}" + assert model_type == "mistral", f"mistral is expected model type for {model_config} but got {model_type}" + assert ( + base_config.data == data_config + ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}" + assert ( + base_config.trainer == trainer_config + ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}" + assert ( + base_config.optim == optim_config + ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}" + assert ( + base_config.log == logger_config + ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}" + + def test_mixtral_base_config(self): + # Mixtral 8x3B + model_config = run.Config(MixtralConfig8x3B) + runner = AutoConfigurator( + model=model_config, + num_nodes=16, + path_to_logs="/", + data_paths="/", + seq_length=4096, + global_batch_size=2048, + ) + base_config = BaseConfig(runner) + model_size = runner._get_model_size(model_config) + model_type = runner._get_model_type(model_config) + data_config = get_data(4096, 2048) + trainer_config = get_trainer(16) + optim_config = get_optim() + logger_config = get_logger() + + assert ( + base_config.model == model_config + ), f"{model_config} is expected class object but got {base_config.model}" + assert model_size == 3, f"3 is expected size for {model_config} but got {model_size}" + assert model_type == "mixtral", f"mixtral is expected model type for {model_config} but got {model_type}" + assert ( + base_config.data == data_config + ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}" + assert ( + base_config.trainer == trainer_config + ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}" + assert ( + base_config.optim == optim_config + ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}" + assert ( + base_config.log == logger_config + ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}" + + def test_gemma_base_config(self): + # Gemma 2B + model_config = run.Config(GemmaConfig2B) + runner = AutoConfigurator( + model=model_config, + num_nodes=8, + path_to_logs="/", + data_paths="/", + seq_length=4096, + global_batch_size=1024, + ) + base_config = BaseConfig(runner) + model_size = runner._get_model_size(model_config) + model_type = runner._get_model_type(model_config) + data_config = get_data(4096, 1024) + trainer_config = get_trainer(8) + optim_config = get_optim() + logger_config = get_logger() + + assert ( + base_config.model == model_config + ), f"{model_config} is expected class object but got {base_config.model}" + assert model_size == 2, f"2 is expected size for {model_config} but got {model_size}" + assert model_type == "gemma", f"gemma is expected model type for {model_config} but got {model_type}" + assert ( + base_config.data == data_config + ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}" + assert ( + base_config.trainer == trainer_config + ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}" + assert ( + base_config.optim == optim_config + ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}" + assert ( + base_config.log == logger_config + ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}" + + def test_nemotron_base_config(self): + # Nemotron 22B + model_config = run.Config(Nemotron4Config22B) + runner = AutoConfigurator( + model=model_config, + num_nodes=64, + path_to_logs="/", + data_paths="/", + seq_length=4096, + global_batch_size=2048, + ) + base_config = BaseConfig(runner) + model_size = runner._get_model_size(model_config) + model_type = runner._get_model_type(model_config) + data_config = get_data(4096, 2048) + trainer_config = get_trainer(64) + optim_config = get_optim() + logger_config = get_logger() + + assert ( + base_config.model == model_config + ), f"{model_config} is expected class object but got {base_config.model}" + assert model_size == 22, f"22 is expected size for {model_config} but got {model_size}" + assert model_type == "nemotron", f"nemotron is expected model type for {model_config} but got {model_type}" + assert ( + base_config.data == data_config + ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}" + assert ( + base_config.trainer == trainer_config + ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}" + assert ( + base_config.optim == optim_config + ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}" + assert ( + base_config.log == logger_config + ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}" diff --git a/tests/collections/llm/auto_conf/test_generate_configs.py b/tests/collections/llm/auto_conf/test_generate_configs.py new file mode 100644 index 000000000000..efb3bcf9a0ba --- /dev/null +++ b/tests/collections/llm/auto_conf/test_generate_configs.py @@ -0,0 +1,307 @@ +import nemo_run as run + +from nemo.collections.llm import ( + GemmaConfig7B, + GPTConfig5B, + Llama3Config70B, + MistralConfig7B, + MixtralConfig8x22B, + Nemotron3Config8B, +) +from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs + + +def get_auto_configs(configs): + auto_configs = [] + for run_name, config in configs.items(): + auto_configs.append( + [ + config.trainer.strategy.tensor_model_parallel_size, + config.trainer.strategy.pipeline_model_parallel_size, + config.trainer.strategy.context_parallel_size, + config.trainer.strategy.expert_model_parallel_size, + config.data.micro_batch_size, + ] + ) + + return auto_configs + + +class TestGenerateConfgis: + def test_gpt_model(self): + # GPT3 126M + runner = AutoConfigurator( + model=run.Config(GPTConfig5B), + num_nodes=16, + seq_length=2048, + global_batch_size=2048, + tensor_parallel_sizes=[4], + pipeline_parallel_sizes=[2], + micro_batch_sizes=[1, 2], + context_parallel_sizes=[1], + expert_parallel_sizes=[1], + min_model_parallel_size=8, + max_model_parallel_size=8, + data_paths="/", + path_to_logs="/", + ) + + _, configs = generate_configs(runner) + + mbs = [1, 2] + for run_name, config, mb in zip(configs.keys(), configs.values(), mbs): + assert config.data.micro_batch_size == mb + assert config.data.seq_length == 2048 + assert config.data.global_batch_size == 2048 + + assert len(configs) == 2, f"{len(configs)} configurations were generated but 2 were expected." + + auto_configs = get_auto_configs(configs) + assert auto_configs[0] == [ + 4, + 2, + 1, + 1, + 1, + ], f"[4, 2, 1, 1, 1] is expected configuration output but got {auto_configs[0]}." + + assert auto_configs[1] == [ + 4, + 2, + 1, + 1, + 2, + ], f"[4, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}." + + def test_llama_model(self): + # Llama3 70B + runner = AutoConfigurator( + model=run.Config(Llama3Config70B), + num_nodes=128, + seq_length=8192, + global_batch_size=2048, + tensor_parallel_sizes="auto", + pipeline_parallel_sizes="auto", + micro_batch_sizes=[1], + context_parallel_sizes=[1, 2, 4], + expert_parallel_sizes=[1], + min_model_parallel_size=16, + max_model_parallel_size=64, + data_paths="/", + path_to_logs="/", + ) + + _, configs = generate_configs(runner) + + mbs = [1, 1, 1] + for run_name, config, mb in zip(configs.keys(), configs.values(), mbs): + assert config.data.micro_batch_size == mb + assert config.data.seq_length == 8192 + assert config.data.global_batch_size == 2048 + + assert len(configs) == 3, f"{len(configs)} configurations were generated but 3 were expected." + + auto_configs = get_auto_configs(configs) + assert auto_configs[0] == [ + 4, + 1, + 4, + 1, + 1, + ], f"[4, 1, 4, 1, 1] is expected configuration output but got {auto_configs[0]}." + + assert auto_configs[1] == [ + 8, + 1, + 2, + 1, + 1, + ], f"[8, 1, 2, 1, 1] is expected configuration output but got {auto_configs[1]}." + + assert auto_configs[2] == [ + 8, + 1, + 4, + 1, + 1, + ], f"[8, 1, 4, 1, 1] is expected configuration output but got {auto_configs[2]}." + + def test_mistral_model(self): + # Mistral 7B + runner = AutoConfigurator( + model=run.Config(MistralConfig7B), + num_nodes=16, + seq_length=4096, + global_batch_size=2048, + tensor_parallel_sizes=[4], + pipeline_parallel_sizes=[1, 2], + micro_batch_sizes=[1], + context_parallel_sizes=[1], + expert_parallel_sizes=[1], + min_model_parallel_size=4, + max_model_parallel_size=8, + data_paths="/", + path_to_logs="/", + ) + + _, configs = generate_configs(runner) + + mbs = [1, 1] + for run_name, config, mb in zip(configs.keys(), configs.values(), mbs): + assert config.data.micro_batch_size == mb + assert config.data.seq_length == 4096 + assert config.data.global_batch_size == 2048 + + assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected." + + auto_configs = get_auto_configs(configs) + assert auto_configs[0] == [ + 4, + 1, + 1, + 1, + 1, + ], f"[4, 1, 1, 1, 1] is expected configuration output but got {auto_configs[0]}." + + assert auto_configs[1] == [ + 4, + 2, + 1, + 1, + 1, + ], f"[4, 2, 1, 1, 1] is expected configuration output but got {auto_configs[1]}." + + def test_mixtral_model(self): + # Mixtral 8x22B + runner = AutoConfigurator( + model=run.Config(MixtralConfig8x22B), + num_nodes=16, + seq_length=4096, + global_batch_size=2048, + tensor_parallel_sizes=[4], + pipeline_parallel_sizes=[1], + micro_batch_sizes=[1], + context_parallel_sizes=[1], + expert_parallel_sizes=[1, 2], + min_model_parallel_size=4, + max_model_parallel_size=8, + data_paths="/", + path_to_logs="/", + ) + + _, configs = generate_configs(runner) + + mbs = [1, 1] + for run_name, config, mb in zip(configs.keys(), configs.values(), mbs): + assert config.data.micro_batch_size == mb + assert config.data.seq_length == 4096 + assert config.data.global_batch_size == 2048 + + assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected." + + auto_configs = get_auto_configs(configs) + assert auto_configs[0] == [ + 4, + 1, + 1, + 1, + 1, + ], f"[4, 1, 1, 1, 1] is expected configuration output but got {auto_configs[0]}." + + assert auto_configs[1] == [ + 4, + 1, + 1, + 2, + 1, + ], f"[4, 1, 1, 2, 1] is expected configuration output but got {auto_configs[1]}." + + def test_gemma_model(self): + # Gemma 7B + runner = AutoConfigurator( + model=run.Config(GemmaConfig7B), + num_nodes=16, + seq_length=8192, + global_batch_size=2048, + tensor_parallel_sizes=[2], + pipeline_parallel_sizes=[2], + micro_batch_sizes=[1, 2], + context_parallel_sizes=[1], + expert_parallel_sizes=[1], + min_model_parallel_size=4, + max_model_parallel_size=8, + data_paths="/", + path_to_logs="/", + ) + + _, configs = generate_configs(runner) + + mbs = [1, 2] + for run_name, config, mb in zip(configs.keys(), configs.values(), mbs): + assert config.data.micro_batch_size == mb + assert config.data.seq_length == 8192 + assert config.data.global_batch_size == 2048 + + assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected." + + auto_configs = get_auto_configs(configs) + assert auto_configs[0] == [ + 2, + 2, + 1, + 1, + 1, + ], f"[2, 2, 1, 1, 1] is expected configuration output but got {auto_configs[0]}." + + assert auto_configs[1] == [ + 2, + 2, + 1, + 1, + 2, + ], f"[2, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}." + + def test_nemotron_model(self): + # Nemotron3 8B + runner = AutoConfigurator( + model=run.Config(Nemotron3Config8B), + num_nodes=16, + seq_length=4096, + global_batch_size=2048, + tensor_parallel_sizes=[1], + pipeline_parallel_sizes=[4], + micro_batch_sizes=[1, 2], + context_parallel_sizes=[1], + expert_parallel_sizes=[1], + min_model_parallel_size=4, + max_model_parallel_size=8, + data_paths="/", + path_to_logs="/", + ) + + _, configs = generate_configs(runner) + + mbs = [1, 2] + for run_name, config, mb in zip(configs.keys(), configs.values(), mbs): + assert config.data.micro_batch_size == mb + assert config.data.seq_length == 4096 + assert config.data.global_batch_size == 2048 + + assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected." + + auto_configs = get_auto_configs(configs) + assert auto_configs[0] == [ + 1, + 4, + 1, + 1, + 1, + ], f"[2, 2, 1, 1, 1] is expected configuration output but got {auto_configs[0]}." + + assert auto_configs[1] == [ + 1, + 4, + 1, + 1, + 2, + ], f"[2, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}." From f666682e28db5b66457b7e0485ea425e1a78bbeb Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Sun, 8 Sep 2024 08:52:50 -0700 Subject: [PATCH 120/664] fix mixtraltopk (#10366) Signed-off-by: Alexandros Koumparoulis Co-authored-by: Marc Romeyn --- nemo/collections/llm/gpt/model/mixtral.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py index b0f40a2fc785..49c8f12f914d 100644 --- a/nemo/collections/llm/gpt/model/mixtral.py +++ b/nemo/collections/llm/gpt/model/mixtral.py @@ -59,7 +59,7 @@ class MixtralConfig(GPTConfig): moe_aux_loss_coeff: float = 0.01 moe_expert_capacity_factor: float = 1.0 moe_pad_expert_input_to_capacity: bool = True - moe_router_topk: int = 1 + moe_router_topk: int = 2 moe_router_pre_softmax: bool = True moe_token_dispatcher_type: str = "alltoall" @@ -104,7 +104,7 @@ class MixtralConfig8x7B(MixtralConfig): @dataclass class MixtralConfig8x22B(MixtralConfig): """ - Config for Mixtral-8x7B model + Config for Mixtral-8x22B model Official announcement: https://mistral.ai/news/mixtral-8x22b/ """ @@ -114,9 +114,6 @@ class MixtralConfig8x22B(MixtralConfig): ffn_hidden_size: int = 16384 max_position_embeddings: int = 4096 seq_length: int = 4096 - # MoE - num_moe_experts: int = 8 - moe_router_topk: int = 2 class MixtralModel(GPTModel): From e1f375ee54faee192405cf05a057dfb2cc8e2fec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 10:35:21 -0700 Subject: [PATCH 121/664] ci: Fix release tag (#10367) Signed-off-by: Oliver Koenig --- .github/workflows/release.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index af09fa241c59..30033a80e6c7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -34,11 +34,12 @@ jobs: PAYLOAD=$(jq \ -n \ -c \ + --arg TAG_NAME "v${VERSION}" \ --arg CI_COMMIT_BRANCH "${{ inputs.branch }}" \ --arg NAME "$NAME" \ --arg BODY "$CHANGELOG" \ '{ - "tag_name": $CI_COMMIT_BRANCH, + "tag_name": $TAG_NAME, "target_commitish": $CI_COMMIT_BRANCH, "name": $NAME, "body": $BODY, From a26ed2f4034f3bf10e08ac60f3558ea53b1ae710 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Sun, 8 Sep 2024 10:37:55 -0700 Subject: [PATCH 122/664] Akoumparouli/nemo ux tokenizer fix (#10351) * save tokenizer to disk Signed-off-by: Alexandros Koumparoulis * Track Hf tokenizer assets Signed-off-by: Alexandros Koumparoulis * raise exception if dst file exists Signed-off-by: Alexandros Koumparoulis * minor Signed-off-by: Alexandros Koumparoulis * remove print Signed-off-by: Alexandros Koumparoulis * add tokenizercontext Signed-off-by: Alexandros Koumparoulis * Add TokenizerContext Signed-off-by: Alexandros Koumparoulis * restore tokenizer from separate dir Signed-off-by: Alexandros Koumparoulis * update artifact __init__.py Signed-off-by: Alexandros Koumparoulis * TokenizerContext connector Signed-off-by: Alexandros Koumparoulis * bugix on_import_ckpt Signed-off-by: Alexandros Koumparoulis * rm code Signed-off-by: Alexandros Koumparoulis * Drop tokenizercontext Signed-off-by: Alexandros Koumparoulis * drop tokenizer load from tokenizercontext Signed-off-by: Alexandros Koumparoulis * undo Signed-off-by: Alexandros Koumparoulis * undo Signed-off-by: Alexandros Koumparoulis * Move to util function Signed-off-by: Alexandros Koumparoulis * use save_hf_tokenizer_assets Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * add tokenizer restoration in resume.py Signed-off-by: Alexandros Koumparoulis * bot fixes Signed-off-by: Alexandros Koumparoulis * rm Signed-off-by: Alexandros Koumparoulis * fix Signed-off-by: Alexandros Koumparoulis * wrap tokenizer restoration in try/catch Signed-off-by: Alexandros Koumparoulis * load_artifacts Signed-off-by: Alexandros Koumparoulis * param fix Signed-off-by: Alexandros Koumparoulis * fix Signed-off-by: Alexandros Koumparoulis * more fix Signed-off-by: Alexandros Koumparoulis * lazy import tensorboard Signed-off-by: Alexandros Koumparoulis * move code out of file context manager Signed-off-by: Alexandros Koumparoulis * Allow skippable artifacts Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * rebase fix Signed-off-by: Alexandros Koumparoulis * checkpoint structure change update Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- .../common/parts/perf_metrics_utils.py | 2 +- nemo/collections/llm/gpt/model/baichuan.py | 2 +- nemo/collections/llm/gpt/model/chatglm.py | 2 +- nemo/collections/llm/gpt/model/gemma.py | 2 +- nemo/collections/llm/gpt/model/llama.py | 2 +- nemo/collections/llm/gpt/model/mistral.py | 2 +- nemo/collections/llm/gpt/model/mixtral.py | 2 +- nemo/collections/llm/gpt/model/nemotron.py | 2 +- nemo/collections/llm/gpt/model/qwen2.py | 2 +- nemo/collections/llm/gpt/model/starcoder.py | 2 +- nemo/collections/llm/gpt/model/starcoder2.py | 2 +- nemo/collections/llm/tokenizer.py | 23 +++++++++- nemo/lightning/io/artifact/__init__.py | 4 +- nemo/lightning/io/artifact/base.py | 1 + nemo/lightning/io/artifact/file.py | 43 +++++++++++++++++- nemo/lightning/io/connector.py | 12 ++++- nemo/lightning/io/mixin.py | 44 +++++++++++-------- nemo/lightning/pytorch/strategies/utils.py | 2 + nemo/lightning/resume.py | 32 ++++++++++++++ 19 files changed, 147 insertions(+), 36 deletions(-) diff --git a/nemo/collections/common/parts/perf_metrics_utils.py b/nemo/collections/common/parts/perf_metrics_utils.py index 41273797e035..1633b1343340 100644 --- a/nemo/collections/common/parts/perf_metrics_utils.py +++ b/nemo/collections/common/parts/perf_metrics_utils.py @@ -2,7 +2,6 @@ import os from typing import List -from tensorboard.backend.event_processing import event_accumulator from nemo.utils import logging @@ -27,6 +26,7 @@ def read_tb_log(path: str, summary_name: str) -> List: Returns: summary_list: list, the values in the read summary list, formatted as a list. """ + from tensorboard.backend.event_processing import event_accumulator files = glob.glob(f"{path}/events*tfevents*") files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x))) diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py index b60c0430b8be..19a04a65a026 100644 --- a/nemo/collections/llm/gpt/model/baichuan.py +++ b/nemo/collections/llm/gpt/model/baichuan.py @@ -106,7 +106,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self), trust_remote_code=True) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)), trust_remote_code=True) @property def config(self) -> Baichuan2Config: diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py index 3b6453b2b891..162b42501d11 100644 --- a/nemo/collections/llm/gpt/model/chatglm.py +++ b/nemo/collections/llm/gpt/model/chatglm.py @@ -113,7 +113,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self), trust_remote_code=True) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)), trust_remote_code=True) @property def config(self) -> ChatGLMConfig: diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py index 753d75165197..e28d4409437b 100644 --- a/nemo/collections/llm/gpt/model/gemma.py +++ b/nemo/collections/llm/gpt/model/gemma.py @@ -134,7 +134,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> GemmaConfig: diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 2c76b2fdd976..59d697f2f6b7 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -251,7 +251,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> LlamaConfig: diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py index 73e6a34fd7c2..a6415769112a 100644 --- a/nemo/collections/llm/gpt/model/mistral.py +++ b/nemo/collections/llm/gpt/model/mistral.py @@ -142,7 +142,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> MistralConfig7B: diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py index 49c8f12f914d..bc255ae8fb87 100644 --- a/nemo/collections/llm/gpt/model/mixtral.py +++ b/nemo/collections/llm/gpt/model/mixtral.py @@ -168,7 +168,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> MixtralConfig8x7B | MixtralConfig8x22B: diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py index 44f10c0bee60..c8a8b5abee4b 100644 --- a/nemo/collections/llm/gpt/model/nemotron.py +++ b/nemo/collections/llm/gpt/model/nemotron.py @@ -173,7 +173,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> NemotronConfig: diff --git a/nemo/collections/llm/gpt/model/qwen2.py b/nemo/collections/llm/gpt/model/qwen2.py index 643bdda3ba8d..09ed910bac4c 100644 --- a/nemo/collections/llm/gpt/model/qwen2.py +++ b/nemo/collections/llm/gpt/model/qwen2.py @@ -141,7 +141,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self), trust_remote_code=True) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)), trust_remote_code=True) @property def config(self) -> Qwen2Config: diff --git a/nemo/collections/llm/gpt/model/starcoder.py b/nemo/collections/llm/gpt/model/starcoder.py index 15deb0ba2191..7cfdec4bce29 100644 --- a/nemo/collections/llm/gpt/model/starcoder.py +++ b/nemo/collections/llm/gpt/model/starcoder.py @@ -120,7 +120,7 @@ def convert_state(self, source, target): @property def tokenizer(self) -> "AutoTokenizer": - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> StarcoderConfig: diff --git a/nemo/collections/llm/gpt/model/starcoder2.py b/nemo/collections/llm/gpt/model/starcoder2.py index c49af006c6f5..3256ccd9e248 100644 --- a/nemo/collections/llm/gpt/model/starcoder2.py +++ b/nemo/collections/llm/gpt/model/starcoder2.py @@ -144,7 +144,7 @@ def convert_state(self, source, target): @property def tokenizer(self) -> "AutoTokenizer": - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> Starcoder2Config: diff --git a/nemo/collections/llm/tokenizer.py b/nemo/collections/llm/tokenizer.py index 77320c4b9c02..ef8cc53db7e5 100644 --- a/nemo/collections/llm/tokenizer.py +++ b/nemo/collections/llm/tokenizer.py @@ -12,12 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo.lightning.io.artifact import FileArtifact +from nemo.lightning.io.artifact import DirOrStringArtifact, FileArtifact from nemo.lightning.io.mixin import track_io __all__ = [] + +def extract_name(cls): + return str(cls).split('.')[-1].rstrip('>').rstrip("'") + + try: + # Track HF tokenizers + from transformers import AutoTokenizer as HfAutoTokenizer + from transformers.models.llama.tokenization_llama import LlamaTokenizer + from transformers.models.llama.tokenization_llama_fast import LlamaTokenizerFast + + for cls in [HfAutoTokenizer, LlamaTokenizer, LlamaTokenizerFast]: + track_io( + cls, + artifacts=[ + FileArtifact(attr_name, required=False) + for attr_name in ['vocab_file', 'merges_file', 'tokenizer_file', 'name_or_path'] + ], + ) + __all__.append(extract_name(cls)) + from nemo.collections.common.tokenizers import AutoTokenizer track_io( @@ -25,6 +45,7 @@ artifacts=[ FileArtifact("vocab_file", required=False), FileArtifact("merges_file", required=False), + DirOrStringArtifact("pretrained_model_name", required=False), ], ) __all__.append("AutoTokenizer") diff --git a/nemo/lightning/io/artifact/__init__.py b/nemo/lightning/io/artifact/__init__.py index 572bd37c0be8..50f77f968a07 100644 --- a/nemo/lightning/io/artifact/__init__.py +++ b/nemo/lightning/io/artifact/__init__.py @@ -1,4 +1,4 @@ from nemo.lightning.io.artifact.base import Artifact -from nemo.lightning.io.artifact.file import FileArtifact, PathArtifact +from nemo.lightning.io.artifact.file import DirArtifact, DirOrStringArtifact, FileArtifact, PathArtifact -__all__ = ["Artifact", "FileArtifact", "PathArtifact"] +__all__ = ["Artifact", "FileArtifact", "PathArtifact", "DirArtifact", "DirOrStringArtifact"] diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py index a997df42f843..ec451de9753b 100644 --- a/nemo/lightning/io/artifact/base.py +++ b/nemo/lightning/io/artifact/base.py @@ -9,6 +9,7 @@ class Artifact(ABC, Generic[ValueT]): def __init__(self, attr: str, required: bool = True): self.attr = attr self.required = required + self.skip = False @abstractmethod def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT: diff --git a/nemo/lightning/io/artifact/file.py b/nemo/lightning/io/artifact/file.py index 76bd0c6003a6..12b94be81030 100644 --- a/nemo/lightning/io/artifact/file.py +++ b/nemo/lightning/io/artifact/file.py @@ -1,3 +1,4 @@ +import os import shutil from pathlib import Path from typing import Union @@ -23,8 +24,46 @@ def load(self, path: str) -> str: return path +def pathize(s): + if not isinstance(s, Path): + return Path(s) + return s + + def copy_file(src: Union[Path, str], path: Union[Path, str], relative_dst: Union[Path, str]): - relative_path = Path(relative_dst) / Path(src).name - output = Path(path) / relative_path + relative_path = pathize(relative_dst) / pathize(src).name + output = pathize(path) / relative_path + if output.exists(): + raise FileExistsError(f"Dst file already exists {str(output)}") shutil.copy2(src, output) return relative_path + + +class DirArtifact(Artifact[str]): + def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str: + value = pathize(value) + absolute_dir = pathize(absolute_dir) + relative_dir = pathize(relative_dir) + if not value.is_dir(): + return value + + relative_dir = relative_dir / value.name + os.makedirs(str(absolute_dir / relative_dir), exist_ok=True) + for file in value.iterdir(): + copy_file(file, absolute_dir, relative_dir) + return str(relative_dir) + + def load(self, path: str) -> str: + return path + + +class DirOrStringArtifact(DirArtifact): + def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str: + if not pathize(value).exists(): + # This is Artifact is just a string. + self.skip = True + return value + return super().dump(value, absolute_dir, relative_dir) + + def load(self, path: str) -> str: + return path diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index 512f3bc4f12e..48222a4bd04d 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -237,5 +237,13 @@ def local_path(self, base_path: Optional[Path] = None) -> Path: def on_import_ckpt(self, model: pl.LightningModule): if hasattr(self, "tokenizer"): model.tokenizer = self.tokenizer - if hasattr(model, "__io__"): - model.__io__.tokenizer = self.tokenizer + if hasattr(model, "__io__") and hasattr(self.tokenizer, '__io__'): + model.__io__.tokenizer = self.tokenizer.__io__ + + def save_hf_tokenizer_assets(self, tokenizer_name_or_path, save_path="/tmp/nemo_tokenizer"): + from transformers import AutoTokenizer + + tok = AutoTokenizer.from_pretrained(tokenizer_name_or_path) + # Save tokenizer assets to save_path. + tok.save_pretrained(save_path) + return save_path diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index f8abc97dc7fc..36fb36bfcb34 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -520,6 +520,9 @@ def _io_path_elements_fn(x): def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."): for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []): + # Allow optional artifacts + if artifact.skip: + continue current_val = getattr(cfg, artifact.attr) if current_val is None: if artifact.required: @@ -539,7 +542,12 @@ def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: P def _artifact_transform_load(cfg: fdl.Config, path: Path): for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []): + if artifact.skip: + continue current_val = getattr(cfg, artifact.attr) + # __init__ arguments can be None + if current_val is None: + continue ## replace local path with absolute one new_val = str(Path(path) / current_val) setattr(cfg, artifact.attr, new_val) @@ -589,31 +597,31 @@ def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] = ## add IO functionality to custom objects present in the json file with open(_path) as f: j = json.load(f) - for obj, val in j["objects"].items(): - clss = ".".join([val["type"]["module"], val["type"]["name"]]) - if subpath and "paths" in val: - if all(map(lambda p: subpath not in p, val["paths"])): - continue + for obj, val in j.get("objects", {}).items(): + clss = ".".join([val["type"]["module"], val["type"]["name"]]) + if subpath and "paths" in val: + if all(map(lambda p: subpath not in p, val["paths"])): + continue - if not serialization.find_node_traverser(locate(clss)): - track_io(locate(clss)) + if not serialization.find_node_traverser(locate(clss)): + track_io(locate(clss)) with open(_path, "rb") as f: json_config = json.loads(f.read()) - root_key = None - for obj, val in json_config["objects"].items(): - if "paths" in val and subpath in val["paths"]: - root_key = obj - break + root_key = None + for obj, val in json_config.get("objects", {}).items(): + if "paths" in val and subpath in val["paths"]: + root_key = obj + break - if subpath and not root_key: - logging.warning(f"Could not find {subpath} for {output_type} in {_path}") + if subpath and not root_key: + logging.warning(f"Could not find {subpath} for {output_type} in {_path}") - if root_key: - json_config["root"]["key"] = root_key + if root_key: + json_config["root"]["key"] = root_key - config = serialization.Deserialization(json_config).result - _artifact_transform_load(config, path) + config = serialization.Deserialization(json_config).result + _artifact_transform_load(config, path) return fdl.build(config) diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py index 64345a378257..a7f0e7339def 100644 --- a/nemo/lightning/pytorch/strategies/utils.py +++ b/nemo/lightning/pytorch/strategies/utils.py @@ -42,6 +42,8 @@ class RestoreConfig: adapter_path: Optional[str] = None load_model_state: bool = True load_optim_state: bool = False + # eg tokenizer, etc. + load_artifacts: bool = True def setup_parallel_ranks(strategy: pl.strategies.Strategy): diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py index c8cefb4dd8d3..cd889a190be9 100644 --- a/nemo/lightning/resume.py +++ b/nemo/lightning/resume.py @@ -34,6 +34,20 @@ BasePath = PosixPath +def _try_restore_tokenizer(model, ckpt_path): + from nemo.lightning.io import load_context + + try: + tokenizer = load_context(ckpt_path, "model.tokenizer") + model.tokenizer = tokenizer + model.__io__.tokenizer = tokenizer.__io__ + except: + # Ignore if the ckpt doesn't have a tokenizer. + pass + finally: + return model + + @dataclass(kw_only=True) class AutoResume: """Class that handles the logic for setting checkpoint paths and restoring from @@ -79,6 +93,11 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None): if trainer_ckpt_path: trainer.ckpt_path = trainer_ckpt_path trainer.checkpoint_callback.last_model_path = trainer_ckpt_path + # Load artifacts + if getattr(self.restore_config, 'load_artifacts', False): + context_path = self.get_context_path(model) + model = _try_restore_tokenizer(model, context_path) + elif self.restore_config: new_path = self._try_import_model( model=model, @@ -215,6 +234,19 @@ def _find_trainer_ckpt_path(self) -> Optional[Path]: return checkpoint + def get_context_path(self, model: Optional[io.ConnectorMixin] = None) -> Optional[Path]: + checkpoint = None + app_state = AppState() + app_state.restore = self.resume_if_exists + if self.resume_if_exists: + checkpoint = self._find_trainer_ckpt_path() + + if checkpoint: + maybe_model_weights_path = Path(checkpoint) / "context" + if os.path.isdir(maybe_model_weights_path): + checkpoint = maybe_model_weights_path + return checkpoint + def get_trainer_ckpt_path(self, model: Optional[io.ConnectorMixin] = None) -> Optional[Path]: checkpoint = None app_state = AppState() From dd63de120397f7334cb16aecc188a4366899e653 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Sun, 8 Sep 2024 10:49:52 -0700 Subject: [PATCH 123/664] Add option to resume from specific path in AutoResume (#10373) * Add option to resume from specific path in AutoResume Signed-off-by: Hemil Desai * Fix path Signed-off-by: Hemil Desai --------- Signed-off-by: Hemil Desai --- nemo/lightning/resume.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py index cd889a190be9..45b73213698d 100644 --- a/nemo/lightning/resume.py +++ b/nemo/lightning/resume.py @@ -57,7 +57,8 @@ class AutoResume: restore_config (Optional[RestoreConfig]): Optional config for selectively restoring specific parts like model weights, optimizer states, etc. If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be automatically converted to a NeMo compatible format. resume_from_folder or the run's log_dir takes precedence over restore_config. - resume_from_directory (str): Path to the checkpointing directory to restore from. Defaults to /checkpoints + resume_from_directory (str): Path to the checkpointing directory to restore from. + resume_from_path (str): Path to a specific checkpoint to restore from. adapter_path (str): Path to any adapter checkpoints. resume_if_exists (bool): Whether this experiment is resuming from a previous run. If True, it sets trainer._checkpoint_connector._ckpt_path so that the trainer should @@ -75,6 +76,7 @@ class AutoResume: restore_config: Optional[RestoreConfig] = None resume_from_directory: Optional[str] = None + resume_from_path: Optional[str] = None adapter_path: Optional[str] = None resume_if_exists: bool = False resume_past_end: bool = False @@ -248,6 +250,10 @@ def get_context_path(self, model: Optional[io.ConnectorMixin] = None) -> Optiona return checkpoint def get_trainer_ckpt_path(self, model: Optional[io.ConnectorMixin] = None) -> Optional[Path]: + if self.resume_from_path: + maybe_model_weights_path = self.get_model_weights_path(self.resume_from_path) + return maybe_model_weights_path if os.path.isdir(maybe_model_weights_path) else self.resume_from_path + checkpoint = None app_state = AppState() app_state.restore = self.resume_if_exists From 6f1c414074ce1b87278520a6865f770d1af6b161 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 10:59:08 -0700 Subject: [PATCH 124/664] ci: Cleanup of release-freeze automation (#10392) Signed-off-by: Oliver Koenig --- .github/workflows/release-freeze.yml | 31 ---------------------------- 1 file changed, 31 deletions(-) diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index f8d037271f36..eb27cbf9f1f8 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -27,15 +27,6 @@ jobs: fetch-depth: 0 fetch-tags: true ref: main - - - name: Get Previous tag - id: previous-tag - # git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags ==> refs/tags/vX.Y.Z in descending order of date - # awk 'FNR == 2 {print substr($1, 11, length($1))}') ==> Selects the 2nd tag from the list, then strips the /refs/tags/ part of the tag - # set-output name=tag_name:: ==> Takes the clean tag vX.Y.Z and sets it to steps.previous_tag.outputs.tag_name - run: | - TAG=$(git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags | awk 'FNR == 2 {print substr($1, 11, length($1))}') - echo "tag-name=$TAG" >> "$GITHUB_OUTPUT" - name: Get release branch ref id: release-branch @@ -56,28 +47,6 @@ jobs: cd ${{ github.run_id }} sed -i 's/^ARG MCORE_TAG=.*$/ARG MCORE_TAG=${{ inputs.mcore_version }}/' Dockerfile.ci - - name: Build Changelog - id: build-changelog - uses: mikepenz/release-changelog-builder-action@v3.3.1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - # Configuration file is setup with filters for domains - # owner:repo must point to current repo - # fromTag: Auto resolved from historical tag order (previous tag compared to current tag) - # toTag: Current tag reference - configuration: ".github/workflows/config/changelog-config.json" - owner: ${{ github.repository_owner }} - repo: ${{ github.event.repository.name }} - ignorePreReleases: "false" - failOnError: "false" - fromTag: ${{ steps.previous-tag.outputs.tag-name }} - toTag: main - - - name: Append Changelog - run: | - echo "${{ steps.build-changelog.outputs.changelog }}" - - name: Create Release PR uses: peter-evans/create-pull-request@v6 id: create-pull-request From ab82b56a0779c02c21a23bc938e515b0fd7ebace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 11:03:42 -0700 Subject: [PATCH 125/664] ci: Toggle pre-release (#10394) * ci: Toggle pre-release Signed-off-by: Oliver Koenig * f Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .github/workflows/release-freeze.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index eb27cbf9f1f8..4151f8bf7de1 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -7,6 +7,11 @@ on: description: 'MAJOR.MINOR.PATCH[rcN] (Example: 2.0.0rc1, or 2.1.0)' required: true type: string + is_prelease: + description: Whether to keep and bump the pre-release label + required: false + default: false + type: boolean mcore_version: description: 'Version of MCore to use (must be a valid git ref)' required: true @@ -34,7 +39,12 @@ jobs: cd ${{ github.run_id }} VERSION=$(python -c 'import nemo; print(nemo.__version__)') - echo "Release version r$VERSION" > version + + if [[ "${{ inputs.is_prelease }}" == "false" ]]; then + sed -i '/^PRE_RELEASE/c\PRE_RELEASE = '\''' nemo/package_info.py + fi + + echo "Release version r$VERSION" > version echo "version=$VERSION" >> "$GITHUB_OUTPUT" - name: Pin branch name in Notebooks From bcf7e0f8109f17344d297b9925b55fddb02dad5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 11:08:21 -0700 Subject: [PATCH 126/664] ci: Toggle pre-release (#10395) Signed-off-by: Oliver Koenig --- .github/workflows/release-freeze.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index 4151f8bf7de1..547ff6b047a8 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -37,13 +37,13 @@ jobs: id: release-branch run: | cd ${{ github.run_id }} - - VERSION=$(python -c 'import nemo; print(nemo.__version__)') - + if [[ "${{ inputs.is_prelease }}" == "false" ]]; then sed -i '/^PRE_RELEASE/c\PRE_RELEASE = '\''' nemo/package_info.py fi + VERSION=$(python -c 'import nemo; print(nemo.__version__)') + echo "Release version r$VERSION" > version echo "version=$VERSION" >> "$GITHUB_OUTPUT" From 21cb9491eaca063f1e5120e41d2d057d2833ca4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 11:11:42 -0700 Subject: [PATCH 127/664] ci: Toggle pre-release (#10396) Signed-off-by: Oliver Koenig --- .github/workflows/release-freeze.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index 547ff6b047a8..ee4889dbe4a8 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -39,7 +39,7 @@ jobs: cd ${{ github.run_id }} if [[ "${{ inputs.is_prelease }}" == "false" ]]; then - sed -i '/^PRE_RELEASE/c\PRE_RELEASE = '\''' nemo/package_info.py + sed "/^PRE_RELEASE/c\PRE_RELEASE = ''" nemo/package_info.py fi VERSION=$(python -c 'import nemo; print(nemo.__version__)') From 30385aa6abd29c2636d4adcd2416cccc6467c9f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 11:15:57 -0700 Subject: [PATCH 128/664] ci: Automate pre-release (#10397) Signed-off-by: Oliver Koenig --- .github/workflows/release-freeze.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index ee4889dbe4a8..90a72c205b2b 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -39,7 +39,7 @@ jobs: cd ${{ github.run_id }} if [[ "${{ inputs.is_prelease }}" == "false" ]]; then - sed "/^PRE_RELEASE/c\PRE_RELEASE = ''" nemo/package_info.py + sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" nemo/package_info.py fi VERSION=$(python -c 'import nemo; print(nemo.__version__)') From 2404c4e9cce0c77e06b6dc6f8f191932db33f6b9 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Sun, 8 Sep 2024 11:38:11 -0700 Subject: [PATCH 129/664] Akoumparouli/nemo ux validate dataset asset accessibility (#10309) * Add validate_dataset_asset_accessibility Signed-off-by: Alexandros Koumparoulis * Add CI tests for validate_dataset_asset_accessibility Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * fix Signed-off-by: Alexandros Koumparoulis * fix for zipped lists Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * fix Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- nemo/collections/llm/gpt/data/pre_training.py | 63 +++++++++++++++++++ .../llm/gpt/data/test_pre_training_data.py | 34 ++++++++++ 2 files changed, 97 insertions(+) diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py index ccb2d21729ed..534922efe3a3 100644 --- a/nemo/collections/llm/gpt/data/pre_training.py +++ b/nemo/collections/llm/gpt/data/pre_training.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +import os import warnings from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Optional @@ -34,6 +35,66 @@ from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec +def is_number_tryexcept(s): + """Returns True if string is a number.""" + if s is None: + return False + try: + float(s) + return True + except ValueError: + return False + + +def is_zipped_list(paths): + # ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"] + even = paths[::2] + if len(even) == 0: + return False + is_num = list(map(is_number_tryexcept, even)) + if any(is_num): + assert all(is_num), "Got malformatted zipped list" + return is_num[0] + + +def validate_dataset_asset_accessibility(paths): + if paths is None: + raise ValueError("Expected path to have a value.") + + if isinstance(paths, tuple) or isinstance(paths, list): + if is_zipped_list(paths): + # remove weights from paths. + paths = paths[1::2] + for p in paths: + validate_dataset_asset_accessibility(p) + return + elif isinstance(paths, dict): + for p in paths.values(): + validate_dataset_asset_accessibility(p) + return + + if not isinstance(paths, str) and not isisntance(paths, Path): + raise ValueError("Expected path to be of string or Path type.") + + path = Path(paths) + suffices = ('.bin', '.idx') + if path.is_dir(): + if not os.access(path, os.R_OK): + raise PermissionError(f"Expected {str(path)} to be readable.") + # Will let the downstream class confirm contents are ok. + return + if path.exists(): + if not os.access(path, os.R_OK): + raise PermissionError(f"Expected {str(path)} to be readable.") + return + for suffix in suffices: + file_path = Path(str(path) + suffix) + if not file_path.exists(): + raise FileNotFoundError(f"Expected {str(file_path)} to exist.") + if not os.access(file_path, os.R_OK): + raise PermissionError(f"Expected {str(file_path)} to be readable.") + + class PreTrainingDataModule(pl.LightningDataModule, IOMixin): """PyTorch Lightning-compatible data module for pre-training GPT-style models. @@ -100,6 +161,8 @@ def __init__( from megatron.core.datasets.utils import get_blend_from_list + validate_dataset_asset_accessibility(paths) + build_kwargs = {} if isinstance(paths, dict): if split is not None: diff --git a/tests/collections/llm/gpt/data/test_pre_training_data.py b/tests/collections/llm/gpt/data/test_pre_training_data.py index 31a7b51cdf53..24dacc7bf33c 100644 --- a/tests/collections/llm/gpt/data/test_pre_training_data.py +++ b/tests/collections/llm/gpt/data/test_pre_training_data.py @@ -78,3 +78,37 @@ def test_multiple_data_distributions(tokenizer, trainer): ## this should succeed data.setup(stage="dummy") + + +def test_validate_dataset_asset_accessibility_file_does_not_exist(tokenizer, trainer): + raised_exception = False + try: + data = PreTrainingDataModule( + paths=["/this/path/should/not/exist/"], + seq_length=512, + micro_batch_size=2, + global_batch_size=2, + tokenizer=tokenizer, + ) + data.trainer = trainer + except FileNotFoundError: + raised_exception = True + + assert raised_exception == True, "Expected to raise a FileNotFoundError" + + +def test_validate_dataset_asset_accessibility_file_is_none(tokenizer, trainer): + raised_exception = False + try: + data = PreTrainingDataModule( + paths=None, + seq_length=512, + micro_batch_size=2, + global_batch_size=2, + tokenizer=tokenizer, + ) + data.trainer = trainer + except ValueError: + raised_exception = True + + assert raised_exception == True, "Expected to raise a ValueError" From 9921e6cf197b67604a87bb25bedd0d80ec49fb1b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 8 Sep 2024 12:08:01 -0700 Subject: [PATCH 130/664] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let'?= =?UTF-8?q?s=20bump=20NeMo=20`2.1.0rc0`=20!=20(#10399)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: ko3n1g <16716991+ko3n1g@users.noreply.github.com> --- nemo/package_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/package_info.py b/nemo/package_info.py index 1cd6ef729936..a60316270d57 100644 --- a/nemo/package_info.py +++ b/nemo/package_info.py @@ -14,9 +14,9 @@ MAJOR = 2 -MINOR = 0 +MINOR = 1 PATCH = 0 -PRE_RELEASE = 'rc2' +PRE_RELEASE = 'rc0' # Use the following formatting: (major, minor, patch, pre-release) VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) From f6cd74bae09ba4fbb6fd1e7f38bd9558156523a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 12:13:39 -0700 Subject: [PATCH 131/664] ci: Update baseline (#10400) Signed-off-by: Oliver Koenig --- .github/workflows/config/.secrets.baseline | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/config/.secrets.baseline b/.github/workflows/config/.secrets.baseline index 2bf4e372565c..4a56aaad3c58 100644 --- a/.github/workflows/config/.secrets.baseline +++ b/.github/workflows/config/.secrets.baseline @@ -123,6 +123,15 @@ } ], "results": { + ".github/workflows/cicd-main.yml": [ + { + "type": "Base64 High Entropy String", + "filename": ".github/workflows/cicd-main.yml", + "hashed_secret": "593951c440200143335452427205ae7c8580d463", + "is_verified": false, + "line_number": 1503 + } + ], "docs/source/nlp/question_answering.rst": [ { "type": "Hex High Entropy String", @@ -2074,5 +2083,5 @@ } ] }, - "generated_at": "2024-09-04T00:45:39Z" + "generated_at": "2024-09-08T19:00:15Z" } From 94c5fd8346ce547c8873e2d4f0b0df4d7cf8dd17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 12:34:59 -0700 Subject: [PATCH 132/664] ci(chore): Minor change (#10401) Signed-off-by: Oliver Koenig --- .github/workflows/cicd-main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index dd74e050a533..cfb45bca0a52 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -19,6 +19,7 @@ on: - 'main' - 'r**' types: [ labeled ] + workflow_dispatch: inputs: test_to_run: From 41502ffcfdbe6e70a4d1658eb326e07465c19ee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 12:35:51 -0700 Subject: [PATCH 133/664] ci: Swap merge/cherry-pick order (#10389) Signed-off-by: Oliver Koenig --- .../workflows/cherry-pick-release-commit.yml | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 3c82269cb9a6..1b26dd543a5b 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -3,22 +3,43 @@ name: Create PR to main with cherry-pick from release on: pull_request_target: branches: - - 'r*.*.*' + - 'main' types: ["closed"] - + jobs: + analyse-labels: + runs-on: ubuntu-latest + outputs: + branches: ${{ steps.main.outputs.versions }} + steps: + - name: main + id: main + run: | + labels='${{ toJSON(github.event.pull_request.labels.*.name) }}' + versions=$(echo "$labels" | grep -oE '[0-9]+\.[0-9]+(\.[0-9]+)?([a-zA-Z]+[0-9]*)?') + + versions=$(jq -ncR '[inputs]' <<< "$versions") + + echo "versions=$versions" | tee -a "$GITHUB_OUTPUT" + cherry-pick-release-commit: name: Cherry-pick release commit runs-on: ubuntu-latest + needs: analyse-labels + strategy: + matrix: + branch: ${{ fromJSON(needs.analyse-labels.outputs.branches) }} steps: + - name: Checkout uses: actions/checkout@v3 with: fetch-depth: 0 + - name: github-cherry-pick-action v1.0.3 uses: carloscastrojumo/github-cherry-pick-action@bb0869df47c27be4ae4c7a2d93d22827aa5a0054 with: - branch: main + branch: ${{ matrix.branch }} labels: | cherry-pick reviewers: | From 19382ebf6dad2b85caca3215cb9d649de9525f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 12:41:50 -0700 Subject: [PATCH 134/664] ci: Fix release tag (#10402) Signed-off-by: Oliver Koenig From 73a8ef8a5bddd06bbcc279ff9f533b5a96dda61b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 12:44:33 -0700 Subject: [PATCH 135/664] Ko3n1g/ci/fix release workflow 2 (#10403) * ci: Improve release workflow Signed-off-by: Oliver Koenig * ci: Fix cherry-picking Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .../workflows/cherry-pick-release-commit.yml | 2 +- .github/workflows/release-freeze.yml | 23 +++++-------------- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 1b26dd543a5b..15f6a2fd6890 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -39,7 +39,7 @@ jobs: - name: github-cherry-pick-action v1.0.3 uses: carloscastrojumo/github-cherry-pick-action@bb0869df47c27be4ae4c7a2d93d22827aa5a0054 with: - branch: ${{ matrix.branch }} + branch: r${{ matrix.branch }} labels: | cherry-pick reviewers: | diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index 90a72c205b2b..7f8cd3dad8f5 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -32,6 +32,7 @@ jobs: fetch-depth: 0 fetch-tags: true ref: main + token: ${{ secrets.PAT }} - name: Get release branch ref id: release-branch @@ -47,6 +48,9 @@ jobs: echo "Release version r$VERSION" > version echo "version=$VERSION" >> "$GITHUB_OUTPUT" + git switch --force-create r$VERSION origin/main + git push -u origin r$VERSION --force + - name: Pin branch name in Notebooks run: | cd ${{ github.run_id }} @@ -62,7 +66,8 @@ jobs: id: create-pull-request with: path: ${{ github.run_id }} - branch: r${{ steps.release-branch.outputs.version }} + base: r${{ steps.release-branch.outputs.version }} + branch: ci/release-r${{ steps.release-branch.outputs.version }} title: 'Release `${{ steps.release-branch.outputs.version }}`' body: | 🚀 PR to release NeMo `${{ steps.release-branch.outputs.version }}`. @@ -80,22 +85,6 @@ jobs: assignees: okoenig labels: 'Run CICD' - - name: Add Summary comment - uses: peter-evans/create-or-update-comment@v4 - with: - issue-number: ${{ steps.create-pull-request.outputs.pull-request-number }} - body: | - # Highlights - __ - - - name: Add Changelog comment - uses: peter-evans/create-or-update-comment@v4 - with: - issue-number: ${{ steps.create-pull-request.outputs.pull-request-number }} - body: | - # Detailed Changelogs - ${{ steps.build-changelog.outputs.changelog }} - bump-next-version: runs-on: ubuntu-latest needs: [create-release-branch] From a4f95f11a27af5fcaa3ba89912a24174c77cbc21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 12:49:48 -0700 Subject: [PATCH 136/664] ci: Send Slack alert on failed cherry pick (#10404) Signed-off-by: Oliver Koenig --- .../workflows/cherry-pick-release-commit.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 15f6a2fd6890..5bdae8da250a 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -44,6 +44,25 @@ jobs: cherry-pick reviewers: | ${{ github.event.pull_request.user.login }} + + - name: Send Slack message on failure + if: failure() + run: | + URL=https://github.com/NVIDIA/NeMo/pull/${{ github.event.number }} + + MESSAGE='{ + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'${{ github.event.number }}'> failed" + } + } + ] + }' + + curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file From 0e5e5d5282ed9cd2bc887ed23a961cb7fdea6516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 12:58:05 -0700 Subject: [PATCH 137/664] ci: Allow concurrent docker system prune (#10405) Signed-off-by: Oliver Koenig --- .github/workflows/_test_template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index 8c61c767b4f1..3e2d63285ec4 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -43,7 +43,7 @@ jobs: steps: - name: Docker system cleanup run: | - docker system prune -a --filter "until=48h" --force + docker system prune -a --filter "until=48h" --force || true - name: Docker pull image run: | From 46e908e82faff6d8987e8333092888d074555f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 13:00:01 -0700 Subject: [PATCH 138/664] ci: Use PAT for cherry-picking (#10406) Signed-off-by: Oliver Koenig --- .github/workflows/cherry-pick-release-commit.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 5bdae8da250a..c92e7b05c149 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -26,6 +26,8 @@ jobs: name: Cherry-pick release commit runs-on: ubuntu-latest needs: analyse-labels + environment: + name: main strategy: matrix: branch: ${{ fromJSON(needs.analyse-labels.outputs.branches) }} @@ -35,6 +37,7 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 0 + token: ${{ secrets.PAT }} - name: github-cherry-pick-action v1.0.3 uses: carloscastrojumo/github-cherry-pick-action@bb0869df47c27be4ae4c7a2d93d22827aa5a0054 From 9f9bf4d7926155e867f04a467f0895bd14a48f20 Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Sun, 8 Sep 2024 16:37:16 -0500 Subject: [PATCH 139/664] Alit/mamba ux cicd (#10370) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add mamba init * more ssm * add 370m * add hybrid * fix issue * integrate model and tokenizer config for ssm * add all mamba configs * modify state re pattern * revert gpt stuff * remove SSM class and training script * Apply isort and black reformatting Signed-off-by: JRD971000 * remove faulty export * add script to test * Apply isort and black reformatting Signed-off-by: JRD971000 * some recent fixes * Apply isort and black reformatting Signed-off-by: JRD971000 * test script tp/pp1 * Apply isort and black reformatting Signed-off-by: JRD971000 * add cicd * include MLM mamba dist ckpt commit * add license head and address more comments * Apply isort and black reformatting Signed-off-by: JRD971000 * add guard * remove guard from TransformerConfig * update scripts * Apply isort and black reformatting Signed-off-by: JRD971000 --------- Signed-off-by: JRD971000 Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Co-authored-by: Ali Taghibakhshi Co-authored-by: JRD971000 Co-authored-by: oliver könig --- .github/workflows/cicd-main.yml | 36 +++++ Dockerfile.ci | 4 +- .../llm/gpt/model/megatron_ssm_finetuning.py | 124 +++++++++++++++++ .../llm/gpt/model/megatron_ssm_pretraining.py | 129 ++++++++++++++++++ 4 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 tests/collections/llm/gpt/model/megatron_ssm_finetuning.py create mode 100644 tests/collections/llm/gpt/model/megatron_ssm_pretraining.py diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index cfb45bca0a52..0945ac36bc7b 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4966,6 +4966,40 @@ jobs: AFTER_SCRIPT: | rm -rf examples/llm/gpt_pretrain_results rm -rf examples/llm/gpt_index_mappings + + L2_NeMo_2_SSM_Pretraining: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/gpt/model/megatron_ssm_pretraining.py \ + --devices 1 \ + --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain \ + --max-steps 10 \ + --data-path /home/TestData/nlp/megatron_mamba/toy_ssm_dataset/legal_pile_text_document + + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain + + L2_NeMo_2_SSM_Finetuning: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/gpt/model/megatron_ssm_finetuning.py \ + --devices 1 \ + --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft \ + --max-steps 10 \ + --model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt + + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft Nemo_CICD_Test: needs: @@ -5084,6 +5118,8 @@ jobs: #- OPTIONAL_L2_Stable_Diffusion_Training - L2_NeMo_2_GPT_Pretraining_no_transformer_engine - L2_NeMo_2_GPT_DDP_Param_Parity_check + - L2_NeMo_2_SSM_Pretraining + - L2_NeMo_2_SSM_Finetuning if: always() runs-on: ubuntu-latest steps: diff --git a/Dockerfile.ci b/Dockerfile.ci index 33490a6d9079..51ad4663131e 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -38,7 +38,9 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.15.0 -ARG MCORE_TAG=3396356ab4ca83cc4c4d3272530b142a1702606e + +ARG MCORE_TAG=01945b98d1ea3a2acb5e8301e181a328104f4856 + ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ diff --git a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py new file mode 100644 index 000000000000..4b748c298105 --- /dev/null +++ b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py @@ -0,0 +1,124 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## NOTE: This script is present for github-actions testing only. +## There are no guarantees that this script is up-to-date with latest NeMo. + +import argparse + +import torch +from megatron.core.optimizer import OptimizerConfig + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import _setup +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.lightning import NeMoLogger +from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule + + +def get_args(): + parser = argparse.ArgumentParser(description='Train a small GPT model using NeMo 2.0') + parser.add_argument('--devices', type=int, help="Number of devices to use for training") + parser.add_argument('--max-steps', type=int, help="Number of steps to train for") + parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to") + parser.add_argument('--model-path', type=str, help="Path to model checkpoint") + parser.add_argument( + '--tokenizer-model-path', type=str, default=None, help="Path to tokenizer model, defaults to None" + ) + return parser.parse_args() + + +if __name__ == "__main__": + + args = get_args() + + # Checkpoint callback setup + checkpoint_callback = nl.ModelCheckpoint( + every_n_train_steps=10, + dirpath=args.experiment_dir, + ) + + trainer = nl.Trainer( + devices=args.devices, + max_steps=args.max_steps, + accelerator="gpu", + strategy=nl.MegatronStrategy( + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + tensor_model_parallel_size=1, + ), + plugins=nl.MegatronMixedPrecision( + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + log_every_n_steps=1, + limit_val_batches=5, + val_check_interval=10, + num_sanity_val_steps=0, + ) + + opt_config = OptimizerConfig( + optimizer='adam', + lr=1e-5, + min_lr=1e-5, + use_distributed_optimizer=False, + clip_grad=1.0, + bf16=True, + ) + + optim = MegatronOptimizerModule(config=opt_config) + model_config = llm.BaseMambaConfig130M() + model_config.tokenizer_model_path = args.tokenizer_model_path + + tokenizer = get_nmt_tokenizer( + library=model_config.tokenizer_library, + model_name=model_config.tokenizer_name, + tokenizer_model=model_config.tokenizer_model_path, + use_fast=True, + ) + + model = llm.GPTModel(model_config, optim=optim, tokenizer=tokenizer) + + ckpt_path = model.import_ckpt( + path="pytorch://" + args.model_path, + model_config=model_config, + ) + + nemo_logger = NeMoLogger( + dir=args.experiment_dir, + ) + + data = llm.SquadDataModule( + seq_length=512, + micro_batch_size=2, + global_batch_size=4, + tokenizer=model.tokenizer, + num_workers=0, + pad_to_max_length=True, + ) + + app_state = _setup( + model=model, + data=data, + resume=None, + trainer=trainer, + log=nemo_logger, + optim=optim, + tokenizer=tokenizer, + model_transform=None, + ) + + trainer.fit(model, data, ckpt_path=ckpt_path) diff --git a/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py new file mode 100644 index 000000000000..52daa21c2279 --- /dev/null +++ b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py @@ -0,0 +1,129 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## NOTE: This script is present for github-actions testing only. +## There are no guarantees that this script is up-to-date with latest NeMo. + +import argparse +import torch +from megatron.core.optimizer import OptimizerConfig +from pytorch_lightning.loggers import TensorBoardLogger +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import train +from nemo.collections.llm.gpt.data import PreTrainingDataModule +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.lightning import NeMoLogger +from nemo.lightning.pytorch.callbacks import ModelCheckpoint +from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule + + +def get_args(): + parser = argparse.ArgumentParser(description='Train a Mamba model using NeMo 2.0') + parser.add_argument('--devices', type=int, help="Number of devices to use for training") + parser.add_argument('--max-steps', type=int, help="Number of steps to train for") + parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to") + parser.add_argument('--data-path', type=str, help="Path to data file") + parser.add_argument('--tokenizer-path', type=str, default=None, help="Path to tokenizer model") + + return parser.parse_args() + + +if __name__ == '__main__': + + args = get_args() + + seq_length = 512 + + tokenizer = get_nmt_tokenizer( + "huggingface", + "EleutherAI/gpt-neox-20b", + tokenizer_model=None, + use_fast=True, + ) + data = PreTrainingDataModule( + paths=args.data_path, + seq_length=seq_length, + micro_batch_size=2, + global_batch_size=16, + seed=1234, + tokenizer=tokenizer, + ) + ssm_config = llm.SSMConfig( + hybrid_override_pattern="M-M*", + num_layers=4, + hidden_size=1024, + ffn_hidden_size=1024, + num_attention_heads=4, + seq_length=seq_length, + init_method_std=0.02, + hidden_dropout=0.0, + attention_dropout=0.0, + layernorm_epsilon=1e-5, + make_vocab_size_divisible_by=16, + ) + model = llm.GPTModel(ssm_config, tokenizer=data.tokenizer) + strategy = nl.MegatronStrategy( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + ) + checkpoint_callback = ModelCheckpoint( + every_n_train_steps=10, + dirpath=args.experiment_dir, + ) + callbacks = [checkpoint_callback] + + loggers = [] + tensorboard_logger = TensorBoardLogger( + save_dir='dummy', ## NOTE: this gets overwritten by default + ) + loggers.append(tensorboard_logger) + + opt_config = OptimizerConfig( + optimizer='adam', + lr=6e-4, + min_lr=6e-5, + clip_grad=1.0, + use_distributed_optimizer=False, + bf16=True, + ) + opt = MegatronOptimizerModule(config=opt_config) + + trainer = nl.Trainer( + devices=args.devices, + max_steps=args.max_steps, + accelerator="gpu", + strategy=strategy, + logger=loggers, + callbacks=callbacks, + log_every_n_steps=1, + limit_val_batches=2, + plugins=nl.MegatronMixedPrecision( + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + ) + + nemo_logger = NeMoLogger( + dir=args.experiment_dir, + ) + + train( + model=model, + data=data, + trainer=trainer, + log=nemo_logger, + tokenizer='data', + optim=opt, + ) From a95f3a237e3e84ddec0e9a5d553a059b060ba25a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 14:45:13 -0700 Subject: [PATCH 140/664] ci: Allow default token to write workflows (#10407) Signed-off-by: Oliver Koenig --- .github/workflows/cherry-pick-release-commit.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index c92e7b05c149..a2da00788d10 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -26,8 +26,8 @@ jobs: name: Cherry-pick release commit runs-on: ubuntu-latest needs: analyse-labels - environment: - name: main + permissions: + actions: write strategy: matrix: branch: ${{ fromJSON(needs.analyse-labels.outputs.branches) }} @@ -37,7 +37,6 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 0 - token: ${{ secrets.PAT }} - name: github-cherry-pick-action v1.0.3 uses: carloscastrojumo/github-cherry-pick-action@bb0869df47c27be4ae4c7a2d93d22827aa5a0054 From 4bf8101b07a3d73b5489b8eeeb5902261fd4be54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 14:51:25 -0700 Subject: [PATCH 141/664] ci: More permissions for cherry-pick automation (#10409) Signed-off-by: Oliver Koenig --- .github/workflows/cherry-pick-release-commit.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index a2da00788d10..e9af30de0f6c 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -28,6 +28,8 @@ jobs: needs: analyse-labels permissions: actions: write + pull-requests: write + contents: write strategy: matrix: branch: ${{ fromJSON(needs.analyse-labels.outputs.branches) }} From 0a4066295d4a069b4e196a5d97211059a9e18e5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 15:15:39 -0700 Subject: [PATCH 142/664] ci: Overhaul cherry-pick workflow (#10410) Signed-off-by: Oliver Koenig --- .../workflows/cherry-pick-release-commit.yml | 143 +++++++++++------- 1 file changed, 87 insertions(+), 56 deletions(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index e9af30de0f6c..c1d136c6eddd 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -1,72 +1,103 @@ name: Create PR to main with cherry-pick from release on: - pull_request_target: + push: branches: - - 'main' - types: ["closed"] - -jobs: - analyse-labels: - runs-on: ubuntu-latest - outputs: - branches: ${{ steps.main.outputs.versions }} - steps: - - name: main - id: main - run: | - labels='${{ toJSON(github.event.pull_request.labels.*.name) }}' - versions=$(echo "$labels" | grep -oE '[0-9]+\.[0-9]+(\.[0-9]+)?([a-zA-Z]+[0-9]*)?') - - versions=$(jq -ncR '[inputs]' <<< "$versions") - - echo "versions=$versions" | tee -a "$GITHUB_OUTPUT" + - main - cherry-pick-release-commit: - name: Cherry-pick release commit +jobs: + main: runs-on: ubuntu-latest - needs: analyse-labels - permissions: - actions: write - pull-requests: write - contents: write - strategy: - matrix: - branch: ${{ fromJSON(needs.analyse-labels.outputs.branches) }} + environment: + name: main steps: - - name: Checkout uses: actions/checkout@v3 with: fetch-depth: 0 + token: ${{ secrets.PAT }} - - name: github-cherry-pick-action v1.0.3 - uses: carloscastrojumo/github-cherry-pick-action@bb0869df47c27be4ae4c7a2d93d22827aa5a0054 - with: - branch: r${{ matrix.branch }} - labels: | - cherry-pick - reviewers: | - ${{ github.event.pull_request.user.login }} - - - name: Send Slack message on failure - if: failure() + + - name: Cherry pick + env: + GH_TOKEN: ${{ secrets.PAT }} run: | - URL=https://github.com/NVIDIA/NeMo/pull/${{ github.event.number }} - - MESSAGE='{ - "blocks": [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'${{ github.event.number }}'> failed" - } - } - ] - }' - - curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }} + SHA=$(git rev-list --no-merges -n 1 HEAD) + MESSAGE=$(git log -n 1 --pretty=format:%s $SHA) + PR_ID=$(echo $MESSAGE | awk -F'#' '{print $2}' | awk -F')' '{print $1}' ) + + PR=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GH_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/NVIDIA/NeMo/pulls/$PR_ID) + + LABELS=$(echo -E $PR | jq '.labels | [.[].name] | join(",")' | tr -d '"') + + TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'r[^,]*') + + if [[ $TARGET_BRANCHES == '' ]]; then + echo Nothing to cherry-pick + exit 0 + fi + + echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do + TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false) + + if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then + echo Release branch does not yet exist, will not cherry-pick + continue + fi + + ( + git fetch origin $RELEASE_BRANCH:$RELEASE_BRANCH + git switch --force-create cherry-pick-$MR_ID-$RELEASE_BRANCH $RELEASE_BRANCH + git cherry-pick $SHA + git push -u origin --force cherry-pick-$MR_ID-$RELEASE_BRANCH + git checkout ${CI_DEFAULT_BRANCH:-main} + ) + + CHERRYPICK_SUCCESSFUL=$? + + if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then + curl \ + --header "PRIVATE-TOKEN: $PROJECT_ACCESS_TOKEN_MCORE" \ + --url https://${GITLAB_ENDPOINT}/api/v4/projects/141257/merge_requests \ + -d "source_branch=cherry-pick-$MR_ID-$RELEASE_BRANCH" \ + -d "target_branch=$RELEASE_BRANCH" \ + -d "title=Cherry-pick $PR_ID into $RELEASE_BRANCH" \ + -d "labels=cherry-pick" + + curl -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GH_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/NVIDIA/NeMo/pulls \ + -d '{"title":"Cherry-pick '$PR_ID' into '$RELEASE_BRANCH'","head":"'cherry-pick-$PR_ID-$RELEASE_BRANCH'","base":"'$RELEASE_BRANCH'"}' + + else + URL=https://github.com/NVIDIA/NeMo/pull/${{ github.event.number }} + + MESSAGE='{ + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'${{ github.event.number }}'> failed" + } + } + ] + }' + + curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }} + + fi + + done + + env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file From 0d0e724883ed3514d92413e56403b7f594cc0e84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 15:17:53 -0700 Subject: [PATCH 143/664] ci: Ignore failures on cherry-picking (#10411) Signed-off-by: Oliver Koenig --- .github/workflows/cherry-pick-release-commit.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index c1d136c6eddd..c8b21e2919c3 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -22,6 +22,8 @@ jobs: env: GH_TOKEN: ${{ secrets.PAT }} run: | + set -x + SHA=$(git rev-list --no-merges -n 1 HEAD) MESSAGE=$(git log -n 1 --pretty=format:%s $SHA) PR_ID=$(echo $MESSAGE | awk -F'#' '{print $2}' | awk -F')' '{print $1}' ) From 52c7f2aecfb8bfb7b5fac7cd707fb4c78aa60da4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 15:19:53 -0700 Subject: [PATCH 144/664] ci: Minor change (#10412) Signed-off-by: Oliver Koenig --- .github/workflows/cicd-main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 0945ac36bc7b..5207c101f631 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -70,6 +70,7 @@ jobs: docker container prune --filter "until=24h" --force docker image prune -a --filter "until=24h" --force + cicd-test-container-setup: needs: [cicd-cluster-clean, pre-flight] runs-on: self-hosted-azure-builder From 7d2779221f9142f3271d6c3df81e242a5fd57a3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 15:21:43 -0700 Subject: [PATCH 145/664] ci: Fix cherry-pick config (#10413) Signed-off-by: Oliver Koenig --- .github/workflows/cherry-pick-release-commit.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index c8b21e2919c3..6948c8ddcdb8 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -22,7 +22,10 @@ jobs: env: GH_TOKEN: ${{ secrets.PAT }} run: | - set -x + set -x + + git config --global user.email "nemo-bot@nvidia.com" + git config --global user.name "NeMo Bot" SHA=$(git rev-list --no-merges -n 1 HEAD) MESSAGE=$(git log -n 1 --pretty=format:%s $SHA) From 91863d23913495db6eb919dd6e387e0430d47cb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 15:22:40 -0700 Subject: [PATCH 146/664] ci: Minor change (#10414) Signed-off-by: Oliver Koenig --- .github/workflows/cicd-main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 5207c101f631..0945ac36bc7b 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -70,7 +70,6 @@ jobs: docker container prune --filter "until=24h" --force docker image prune -a --filter "until=24h" --force - cicd-test-container-setup: needs: [cicd-cluster-clean, pre-flight] runs-on: self-hosted-azure-builder From 48fab9dd52721992b226eda51e5c003b36633bea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 15:23:54 -0700 Subject: [PATCH 147/664] ci: Minor change (#10415) Signed-off-by: Oliver Koenig --- .github/workflows/cicd-main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 0945ac36bc7b..d73e7b7cece1 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. name: "CICD NeMo" - on: pull_request: branches: From 573d91096fd724fd990e864ae78d4325878c6107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 15:27:05 -0700 Subject: [PATCH 148/664] ci: Remove dead code (#10416) Signed-off-by: Oliver Koenig --- .github/workflows/cherry-pick-release-commit.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 6948c8ddcdb8..924b20c02588 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -65,14 +65,6 @@ jobs: CHERRYPICK_SUCCESSFUL=$? if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then - curl \ - --header "PRIVATE-TOKEN: $PROJECT_ACCESS_TOKEN_MCORE" \ - --url https://${GITLAB_ENDPOINT}/api/v4/projects/141257/merge_requests \ - -d "source_branch=cherry-pick-$MR_ID-$RELEASE_BRANCH" \ - -d "target_branch=$RELEASE_BRANCH" \ - -d "title=Cherry-pick $PR_ID into $RELEASE_BRANCH" \ - -d "labels=cherry-pick" - curl -L \ -X POST \ -H "Accept: application/vnd.github+json" \ From 14c3d4a369d8d481b5cfe0965c2337a2170f8d9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 15:30:34 -0700 Subject: [PATCH 149/664] Ko3n1g/ci/test cherry picking 2 (#10417) * ci: Cherrypick continue on error Signed-off-by: Oliver Koenig * ci: Fix cherry pick branch Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .github/workflows/cherry-pick-release-commit.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 924b20c02588..cb90e1562d19 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -23,6 +23,7 @@ jobs: GH_TOKEN: ${{ secrets.PAT }} run: | set -x + set +e git config --global user.email "nemo-bot@nvidia.com" git config --global user.name "NeMo Bot" @@ -71,7 +72,7 @@ jobs: -H "Authorization: Bearer $GH_TOKEN" \ -H "X-GitHub-Api-Version: 2022-11-28" \ https://api.github.com/repos/NVIDIA/NeMo/pulls \ - -d '{"title":"Cherry-pick '$PR_ID' into '$RELEASE_BRANCH'","head":"'cherry-pick-$PR_ID-$RELEASE_BRANCH'","base":"'$RELEASE_BRANCH'"}' + -d '{"title":"Cherry-pick '$PR_ID' into '$RELEASE_BRANCH'","head":"cherry-pick-'$PR_ID'-'$RELEASE_BRANCH'","base":"'$RELEASE_BRANCH'"}' else URL=https://github.com/NVIDIA/NeMo/pull/${{ github.event.number }} From aab78f0a397c38268bdb813f8f6779e7989f2902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 15:35:06 -0700 Subject: [PATCH 150/664] ci: Small test (#10419) Signed-off-by: Oliver Koenig --- .github/workflows/cicd-main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index d73e7b7cece1..2d30dec37054 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -5212,3 +5212,4 @@ jobs: - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }} run: | exit 1 + From b7ee0b89a7c7d0885a509f22700c83899cfc75d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 8 Sep 2024 15:37:45 -0700 Subject: [PATCH 151/664] ci: Small fix (#10420) Signed-off-by: Oliver Koenig --- .github/workflows/cherry-pick-release-commit.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index cb90e1562d19..0b753d59a826 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -57,9 +57,9 @@ jobs: ( git fetch origin $RELEASE_BRANCH:$RELEASE_BRANCH - git switch --force-create cherry-pick-$MR_ID-$RELEASE_BRANCH $RELEASE_BRANCH + git switch --force-create cherry-pick-$PR_ID-$RELEASE_BRANCH $RELEASE_BRANCH git cherry-pick $SHA - git push -u origin --force cherry-pick-$MR_ID-$RELEASE_BRANCH + git push -u origin --force cherry-pick-$PR_ID-$RELEASE_BRANCH git checkout ${CI_DEFAULT_BRANCH:-main} ) From d12fbbd3db952ea60d48cc567cbc632c02040bba Mon Sep 17 00:00:00 2001 From: Marc Romeyn Date: Mon, 9 Sep 2024 02:11:38 +0200 Subject: [PATCH 152/664] [NeMo-UX] Integrating CLI (#10300) * Adding nemo-run to requirements Signed-off-by: Marc Romeyn * Updating nemo-run entrypoint inside setup.py Signed-off-by: Marc Romeyn * Remove nemo-run from requirements until we have a pypi package Signed-off-by: Marc Romeyn * Update entrypoint naming Signed-off-by: Marc Romeyn * Setting up cli recipe for llama3-8b Signed-off-by: Marc Romeyn * Move AutoTokenizer import inline for starcoder Signed-off-by: Marc Romeyn * Move AutoTokenizer import inline for starcoder2 Signed-off-by: Marc Romeyn * Use target for factories inside llama3_8b Signed-off-by: Marc Romeyn * Update other recipes Signed-off-by: Marc Romeyn * Fix some bugs in the recipes Signed-off-by: Marc Romeyn * Adding some examples Signed-off-by: Marc Romeyn * Adding repl example Signed-off-by: Marc Romeyn * Starting to add a notebook example as well Signed-off-by: Marc Romeyn * Fix wrong imports Signed-off-by: Marc Romeyn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply isort and black reformatting Signed-off-by: pre-commit-ci[bot] * Fix wrong imports Signed-off-by: Marc Romeyn * Apply isort and black reformatting Signed-off-by: marcromeyn * Fix typo + add script with default executor Signed-off-by: Marc Romeyn * Apply isort and black reformatting Signed-off-by: marcromeyn * Add nemo-run to Dockerfile.ci Signed-off-by: Marc Romeyn * Adding copyright to recipes Signed-off-by: Marc Romeyn * Apply isort and black reformatting Signed-off-by: marcromeyn * Adding guides to recipes dir Signed-off-by: Marc Romeyn * Adding hatchling to Dockerfile.ci Signed-off-by: Marc Romeyn * Move install to different line Signed-off-by: Marc Romeyn * fix install Signed-off-by: Hemil Desai * Move llama3_pretraining to scripts for now Signed-off-by: Marc Romeyn * Remove img folder & use images from release instead Signed-off-by: Marc Romeyn * Apply isort and black reformatting Signed-off-by: marcromeyn * Updating default of num_nodes in all recipes Signed-off-by: Marc Romeyn * Apply isort and black reformatting Signed-off-by: marcromeyn * Adding tests for all recipes Signed-off-by: Marc Romeijn * ddAing docstrings Signed-off-by: Marc Romeijn * Apply isort and black reformatting Signed-off-by: marcromeyn * Fix failing tests inside test_mixtral_8x7b_64k Signed-off-by: Marc Romeijn * Rename fabric to _fabric to avoid name collision with package fabric Signed-off-by: Alexandros Koumparoulis * add rename comment Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Marc Romeyn Signed-off-by: pre-commit-ci[bot] Signed-off-by: marcromeyn Signed-off-by: Hemil Desai Signed-off-by: Marc Romeijn Signed-off-by: Alexandros Koumparoulis Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] Co-authored-by: marcromeyn Co-authored-by: Hemil Desai Co-authored-by: Alexandros Koumparoulis Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> --- Dockerfile.ci | 4 +- examples/llm/pretrain/README.md | 72 ++ examples/llm/pretrain/custom_recipe.py | 44 ++ examples/llm/pretrain/default_executor.py | 106 +++ examples/llm/pretrain/pretrain.ipynb | 737 ++++++++++++++++++ nemo/collections/llm/api.py | 32 +- nemo/collections/llm/gpt/model/starcoder.py | 3 +- nemo/collections/llm/gpt/model/starcoder2.py | 3 +- nemo/collections/llm/recipes/ADD-RECIPE.md | 100 +++ nemo/collections/llm/recipes/README.md | 46 ++ nemo/collections/llm/recipes/__init__.py | 15 + nemo/collections/llm/recipes/llama3_70b.py | 249 ++++-- .../collections/llm/recipes/llama3_70b_16k.py | 186 ++++- .../collections/llm/recipes/llama3_70b_64k.py | 191 ++++- nemo/collections/llm/recipes/llama3_8b.py | 246 ++++-- nemo/collections/llm/recipes/llama3_8b_16k.py | 184 ++++- nemo/collections/llm/recipes/llama3_8b_64k.py | 186 ++++- nemo/collections/llm/recipes/log/__init__.py | 13 + nemo/collections/llm/recipes/log/default.py | 31 +- nemo/collections/llm/recipes/mistral.py | 255 +++++- nemo/collections/llm/recipes/mixtral_8x22b.py | 215 ++++- nemo/collections/llm/recipes/mixtral_8x3b.py | 200 ++++- .../llm/recipes/mixtral_8x3b_16k.py | 189 +++-- .../llm/recipes/mixtral_8x3b_64k.py | 192 +++-- nemo/collections/llm/recipes/mixtral_8x7b.py | 212 ++++- .../llm/recipes/mixtral_8x7b_16k.py | 190 +++-- .../llm/recipes/mixtral_8x7b_64k.py | 196 +++-- .../collections/llm/recipes/optim/__init__.py | 13 + nemo/collections/llm/recipes/optim/adam.py | 26 +- .../llm/recipes/precision/__init__.py | 13 + .../llm/recipes/precision/mixed_precision.py | 27 +- requirements/requirements.txt | 1 - .../run => scripts/llm}/llama3_pretraining.py | 0 setup.py | 2 +- tests/collections/llm/recipes/__init__.py | 0 .../llm/recipes/test_llama3_70b.py | 113 +++ .../llm/recipes/test_llama3_70b_16k.py | 93 +++ .../llm/recipes/test_llama3_70b_64k.py | 99 +++ .../collections/llm/recipes/test_llama3_8b.py | 120 +++ .../llm/recipes/test_llama3_8b_16k.py | 93 +++ .../llm/recipes/test_llama3_8b_64k.py | 93 +++ tests/collections/llm/recipes/test_mistral.py | 101 +++ .../llm/recipes/test_mixtral_8x22b.py | 118 +++ .../llm/recipes/test_mixtral_8x3b.py | 110 +++ .../llm/recipes/test_mixtral_8x3b_16k.py | 98 +++ .../llm/recipes/test_mixtral_8x3b_64k.py | 98 +++ .../llm/recipes/test_mixtral_8x7b.py | 112 +++ .../llm/recipes/test_mixtral_8x7b_16k.py | 104 +++ .../llm/recipes/test_mixtral_8x7b_64k.py | 98 +++ .../lightning/{fabric => _fabric}/__init__.py | 5 + .../{fabric => _fabric}/test_conversion.py | 0 51 files changed, 4997 insertions(+), 637 deletions(-) create mode 100644 examples/llm/pretrain/README.md create mode 100644 examples/llm/pretrain/custom_recipe.py create mode 100644 examples/llm/pretrain/default_executor.py create mode 100644 examples/llm/pretrain/pretrain.ipynb create mode 100644 nemo/collections/llm/recipes/ADD-RECIPE.md create mode 100644 nemo/collections/llm/recipes/README.md rename {examples/llm/run => scripts/llm}/llama3_pretraining.py (100%) create mode 100644 tests/collections/llm/recipes/__init__.py create mode 100644 tests/collections/llm/recipes/test_llama3_70b.py create mode 100644 tests/collections/llm/recipes/test_llama3_70b_16k.py create mode 100644 tests/collections/llm/recipes/test_llama3_70b_64k.py create mode 100644 tests/collections/llm/recipes/test_llama3_8b.py create mode 100644 tests/collections/llm/recipes/test_llama3_8b_16k.py create mode 100644 tests/collections/llm/recipes/test_llama3_8b_64k.py create mode 100644 tests/collections/llm/recipes/test_mistral.py create mode 100644 tests/collections/llm/recipes/test_mixtral_8x22b.py create mode 100644 tests/collections/llm/recipes/test_mixtral_8x3b.py create mode 100644 tests/collections/llm/recipes/test_mixtral_8x3b_16k.py create mode 100644 tests/collections/llm/recipes/test_mixtral_8x3b_64k.py create mode 100644 tests/collections/llm/recipes/test_mixtral_8x7b.py create mode 100644 tests/collections/llm/recipes/test_mixtral_8x7b_16k.py create mode 100644 tests/collections/llm/recipes/test_mixtral_8x7b_64k.py rename tests/lightning/{fabric => _fabric}/__init__.py (80%) rename tests/lightning/{fabric => _fabric}/test_conversion.py (100%) diff --git a/Dockerfile.ci b/Dockerfile.ci index 51ad4663131e..7e3ba798d62e 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -18,14 +18,14 @@ ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3 FROM ${BASE_IMAGE} -ENV TRANSFORMERS_OFFLINE=0 +ENV TRANSFORMERS_OFFLINE=0 ENV HYDRA_FULL_ERROR=1 ENV PYTHONUNBUFFERED=1 # APT packages RUN <<"EOF" bash -ex apt-get update -apt-get install -y bc libsox-fmt-all -y +apt-get install -y bc libsox-fmt-all -y apt-get clean EOF diff --git a/examples/llm/pretrain/README.md b/examples/llm/pretrain/README.md new file mode 100644 index 000000000000..c9bb7331f972 --- /dev/null +++ b/examples/llm/pretrain/README.md @@ -0,0 +1,72 @@ +# Pre-training + +### Listing the available recipes for pretraining + +```bash +nemorun llm pretrain --help +``` + +![recipe-listing](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/list-recipes.png) + + +### Run pre-training with a default recipe + +```bash +nemorun llm pretrain --factory llama3_8b +``` + +![llama3_70b](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b.png) + +We can also call the factory function with custom parameters: + +```bash +nemorun llm pretrain --factory "llama3_70b(num_nodes=128)" +``` + +![llama3_70b-128-nodes](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b_128nodes.png) + + +The CLI allows you to overwrite any parameter. For example, to run the recipe with 2000 steps: + +```bash +nemorun llm pretrain --factory llama3_70b trainer.max_steps=2000 +``` + +The syntax of the CLI is the same as the Python code. Which is great but in some cases you might want to inspect & edit a recipe interactively. An easy way to do this using the cli is the use the `--repl` flag. + +```bash +nemorun llm pretrain --factory llama3_70b --repl +``` + +![repl](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/repl.gif) + +We can also trigger a run from a jupyter notebook, see [pretrain.ipynb](pretrain.ipynb) for an example. This allows visualizes all configs in a structured format. See for instance the `llama3_8b` recipe: + +![llama3_8b_visualization](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_8b_config.svg) + + +### Create and run a custom recipe + +We can create a script that contains a custom recipe. See [custom_recipe.py](custom_recipe.py) for an example. + +Note that we end the script with a call to `run.cli.main()`, which uses the same syntax as the CLI but allows us to provide specific defaults. We still can overwrite any parameter using the syntax `param=value`. We can set nested parameters using dotted notation, e.g. `trainer.max_steps=2000`. + +When running the custom_recipe.py file, it will execute the `custom_llama3_8b` recipe by default. However, you can select different recipes or modify parameters using the following methods: + +1. To select the `custom_llama3_70b` recipe: + ```bash + python custom_recipe.py --factory custom_llama3_70b + ``` + This will automatically call the `custom_llama3_70b` function defined in the script. + +2. To overwrite any parameter: + ```bash + python custom_recipe.py trainer.max_steps=2000 + ``` + +3. You can even apply transformations when triggering the CLI as if it's Python code: + ```bash + python custom_recipe.py "trainer.max_steps=*2" + ``` + +These options provide flexibility in customizing your pretraining recipe directly from the command line. \ No newline at end of file diff --git a/examples/llm/pretrain/custom_recipe.py b/examples/llm/pretrain/custom_recipe.py new file mode 100644 index 000000000000..a522a1a8e1f5 --- /dev/null +++ b/examples/llm/pretrain/custom_recipe.py @@ -0,0 +1,44 @@ +import nemo_run as run + +from nemo.collections import llm +from nemo.collections.llm.recipes import llama3_8b, llama3_70b + + +def custom_llama3_8b(): + pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8) + + pretrain.trainer.val_check_interval = 400 + pretrain.log.ckpt.save_top_k = -1 + pretrain.log.ckpt.every_n_train_steps = 400 + + pretrain.trainer.max_steps = 1000 + + return pretrain + + +def custom_llama3_70b(): + pretrain = llama3_70b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8) + + pretrain.trainer.val_check_interval = 400 + pretrain.log.ckpt.save_top_k = -1 + pretrain.log.ckpt.every_n_train_steps = 400 + + pretrain.trainer.max_steps = 1000 + + return pretrain + + +if __name__ == "__main__": + # When running this file, it will run the `custom_llama3_8b` recipe + + # To select the `custom_llama3_70b` recipe, use the following command: + # python custom_recipe.py --factory custom_llama3_70b + # This will automatically call the custom_llama3_70b that's defined above + + # Note that any parameter can be overwritten by using the following syntax: + # python custom_recipe.py trainer.max_steps=2000 + + # You can even apply transformations when triggering the CLI as if it's python code + # python custom_recipe.py "trainer.max_steps*=2" + + run.cli.main(llm.pretrain, default_factory=custom_llama3_8b) diff --git a/examples/llm/pretrain/default_executor.py b/examples/llm/pretrain/default_executor.py new file mode 100644 index 000000000000..2668d312f2b8 --- /dev/null +++ b/examples/llm/pretrain/default_executor.py @@ -0,0 +1,106 @@ +from typing import Optional +import nemo_run as run +from nemo.collections import llm + + +def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor: + env_vars = { + "TRANSFORMERS_OFFLINE": "1", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", + "NCCL_NVLS_ENABLE": "0", + "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", + "NVTE_ASYNC_AMAX_REDUCTION": "1", + "NVTE_FUSED_ATTN": "0", + } + + executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) + + return executor + + +def slurm_executor( + user: str, + host: str, + remote_job_dir: str, + account: str, + partition: str, + nodes: int, + devices: int, + time: str = "01:00:00", + custom_mounts: Optional[list[str]] = None, + custom_env_vars: Optional[dict[str, str]] = None, + container_image: str = "nvcr.io/nvidia/nemo:dev", + retries: int = 0, +) -> run.SlurmExecutor: + if not (user and host and remote_job_dir and account and partition and nodes and devices): + raise RuntimeError( + "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function." + ) + + mounts = [] + if custom_mounts: + mounts.extend(custom_mounts) + + env_vars = { + "TRANSFORMERS_OFFLINE": "1", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", + "NCCL_NVLS_ENABLE": "0", + "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", + "NVTE_ASYNC_AMAX_REDUCTION": "1", + "NVTE_FUSED_ATTN": "0", + } + if custom_env_vars: + env_vars |= custom_env_vars + + executor = run.SlurmExecutor( + account=account, + partition=partition, + tunnel=run.SSHTunnel( + user=user, + host=host, + job_dir=remote_job_dir, + ), + nodes=nodes, + ntasks_per_node=devices, + gpus_per_node=devices, + mem="0", + exclusive=True, + gres="gpu:8", + packager=run.GitArchivePackager(subpath="examples/llm/run"), + ) + + executor.container_image = container_image + executor.container_mounts = mounts + executor.env_vars = env_vars + executor.retries = retries + executor.time = time + + return executor + + +def my_slurm_executor(): + # TODO: Set your custom parameters for the Slurm Executor. + return slurm_executor( + user="", + host="", + remote_job_dir="", + account="", + partition="", + nodes=1, + devices=2, + ) + + +if __name__ == "__main__": + run.cli.main(llm.pretrain, default_executor=local_executor_torchrun) + + # This will re-expose the pretrain entrypoint with your custom local executor as default. + + # To run, for instance, the llama3_8b recipe, use the following command: + # python default_executor.py --factory llama3_8b + + # To run with any overrides, use the following command: + # python default_executor.py --factory llama3_8b trainer.max_steps=2000 + + # To use your custom Slurm executor, use the following command: + # python default_executor.py --executor my_slurm_executor --factory llama3_8b diff --git a/examples/llm/pretrain/pretrain.ipynb b/examples/llm/pretrain/pretrain.ipynb new file mode 100644 index 000000000000..194741a9da9f --- /dev/null +++ b/examples/llm/pretrain/pretrain.ipynb @@ -0,0 +1,737 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Trigger a run from a notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[NeMo W 2024-08-29 17:14:25 nemo_logging:349] /Users/romeyn/base/code/.venv/lib/python3.10/site-packages/megatron/core/optimizer/__init__.py:18: UserWarning: Transformer Engine and Apex are not installed. Falling back to Torch optimizers.\n", + " warnings.warn(\n", + " \n", + "[NeMo W 2024-08-29 17:14:25 nemo_logging:349] /Users/romeyn/base/code/.venv/lib/python3.10/site-packages/megatron/core/optimizer/clip_grads.py:31: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier, multi_tensor_l2norm, and multi_tensor_scale\n", + " warnings.warn(\n", + " \n" + ] + } + ], + "source": [ + "import nemo_run as run\n", + "from nemo.collections import llm\n", + "from nemo.collections.llm.recipes import llama3_8b\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "2\n", + "\n", + "\n", + "Config:\n", + " Llama3Config8B\n", + "\n", + "\n", + "no arguments\n", + "\n", + "\n", + "\n", + "1\n", + "\n", + "\n", + "Config:\n", + " LlamaModel\n", + "\n", + "\n", + "config\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "1:c--2:c\n", + "\n", + "\n", + "\n", + "\n", + "0\n", + "\n", + "\n", + "Partial:\n", + " pretrain\n", + "\n", + "\n", + "model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "data\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "trainer\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "log\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "resume\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "optim\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "0:c--1:c\n", + "\n", + "\n", + "\n", + "\n", + "3\n", + "\n", + "\n", + "Config:\n", + " MockDataModule\n", + "\n", + "\n", + "seq_length\n", + "\n", + "8192\n", + "\n", + "\n", + "micro_batch_size\n", + "\n", + "1\n", + "\n", + "\n", + "global_batch_size\n", + "\n", + "512\n", + "\n", + "\n", + "\n", + "0:c--3:c\n", + "\n", + "\n", + "\n", + "\n", + "4\n", + "\n", + "\n", + "Config:\n", + " Trainer\n", + "\n", + "\n", + "accelerator\n", + "\n", + "'gpu'\n", + "\n", + "\n", + "accumulate_grad_batches\n", + "\n", + "1\n", + "\n", + "\n", + "callbacks\n", + "\n", + "\n", + "\n", + "list\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "0\n", + "\n", + "\n", + "devices\n", + "\n", + "8\n", + "\n", + "\n", + "gradient_clip_val\n", + "\n", + "1.0\n", + "\n", + "\n", + "limit_test_batches\n", + "\n", + "50\n", + "\n", + "\n", + "limit_val_batches\n", + "\n", + "32\n", + "\n", + "\n", + "log_every_n_steps\n", + "\n", + "10\n", + "\n", + "\n", + "max_steps\n", + "\n", + "1168251\n", + "\n", + "\n", + "num_nodes\n", + "\n", + "1\n", + "\n", + "\n", + "plugins\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "strategy\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "use_distributed_sampler\n", + "\n", + "False\n", + "\n", + "\n", + "val_check_interval\n", + "\n", + "2000\n", + "\n", + "\n", + "\n", + "0:c--4:c\n", + "\n", + "\n", + "\n", + "\n", + "9\n", + "\n", + "\n", + "Config:\n", + " NeMoLogger\n", + "\n", + "\n", + "name\n", + "\n", + "'default'\n", + "\n", + "\n", + "dir\n", + "\n", + "None\n", + "\n", + "\n", + "ckpt\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "tensorboard\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "wandb\n", + "\n", + "None\n", + "\n", + "\n", + "\n", + "0:c--9:c\n", + "\n", + "\n", + "\n", + "\n", + "12\n", + "\n", + "\n", + "Config:\n", + " AutoResume\n", + "\n", + "\n", + "resume_if_exists\n", + "\n", + "True\n", + "\n", + "\n", + "resume_ignore_no_checkpoint\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "0:c--12:c\n", + "\n", + "\n", + "\n", + "\n", + "13\n", + "\n", + "\n", + "Config:\n", + " MegatronOptimizerModule\n", + "\n", + "\n", + "config\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "lr_scheduler\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "0:c--13:c\n", + "\n", + "\n", + "\n", + "\n", + "5\n", + "\n", + "\n", + "Config:\n", + " TimingCallback\n", + "\n", + "\n", + "no arguments\n", + "\n", + "\n", + "\n", + "4:c--5:c\n", + "\n", + "\n", + "\n", + "\n", + "6\n", + "\n", + "\n", + "Config:\n", + " MegatronMixedPrecision\n", + "\n", + "\n", + "precision\n", + "\n", + "'bf16-mixed'\n", + "\n", + "\n", + "params_dtype\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "pipeline_dtype\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "autocast_enabled\n", + "\n", + "False\n", + "\n", + "\n", + "grad_reduce_in_fp32\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "4:c--6:c\n", + "\n", + "\n", + "\n", + "\n", + "8\n", + "\n", + "\n", + "Config:\n", + " MegatronStrategy\n", + "\n", + "\n", + "tensor_model_parallel_size\n", + "\n", + "1\n", + "\n", + "\n", + "pipeline_model_parallel_size\n", + "\n", + "1\n", + "\n", + "\n", + "virtual_pipeline_model_parallel_size\n", + "\n", + "None\n", + "\n", + "\n", + "context_parallel_size\n", + "\n", + "2\n", + "\n", + "\n", + "sequence_parallel\n", + "\n", + "False\n", + "\n", + "\n", + "ckpt_include_optimizer\n", + "\n", + "True\n", + "\n", + "\n", + "pipeline_dtype\n", + "\n", + "None\n", + "\n", + "\n", + "ckpt_async_save\n", + "\n", + "True\n", + "\n", + "\n", + "ckpt_parallel_load\n", + "\n", + "True\n", + "\n", + "\n", + "gradient_as_bucket_view\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "4:c--8:c\n", + "\n", + "\n", + "\n", + "\n", + "7\n", + "\n", + "\n", + "dtype\n", + "\n", + "torch.bfloat16\n", + "\n", + "\n", + "\n", + "6:c--7:c\n", + "\n", + "\n", + "\n", + "\n", + "6:c--7:c\n", + "\n", + "\n", + "\n", + "\n", + "10\n", + "\n", + "\n", + "Config:\n", + " ModelCheckpoint\n", + "\n", + "\n", + "save_last\n", + "\n", + "True\n", + "\n", + "\n", + "save_top_k\n", + "\n", + "10\n", + "\n", + "\n", + "every_n_train_steps\n", + "\n", + "200\n", + "\n", + "\n", + "save_best_model\n", + "\n", + "False\n", + "\n", + "\n", + "filename\n", + "\n", + "'{model_name}--{val_loss:.2f}-{step}-{consumed_samples}'\n", + "\n", + "\n", + "\n", + "9:c--10:c\n", + "\n", + "\n", + "\n", + "\n", + "11\n", + "\n", + "\n", + "Config:\n", + " TensorBoardLogger\n", + "\n", + "\n", + "save_dir\n", + "\n", + "'tb_logs'\n", + "\n", + "\n", + "name\n", + "\n", + "'default'\n", + "\n", + "\n", + "\n", + "9:c--11:c\n", + "\n", + "\n", + "\n", + "\n", + "14\n", + "\n", + "\n", + "Config:\n", + " OptimizerConfig\n", + "\n", + "\n", + "optimizer\n", + "\n", + "'adam'\n", + "\n", + "\n", + "lr\n", + "\n", + "0.0003\n", + "\n", + "\n", + "weight_decay\n", + "\n", + "0.1\n", + "\n", + "\n", + "bf16\n", + "\n", + "True\n", + "\n", + "\n", + "adam_beta1\n", + "\n", + "0.9\n", + "\n", + "\n", + "adam_beta2\n", + "\n", + "0.95\n", + "\n", + "\n", + "adam_eps\n", + "\n", + "1e-05\n", + "\n", + "\n", + "use_distributed_optimizer\n", + "\n", + "True\n", + "\n", + "\n", + "overlap_grad_reduce\n", + "\n", + "True\n", + "\n", + "\n", + "overlap_param_gather\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "13:c--14:c\n", + "\n", + "\n", + "\n", + "\n", + "15\n", + "\n", + "\n", + "Config:\n", + " CosineAnnealingScheduler\n", + "\n", + "\n", + "warmup_steps\n", + "\n", + "2000\n", + "\n", + "\n", + "constant_steps\n", + "\n", + "0\n", + "\n", + "\n", + "min_lr\n", + "\n", + "2.9999999999999997e-05\n", + "\n", + "\n", + "\n", + "13:c--15:c\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + ")]>,\n", + " data=,\n", + " trainer=],\n", + " devices=8,\n", + " gradient_clip_val=1.0,\n", + " limit_test_batches=50,\n", + " limit_val_batches=32,\n", + " log_every_n_steps=10,\n", + " max_steps=1168251,\n", + " num_nodes=1,\n", + " plugins=,\n", + " strategy=,\n", + " use_distributed_sampler=False,\n", + " val_check_interval=2000)]>,\n", + " log=,\n", + " tensorboard=,\n", + " wandb=None)]>,\n", + " resume=,\n", + " optim=,\n", + " lr_scheduler=)]>)]>" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8)\n", + "\n", + "pretrain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index d330b42d08c4..847b87131925 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -18,10 +18,10 @@ from pathlib import Path from typing import Any, Callable, Optional, Union +import nemo_run as run import pytorch_lightning as pl from typing_extensions import Annotated -from nemo.collections.llm.utils import Config, task from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform from nemo.utils import logging @@ -29,13 +29,13 @@ TokenizerType = Any -@task(namespace="llm") +@run.cli.entrypoint(namespace="llm") def train( model: pl.LightningModule, data: pl.LightningDataModule, trainer: Trainer, - log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None, - resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None, + log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None, + resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None, optim: Optional[OptimizerModule] = None, tokenizer: Optional[TokenizerType] = None, model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None, @@ -87,13 +87,13 @@ def train( return app_state.exp_dir -@task(namespace="llm") +@run.cli.entrypoint(namespace="llm") def pretrain( model: pl.LightningModule, data: pl.LightningDataModule, trainer: Trainer, - log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None, - resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None, + log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None, + resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None, optim: Optional[OptimizerModule] = None, ) -> Path: """ @@ -135,13 +135,13 @@ def pretrain( ) -@task(namespace="llm") +@run.cli.entrypoint(namespace="llm") def finetune( model: pl.LightningModule, data: pl.LightningDataModule, trainer: Trainer, - log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None, - resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None, + log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None, + resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None, optim: Optional[OptimizerModule] = None, peft: Optional[Union[PEFT, ModelTransform, Callable]] = None, ) -> Path: @@ -186,13 +186,13 @@ def finetune( ) -@task(namespace="llm") +@run.cli.entrypoint(namespace="llm") def validate( model: pl.LightningModule, data: pl.LightningDataModule, trainer: Trainer, - log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None, - resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None, + log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None, + resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None, optim: Optional[OptimizerModule] = None, tokenizer: Optional[TokenizerType] = None, model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None, @@ -311,7 +311,7 @@ def store_args_to_json(triton_http_address, triton_port, triton_request_timeout, json.dump(args_dict, f) -@task(namespace="llm") +@run.cli.entrypoint(namespace="llm") def deploy( nemo_checkpoint: Path = None, model_type: str = "llama", @@ -400,7 +400,7 @@ def deploy( nm.stop() -@task(name="import", namespace="llm") +@run.cli.entrypoint(name="import", namespace="llm") def import_ckpt( model: pl.LightningModule, source: str, @@ -414,7 +414,7 @@ def load_connector_from_trainer_ckpt(path: Path, target: str) -> io.ModelConnect return io.load_context(path).model.exporter(target, path) -@task(name="export", namespace="llm") +@run.cli.entrypoint(name="export", namespace="llm") def export_ckpt( path: Path, target: str, diff --git a/nemo/collections/llm/gpt/model/starcoder.py b/nemo/collections/llm/gpt/model/starcoder.py index 7cfdec4bce29..e7cc3f411710 100644 --- a/nemo/collections/llm/gpt/model/starcoder.py +++ b/nemo/collections/llm/gpt/model/starcoder.py @@ -19,7 +19,6 @@ import torch.nn.functional as F from torch import nn -from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel from nemo.collections.llm.utils import Config from nemo.lightning import OptimizerModule, io, teardown @@ -120,6 +119,8 @@ def convert_state(self, source, target): @property def tokenizer(self) -> "AutoTokenizer": + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property diff --git a/nemo/collections/llm/gpt/model/starcoder2.py b/nemo/collections/llm/gpt/model/starcoder2.py index 3256ccd9e248..57b8d3635ade 100644 --- a/nemo/collections/llm/gpt/model/starcoder2.py +++ b/nemo/collections/llm/gpt/model/starcoder2.py @@ -20,7 +20,6 @@ import torch.nn.functional as F from torch import nn -from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel from nemo.collections.llm.utils import Config from nemo.lightning import OptimizerModule, io, teardown @@ -144,6 +143,8 @@ def convert_state(self, source, target): @property def tokenizer(self) -> "AutoTokenizer": + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property diff --git a/nemo/collections/llm/recipes/ADD-RECIPE.md b/nemo/collections/llm/recipes/ADD-RECIPE.md new file mode 100644 index 000000000000..c506374e3784 --- /dev/null +++ b/nemo/collections/llm/recipes/ADD-RECIPE.md @@ -0,0 +1,100 @@ +# How to Add a New Recipe + +This guide explains the process of adding a new recipe to the NeMo LLM collection. + +## Step 1: Create a New Python File + +Create a new Python file in the `nemo/collections/llm/recipes/` directory. Name it according to the model and its specific configuration, e.g., `my_new_model_12b.py`. + +## Step 2: Define the Model Configuration + +Create a function called `model` to define the model configuration: + +```python +NAME = "my_new_model_12b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + return run.Config(YourModel, config=run.Config(YourModelConfig)) +``` + +## Step 3: Define the Trainer Configuration + +Create a function called `trainer` to set up the trainer: + +```python +def trainer( + num_nodes: int = 1, + num_gpus_per_node: int = 8, + # Add other parameters as needed +) -> run.Config[nl.Trainer]: + strategy = run.Config( + nl.MegatronStrategy, + # Define your parallelism strategy here + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + devices=num_gpus_per_node, + num_nodes=num_nodes, + # Add other trainer configurations + ) + return trainer +``` + +## Step 4: Define the Recipe Configuration + +Create a function called `pretrain_recipe` or `finetune_recipe` to define the recipe configuration: + +```python +from nemo.collections.llm import pretrain + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + # Add other parameters as needed +) -> run.Config[nl.PretrainRecipe]: + return run.Config( + nl.PretrainRecipe, + model=model(), + trainer=trainer(), + # Add other recipe configurations + data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) +``` + +```python +from nemo.collections.llm import finetune + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + # Add other parameters as needed +) -> run.Config[nl.FinetuneRecipe]: + return run.Config( + nl.FinetuneRecipe, + model=model(), + trainer=trainer(), + # Add other recipe configurations + data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) +``` + + +## Step 5: Import the recipe in the __init__.py file + +Import the recipe in the [__init__.py](__init__.py) file in the same directory: + +```python +from .my_new_model_12b import pretrain_recipe, finetune_recipe +``` + + +## Step 6: Add tests for the recipe + +Add tests for the recipe in the [tests](../../../../tests/collections/llm/recipes) directory. You can use [test_llama3_8b.py](../../../../tests/collections/llm/recipes/test_llama3_8b.py) as an example. diff --git a/nemo/collections/llm/recipes/README.md b/nemo/collections/llm/recipes/README.md new file mode 100644 index 000000000000..a3cf715acffb --- /dev/null +++ b/nemo/collections/llm/recipes/README.md @@ -0,0 +1,46 @@ +# NeMo LLM Recipes + +This directory contains recipes for pre-training and fine-tuning large language models (LLMs) using NeMo. + +A recipe in NeMo is a Python file that defines a complete configuration for training or fine-tuning an LLM. Each recipe typically includes: + +1. Model configuration: Defines the architecture and hyperparameters of the LLM. +2. Training configuration: Specifies settings for the PyTorch Lightning Trainer, including distributed training strategies. +3. Data configuration: Sets up the data pipeline, including batch sizes and sequence lengths. +4. Optimization configuration: Defines the optimizer and learning rate schedule. +5. Logging and checkpointing configuration: Specifies how to save model checkpoints and log training progress. + +Recipes are designed to be modular and extensible, allowing users to easily customize settings for their specific use cases. + +## Usage + +### Command Line Interface + +You can use these recipes via the NeMo CLI: + +```bash +nemorun llm --factory +``` +Where: +- `` is either `pretrain` or `finetune` +- `` is the name of the recipe (e.g. `llama3_8b`) + +For example: +```bash +nemorun llm pretrain --factory llama3_8b +``` + + +### Customizing Parameters + +You can override any parameter in the recipe: + +```bash +nemorun llm pretrain --factory llama3_8b trainer.max_steps=2000 +``` + +For more details around running recipes, see [pre-train](../../../../examples/llm/pretrain/README.md). + +## Adding a New Recipe + +See [ADD-RECIPE.md](ADD-RECIPE.md) for instructions on how to add a new recipe. \ No newline at end of file diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index 950ca6db7ac6..ec44d1c19864 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -1,3 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from nemo.collections.llm.recipes import ( llama3_8b, llama3_8b_16k, diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index cbf6b5e2e7a1..96c94fd6eeba 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -1,8 +1,23 @@ -from typing import Callable, Optional +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import nemo_run as run import pytorch_lightning as pl import torch -from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -13,32 +28,77 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192 -from nemo.collections.llm.utils import Config, Partial from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "llama3_70b" -def model() -> Config[pl.LightningModule]: - return Config(LlamaModel, config=Config(Llama3Config70B)) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 70B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 70B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_70b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(LlamaModel, config=run.Config(Llama3Config70B)) def trainer( - tensor_parallelism: int, - pipeline_parallelism: int, - pipeline_parallelism_type: Optional[torch.dtype], - virtual_pipeline_parallelism: Optional[int], - context_parallelism: int, - sequence_parallelism: bool, + tensor_parallelism: int = 4, + pipeline_parallelism: int = 4, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, + virtual_pipeline_parallelism: Optional[int] = 5, + context_parallelism: int = 2, + sequence_parallelism: bool = True, num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 1168251, - callbacks: Optional[list[Config[Callback]]] = None, -) -> Config[nl.Trainer]: - strategy = Config( + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Llama3 70B model. + + This function sets up the distributed training strategy optimized for the large 70B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_70b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size efficiently. + """ + strategy = run.Config( nl.MegatronStrategy, tensor_model_parallel_size=tensor_parallelism, pipeline_model_parallel_size=pipeline_parallelism, @@ -51,7 +111,7 @@ def trainer( ckpt_parallel_load=True, ) - trainer = Config( + trainer = run.Config( nl.Trainer, accelerator="gpu", accumulate_grad_batches=1, @@ -62,7 +122,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=bf16_mixed_plugin(), + plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -71,42 +131,89 @@ def trainer( return trainer +@run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - return Partial( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 70B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_70b + $ nemo llm pretrain --factory "llama3_70b(num_nodes=4, name='my_70b_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_70b_pretrain", num_nodes=4) + >>> print(recipe) + + Note: + This recipe is optimized for the large 70B model and requires significant computational resources. + """ + return run.Partial( fn, model=model(), trainer=trainer( - tensor_parallelism=4, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, - context_parallelism=2, - sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ), - data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) +@run.cli.factory(target=pretrain, name=NAME + "_performance") def pretrain_recipe_performance( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - """'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default - due to being model specific or lacking sufficent support. For better compatibility please use - the default 'pretrain_recipe()' above.""" - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Llama3 70B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) recipe.trainer.callbacks.append( - Config( + run.Config( MegatronCommOverlapCallback, tp_comm_overlap=True, tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, @@ -118,18 +225,66 @@ def pretrain_recipe_performance( return recipe -def hf_resume() -> Config[nl.AutoResume]: - return Config( +def hf_resume() -> run.Config[nl.AutoResume]: + """ + Configure automatic resumption from a Hugging Face checkpoint for Llama3 70B model. + + This function sets up the configuration to resume training from a pre-trained + Hugging Face model checkpoint. + + More info about the model can be found at: https://huggingface.co/meta-llama/Meta-Llama-3-70B + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint. + + Note: + This is particularly useful for fine-tuning scenarios where you want to + start from the pre-trained Llama3 70B model. + """ + return run.Config( nl.AutoResume, - restore_config=Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-70B"), + restore_config=run.Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-70B"), ) -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune - ) +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 70B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning of the large model. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_70b + $ nemo llm finetune --factory "llama3_70b(num_nodes=4, name='my_70b_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_70b_finetune", num_nodes=4) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. Be aware that fine-tuning a 70B model + requires substantial computational resources. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) recipe.resume = hf_resume() - recipe.peft = Config(LoRA) - recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + recipe.peft = run.Config(LoRA) + recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/llama3_70b_16k.py b/nemo/collections/llm/recipes/llama3_70b_16k.py index 87826661606f..3798088ff722 100644 --- a/nemo/collections/llm/recipes/llama3_70b_16k.py +++ b/nemo/collections/llm/recipes/llama3_70b_16k.py @@ -1,57 +1,81 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_70b -from nemo.collections.llm.utils import Config, Partial -from nemo.utils.exp_manager import TimingCallback NAME = "llama3_70b_16k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = llama3_70b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 70B model configuration with 16k sequence length. - model = llama3_70b.model() - model.config.seq_length = 16384 + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 70B model with 16k sequence length. - trainer = llama3_70b.trainer( - tensor_parallelism=4, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, - context_parallelism=4, - sequence_parallelism=True, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_70b_16k ... - data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = llama3_70b.model() + model_config.config.seq_length = 16384 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Llama3 70B model with 16k sequence length. + This function sets up the distributed training strategy optimized for the large 70B model with longer sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = llama3_70b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. - model = llama3_70b.model() - model.config.seq_length = 16384 + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_70b_16k ... - trainer = llama3_70b.trainer( + Python API usage: + >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size and longer sequence length efficiently. + """ + return llama3_70b.trainer( tensor_parallelism=2, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, @@ -60,13 +84,93 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 70B model with 16k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_70b_16k + $ nemo llm pretrain --factory "llama3_70b_16k(num_nodes=4, name='my_70b_16k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_70b_16k_pretrain", num_nodes=4) + >>> print(recipe) + + Note: + This recipe is optimized for the large 70B model with longer sequences (16k). + It requires significant computational resources. + """ + recipe = llama3_70b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 70B model with 16k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_70b_16k + $ nemo llm finetune --factory "llama3_70b_16k(num_nodes=4, name='my_70b_16k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_70b_16k_finetune", num_nodes=4) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning the large 70B model with longer sequences (16k). + It uses the SQuAD dataset adapted for 16k sequence length. Be aware that this configuration + requires substantial computational resources. + """ + recipe = llama3_70b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/llama3_70b_64k.py b/nemo/collections/llm/recipes/llama3_70b_64k.py index 5185e6b2ec45..353bdd659947 100644 --- a/nemo/collections/llm/recipes/llama3_70b_64k.py +++ b/nemo/collections/llm/recipes/llama3_70b_64k.py @@ -1,72 +1,179 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_70b -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "llama3_70b_64k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = llama3_70b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 70B model configuration with 64k sequence length. - model = llama3_70b.model() - model.config.seq_length = 65536 + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 70B model with 64k sequence length. - trainer = llama3_70b.trainer( - tensor_parallelism=8, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, - context_parallelism=8, - sequence_parallelism=True, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_70b_64k ... - data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = llama3_70b.model() + model_config.config.seq_length = 65536 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 32, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Llama3 70B model with 64k sequence length. + This function sets up the distributed training strategy optimized for the large 70B model with long sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = llama3_70b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. - model = llama3_70b.model() - model.config.seq_length = 65536 + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_70b_64k ... - trainer = llama3_70b.trainer( - tensor_parallelism=2, + Python API usage: + >>> trainer_config = trainer(num_nodes=32, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size and long sequence length efficiently. + It requires a significant amount of computational resources. + """ + return llama3_70b.trainer( + tensor_parallelism=8, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, virtual_pipeline_parallelism=5, - context_parallelism=2, + context_parallelism=8, sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 32, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 70B model with 64k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_70b_64k + $ nemo llm pretrain --factory "llama3_70b_64k(num_nodes=32, name='my_70b_64k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_70b_64k_pretrain", num_nodes=32) + >>> print(recipe) + + Note: + This recipe is optimized for the large 70B model with long sequences (64k). + It requires extensive computational resources due to the model size and extended sequence length. + """ + recipe = llama3_70b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 32, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 70B model with 64k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_70b_64k + $ nemo llm finetune --factory "llama3_70b_64k(num_nodes=32, name='my_70b_64k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_70b_64k_finetune", num_nodes=32) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning the large 70B model with long sequences (64k). + It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration + requires extensive computational resources due to the model size and extended sequence length. + """ + recipe = llama3_70b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index 17d4e8b168b3..8b2ea2969273 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -1,5 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from typing import Callable, Optional +import nemo_run as run import pytorch_lightning as pl import torch from pytorch_lightning.callbacks.callback import Callback @@ -12,31 +28,77 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin -from nemo.collections.llm.utils import Config, Partial +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "llama3_8b" -def model() -> Config[pl.LightningModule]: - return Config(LlamaModel, config=Config(Llama3Config8B)) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 8B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 8B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_8b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(LlamaModel, config=run.Config(Llama3Config8B)) def trainer( - tensor_parallelism: int, - pipeline_parallelism: int, - pipeline_parallelism_type: Optional[torch.dtype], - virtual_pipeline_parallelism: Optional[int], - context_parallelism: int, - sequence_parallelism: bool, + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 1168251, - callbacks: Optional[list[Config[Callback]]] = None, -) -> Config[nl.Trainer]: - strategy = Config( + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Llama3 8B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_8b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( nl.MegatronStrategy, tensor_model_parallel_size=tensor_parallelism, pipeline_model_parallel_size=pipeline_parallelism, @@ -49,7 +111,7 @@ def trainer( ckpt_parallel_load=True, ) - trainer = Config( + trainer = run.Config( nl.Trainer, accelerator="gpu", accumulate_grad_batches=1, @@ -60,7 +122,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=bf16_mixed_plugin(), + plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -69,42 +131,93 @@ def trainer( return trainer +@run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - return Partial( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 8B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_8b + $ nemo llm pretrain --factory "llama3_8b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_8b_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( fn, model=model(), trainer=trainer( - tensor_parallelism=1, - pipeline_parallelism=1, - pipeline_parallelism_type=None, - virtual_pipeline_parallelism=None, - context_parallelism=2, - sequence_parallelism=False, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ), - data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) +@run.cli.factory(target=pretrain, name=NAME + "_optimized") def pretrain_recipe_performance( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - """'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default - due to being model specific or lacking sufficent support. For better compatibility please use - the default 'pretrain_recipe()' above.""" - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Llama3 8B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory llama3_8b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) recipe.trainer.callbacks.append( - Config( + run.Config( MegatronCommOverlapCallback, tp_comm_overlap=False, ) @@ -112,18 +225,61 @@ def pretrain_recipe_performance( return recipe -def hf_resume() -> Config[nl.AutoResume]: - return Config( +def hf_resume() -> run.Config[nl.AutoResume]: + """Configure automatic resumption from a Hugging Face checkpoint. + + This function sets up the configuration to resume training from a pre-trained + Hugging Face model checkpoint. + + More info about the model can be found at: https://huggingface.co/meta-llama/Meta-Llama-3-8B + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint. + """ + return run.Config( nl.AutoResume, - restore_config=Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-8B"), + restore_config=run.Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-8B"), ) -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune - ) +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 8B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_8b + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_8b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) recipe.resume = hf_resume() - recipe.peft = Config(LoRA) - recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + recipe.peft = run.Config(LoRA) + recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py index 27762777c622..bd02f1975864 100644 --- a/nemo/collections/llm/recipes/llama3_8b_16k.py +++ b/nemo/collections/llm/recipes/llama3_8b_16k.py @@ -1,57 +1,81 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_8b -from nemo.collections.llm.utils import Config, Partial -from nemo.utils.exp_manager import TimingCallback NAME = "llama3_8b_16k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = llama3_8b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 8B model configuration with 16k sequence length. - model = llama3_8b.model() - model.config.seq_length = 16384 + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 8B model with 16k sequence length. - trainer = llama3_8b.trainer( - tensor_parallelism=2, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, - context_parallelism=2, - sequence_parallelism=True, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_8b_16k ... - data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = llama3_8b.model() + model_config.config.seq_length = 16384 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Llama3 8B model with 16k sequence length. + This function sets up the distributed training strategy optimized for longer sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = llama3_8b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_8b_16k ... - model = llama3_8b.model() - model.config.seq_length = 16384 + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) - trainer = llama3_8b.trainer( + Note: + This configuration uses increased parallelism to handle the longer sequence length efficiently. + """ + return llama3_8b.trainer( tensor_parallelism=2, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, @@ -60,13 +84,91 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 8B model with 16k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_8b_16k + $ nemo llm pretrain --factory "llama3_8b_16k(num_nodes=2, name='my_16k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_8b_16k_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for handling longer sequences (16k) compared to the standard 8k version. + """ + recipe = llama3_8b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 8B model with 16k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_8b_16k + $ nemo llm finetune --factory "llama3_8b_16k(num_nodes=2, name='my_16k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_8b_16k_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning with longer sequences (16k) compared to the standard 8k version. + It uses the SQuAD dataset adapted for 16k sequence length. + """ + recipe = llama3_8b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py index 90001c6189a0..e5845e4530ca 100644 --- a/nemo/collections/llm/recipes/llama3_8b_64k.py +++ b/nemo/collections/llm/recipes/llama3_8b_64k.py @@ -1,57 +1,81 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_8b -from nemo.collections.llm.utils import Config, Partial -from nemo.utils.exp_manager import TimingCallback NAME = "llama3_8b_64k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = llama3_8b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 8B model configuration with 64k sequence length. - model = llama3_8b.model() - model.config.seq_length = 65536 + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 8B model with 64k sequence length. - trainer = llama3_8b.trainer( - tensor_parallelism=2, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, - context_parallelism=4, - sequence_parallelism=True, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_8b_64k ... - data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = llama3_8b.model() + model_config.config.seq_length = 65536 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Llama3 8B model with 64k sequence length. + This function sets up the distributed training strategy optimized for long sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = llama3_8b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_8b_64k ... - model = llama3_8b.model() - model.config.seq_length = 65536 + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) - trainer = llama3_8b.trainer( + Note: + This configuration uses significantly increased parallelism to handle the long sequence length efficiently. + """ + return llama3_8b.trainer( tensor_parallelism=2, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, @@ -60,13 +84,93 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 8B model with 64k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_8b_64k + $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=2, name='my_64k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for handling long sequences (64k) compared to the standard 8k version. + It requires significant computational resources due to the extended sequence length. + """ + recipe = llama3_8b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 8B model with 64k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_8b_64k + $ nemo llm finetune --factory "llama3_8b_64k(num_nodes=2, name='my_64k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_8b_64k_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning with long sequences (64k) compared to the standard 8k version. + It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration requires + substantial computational resources due to the extended sequence length. + """ + recipe = llama3_8b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/log/__init__.py b/nemo/collections/llm/recipes/log/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/nemo/collections/llm/recipes/log/__init__.py +++ b/nemo/collections/llm/recipes/log/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py index 94e595bdb811..b59d549726c6 100644 --- a/nemo/collections/llm/recipes/log/default.py +++ b/nemo/collections/llm/recipes/log/default.py @@ -1,9 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from typing import Optional +from nemo_run import Config, cli from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger from nemo import lightning as nl -from nemo.collections.llm.utils import Config def tensorboard_logger(name: str, save_dir: str = "tb_logs") -> Config[TensorBoardLogger]: @@ -24,9 +39,10 @@ def wandb_logger(project: str, name: str, entity: Optional[str] = None) -> Confi return cfg +@cli.factory(is_target_default=True) def default_log( - ckpt_dir: str, - name: str, + dir: Optional[str] = None, + name: str = "default", tensorboard_logger: Optional[Config[TensorBoardLogger]] = None, wandb_logger: Optional[Config[WandbLogger]] = None, ) -> Config[nl.NeMoLogger]: @@ -44,13 +60,14 @@ def default_log( name=name, tensorboard=tensorboard_logger, wandb=wandb_logger, - dir=ckpt_dir, + dir=dir, ) -def default_resume() -> Config[nl.AutoResume]: +@cli.factory(is_target_default=True) +def default_resume(resume_if_exists=True, resume_ignore_no_checkpoint=True) -> Config[nl.AutoResume]: return Config( nl.AutoResume, - resume_if_exists=True, - resume_ignore_no_checkpoint=True, + resume_if_exists=resume_if_exists, + resume_ignore_no_checkpoint=resume_ignore_no_checkpoint, ) diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral.py index c504340348fe..902e7623afd2 100644 --- a/nemo/collections/llm/recipes/mistral.py +++ b/nemo/collections/llm/recipes/mistral.py @@ -1,61 +1,242 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.api import squad +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel -from nemo.collections.llm.peft.api import gpt_lora -from nemo.collections.llm.recipes.log.default import default_log +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.utils import Partial, factory +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.utils.exp_manager import TimingCallback NAME = "mistral" -@factory(name=NAME) -def model() -> pl.LightningModule: - return MistralModel(MistralConfig7B()) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mistral 7B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mistral 7B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mistral ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(MistralModel, config=run.Config(MistralConfig7B)) + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 100, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mistral 7B model. -@factory(name=NAME) -def trainer(devices=8) -> nl.Trainer: - strategy = nl.MegatronStrategy(tensor_model_parallel_size=2) + This function sets up the distributed training strategy and other training parameters. - return nl.Trainer( - devices=devices, - max_steps=100, + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mistral ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_include_optimizer=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ) + + trainer = run.Config( + nl.Trainer, accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + gradient_clip_val=1.0, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), strategy=strategy, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + use_distributed_sampler=False, + val_check_interval=2000, ) + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Mistral 7B model. -@factory(name=NAME + "_hf") -def hf_resume() -> nl.AutoResume: - return nl.AutoResume(restore_config=nl.RestoreConfig(path="hf://mistralai/Mistral-7B-v0.3")) + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. -@factory(name=NAME, for_task="llm.pretrain") -def pretrain_recipe() -> Partial: - return Partial( - pretrain, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=distributed_fused_adam_with_cosine_annealing(), + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mistral + $ nemo llm pretrain --factory "mistral(num_nodes=2, name='my_mistral_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mistral_pretrain", num_nodes=2) + >>> print(recipe) + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + tensor_parallelism=1, + pipeline_parallelism=1, + pipeline_parallelism_type=None, + virtual_pipeline_parallelism=None, + context_parallelism=2, + sequence_parallelism=False, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), ) -@factory(name=NAME, for_task="llm.finetune") -def finetune_recipe() -> Partial: - return Partial( - finetune, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=distributed_fused_adam_with_cosine_annealing(), - peft=gpt_lora, - resume=hf_resume, +@run.cli.factory(name=NAME + "_hf") +def hf_resume() -> run.Config[nl.AutoResume]: + """ + Configure automatic resumption from a Hugging Face checkpoint for Mistral 7B model. + + This function sets up the configuration to resume training from a pre-trained + Hugging Face model checkpoint. + + More info about the model can be found at: https://huggingface.co/mistralai/Mistral-7B-v0.3 + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint. + + Note: + This is particularly useful for fine-tuning scenarios where you want to + start from the pre-trained Mistral 7B model. + """ + return run.Config( + nl.AutoResume, restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mistral-7B-v0.3") ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mistral 7B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mistral + $ nemo llm finetune --factory "mistral(num_nodes=2, name='my_mistral_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mistral_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) + recipe.resume = hf_resume() + recipe.peft = run.Config(LoRA) + recipe.data = run.Config(SquadDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1) + return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py index 209a5926a008..2320c89dfd2c 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -1,7 +1,24 @@ -from typing import Callable, Optional +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import nemo_run as run import pytorch_lightning as pl import torch +from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -12,31 +29,76 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x22b" -def model() -> Config[pl.LightningModule]: - return Config(MixtralModel, config=Config(MixtralConfig8x22B)) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x22B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x22B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x22b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(MixtralModel, config=run.Config(MixtralConfig8x22B)) def trainer( - tensor_parallelism: int, - pipeline_parallelism: int, - pipeline_parallelism_type: Optional[torch.dtype], - virtual_pipeline_parallelism: Optional[int], - context_parallelism: int, - sequence_parallelism: bool, - expert_parallelism: int, - num_nodes: int = 1, + tensor_parallelism: int = 8, + pipeline_parallelism: int = 8, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, + virtual_pipeline_parallelism: Optional[int] = 7, + context_parallelism: int = 1, + sequence_parallelism: bool = True, + expert_parallelism: int = 1, + num_nodes: int = 8, num_gpus_per_node: int = 8, max_steps: int = 1168251, - callbacks: Optional[list[Config[Callback]]] = None, -) -> Config[nl.Trainer]: - strategy = Config( + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x22B model. + + This function sets up the distributed training strategy optimized for the large Mixtral 8x22B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + expert_parallelism (int): Degree of expert parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x22b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=8, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size efficiently. + """ + strategy = run.Config( nl.MegatronStrategy, tensor_model_parallel_size=tensor_parallelism, pipeline_model_parallel_size=pipeline_parallelism, @@ -48,9 +110,14 @@ def trainer( gradient_as_bucket_view=True, ckpt_async_save=True, ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + ), ) - trainer = Config( + trainer = run.Config( nl.Trainer, accelerator="gpu", accumulate_grad_batches=1, @@ -61,7 +128,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=bf16_mixed_plugin(), + plugins=run.Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -70,43 +137,107 @@ def trainer( return trainer +@run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - return Partial( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x22B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x22b + $ nemo llm pretrain --factory "mixtral_8x22b(num_nodes=2, name='my_mixtral_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=2) + >>> print(recipe) + """ + return run.Partial( fn, model=model(), trainer=trainer( - tensor_parallelism=8, - pipeline_parallelism=8, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=7, - context_parallelism=1, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, callbacks=[run.Config(TimingCallback)] ), - data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) -def hf_resume() -> Config[nl.AutoResume]: - return Config( +def hf_resume() -> run.Config[nl.AutoResume]: + """ + Configure automatic resumption from a Hugging Face checkpoint for Mixtral 8x22B model. + + This function sets up the configuration to resume training from a pre-trained + Hugging Face model checkpoint. + + More info about the model can be found at: https://huggingface.co/mistralai/Mixtral-8x22B-v0.1 + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint. + + Note: + This is particularly useful for fine-tuning scenarios where you want to + start from the pre-trained Mixtral 8x22B model. + """ + return run.Config( nl.AutoResume, - restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x22B-v0.1"), + restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x22B-v0.1"), ) -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune - ) +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 8, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x22B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x22b + $ nemo llm finetune --factory "mixtral_8x22b(num_nodes=2, name='my_mixtral_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) recipe.resume = hf_resume() - recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) - recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b.py b/nemo/collections/llm/recipes/mixtral_8x3b.py index 7dc8170e13e3..14318bea9e5a 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b.py @@ -1,5 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from typing import Callable, Optional +import nemo_run as run import pytorch_lightning as pl import torch from pytorch_lightning.callbacks.callback import Callback @@ -12,31 +28,74 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin -from nemo.collections.llm.utils import Config, Partial +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x3b" -def model() -> Config[pl.LightningModule]: - return Config(MixtralModel, config=Config(MixtralConfig8x3B)) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x3B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x3b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(MixtralModel, config=run.Config(MixtralConfig8x3B)) def trainer( - tensor_parallelism: int, - pipeline_parallelism: int, - pipeline_parallelism_type: Optional[torch.dtype], - virtual_pipeline_parallelism: Optional[int], - context_parallelism: int, - sequence_parallelism: bool, - expert_parallelism: int, + tensor_parallelism: int = 4, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = True, + expert_parallelism: int = 1, num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 1168251, - callbacks: Optional[list[Config[Callback]]] = None, -) -> Config[nl.Trainer]: - strategy = Config( + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x3B model. + + This function sets up the distributed training strategy optimized for the Mixtral 8x3B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + expert_parallelism (int): Degree of expert parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x3b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=8) + >>> print(trainer_config) + """ + strategy = run.Config( nl.MegatronStrategy, tensor_model_parallel_size=tensor_parallelism, pipeline_model_parallel_size=pipeline_parallelism, @@ -50,7 +109,7 @@ def trainer( ckpt_parallel_load=True, ) - trainer = Config( + trainer = run.Config( nl.Trainer, accelerator="gpu", accumulate_grad_batches=1, @@ -61,7 +120,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=bf16_mixed_plugin(), + plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -70,43 +129,108 @@ def trainer( return trainer +@run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - return Partial( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x3B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): Function to use for pre-training (default: nemo.collections.llm.api.pretrain). + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x3b + $ nemo llm pretrain --factory "mixtral_8x3b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x3b_pretrain", num_nodes=2) + >>> print(recipe) + """ + return run.Partial( fn, model=model(), trainer=trainer( - tensor_parallelism=4, - pipeline_parallelism=1, - pipeline_parallelism_type=None, - virtual_pipeline_parallelism=None, - context_parallelism=1, - sequence_parallelism=True, - expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ), - data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) -def hf_resume() -> Config[nl.AutoResume]: - return Config( +def hf_resume() -> run.Config[nl.AutoResume]: + """ + Configure the Hugging Face model resuming for Mixtral 8x3B model. + + This function sets up the configuration for resuming training from a Hugging Face model. + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from a Hugging Face model. + + Examples: + CLI usage: + $ nemo llm finetune --factory "mixtral_8x3b(resume=hf_resume())" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x3b_finetune", num_nodes=2) + >>> recipe.resume = hf_resume() + >>> print(recipe) + """ + return run.Config( nl.AutoResume, - restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), + restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), ) -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune - ) +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x3B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x3b + $ nemo llm finetune --factory "mixtral_8x3b(num_nodes=2, name='my_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x3b_finetune", num_nodes=2) + >>> print(recipe) + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) + recipe.resume = hf_resume() - recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) - recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py index dbf27f86415c..287ac331ee65 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py @@ -1,61 +1,82 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x3b -from nemo.collections.llm.utils import Config, Partial -from nemo.utils.exp_manager import TimingCallback - NAME = "mixtral_8x3b_16k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = mixtral_8x3b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x3B model configuration with 16k sequence length. - model = mixtral_8x3b.model() - model.config.seq_length = 16384 - model.config.max_position_embeddings = 16384 + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model with 16k sequence length. - trainer = mixtral_8x3b.trainer( - tensor_parallelism=2, - pipeline_parallelism=2, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=2, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x3b_16k ... - data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = mixtral_8x3b.model() + model_config.config.seq_length = 16384 + model_config.config.max_position_embeddings = 16384 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x3B model with 16k sequence length. + This function sets up the distributed training strategy optimized for longer sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = mixtral_8x3b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x3b_16k ... - model = mixtral_8x3b.model() - model.config.seq_length = 16384 - model.config.max_position_embeddings = 16384 + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) - trainer = mixtral_8x3b.trainer( + Note: + This configuration uses increased parallelism to handle the longer sequence length efficiently. + """ + return mixtral_8x3b.trainer( tensor_parallelism=2, pipeline_parallelism=2, pipeline_parallelism_type=torch.bfloat16, @@ -65,13 +86,91 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x3B model with 16k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x3b_16k + $ nemo llm pretrain --factory "mixtral_8x3b_16k(num_nodes=2, name='my_16k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x3b_16k_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for handling longer sequences (16k) compared to the standard version. + """ + recipe = mixtral_8x3b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x3B model with 16k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x3b_16k + $ nemo llm finetune --factory "mixtral_8x3b_16k(num_nodes=2, name='my_16k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x3b_16k_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning with longer sequences (16k) compared to the standard version. + It uses the SQuAD dataset adapted for 16k sequence length. + """ + recipe = mixtral_8x3b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py index b2a7724b35a9..98cf2f4f9e7b 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py @@ -1,62 +1,84 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x3b -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x3b_64k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = mixtral_8x3b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x3B model configuration with 64k sequence length. - model = mixtral_8x3b.model() - model.config.seq_length = 65536 - model.config.max_position_embeddings = 65536 + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model with 64k sequence length. - trainer = mixtral_8x3b.trainer( - tensor_parallelism=4, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=4, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x3b_64k ... - data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = mixtral_8x3b.model() + model_config.config.seq_length = 65536 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 8, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x3B model with 64k sequence length. + This function sets up the distributed training strategy optimized for long sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = mixtral_8x3b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. - model = mixtral_8x3b.model() - model.config.seq_length = 65536 - model.config.max_position_embeddings = 65536 + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. - trainer = mixtral_8x3b.trainer( - tensor_parallelism=2, - pipeline_parallelism=2, + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x3b_64k ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=8, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses significantly increased parallelism to handle the long sequence length efficiently. + """ + return mixtral_8x3b.trainer( + tensor_parallelism=4, + pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, virtual_pipeline_parallelism=8, context_parallelism=4, @@ -64,13 +86,93 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 8, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x3B model with 64k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x3b_64k + $ nemo llm pretrain --factory "mixtral_8x3b_64k(num_nodes=8, name='my_64k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x3b_64k_pretrain", num_nodes=8) + >>> print(recipe) + + Note: + This recipe is optimized for handling long sequences (64k) compared to the standard version. + It requires significant computational resources due to the extended sequence length. + """ + recipe = mixtral_8x3b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 8, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x3B model with 64k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x3b_64k + $ nemo llm finetune --factory "mixtral_8x3b_64k(num_nodes=8, name='my_64k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x3b_64k_finetune", num_nodes=8) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning with long sequences (64k) compared to the standard version. + It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration requires + substantial computational resources due to the extended sequence length. + """ + recipe = mixtral_8x3b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py index bacbfcab4e2d..21c9ef572a68 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b.py @@ -1,7 +1,24 @@ -from typing import Callable, Optional +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import nemo_run as run import pytorch_lightning as pl import torch +from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -12,31 +29,73 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x7b" -def model() -> Config[pl.LightningModule]: - return Config(MixtralModel, config=Config(MixtralConfig8x7B)) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x7B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x7b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(MixtralModel, config=run.Config(MixtralConfig8x7B)) def trainer( - tensor_parallelism: int, - pipeline_parallelism: int, - pipeline_parallelism_type: Optional[torch.dtype], - virtual_pipeline_parallelism: Optional[int], - context_parallelism: int, - sequence_parallelism: bool, - expert_parallelism: int, - num_nodes: int = 1, + tensor_parallelism: int = 8, + pipeline_parallelism: int = 2, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = True, + expert_parallelism: int = 1, + num_nodes: int = 2, num_gpus_per_node: int = 8, max_steps: int = 1168251, - callbacks: Optional[list[Config[Callback]]] = None, -) -> Config[nl.Trainer]: - strategy = Config( + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x7B model. + + This function sets up the distributed training strategy optimized for the Mixtral 8x7B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + expert_parallelism (int): Degree of expert parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x7b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + """ + strategy = run.Config( nl.MegatronStrategy, tensor_model_parallel_size=tensor_parallelism, pipeline_model_parallel_size=pipeline_parallelism, @@ -48,9 +107,14 @@ def trainer( gradient_as_bucket_view=True, ckpt_async_save=True, ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + ), ) - trainer = Config( + trainer = run.Config( nl.Trainer, accelerator="gpu", accumulate_grad_batches=1, @@ -61,7 +125,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=bf16_mixed_plugin(), + plugins=run.Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -70,43 +134,107 @@ def trainer( return trainer +@run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - return Partial( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x7B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x7b + $ nemo llm pretrain --factory "mixtral_8x7b(num_nodes=2, name='my_mixtral_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x7b_pretrain", num_nodes=2) + >>> print(recipe) + """ + return run.Partial( fn, model=model(), trainer=trainer( - tensor_parallelism=8, - pipeline_parallelism=2, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=None, - context_parallelism=1, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, callbacks=[run.Config(TimingCallback)] ), - data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) -def hf_resume() -> Config[nl.AutoResume]: - return Config( +def hf_resume() -> run.Config[nl.AutoResume]: + """ + Configure automatic resumption from a Hugging Face checkpoint for Mixtral 8x7B model. + + This function sets up the configuration to resume training from a pre-trained + Hugging Face model checkpoint. + + More info about the model can be found at: https://huggingface.co/mistralai/Mixtral-8x7B-v0.1 + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint. + + Note: + This is particularly useful for fine-tuning scenarios where you want to + start from the pre-trained Mixtral 8x7B model. + """ + return run.Config( nl.AutoResume, - restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), + restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), ) -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune - ) +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x7B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x7b + $ nemo llm finetune --factory "mixtral_8x7b(num_nodes=2, name='my_mixtral_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x7b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) recipe.resume = hf_resume() - recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) - recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py index 0542f22836d6..4b5fd07a69e9 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py @@ -1,76 +1,174 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x7b -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x7b_16k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = mixtral_8x7b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x7B model configuration with 16k sequence length. - model = mixtral_8x7b.model() - model.config.seq_length = 16384 - model.config.max_position_embeddings = 16384 + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model with 16k sequence length. - trainer = mixtral_8x7b.trainer( - tensor_parallelism=2, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=4, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x7b_16k ... - data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = mixtral_8x7b.model() + model_config.config.seq_length = 16384 + model_config.config.max_position_embeddings = 16384 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x7B model with 16k sequence length. + This function sets up the distributed training strategy optimized for longer sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = mixtral_8x7b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. - model = mixtral_8x7b.model() - model.config.seq_length = 16384 - model.config.max_position_embeddings = 16384 + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x7b_16k ... - trainer = mixtral_8x7b.trainer( + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses increased parallelism to handle the longer sequence length efficiently. + """ + return mixtral_8x7b.trainer( tensor_parallelism=2, - pipeline_parallelism=2, + pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, virtual_pipeline_parallelism=8, - context_parallelism=1, + context_parallelism=4, sequence_parallelism=True, - expert_parallelism=8, + expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x7B model with 16k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x7b_16k + $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=2, name='my_16k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=2) + >>> print(recipe) + """ + recipe = mixtral_8x7b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x7B model with 16k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x7b_16k + $ nemo llm finetune --factory "mixtral_8x7b_16k(num_nodes=2, name='my_16k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x7b_16k_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. + """ + recipe = mixtral_8x7b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py index 4fb8de98063e..6a1f76961325 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py @@ -1,76 +1,180 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x7b -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x7b_64k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = mixtral_8x7b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x7B model configuration with 64k sequence length. - model = mixtral_8x7b.model() - model.config.seq_length = 65536 - model.config.max_position_embeddings = 65536 + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model with 64k sequence length. - trainer = mixtral_8x7b.trainer( - tensor_parallelism=4, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=4, - context_parallelism=8, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x7b_64k ... - data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = mixtral_8x7b.model() + model_config.config.seq_length = 65536 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 16, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x7B model with 64k sequence length. + This function sets up the distributed training strategy optimized for very long sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = mixtral_8x7b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. - model = mixtral_8x7b.model() - model.config.seq_length = 65536 - model.config.max_position_embeddings = 65536 + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x7b_64k ... - trainer = mixtral_8x7b.trainer( - tensor_parallelism=2, + Python API usage: + >>> trainer_config = trainer(num_nodes=16, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses significantly increased parallelism to handle the long sequence length efficiently. + It requires a substantial amount of computational resources. + """ + return mixtral_8x7b.trainer( + tensor_parallelism=4, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=4, + virtual_pipeline_parallelism=4, + context_parallelism=8, sequence_parallelism=True, expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 16, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x7B model with 64k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x7b_64k + $ nemo llm pretrain --factory "mixtral_8x7b_64k(num_nodes=16, name='my_64k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x7b_64k_pretrain", num_nodes=16) + >>> print(recipe) + + Note: + This recipe is optimized for handling long sequences (64k) compared to the standard version. + It requires extensive computational resources due to the model size and extended sequence length. + """ + recipe = mixtral_8x7b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 16, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x7B model with 64k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x7b_64k + $ nemo llm finetune --factory "mixtral_8x7b_64k(num_nodes=16, name='my_64k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x7b_64k_finetune", num_nodes=16) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning with long sequences (64k) compared to the standard version. + It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration requires + substantial computational resources due to the model size and extended sequence length. + """ + recipe = mixtral_8x7b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/optim/__init__.py b/nemo/collections/llm/recipes/optim/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/nemo/collections/llm/recipes/optim/__init__.py +++ b/nemo/collections/llm/recipes/optim/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py index d38bbc09d8e6..77472d8a3755 100644 --- a/nemo/collections/llm/recipes/optim/adam.py +++ b/nemo/collections/llm/recipes/optim/adam.py @@ -1,11 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import nemo_run as run from megatron.core.optimizer import OptimizerConfig -from nemo.collections.llm.utils import Config from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule, OptimizerModule -def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> Config[OptimizerModule]: - opt_cfg = Config( +@run.cli.factory +def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> run.Config[OptimizerModule]: + opt_cfg = run.Config( OptimizerConfig, optimizer="adam", lr=max_lr, @@ -20,14 +36,14 @@ def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> Config clip_grad=1.0, ) - sched = Config( + sched = run.Config( CosineAnnealingScheduler, warmup_steps=2000, constant_steps=0, min_lr=0.1 * max_lr, ) - return Config( + return run.Config( MegatronOptimizerModule, config=opt_cfg, lr_scheduler=sched, diff --git a/nemo/collections/llm/recipes/precision/__init__.py b/nemo/collections/llm/recipes/precision/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/nemo/collections/llm/recipes/precision/__init__.py +++ b/nemo/collections/llm/recipes/precision/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/llm/recipes/precision/mixed_precision.py b/nemo/collections/llm/recipes/precision/mixed_precision.py index 6a9cb64404ce..3c0332a0b330 100644 --- a/nemo/collections/llm/recipes/precision/mixed_precision.py +++ b/nemo/collections/llm/recipes/precision/mixed_precision.py @@ -1,11 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import nemo_run as run import torch -from nemo.collections.llm.utils import Config from nemo.lightning.pytorch.plugins.mixed_precision import MegatronMixedPrecision -def bf16_mixed_plugin() -> Config[MegatronMixedPrecision]: - return Config( +@run.cli.factory +def bf16_mixed() -> run.Config[MegatronMixedPrecision]: + return run.Config( MegatronMixedPrecision, precision="bf16-mixed", params_dtype=torch.bfloat16, @@ -15,8 +31,9 @@ def bf16_mixed_plugin() -> Config[MegatronMixedPrecision]: ) -def fp16_mixed_plugin() -> Config[MegatronMixedPrecision]: - return Config( +@run.cli.factory +def fp16_mixed() -> run.Config[MegatronMixedPrecision]: + return run.Config( MegatronMixedPrecision, precision="16-mixed", params_dtype=torch.half, diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 3169d31dbeed..0bd6208f11c7 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,4 +1,3 @@ -fiddle huggingface_hub>=0.24 numba numpy>=1.22 diff --git a/examples/llm/run/llama3_pretraining.py b/scripts/llm/llama3_pretraining.py similarity index 100% rename from examples/llm/run/llama3_pretraining.py rename to scripts/llm/llama3_pretraining.py diff --git a/setup.py b/setup.py index 7787c0ba9603..f3aec7b2f465 100644 --- a/setup.py +++ b/setup.py @@ -273,7 +273,7 @@ def finalize_options(self): # Custom commands. cmdclass={'style': StyleCommand}, entry_points={ - "run.factories": [ + "nemo_run.cli": [ "llm = nemo.collections.llm", ], }, diff --git a/tests/collections/llm/recipes/__init__.py b/tests/collections/llm/recipes/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/collections/llm/recipes/test_llama3_70b.py b/tests/collections/llm/recipes/test_llama3_70b.py new file mode 100644 index 000000000000..4271dd4ef47c --- /dev/null +++ b/tests/collections/llm/recipes/test_llama3_70b.py @@ -0,0 +1,113 @@ +import nemo_run as run +import pytest +import torch + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes import llama3_70b +from nemo.lightning import AutoResume, Trainer +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + + +class TestLlama3_70B: + @pytest.fixture(scope="class") + def recipe_module(self): + return llama3_70b + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == LlamaModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == Llama3Config70B + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 1 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is True + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 8192 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 8192 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + assert isinstance(recipe.peft, run.Config) + assert recipe.peft.__fn_or_cls__ == LoRA + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_pretrain_recipe_performance(self, recipe_module): + recipe = recipe_module.pretrain_recipe_performance( + name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8 + ) + assert any( + isinstance(cb, run.Config) and cb.__fn_or_cls__ == MegatronCommOverlapCallback + for cb in recipe.trainer.callbacks + ) + + def test_hf_resume(self, recipe_module): + resume_config = recipe_module.hf_resume() + assert isinstance(resume_config, run.Config) + assert resume_config.__fn_or_cls__ == AutoResume + assert isinstance(resume_config.restore_config, run.Config) + assert resume_config.restore_config.path == "hf://meta-llama/Meta-Llama-3-70B" + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer( + tensor_parallelism=8, pipeline_parallelism=2, context_parallelism=4, sequence_parallelism=False + ) + assert trainer_config.strategy.tensor_model_parallel_size == 8 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 + assert trainer_config.strategy.context_parallel_size == 4 + assert trainer_config.strategy.sequence_parallel is False + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + llama_config = model_config.config + assert llama_config.num_layers == 80 + assert llama_config.hidden_size == 8192 + assert llama_config.num_attention_heads == 64 + assert llama_config.seq_length == 8192 diff --git a/tests/collections/llm/recipes/test_llama3_70b_16k.py b/tests/collections/llm/recipes/test_llama3_70b_16k.py new file mode 100644 index 000000000000..0aa482b0c905 --- /dev/null +++ b/tests/collections/llm/recipes/test_llama3_70b_16k.py @@ -0,0 +1,93 @@ +import nemo_run as run +import pytest +import torch + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel +from nemo.collections.llm.recipes import llama3_70b_16k +from nemo.lightning import Trainer + + +class TestLlama3_70B_16k: + @pytest.fixture(scope="class") + def recipe_module(self): + return llama3_70b_16k + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == LlamaModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == Llama3Config70B + assert model_config.config.seq_length == 16384 + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 2 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is True + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 16384 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 16384 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(2, 8), (4, 4), (8, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer() + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is True + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + llama_config = model_config.config + assert llama_config.num_layers == 80 + assert llama_config.hidden_size == 8192 + assert llama_config.num_attention_heads == 64 + assert llama_config.seq_length == 16384 diff --git a/tests/collections/llm/recipes/test_llama3_70b_64k.py b/tests/collections/llm/recipes/test_llama3_70b_64k.py new file mode 100644 index 000000000000..f344e4541350 --- /dev/null +++ b/tests/collections/llm/recipes/test_llama3_70b_64k.py @@ -0,0 +1,99 @@ +import nemo_run as run +import pytest +import torch + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel +from nemo.collections.llm.recipes import llama3_70b_64k +from nemo.lightning import Trainer +from nemo.utils.exp_manager import TimingCallback + + +class TestLlama3_70B_64k: + @pytest.fixture(scope="class") + def recipe_module(self): + return llama3_70b_64k + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == LlamaModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == Llama3Config70B + assert model_config.config.seq_length == 65536 + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 32 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 8 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.context_parallel_size == 8 + assert trainer_config.strategy.sequence_parallel is True + + # Check for TimingCallback + assert any( + isinstance(cb, run.Config) and cb.__fn_or_cls__ == TimingCallback for cb in trainer_config.callbacks + ) + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 65536 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 65536 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(32, 8), (64, 4), (128, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer() + assert trainer_config.strategy.tensor_model_parallel_size == 8 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.context_parallel_size == 8 + assert trainer_config.strategy.sequence_parallel is True + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + llama_config = model_config.config + assert llama_config.num_layers == 80 + assert llama_config.hidden_size == 8192 + assert llama_config.num_attention_heads == 64 + assert llama_config.seq_length == 65536 diff --git a/tests/collections/llm/recipes/test_llama3_8b.py b/tests/collections/llm/recipes/test_llama3_8b.py new file mode 100644 index 000000000000..2ad22aedf863 --- /dev/null +++ b/tests/collections/llm/recipes/test_llama3_8b.py @@ -0,0 +1,120 @@ +import nemo_run as run +import pytest + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes import llama3_8b +from nemo.lightning import AutoResume, Trainer + + +class TestLlama3_8B: + @pytest.fixture(scope="class") + def recipe_module(self): + return llama3_8b + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == LlamaModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == Llama3Config8B + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 1 + assert trainer_config.max_steps == 1168251 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 1 + assert trainer_config.strategy.pipeline_model_parallel_size == 1 + assert trainer_config.strategy.pipeline_dtype is None + assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is False + assert trainer_config.strategy.gradient_as_bucket_view is True + assert trainer_config.strategy.ckpt_async_save is True + assert trainer_config.strategy.ckpt_parallel_load is True + + # Check other trainer configurations + assert trainer_config.accumulate_grad_batches == 1 + assert trainer_config.limit_test_batches == 50 + assert trainer_config.limit_val_batches == 32 + assert trainer_config.log_every_n_steps == 10 + assert trainer_config.use_distributed_sampler is False + assert trainer_config.val_check_interval == 2000 + + # Check plugins + assert isinstance(trainer_config.plugins, run.Config) + assert trainer_config.plugins.__fn_or_cls__.__name__ == "MegatronMixedPrecision" + + def test_hf_resume(self, recipe_module): + resume_config = recipe_module.hf_resume() + assert isinstance(resume_config, run.Config) + assert resume_config.__fn_or_cls__ == AutoResume + assert isinstance(resume_config.restore_config, run.Config) + assert resume_config.restore_config.path == "hf://meta-llama/Meta-Llama-3-8B" + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 8192 + assert recipe.data.global_batch_size == 512 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 8192 + assert recipe.data.global_batch_size == 512 + assert isinstance(recipe.peft, run.Config) + assert recipe.peft.__fn_or_cls__ == LoRA + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_pretrain_recipe_performance(self, recipe_module): + recipe = recipe_module.pretrain_recipe_performance( + name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8 + ) + assert any(cb.__fn_or_cls__.__name__ == "MegatronCommOverlapCallback" for cb in recipe.trainer.callbacks) + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer( + tensor_parallelism=2, pipeline_parallelism=2, context_parallelism=4, sequence_parallelism=True + ) + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 + assert trainer_config.strategy.context_parallel_size == 4 + assert trainer_config.strategy.sequence_parallel is True + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + llama_config = model_config.config + assert llama_config.num_layers == 32 + assert llama_config.hidden_size == 4096 + assert llama_config.num_attention_heads == 32 diff --git a/tests/collections/llm/recipes/test_llama3_8b_16k.py b/tests/collections/llm/recipes/test_llama3_8b_16k.py new file mode 100644 index 000000000000..6362ef1e4728 --- /dev/null +++ b/tests/collections/llm/recipes/test_llama3_8b_16k.py @@ -0,0 +1,93 @@ +import nemo_run as run +import pytest +import torch + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel +from nemo.collections.llm.recipes import llama3_8b_16k +from nemo.lightning import Trainer + + +class TestLlama3_8B_16k: + @pytest.fixture(scope="class") + def recipe_module(self): + return llama3_8b_16k + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == LlamaModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == Llama3Config8B + assert model_config.config.seq_length == 16384 + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 1 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is True + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 16384 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 16384 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer() + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is True + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + llama_config = model_config.config + assert llama_config.num_layers == 32 + assert llama_config.hidden_size == 4096 + assert llama_config.num_attention_heads == 32 + assert llama_config.seq_length == 16384 diff --git a/tests/collections/llm/recipes/test_llama3_8b_64k.py b/tests/collections/llm/recipes/test_llama3_8b_64k.py new file mode 100644 index 000000000000..51a876822314 --- /dev/null +++ b/tests/collections/llm/recipes/test_llama3_8b_64k.py @@ -0,0 +1,93 @@ +import nemo_run as run +import pytest +import torch + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel +from nemo.collections.llm.recipes import llama3_8b_64k +from nemo.lightning import Trainer + + +class TestLlama3_8B_64k: + @pytest.fixture(scope="class") + def recipe_module(self): + return llama3_8b_64k + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == LlamaModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == Llama3Config8B + assert model_config.config.seq_length == 65536 + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 1 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.context_parallel_size == 4 + assert trainer_config.strategy.sequence_parallel is True + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 65536 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == LlamaModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 65536 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer() + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.context_parallel_size == 4 + assert trainer_config.strategy.sequence_parallel is True + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + llama_config = model_config.config + assert llama_config.num_layers == 32 + assert llama_config.hidden_size == 4096 + assert llama_config.num_attention_heads == 32 + assert llama_config.seq_length == 65536 diff --git a/tests/collections/llm/recipes/test_mistral.py b/tests/collections/llm/recipes/test_mistral.py new file mode 100644 index 000000000000..fb64c8fe17cc --- /dev/null +++ b/tests/collections/llm/recipes/test_mistral.py @@ -0,0 +1,101 @@ +import nemo_run as run +import pytest + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes import mistral +from nemo.lightning import AutoResume, Trainer + + +class TestMistral: + @pytest.fixture(scope="class") + def recipe_module(self): + return mistral + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == MistralModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == MistralConfig7B + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 1 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 1 + assert trainer_config.strategy.pipeline_model_parallel_size == 1 + assert trainer_config.strategy.pipeline_dtype is None + assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is False + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MistralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 4096 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MistralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 4096 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + assert isinstance(recipe.peft, run.Config) + assert recipe.peft.__fn_or_cls__ == LoRA + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_hf_resume(self, recipe_module): + resume_config = recipe_module.hf_resume() + assert isinstance(resume_config, run.Config) + assert resume_config.__fn_or_cls__ == AutoResume + assert isinstance(resume_config.restore_config, run.Config) + assert resume_config.restore_config.path == "hf://mistralai/Mistral-7B-v0.3" + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer( + tensor_parallelism=2, pipeline_parallelism=2, context_parallelism=4, sequence_parallelism=True + ) + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 + assert trainer_config.strategy.context_parallel_size == 4 + assert trainer_config.strategy.sequence_parallel is True + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + mistral_config = model_config.config + assert mistral_config.num_layers == 32 + assert mistral_config.hidden_size == 4096 + assert mistral_config.num_attention_heads == 32 + assert mistral_config.seq_length == 32768 diff --git a/tests/collections/llm/recipes/test_mixtral_8x22b.py b/tests/collections/llm/recipes/test_mixtral_8x22b.py new file mode 100644 index 000000000000..f2891408c6d6 --- /dev/null +++ b/tests/collections/llm/recipes/test_mixtral_8x22b.py @@ -0,0 +1,118 @@ +import nemo_run as run +import pytest +import torch +from megatron.core.distributed import DistributedDataParallelConfig + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x22B, MixtralModel +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes import mixtral_8x22b +from nemo.lightning import AutoResume, Trainer + + +class TestMixtral8x22B: + @pytest.fixture(scope="class") + def recipe_module(self): + return mixtral_8x22b + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == MixtralModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == MixtralConfig8x22B + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 8 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 8 + assert trainer_config.strategy.pipeline_model_parallel_size == 8 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 7 + assert trainer_config.strategy.context_parallel_size == 1 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + # Check DDP configuration + assert isinstance(trainer_config.strategy.ddp, run.Config) + assert trainer_config.strategy.ddp.__fn_or_cls__ == DistributedDataParallelConfig + assert trainer_config.strategy.ddp.check_for_nan_in_grad is True + assert trainer_config.strategy.ddp.grad_reduce_in_fp32 is True + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 8192 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 8192 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + assert isinstance(recipe.peft, run.Config) + assert recipe.peft.__fn_or_cls__ == LoRA + assert recipe.peft.target_modules == ['linear_qkv', 'linear_proj'] + assert recipe.peft.dim == 32 + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(8, 8), (16, 4), (32, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_hf_resume(self, recipe_module): + resume_config = recipe_module.hf_resume() + assert isinstance(resume_config, run.Config) + assert resume_config.__fn_or_cls__ == AutoResume + assert isinstance(resume_config.restore_config, run.Config) + assert resume_config.restore_config.path == "hf://mistralai/Mixtral-8x22B-v0.1" + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer( + tensor_parallelism=4, + pipeline_parallelism=4, + context_parallelism=2, + sequence_parallelism=False, + expert_parallelism=2, + ) + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is False + assert trainer_config.strategy.expert_model_parallel_size == 2 + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + mixtral_config = model_config.config + assert mixtral_config.num_layers == 56 + assert mixtral_config.hidden_size == 6144 + assert mixtral_config.num_attention_heads == 48 + assert mixtral_config.seq_length == 4096 + assert mixtral_config.num_moe_experts == 8 diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b.py b/tests/collections/llm/recipes/test_mixtral_8x3b.py new file mode 100644 index 000000000000..949246c54c2a --- /dev/null +++ b/tests/collections/llm/recipes/test_mixtral_8x3b.py @@ -0,0 +1,110 @@ +import nemo_run as run +import pytest + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes import mixtral_8x3b +from nemo.lightning import AutoResume, Trainer + + +class TestMixtral8x3B: + @pytest.fixture(scope="class") + def recipe_module(self): + return mixtral_8x3b + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == MixtralModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 1 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 1 + assert trainer_config.strategy.pipeline_dtype is None + assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None + assert trainer_config.strategy.context_parallel_size == 1 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 8192 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 8192 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + assert isinstance(recipe.peft, run.Config) + assert recipe.peft.__fn_or_cls__ == LoRA + assert recipe.peft.target_modules == ['linear_qkv', 'linear_proj'] + assert recipe.peft.dim == 32 + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_hf_resume(self, recipe_module): + resume_config = recipe_module.hf_resume() + assert isinstance(resume_config, run.Config) + assert resume_config.__fn_or_cls__ == AutoResume + assert isinstance(resume_config.restore_config, run.Config) + assert resume_config.restore_config.path == "hf://mistralai/Mixtral-8x7B-v0.1" + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer( + tensor_parallelism=8, + pipeline_parallelism=2, + context_parallelism=4, + sequence_parallelism=False, + expert_parallelism=2, + ) + assert trainer_config.strategy.tensor_model_parallel_size == 8 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 + assert trainer_config.strategy.context_parallel_size == 4 + assert trainer_config.strategy.sequence_parallel is False + assert trainer_config.strategy.expert_model_parallel_size == 2 + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + mixtral_config = model_config.config + assert mixtral_config.num_layers == 32 + assert mixtral_config.hidden_size == 2560 + assert mixtral_config.num_attention_heads == 32 + assert mixtral_config.seq_length == 4096 + assert mixtral_config.num_moe_experts == 8 diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py b/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py new file mode 100644 index 000000000000..0e75e132f70b --- /dev/null +++ b/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py @@ -0,0 +1,98 @@ +import nemo_run as run +import pytest +import torch + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel +from nemo.collections.llm.recipes import mixtral_8x3b_16k +from nemo.lightning import Trainer + + +class TestMixtral8x3B_16k: + @pytest.fixture(scope="class") + def recipe_module(self): + return mixtral_8x3b_16k + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == MixtralModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B + assert model_config.config.seq_length == 16384 + assert model_config.config.max_position_embeddings == 16384 + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 1 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 16384 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 16384 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer() + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + mixtral_config = model_config.config + assert mixtral_config.num_layers == 32 + assert mixtral_config.hidden_size == 2560 + assert mixtral_config.num_attention_heads == 32 + assert mixtral_config.seq_length == 16384 + assert mixtral_config.max_position_embeddings == 16384 + assert mixtral_config.num_moe_experts == 8 diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py b/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py new file mode 100644 index 000000000000..1627d55358d9 --- /dev/null +++ b/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py @@ -0,0 +1,98 @@ +import nemo_run as run +import pytest +import torch + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel +from nemo.collections.llm.recipes import mixtral_8x3b_64k +from nemo.lightning import Trainer + + +class TestMixtral8x3B_64k: + @pytest.fixture(scope="class") + def recipe_module(self): + return mixtral_8x3b_64k + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == MixtralModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B + assert model_config.config.seq_length == 65536 + assert model_config.config.max_position_embeddings == 4096 + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 8 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 + assert trainer_config.strategy.context_parallel_size == 4 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 65536 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 65536 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(32, 8), (64, 4), (128, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer() + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 + assert trainer_config.strategy.context_parallel_size == 4 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + mixtral_config = model_config.config + assert mixtral_config.num_layers == 32 + assert mixtral_config.hidden_size == 2560 + assert mixtral_config.num_attention_heads == 32 + assert mixtral_config.seq_length == 65536 + assert mixtral_config.max_position_embeddings == 4096 + assert mixtral_config.num_moe_experts == 8 diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b.py b/tests/collections/llm/recipes/test_mixtral_8x7b.py new file mode 100644 index 000000000000..ff8e2ee0724e --- /dev/null +++ b/tests/collections/llm/recipes/test_mixtral_8x7b.py @@ -0,0 +1,112 @@ +import nemo_run as run +import pytest +import torch +from megatron.core.distributed import DistributedDataParallelConfig + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes import mixtral_8x7b +from nemo.lightning import AutoResume, Trainer + + +class TestMixtral8x7B: + @pytest.fixture(scope="class") + def recipe_module(self): + return mixtral_8x7b + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == MixtralModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == MixtralConfig8x7B + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 2 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 8 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None + assert trainer_config.strategy.context_parallel_size == 1 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + # Check DDP configuration + assert isinstance(trainer_config.strategy.ddp, run.Config) + assert trainer_config.strategy.ddp.__fn_or_cls__ == DistributedDataParallelConfig + assert trainer_config.strategy.ddp.check_for_nan_in_grad is True + assert trainer_config.strategy.ddp.grad_reduce_in_fp32 is True + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 8192 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 8192 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + assert isinstance(recipe.peft, run.Config) + assert recipe.peft.__fn_or_cls__ == LoRA + assert recipe.peft.target_modules == ['linear_qkv', 'linear_proj'] + assert recipe.peft.dim == 32 + + def test_hf_resume(self, recipe_module): + resume_config = recipe_module.hf_resume() + assert isinstance(resume_config, run.Config) + assert resume_config.__fn_or_cls__ == AutoResume + assert isinstance(resume_config.restore_config, run.Config) + assert resume_config.restore_config.path == "hf://mistralai/Mixtral-8x7B-v0.1" + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer( + tensor_parallelism=4, + pipeline_parallelism=4, + context_parallelism=2, + sequence_parallelism=False, + expert_parallelism=2, + ) + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.context_parallel_size == 2 + assert trainer_config.strategy.sequence_parallel is False + assert trainer_config.strategy.expert_model_parallel_size == 2 + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + mixtral_config = model_config.config + assert mixtral_config.num_layers == 32 + assert mixtral_config.hidden_size == 4096 + assert mixtral_config.num_attention_heads == 32 + assert mixtral_config.seq_length == 4096 + assert mixtral_config.num_moe_experts == 8 diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py new file mode 100644 index 000000000000..a7c68362c057 --- /dev/null +++ b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py @@ -0,0 +1,104 @@ +import nemo_run as run +import pytest +import torch + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel +from nemo.collections.llm.recipes import mixtral_8x7b_16k +from nemo.lightning import Trainer +from nemo.utils.exp_manager import TimingCallback + + +class TestMixtral8x7B_16k: + @pytest.fixture(scope="class") + def recipe_module(self): + return mixtral_8x7b_16k + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == MixtralModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == MixtralConfig8x7B + assert model_config.config.seq_length == 16384 + assert model_config.config.max_position_embeddings == 16384 + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 2 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 + assert trainer_config.strategy.context_parallel_size == 4 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + # Check for TimingCallback + assert any( + isinstance(cb, run.Config) and cb.__fn_or_cls__ == TimingCallback for cb in trainer_config.callbacks + ) + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 16384 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 16384 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(2, 8), (4, 4), (8, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer() + assert trainer_config.strategy.tensor_model_parallel_size == 2 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 + assert trainer_config.strategy.context_parallel_size == 4 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + mixtral_config = model_config.config + assert mixtral_config.num_layers == 32 + assert mixtral_config.hidden_size == 4096 + assert mixtral_config.num_attention_heads == 32 + assert mixtral_config.seq_length == 16384 + assert mixtral_config.max_position_embeddings == 16384 + assert mixtral_config.num_moe_experts == 8 diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py new file mode 100644 index 000000000000..d7220d072634 --- /dev/null +++ b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py @@ -0,0 +1,98 @@ +import nemo_run as run +import pytest +import torch + +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel +from nemo.collections.llm.recipes import mixtral_8x7b_64k +from nemo.lightning import Trainer + + +class TestMixtral8x7B_64k: + @pytest.fixture(scope="class") + def recipe_module(self): + return mixtral_8x7b_64k + + def test_model(self, recipe_module): + model_config = recipe_module.model() + assert isinstance(model_config, run.Config) + assert model_config.__fn_or_cls__ == MixtralModel + assert isinstance(model_config.config, run.Config) + assert model_config.config.__fn_or_cls__ == MixtralConfig8x7B + assert model_config.config.seq_length == 65536 + assert model_config.config.max_position_embeddings == 4096 + + def test_trainer(self, recipe_module): + trainer_config = recipe_module.trainer() + assert isinstance(trainer_config, run.Config) + assert trainer_config.__fn_or_cls__ == Trainer + assert trainer_config.accelerator == "gpu" + assert trainer_config.devices == 8 + assert trainer_config.num_nodes == 16 + + # Check strategy configuration + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4 + assert trainer_config.strategy.context_parallel_size == 8 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + def test_pretrain_recipe(self, recipe_module): + recipe = recipe_module.pretrain_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == pretrain + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == MockDataModule + assert recipe.data.seq_length == 65536 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + def test_finetune_recipe(self, recipe_module): + recipe = recipe_module.finetune_recipe() + assert isinstance(recipe, run.Partial) + assert recipe.__fn_or_cls__ == finetune + assert isinstance(recipe.model, run.Config) + assert recipe.model.__fn_or_cls__ == MixtralModel + assert isinstance(recipe.trainer, run.Config) + assert recipe.trainer.__fn_or_cls__ == Trainer + assert isinstance(recipe.data, run.Config) + assert recipe.data.__fn_or_cls__ == SquadDataModule + assert recipe.data.seq_length == 65536 + assert recipe.data.global_batch_size == 512 + assert recipe.data.micro_batch_size == 1 + + @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(16, 8), (32, 4), (64, 2)]) + def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): + recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + assert recipe.trainer.num_nodes == num_nodes + assert recipe.trainer.devices == num_gpus_per_node + + def test_trainer_parallelism_options(self, recipe_module): + trainer_config = recipe_module.trainer() + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4 + assert trainer_config.strategy.context_parallel_size == 8 + assert trainer_config.strategy.sequence_parallel is True + assert trainer_config.strategy.expert_model_parallel_size == 1 + + def test_model_config_parameters(self, recipe_module): + model_config = recipe_module.model() + mixtral_config = model_config.config + assert mixtral_config.num_layers == 32 + assert mixtral_config.hidden_size == 4096 + assert mixtral_config.num_attention_heads == 32 + assert mixtral_config.seq_length == 65536 + assert mixtral_config.max_position_embeddings == 4096 + assert mixtral_config.num_moe_experts == 8 diff --git a/tests/lightning/fabric/__init__.py b/tests/lightning/_fabric/__init__.py similarity index 80% rename from tests/lightning/fabric/__init__.py rename to tests/lightning/_fabric/__init__.py index d9155f923f18..d2c20dd967b2 100644 --- a/tests/lightning/fabric/__init__.py +++ b/tests/lightning/_fabric/__init__.py @@ -11,3 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + +# Directory name is "_fabric" instead of "fabric" to avoid +# name collision with package the "fabric" when running scripts +# in the parent directory. diff --git a/tests/lightning/fabric/test_conversion.py b/tests/lightning/_fabric/test_conversion.py similarity index 100% rename from tests/lightning/fabric/test_conversion.py rename to tests/lightning/_fabric/test_conversion.py From fb39fad9d68346cbd994da201e570294c9ec854d Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Mon, 9 Sep 2024 08:48:49 -0700 Subject: [PATCH 153/664] [Nemo Unit Tests] Split GPU unit tests (#10380) * Split GPU unit tests * Make L0_Unit_Tests_GPU_Lightning optional since flaky * Add time for GPU_Core test * Add time for GPU_Audio test --- .github/workflows/cicd-main.yml | 166 +++++++++++++++++++++++++++----- 1 file changed, 143 insertions(+), 23 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 2d30dec37054..fcb5c1743a21 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -122,112 +122,219 @@ jobs: ' ### \'\' - L0_Unit_Tests_GPU: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - TIMEOUT: 60 - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads - IS_OPTIONAL: true + # L0: GPU unit tests + L0_Unit_Tests_GPU_ASR: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + TIMEOUT: 20 + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_Audio: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + TIMEOUT: 20 + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_Common: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_LLM: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_Multimodal: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_NLP: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_TTS: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --with_downloads + + OPTIONAL_L0_Unit_Tests_GPU_Core: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + TIMEOUT: 20 + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/core -m "not pleasefixme" --with_downloads + IS_OPTIONAL: true + + L0_Unit_Tests_GPU_Hydra: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads + + OPTIONAL_L0_Unit_Tests_GPU_Lightning: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads + IS_OPTIONAL: true + + L0_Unit_Tests_GPU_Others: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads \ + --ignore=tests/collections/asr \ + --ignore=tests/collections/audio \ + --ignore=tests/collections/common \ + --ignore=tests/collections/llm \ + --ignore=tests/collections/multimodal \ + --ignore=tests/collections/nlp \ + --ignore=tests/collections/tts \ + --ignore=tests/core \ + --ignore=tests/core_ptl \ + --ignore=tests/hydra \ + --ignore=tests/lightning \ + --ignore=tests/utils # L0: CPU unit tests L0_Unit_Tests_CPU_ASR: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu TIMEOUT: 20 SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Audio: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Common: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_LLM: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Multimodal: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_NLP: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_TTS: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Core: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Hydra: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Lightning: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true - L0_Unit_Tests_CPU_Ohers: + L0_Unit_Tests_CPU_Others: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | @@ -5004,7 +5111,19 @@ jobs: needs: - gpu-test - cicd-test-container-setup - - L0_Unit_Tests_GPU + + - L0_Unit_Tests_GPU_ASR + - L0_Unit_Tests_GPU_Audio + - L0_Unit_Tests_GPU_Common + - L0_Unit_Tests_GPU_LLM + - L0_Unit_Tests_GPU_Multimodal + - L0_Unit_Tests_GPU_NLP + - L0_Unit_Tests_GPU_TTS + #- OPTIONAL_L0_Unit_Tests_GPU_Core + - L0_Unit_Tests_GPU_Hydra + #- OPTIONAL_L0_Unit_Tests_GPU_Lightning + - L0_Unit_Tests_GPU_Others + - L0_Unit_Tests_CPU_ASR - L0_Unit_Tests_CPU_Audio - L0_Unit_Tests_CPU_Common @@ -5015,7 +5134,8 @@ jobs: - L0_Unit_Tests_CPU_Core - L0_Unit_Tests_CPU_Hydra - L0_Unit_Tests_CPU_Lightning - - L0_Unit_Tests_CPU_Ohers + - L0_Unit_Tests_CPU_Others + - L2_Community_LLM_Checkpoints_tests_Bert - L2_Community_LLM_Checkpoints_tests_Mamba2 - L2_Community_LLM_Checkpoints_tests_Llama From dc61f7a011bceb302a4e8f5ba9bde36d35cc6dd7 Mon Sep 17 00:00:00 2001 From: Paul Gibbons <87940629+paul-gibbons@users.noreply.github.com> Date: Mon, 9 Sep 2024 09:46:35 -0700 Subject: [PATCH 154/664] Support Energon as dataloader in NeVA (#10305) * energon support draft Signed-off-by: paul-gibbons * removing yi, not in PRs scope Signed-off-by: paul-gibbons * remove mp_spawn + add energon req Signed-off-by: paul-gibbons * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply isort and black reformatting Signed-off-by: paul-gibbons * removing seq packing Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * fixing encode_sft Signed-off-by: paul-gibbons * fix use_energon Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * adding docstrings + addressing comments + CodeQL fixes Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * addressing comments Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * add yi_34b Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * adding assertion for energon + peft, not supported due to dataloader being saved with dist_ckpt Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * energon usage warning Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons --------- Signed-off-by: paul-gibbons Signed-off-by: paul-gibbons Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: paul-gibbons Co-authored-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> --- .../multimodal_llm/neva/conf/neva_config.yaml | 17 + .../multimodal_llm/neva/neva_finetune.py | 2 - .../multimodal_llm/neva/neva_peft.py | 2 - .../multimodal_llm/neva/neva_pretrain.py | 4 - .../multimodal/data/neva/conversation.py | 2 + .../multimodal/data/neva/neva_dataset.py | 132 +++-- .../data/neva/neva_energon_dataset.py | 506 ++++++++++++++++++ .../models/multimodal_llm/neva/neva_model.py | 222 +++++++- nemo/collections/multimodal/parts/utils.py | 1 + .../common/text_generation_strategy.py | 2 + requirements/requirements_multimodal.txt | 1 + 11 files changed, 825 insertions(+), 66 deletions(-) create mode 100644 nemo/collections/multimodal/data/neva/neva_energon_dataset.py diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml index 89e61a8b917c..0464d85b5480 100644 --- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml +++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml @@ -210,6 +210,23 @@ model: image_folder: null image_aspect_ratio: 'square' +energon: + use_energon: False + data: + __module__: megatron.energon + __class__: Metadataset + splits: + # Train dataset, the datasets will be mixed according to their weights + train: + datasets: + - weight: 1.0 + path: null + val: + datasets: + - weight: 1.0 + path: null + + # Nsys profiling options nsys_profile: enabled: False diff --git a/examples/multimodal/multimodal_llm/neva/neva_finetune.py b/examples/multimodal/multimodal_llm/neva/neva_finetune.py index e94308ad89f3..1796a87bac9e 100644 --- a/examples/multimodal/multimodal_llm/neva/neva_finetune.py +++ b/examples/multimodal/multimodal_llm/neva/neva_finetune.py @@ -22,8 +22,6 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -mp.set_start_method("spawn", force=True) - @hydra_runner(config_path="conf", config_name="neva_finetune") def main(cfg) -> None: diff --git a/examples/multimodal/multimodal_llm/neva/neva_peft.py b/examples/multimodal/multimodal_llm/neva/neva_peft.py index 2c0e1bc41ac2..0960dd260ad4 100644 --- a/examples/multimodal/multimodal_llm/neva/neva_peft.py +++ b/examples/multimodal/multimodal_llm/neva/neva_peft.py @@ -23,8 +23,6 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -mp.set_start_method("spawn", force=True) - @hydra_runner(config_path="conf", config_name="neva_peft") def main(cfg) -> None: diff --git a/examples/multimodal/multimodal_llm/neva/neva_pretrain.py b/examples/multimodal/multimodal_llm/neva/neva_pretrain.py index 26e0dc294185..8aae9f2d655a 100644 --- a/examples/multimodal/multimodal_llm/neva/neva_pretrain.py +++ b/examples/multimodal/multimodal_llm/neva/neva_pretrain.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import torch.multiprocessing as mp from omegaconf.omegaconf import OmegaConf from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel @@ -22,8 +20,6 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -mp.set_start_method("spawn", force=True) - @hydra_runner(config_path="conf", config_name="neva_config") def main(cfg) -> None: diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py index 89f1ab24f0a9..4bd4443e46f5 100644 --- a/nemo/collections/multimodal/data/neva/conversation.py +++ b/nemo/collections/multimodal/data/neva/conversation.py @@ -34,6 +34,7 @@ DEFAULT_IM_START_TOKEN["llama_3"] = "<|reserved_special_token_4|>" DEFAULT_IM_END_TOKEN["llama_3"] = "<|reserved_special_token_5|>" + DEFAULT_VID_START_TOKEN = "" DEFAULT_VID_END_TOKEN = "" TIME_TOKEN_TEMPLATE = "" @@ -507,6 +508,7 @@ def dict(self): sep2=DEFAULT_EOS_TOKEN, ) + default_conversation = conv_vicuna_v1 conv_templates = { "default": conv_vicuna_v0, diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 37f57ff21bba..f46b75e7b472 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -40,6 +40,7 @@ DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IMAGE_TOKEN, DEFAULT_LABELS_TOKEN, + DEFAULT_PAD_TOKEN, DEFAULT_VID_END_TOKEN, DEFAULT_VID_START_TOKEN, DEFAULT_VIDEO_TOKEN, @@ -353,8 +354,14 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in if use_plain: assert default_token in conversation[0]['value'] conversation[0]['value'] = default_token - for turn in conversation: - turn["value"] = turn["value"].replace(default_token, replace_token) + if multimodal_cfg["conv_template"] == "interleaved": + # directly replace the default_token in the conversation, + # since we don't use the conversation template + updated_conversation = conversation.replace(default_token, replace_token) + source['conversations'] = updated_conversation + else: + for turn in conversation: + turn["value"] = turn["value"].replace(default_token, replace_token) return sources @@ -791,6 +798,52 @@ def preprocess_v1( ) +def preprocess_interleaved_prompt( + sources: dict, + tokenizer, + cfg, +) -> Dict: + """tokenize the interleaved prompt and mask the text part of the prompt""" + conversations = [] + for source in sources: + conversations.append(source['conversations']) + add_extra_token = cfg.get("add_extra_token") + tokens = tokenize( + texts=conversations, + tokenizer=tokenizer, + context_length=cfg.get("context_length"), + add_extra_token=add_extra_token, + ) + + model_type = cfg['model_type'] + image_patch_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] + image_start_token = DEFAULT_IM_START_TOKEN[model_type] + image_end_token = DEFAULT_IM_END_TOKEN[model_type] + DEFAULT_TOKENS = [image_patch_token, image_start_token, image_end_token, DEFAULT_PAD_TOKEN] + img_patch_id, img_start_id, img_end_id, pad_id = get_tokens_ids(tokenizer, DEFAULT_TOKENS) + tokens[tokens == img_patch_id] = 0 # DEFAULT_IMAGE_PATCH_TOKEN + + labels = tokens.clone().detach() + + # Mask labels change for interleaved prompt + labels[labels == img_start_id] = IGNORE_INDEX + labels[labels == img_end_id] = IGNORE_INDEX + labels[labels == 0] = IGNORE_INDEX + labels[labels == pad_id] = IGNORE_INDEX + + if add_extra_token: + tokens = tokens[:, :-1].contiguous() + labels = labels[:, 1:].contiguous() + else: + labels = torch.roll(labels, shifts=-1, dims=-1) + labels[:, -1] = IGNORE_INDEX + + return dict( + tokens=tokens, + labels=labels, + ) + + def preprocess_nvgpt( sources: dict, tokenizer, @@ -1075,6 +1128,29 @@ def preprocess_plain( ) +def preprocess_conversations(self, sources): + if self.conv_template in ["nvgpt", "nv_steerlm"]: + return preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "nv_dpo": + return preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "v1": + return preprocess_v1(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "llama_2": + return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "llama_3": + return preprocess_llama_3(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "mistral": + return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg, is_mistral=True) + elif self.conv_template == "yi_34b": + return preprocess_yi_34b(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "plain": + return preprocess_plain(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "interleaved": + return preprocess_interleaved_prompt(sources, self.tokenizer, self.multimodal_cfg) + else: + raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.") + + class LazySupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" @@ -1215,57 +1291,7 @@ def expand2square(pil_img, background_color): media_tensors = torch.tensor([]) sources = copy.deepcopy(sources) - if self.conv_template in ["nvgpt", "nv_steerlm"]: - data_dict = preprocess_nvgpt( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "nv_dpo": - data_dict = preprocess_nv_dpo( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "v1": - data_dict = preprocess_v1( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "llama_2": - data_dict = preprocess_llama_2( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "llama_3": - data_dict = preprocess_llama_3( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "mistral": - data_dict = preprocess_llama_2( - sources, - self.tokenizer, - self.multimodal_cfg, - is_mistral=True, - ) - elif self.conv_template == "plain": - data_dict = preprocess_plain( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "yi_34b": - data_dict = preprocess_yi_34b( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - else: - raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.") + data_dict = preprocess_conversations(self, sources) if isinstance(i, int): data_dict = dict(tokens=data_dict["tokens"][0], labels=data_dict["labels"][0]) diff --git a/nemo/collections/multimodal/data/neva/neva_energon_dataset.py b/nemo/collections/multimodal/data/neva/neva_energon_dataset.py new file mode 100644 index 000000000000..a83e616f248f --- /dev/null +++ b/nemo/collections/multimodal/data/neva/neva_energon_dataset.py @@ -0,0 +1,506 @@ +import dataclasses +from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import torch +from einops import rearrange +from megatron.energon import ( + Batch, + CaptioningSample, + DefaultTaskEncoder, + InterleavedSample, + SimilarityInterleavedSample, + VQASample, + batch_pad_stack, +) +from PIL import Image + +from nemo.collections.multimodal.data.neva.neva_dataset import ( + DEFAULT_IMAGE_TOKEN, + preprocess_conversations, + preprocess_interleaved_prompt, + preprocess_llama_2, + preprocess_llama_3, + preprocess_multimodal, + preprocess_nv_dpo, + preprocess_nvgpt, + preprocess_plain, + preprocess_v1, + preprocess_yi_34b, + process_image, +) + + +# Type for intermediate batch, after batch() +@dataclass +class ImageTaskSample: + __key__: str + __subflavor__: str + conversations: List[dict] + image: Optional[Union[str, List[str], torch.Tensor]] = None + video: Optional[Union[str, List[str]]] = None + + tokens: Optional[torch.Tensor] = None + labels: Optional[torch.Tensor] = None + attention_mask: Optional[torch.Tensor] = None + loss_mask: Optional[torch.Tensor] = None + position_ids: Optional[torch.Tensor] = None + + +# Typing for the resulting batch data after encode_batch() +@dataclass +class ImageTaskBatch(Batch): + tokens: torch.Tensor + labels: torch.Tensor + attention_mask: torch.Tensor + loss_mask: torch.Tensor + position_ids: torch.Tensor + media: Optional[torch.Tensor] = None + + +# Required for energon, https://nvidia.github.io/Megatron-Energon/task_encoders.html +class TaskEncoder(DefaultTaskEncoder[VQASample, InterleavedSample, ImageTaskBatch, dict]): + """A task encoder for data samples for captioning, pretraining, sft and interleaved multimodal tasks. + It defines how the data is processed after it is loaded from the dataset. + Currently, it supports captioning, pretraining, sft and interleaved multimodal tasks and datasets. + """ + + def __init__(self, tokenizer, image_processor, multimodal_cfg: dict, data_cfg: dict): + super().__init__(batch_type=ImageTaskBatch) + self.tokenizer = tokenizer + self.image_processor = image_processor + self.multimodal_cfg = multimodal_cfg + self.data_cfg = data_cfg + self.conv_template = multimodal_cfg["conv_template"] + self.max_num_images = 6 + self.image_following_text_only = False + self.caption_prompts = [ + "Generate a short cap fotion of the image.", + "Describe the image concisely.", + "Provide a brief description of the given image.", + ] + self.prompt_index = 0 + + def encode_sample( + self, + sample: Union[ImageTaskSample, CaptioningSample, VQASample, InterleavedSample, SimilarityInterleavedSample], + ) -> dict: + if isinstance(sample, InterleavedSample): + return self.encode_interleaved(sample) + elif isinstance(sample, VQASample): + return self.encode_pretrain(sample) + elif isinstance(sample, CaptioningSample): + return self.encode_captioning(sample) + elif isinstance(sample, SimilarityInterleavedSample) and self.conv_template == "interleaved": + return self.encode_similarity_interleaved(sample) + else: + return self.encode_sft(sample) + + def encode_captioning(self, sample: CaptioningSample) -> dict: + """Preprocessing function for datasets like COCO, containing image-caption pairs. + See Energon codebase for more details on CaptioningSample. + https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/captioning.py + """ + processed_image = self.process_images(sample.image) + + prompt = f"\n{self.caption_prompts[self.prompt_index]}\n" + self.prompt_index = (self.prompt_index + 1) % len(self.caption_prompts) + + caption = sample.caption.strip() + + conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": caption}] + + processed_sample = {"conversations": conversation, "image": processed_image} + + if self.multimodal_cfg['is_multimodal']: + cur_token_len = self.calculate_token_length(processed_sample["image"]) + processed_sample = preprocess_multimodal( + [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain") + )[0] + + processed = preprocess_conversations(self, [processed_sample]) + tokens = processed["tokens"] + labels = processed["labels"] + attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavor__=sample.__subflavor__, + conversations=conversation, + image=processed_sample["image"], + tokens=tokens.squeeze(0), + labels=labels.squeeze(0), + attention_mask=attention_mask.squeeze(0), + loss_mask=loss_mask.squeeze(0), + position_ids=position_ids, + ) + + def encode_pretrain(self, sample: VQASample) -> dict: + """Preprocessing function for datasets like LlaVA-Pretrain, multimodal synthesized conversation from the image-caption pairs. + See Energon codebase for more details on VQASample. + https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/vqa.py + """ + conversations = [{"from": "human", "value": sample.context}, {"from": "gpt", "value": sample.answers}] + processed_sample = {"conversations": conversations} + + if self.multimodal_cfg['is_multimodal']: + if hasattr(sample, 'image') and sample.image is not None: + processed_sample["image"] = self.process_images(sample.image) + cur_token_len = self.calculate_token_length(processed_sample["image"]) + processed_sample = preprocess_multimodal( + [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain") + )[0] + + processed = preprocess_conversations(self, [processed_sample]) + tokens = processed["tokens"] + labels = processed["labels"] + attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavor__=sample.__subflavor__, + conversations=conversations, + image=processed_sample.get("image"), + video=processed_sample.get("video"), + tokens=tokens.squeeze(0), + labels=labels.squeeze(0), + attention_mask=attention_mask.squeeze(0), + loss_mask=loss_mask.squeeze(0), + position_ids=position_ids, + ) + + def encode_sft(self, sample: Union[ImageTaskSample, VQASample, InterleavedSample]) -> dict: + """Preprocessing function for datasets like LLaVA-Instruct, conversational multimodal instruction-following data. + See Energon codebase for more details on VQASample. + https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/vqa.py + """ + conversations = sample.texts if hasattr(sample, 'texts') else sample.conversations + processed_sample = {"conversations": conversations} + image_present = False + + if self.multimodal_cfg['is_multimodal']: + image_present = False + if hasattr(sample, 'image') and sample.image is not None: + processed_sample["image"] = self.process_images(sample.image) + image_present = True + elif hasattr(sample, 'images') and sample.images: + processed_sample["image"] = self.process_images(sample.images[0]) + image_present = True + elif hasattr(sample, 'video') and sample.video: + # Implement video processing if needed + pass + + if image_present: + processed_sample = preprocess_multimodal( + [processed_sample], + self.multimodal_cfg, + self.calculate_token_length(processed_sample["image"]), + use_plain=(self.conv_template == "plain"), + )[0] + + processed = preprocess_conversations(self, [processed_sample]) + tokens = processed["tokens"] + labels = processed["labels"] + attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels) + + if not image_present: + processed_sample["image"] = torch.zeros( + 1, 3, self.multimodal_cfg["crop_size"][0], self.multimodal_cfg["crop_size"][1] + ) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavor__=sample.__subflavor__, + conversations=conversations, + # rewrite image so it creates tensor of zeros if not present + image=processed_sample.get("image", torch.tensor([])), + tokens=tokens.squeeze(0), + labels=labels.squeeze(0), + attention_mask=attention_mask.squeeze(0), + loss_mask=loss_mask.squeeze(0), + position_ids=position_ids, + ) + + def encode_similarity_interleaved(self, sample: SimilarityInterleavedSample) -> dict: + """Preprocessing function for datasets like MMC4, where text and images are interleaved via a similarity matrix or matched_text_indices. + See Energon codebase for more details on SimilarityInterleavedSample. + https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/similarity_interleaved.py + """ + # 4 fields: sample.images, sample.texts, sample.similarity_matrix, sample.matched_text_index + images, sentence_ixs = [], [] + for sample_image, sim_vec in zip(sample.images, sample.matched_text_indices): + images.append(sample_image) + sentence_ixs.append(sim_vec) + + # constrain max num images + max_num_images = self.max_num_images + if len(images) > max_num_images: + images = images[:max_num_images] + sentence_ixs = sentence_ixs[:max_num_images] + + images = [images[i] for i in np.argsort(sentence_ixs)] + + for ix in sentence_ixs: + sample.texts[ix] = f"{DEFAULT_IMAGE_TOKEN} {sample.texts[ix]}" + + if self.image_following_text_only: + # use pad token to divide sentence pieces + text = self.tokenizer.pad_id.join(sample.texts) + else: + text = " ".join(sample.texts) + + text = text.replace(" ", "").replace(" ", "") + text = f"{text}{self.tokenizer.eos_id}" + + if len(images) > 0: + processed_images = self.process_images(images) + else: + processed_images = None + + # check the case where the last token is the image token. + if text.endswith(DEFAULT_IMAGE_TOKEN): + text = text[: -len(DEFAULT_IMAGE_TOKEN)] + + n_im_patch = text.count(DEFAULT_IMAGE_TOKEN) + processed_images = processed_images[:n_im_patch] + assert len(processed_images) == n_im_patch + + processed_sample = {"conversations": text, "image": processed_images} + + if self.multimodal_cfg['is_multimodal']: + if images: + cur_token_len = self.calculate_token_length(processed_sample["image"]) + processed_sample = preprocess_multimodal( + [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain") + )[0] + + processed = preprocess_conversations(self, [processed_sample]) + + tokens = processed["tokens"] + labels = processed["labels"] + attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels) + + # pad images + if images: + processed_sample["image"] = self.pad_images(processed_sample["image"], self.max_num_images) + else: + # add extra dummy images + processed_sample["image"] = torch.zeros( + self.max_num_images, 3, self.multimodal_cfg["crop_size"][0], self.multimodal_cfg["crop_size"][1] + ) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavor__=sample.__subflavor__, + conversations=processed_sample["conversations"], + image=processed_sample["image"], + tokens=tokens.squeeze(0), + labels=labels.squeeze(0), + attention_mask=attention_mask.squeeze(0), + loss_mask=loss_mask.squeeze(0), + position_ids=position_ids, + ) + + def encode_interleaved(self, sample: InterleavedSample) -> dict: + """Preprocessing function for datasets like OBELISC, where text and images are strictly interleaved. + See Energon codebase for more details on SimilarityInterleavedSample. + https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/interleaved.py + """ + interleaved_text = [] + images = [] + for item in sample.sequence: + if isinstance(item, str): + interleaved_text.append(item) + elif isinstance(item, torch.Tensor) or isinstance(item, Image.Image): + interleaved_text.append(DEFAULT_IMAGE_TOKEN) + images.append(item) + else: + raise ValueError(f"Unsupported type in interleaved sequence: {type(item)}") + + # constrain max num images + max_num_images = self.max_num_images + + n_im_patch = interleaved_text.count(DEFAULT_IMAGE_TOKEN) + if n_im_patch > max_num_images: + interleaved_text, kept_image_indices = self.remove_excess_image_tokens(interleaved_text, max_num_images) + images = [images[i] for i in kept_image_indices] + + if len(images) > max_num_images: + images = images[:max_num_images] + + if len(images) > 0: + processed_images = self.process_images(images) + else: + processed_images = None + + combined_text = ' '.join(interleaved_text) + + processed_sample = {"conversations": combined_text, "image": processed_images} + + if self.multimodal_cfg['is_multimodal']: + if images: + cur_token_len = self.calculate_token_length(processed_sample["image"]) + processed_sample = preprocess_multimodal( + [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain") + )[0] + + processed = preprocess_conversations(self, [processed_sample]) + + tokens = processed["tokens"] + labels = processed["labels"] + + attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels) + + # pad images + if images: + processed_sample["image"] = self.pad_images(processed_sample["image"], self.max_num_images) + else: + processed_sample["image"] = torch.zeros( + self.max_num_images, 3, self.multimodal_cfg["crop_size"][0], self.multimodal_cfg["crop_size"][1] + ) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavor__=sample.__subflavor__, + conversations=processed_sample["conversations"], + image=processed_sample["image"], + tokens=tokens.squeeze(0), + labels=labels.squeeze(0), + attention_mask=attention_mask.squeeze(0), + loss_mask=loss_mask.squeeze(0), + position_ids=position_ids, + ) + + def remove_excess_image_tokens(self, interleaved_text, max_num_images): + if interleaved_text[-1] == DEFAULT_IMAGE_TOKEN: + interleaved_text = interleaved_text[:-1] + + image_indices = [i for i, token in enumerate(interleaved_text) if token == DEFAULT_IMAGE_TOKEN] + + if len(image_indices) <= max_num_images: + return interleaved_text, list(range(len(image_indices))) + + # we keep the images that are close to the text tokens + importance = [] + for i, img_idx in enumerate(image_indices): + has_text_before = img_idx > 0 and interleaved_text[img_idx - 1] != DEFAULT_IMAGE_TOKEN + has_text_after = ( + img_idx < len(interleaved_text) - 1 and interleaved_text[img_idx + 1] != DEFAULT_IMAGE_TOKEN + ) + + if has_text_before and has_text_after: + importance.append((0, img_idx)) # highest importance + elif has_text_before or has_text_after: + importance.append((1, img_idx)) + else: + importance.append((2, img_idx)) + + importance.sort(key=lambda x: (x[0], x[1])) + kept_indices = {idx for _, idx in importance[:max_num_images]} + + # update idx to map correctly to the original images array + kept_image_indices = [image_indices.index(i) for i in kept_indices if i in image_indices] + + new_interleaved_text = [ + token for i, token in enumerate(interleaved_text) if token != DEFAULT_IMAGE_TOKEN or i in kept_indices + ] + + return new_interleaved_text, kept_image_indices + + def process_images(self, images): + if not isinstance(images, list): + images = [images] + processed_images = [] + for image in images: + image = process_image(self.image_processor, image, self.multimodal_cfg['image_aspect_ratio']) + processed_images.append(image) + return torch.stack(processed_images) # make it always 4D, otherwise has problem when len(images) > 1 + + def pad_images(self, images, max_num_images): + if len(images) < max_num_images: + pad_size = max_num_images - len(images) + padded_images = torch.cat([images, torch.zeros(pad_size, *images.size()[1:])], dim=0) + return padded_images + return images + + def batch(self, samples: List[ImageTaskSample]) -> ImageTaskBatch: + """Pads and stacks the samples in the batch.""" + batch = ImageTaskBatch( + tokens=batch_pad_stack([s.tokens for s in samples]), + labels=batch_pad_stack([s.labels for s in samples]), + attention_mask=batch_pad_stack([s.attention_mask for s in samples]), + loss_mask=batch_pad_stack([s.loss_mask for s in samples]), + position_ids=batch_pad_stack([s.position_ids for s in samples]), + media=( + torch.stack([s.image for s in samples if s.image is not None]) + if self.multimodal_cfg['is_multimodal'] + else None + ), + ) + + # TODO: cleanup, this is following logic in neva_dataset when we rearrange media tensor + if batch.media.shape[1] == 1: + batch.media = rearrange(batch.media, "b T c h w -> b T 1 c h w") + else: + batch.media = rearrange(batch.media, "b T c h w -> b T 1 c h w") + + return batch + + def preprocess_conversations(self, sources): + if self.conv_template == "nvgpt": + return preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "nv_dpo": + return preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "v1": + return preprocess_v1(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "llama_2": + return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "llama_3": + return preprocess_llama_3(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "mistral": + return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg, is_mistral=True) + elif self.conv_template == "yi_34b": + return preprocess_yi_34b(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "plain": + return preprocess_plain(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "interleaved": + return preprocess_interleaved_prompt(sources, self.tokenizer, self.multimodal_cfg) + else: + raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.") + + def encode_batch(self, batch: ImageTaskBatch) -> dict: + raw = dataclasses.asdict(batch) + return raw + + def calculate_token_length(self, media_tensor): + if len(media_tensor.shape) == 4: + height = media_tensor.shape[2] + width = media_tensor.shape[3] + else: + raise ValueError("Media tensor must be 4-dimensional") + patch_dim = self.multimodal_cfg['patch_dim'] + height_num_patches = height // patch_dim + width_num_patches = width // patch_dim + if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample': + height_num_patches = (height_num_patches + 1) // 2 * 2 + width_num_patches = (width_num_patches + 1) // 2 * 2 + + return height_num_patches * width_num_patches + + def get_masks_and_position_ids(self, tokens, labels): + from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids + + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + data=tokens, + eod_token=self.tokenizer.eos_id, + eod_mask_loss=self.data_cfg.get("eod_mask_loss", False), + reset_attention_mask=False, + reset_position_ids=False, + ) + + loss_mask[labels == -1] = 0.0 + tokens[tokens == -1] = 0 + labels[labels == -1] = 0 + + return attention_mask, loss_mask, position_ids diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index 6218332c2bde..07bc4f3960d3 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -21,7 +21,7 @@ import torch import torch.nn.functional as F from einops import rearrange, reduce, repeat -from omegaconf import DictConfig, ListConfig +from omegaconf import DictConfig, ListConfig, OmegaConf from pkg_resources import packaging from pytorch_lightning.trainer.trainer import Trainer from transformers import CLIPVisionModel, SiglipVisionModel @@ -69,6 +69,25 @@ from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +try: + from megatron.energon import ( + LimitDataset, + RepeatDataset, + WorkerConfig, + get_loader, + get_savable_loader, + get_train_dataset, + get_val_datasets, + ) + + from nemo.collections.multimodal.data.neva.neva_energon_dataset import TaskEncoder + + HAVE_ENERGON = True + +except (ImportError, ModuleNotFoundError): + + HAVE_ENERGON = False + try: from megatron.core import InferenceParams, dist_checkpointing, parallel_state, tensor_parallel from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace @@ -1226,10 +1245,22 @@ def setup(self, stage=None): else: # TODO: consider adding a ModelPT guard to check if model is being restored. # allowing restored models to optionally setup datasets - self.build_train_valid_test_datasets() - self.setup_training_data(self.cfg.data) - self.setup_validation_data(self.cfg.data) - self.setup_test_data(self.cfg.data) + + if self.cfg.get('energon', {}).get('use_energon', False): + if not HAVE_ENERGON: + raise ImportError( + "Megatron-Energon was not found. Please see the Energon README for installation instructions: https://github.com/NVIDIA/Megatron-Energon?tab=readme-ov-file#installation." + ) + assert not self.use_peft, "NeMo does not currently support the combination of Energon and PEFT." + logging.info( + "You are now using an experimental implementation of Megatron-Energon, https://github.com/NVIDIA/Megatron-Energon, for your NeVA dataloader. Further updates to Energon support in NeMo will be done in NeMo 2.0 implementation." + ) + self.build_train_valid_test_datasets_energon() + else: + self.build_train_valid_test_datasets() + self.setup_training_data(self.cfg.data) + self.setup_validation_data(self.cfg.data) + self.setup_test_data(self.cfg.data) # when using pipeline model parallel the final stage need to initialize word embeddings if parallel_state.get_pipeline_model_parallel_world_size() > 1: @@ -1435,6 +1466,144 @@ def build_pretraining_data_loader( persistent_workers=True if self.cfg.data.num_workers > 0 else False, ) + def datasets_provider(self, worker_config=None): + """Create multimodal train, validation and test datasets.""" + if parallel_state.get_pipeline_model_parallel_world_size() == 1: + micro_batch_size = self.cfg.micro_batch_size + else: + micro_batch_size = self.cfg.global_batch_size // parallel_state.get_data_parallel_world_size() + + dname = OmegaConf.to_container(self.cfg.energon.data, resolve=True) + + image_processor = ( + self.model.module.image_processor if hasattr(self.model, "module") else self.model.image_processor + ) + + add_extra_token = 1 + if getattr(self.cfg, 'no_seqlen_plus_one_input_tokens', False): + add_extra_token = 0 + + multimodal_cfg = dict( + is_multimodal=self.cfg.data.is_multimodal, + sep_image_conv_front=self.cfg.data.sep_image_conv_front, + model_type=self.cfg.mm_cfg.llm.get("model_type", "nvgpt"), + conv_template=self.cfg.data.get("conv_template", "nvgpt"), + patch_dim=self.cfg.mm_cfg.vision_encoder.patch_dim, + crop_size=self.cfg.mm_cfg.vision_encoder.get("crop_size", (336, 336)), + image_folder=self.cfg.data.get('image_folder', None), + video_folder=self.cfg.data.get('video_folder', None), + image_aspect_ratio=self.cfg.data.image_aspect_ratio, + use_im_start_end=getattr(self.cfg.mm_cfg, 'use_im_start_end', False), + image_processor=image_processor, + add_extra_token=add_extra_token, + context_length=self.cfg.encoder_seq_length, + media_type=self.cfg.data.get('media_type', 'image'), + num_frames=self.cfg.data.get('num_frames', -1), + use_lita=getattr(self.cfg.mm_cfg, 'use_lita', False), + lita=getattr(self.cfg.mm_cfg, 'lita', {}), + mm_mlp_adapter_type=self.cfg.mm_cfg.get('mm_mlp_adapter_type', 'linear'), + ) + + data_cfg = dict( + splice_single_frame=self.cfg.data.get('splice_single_frame', None), + num_frames=self.cfg.data.get('num_frames', -1), + sep_token_between_frames=self.cfg.data.get('sep_token_between_frames', False), + ) + + train_dataset = get_train_dataset( + dname, + batch_size=micro_batch_size, + task_encoder=TaskEncoder( + tokenizer=self.tokenizer, + image_processor=image_processor, + multimodal_cfg=multimodal_cfg, + data_cfg=data_cfg, + ), + worker_config=worker_config, + virtual_epoch_length=1000, + max_samples_per_sequence=100, + shuffle_buffer_size=100, + image_decode="pil", + ) + + val_datasets = get_val_datasets( + dname, + batch_size=micro_batch_size, + # This is the total number over all workers + task_encoder=TaskEncoder( + tokenizer=self.tokenizer, + image_processor=image_processor, + multimodal_cfg=multimodal_cfg, + data_cfg=data_cfg, + ), + worker_config=worker_config, + image_decode="pil", + ) + + val_datasets_without_source_datasets = [ + # Limit the dataset to eval_iters * num_microbatches + LimitDataset( + # Repeat the inner dataset in case it's too short + RepeatDataset(val_ds, worker_config=worker_config), + length=self.cfg.micro_batch_size * self.trainer.limit_val_batches, + worker_config=worker_config, + reset_after_epoch=True, + ) + for val_ds, _src_ds in val_datasets + ] + + return train_dataset, val_datasets_without_source_datasets, None + + # energon dataset builder + def build_train_valid_test_datasets_energon(self): + """Builds train and validation dataloaders using Megatron-Energon""" + rank = parallel_state.get_data_parallel_rank() + world_size = parallel_state.get_data_parallel_world_size() + data_parallel_group = parallel_state.get_data_parallel_group() + worker_debug_path = None + worker_log_level = 0 + logging.info( + f" Multimodal train dataloader initializing with rank {rank} world_size {world_size} data_parallel_group {data_parallel_group} ****** " + ) + + worker_config = WorkerConfig( + rank=rank, + world_size=world_size, + num_workers=1, + data_parallel_group=data_parallel_group, + worker_debug_path=worker_debug_path, + worker_log_level=worker_log_level, + ) + train_ds, valid_ds1, test_ds = self.datasets_provider(worker_config) + train_dataloader = get_savable_loader(train_ds, worker_config=worker_config) + + # Restore energon train dataloader state if we are resuming training + restore = os.path.exists(self.trainer.ckpt_path) if self.trainer.ckpt_path else False + if restore: + replica_id = ( + parallel_state.get_pipeline_model_parallel_rank(), + parallel_state.get_tensor_model_parallel_rank(), + parallel_state.get_context_parallel_rank(), + ) + sharded_state_dict = { + 'dataloader_state': ShardedObject( + data=None, + key='dataloader_state', + global_shape=[parallel_state.get_data_parallel_world_size()], + global_offset=[parallel_state.get_data_parallel_rank()], + replica_id=replica_id, + ) + } + state_dict = dist_checkpointing.load(sharded_state_dict, self.trainer.ckpt_path) + train_dataloader.restore_state_rank(state_dict['dataloader_state']) + logging.info(f"Restored dataset state from {self.trainer.ckpt_path}") + + valid_dataloader = [get_loader(valid_ds, worker_config=worker_config) for valid_ds in valid_ds1] + # valid_dataloader = get_loader(valid_ds1, worker_config=worker_config) + self._train_dl = train_dataloader + self._validation_dl = valid_dataloader + return self._train_dl, self._validation_dl + @classmethod def list_available_models(cls) -> Optional[PretrainedModelInfo]: """ @@ -1512,6 +1681,49 @@ def on_load_checkpoint(self, checkpoint) -> None: self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) parallel_state.set_virtual_pipeline_model_parallel_rank(0) + def on_save_checkpoint(self, checkpoint) -> None: + """LightningModule hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint + """ + + # Neva supports Megatron Energon dataloader, this requires saving the dataloader state on each data parallel group + def should_save_dataloader_state(): + if self._train_dl is None: + return False + if not hasattr(self._train_dl, "save_state"): + return False + first_rank = ( + parallel_state.is_pipeline_first_stage(ignore_virtual=True) + and parallel_state.get_tensor_model_parallel_rank() == 0 + ) + return first_rank + + def save_dataloader_state(): + train_dataloader_state_dict = self._train_dl.save_state_rank() + checkpoint['dataloader_state'] = ShardedObject( + data=train_dataloader_state_dict, + key='dataloader_state', + global_shape=[parallel_state.get_data_parallel_world_size()], + global_offset=[parallel_state.get_data_parallel_rank()], + ) + + # Save energon train dataloader state if conditions are met + if self.cfg.get('energon', False) and should_save_dataloader_state(): + save_dataloader_state() + + # mcore uses distributed checkpointing + # FSDP supports the lagecy checkpointing or torch-FSDP-native sharded checkpointing + if self.mcore_gpt and not self.use_fsdp: + checkpoint['sharded_state_dict'] = self.sharded_state_dict() + + # legacy checkpointing for interleaved + else: + if isinstance(self.model, list): + for i in range(len(self.model)): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint() + parallel_state.set_virtual_pipeline_model_parallel_rank(0) + def sharded_state_dict(self, prefix: str = ''): if self.use_peft: return None diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index ea8053398a88..6ba2e8ca91f9 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -149,6 +149,7 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None): checkpoint = dist_checkpointing.load( sharded_state_dict=checkpoint, checkpoint_dir=tmp_model_weights_dir, + strict=dist_checkpointing.validation.StrictHandling.LOG_UNEXPECTED, ) state_dict = checkpoint["state_dict"] diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index 09f265ed2521..c6b7aac04a55 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -531,6 +531,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents ) # HARDCODED FOR NOW data_dict = preprocess_llama_3(sources, tokenizer, multimodal_cfg) + elif multimodal_cfg["conv_template"] == "mistral": record = { 'conversations': [ @@ -552,6 +553,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents ) # HARDCODED FOR NOW data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg, is_mistral=True) + elif multimodal_cfg["conv_template"] == "v1": record = { 'conversations': [ diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt index b7e6119fd7b7..8b56c3974a25 100644 --- a/requirements/requirements_multimodal.txt +++ b/requirements/requirements_multimodal.txt @@ -5,6 +5,7 @@ diffusers>=0.19.3 einops_exts imageio kornia +megatron-energon nerfacc>=0.5.3 open_clip_torch==2.24.0 PyMCubes From 8e3d65dcfe3504689bfc5b6e67d502780aacad41 Mon Sep 17 00:00:00 2001 From: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Date: Mon, 9 Sep 2024 22:31:01 +0530 Subject: [PATCH 155/664] 24.07 perf numbers (#10253) Signed-off-by: Malay Nagda Co-authored-by: Sangkug Lym --- .../source/performance/performance_summary.md | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/source/performance/performance_summary.md b/docs/source/performance/performance_summary.md index eca42f2d0695..98dae2dc0a78 100644 --- a/docs/source/performance/performance_summary.md +++ b/docs/source/performance/performance_summary.md @@ -11,18 +11,18 @@ | Model | #-GPUs | GBS | MBS | Sequence Length| TP | PP | CP | VP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to train in days (10T tokens, 1K GPUs)*** | | ----- | ------ | --- | --- | ---------------| -- | -- | -- | -- | ------------------ | ----------------------- | ------------------------------------------------------ | -| GPT3-5B | 64 | 2048 | 4 | 2048 | 1 | 1 | 1 | 1 | 23574 | 770 | ***5*** | -| GPT3-20B | 64 | 256 | 2 | 2048 | 2 | 1 | 1 | 1 | 5894 | 755 | ***19*** | -| GPT3-175B | 128 | 256 | 1 | 2048 | 4 | 8 | 1 | 6 | 745 | 802 | **152** | -| GPT3-175B | 512 | 2048 | 2 | 2048 | 4 | 8 | 1 | 6 | 832 | [895](https://mlcommons.org/benchmarks/training/) | **136** | -| LLAMA2-7B | 8 | 128 | 1 | 4096 | 1 | 1 | 1 | 1 | 16634 | 767 | ***7*** | +| GPT3-5B | 64 | 2048 | 4 | 2048 | 1 | 1 | 1 | 1 | 23406 | 765 | ***5*** | +| GPT3-20B | 64 | 256 | 2 | 2048 | 2 | 1 | 1 | 1 | 5851 | 750 | ***19*** | +| GPT3-175B | 128 | 256 | 1 | 2048 | 4 | 8 | 1 | 6 | 716 | 771 | **158** | +| GPT3-175B | 512 | 2048 | 2 | 2048 | 4 | 8 | 1 | 6 | 825 | [888](https://mlcommons.org/benchmarks/training/) | **137** | +| LLAMA2-7B | 8 | 128 | 1 | 4096 | 1 | 1 | 1 | 1 | 16934 | 780 | ***7*** | | LLAMA2-13B | 16 | 128 | 1 | 4096 | 1 | 4 | 1 | 10 | 8715 | 760 | ***13*** | -| LLAMA2-70B | 64 | 128 | 1 | 4096 | 4 | 4 | 1 | 20 | 1717 | 763 | ***66*** | +| LLAMA2-70B | 64 | 128 | 1 | 4096 | 4 | 4 | 1 | 20 | 1728 | 768 | ***65*** | | Nemotron-8B | 64 | 256 | 4 | 4096 | 2 | 1 | 1 | 1 | 12507 | 643 | ***9*** | -| Nemotron-22B | 64 | 256 | 2 | 4096 | 2 | 4 | 1 | 10 | 4289 | 559 | ***26*** | -| Nemotron-340B | 128 | 32 | 1 | 4096 | 8 | 8 | 1 | 12 | 328 | 691 | ***344*** | -| LLAMA3-8B | 8 | 128 | 1 | 8192 | 1 | 1 | 2 | 1 | 11883 | 688 | ***10*** | -| LLAMA3-70B | 64 | 128 | 1 | 8192 | 4 | 4 | 2 | 5 | 1549 | 746 | ***73*** | +| Nemotron-22B | 64 | 256 | 2 | 4096 | 2 | 4 | 1 | 10 | 4312 | 562 | ***26*** | +| Nemotron-340B | 128 | 32 | 1 | 4096 | 8 | 8 | 1 | 12 | 326 | 686 | ***347*** | +| LLAMA3-8B | 8 | 128 | 1 | 8192 | 1 | 1 | 2 | 1 | 12273 | 711 | ***9*** | +| LLAMA3-70B | 64 | 128 | 1 | 8192 | 4 | 4 | 2 | 5 | 1524 | 734 | ***74*** | ### Finetuning @@ -34,9 +34,9 @@ | Model | Task | #-GPUs | GBS | MBS | Packed Sequence Length | TP | PP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to finetune in mins (10M tokens)*** | | ----- | ---- | --- | --- | --- | --------------- | -- | -- | ------------------ | ----------------------- | -------------------------------------------------- | -| LLAMA2-7B | SFT | 8 | 32 | 1 | 4096 | 1 | 1 | 17617 | 702 | ***1.2*** | +| LLAMA2-7B | SFT | 8 | 32 | 1 | 4096 | 1 | 1 | 16891 | 673 | ***1.2*** | | LLAMA2-13B | SFT | 8 | 32 | 1 | 4096 | 1 | 4 | 10176 | 787 | ***2.0*** | -| LLAMA2-70B | SFT | 16 | 32 | 1 | 4096 | 4 | 4 | 1812 | 747 | ***5.7*** | -| LLAMA2-7B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 25206 | 673 | ***0.8*** | -| LLAMA2-13B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 14760 | 764 | ***1.4*** | +| LLAMA2-70B | SFT | 16 | 32 | 1 | 4096 | 4 | 4 | 1816 | 749 | ***5.7*** | +| LLAMA2-7B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 24824 | 663 | ***0.8*** | +| LLAMA2-13B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 14629 | 757 | ***1.4*** | | LLAMA2-70B | LoRA | 8 | 32 | 1 | 4096 | 2 | 4 | 2621 | 722 | ***7.9*** | From ba7962eba4736bd091333d97541d8201a54e3de7 Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Mon, 9 Sep 2024 12:04:31 -0500 Subject: [PATCH 156/664] remove scripts (#10427) --- .../llm/megatron_ssm_finetuning.py | 110 --------------- .../llm/megatron_ssm_pretraining.py | 129 ------------------ 2 files changed, 239 deletions(-) delete mode 100644 tests/collections/llm/megatron_ssm_finetuning.py delete mode 100644 tests/collections/llm/megatron_ssm_pretraining.py diff --git a/tests/collections/llm/megatron_ssm_finetuning.py b/tests/collections/llm/megatron_ssm_finetuning.py deleted file mode 100644 index 187384e15dcd..000000000000 --- a/tests/collections/llm/megatron_ssm_finetuning.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -## NOTE: This script is present for github-actions testing only. -## There are no guarantees that this script is up-to-date with latest NeMo. - -import argparse -import torch -from megatron.core.optimizer import OptimizerConfig -from nemo import lightning as nl -from nemo.collections import llm -from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule - - -def get_args(): - parser = argparse.ArgumentParser(description='Train a small GPT model using NeMo 2.0') - parser.add_argument('--devices', type=int, help="Number of devices to use for training") - parser.add_argument('--max-steps', type=int, help="Number of steps to train for") - parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to") - parser.add_argument('--model-path', type=str, help="Path to model checkpoint") - parser.add_argument( - '--tokenizer-model-path', type=str, default=None, help="Path to tokenizer model, defaults to None" - ) - return parser.parse_args() - - -if __name__ == "__main__": - - args = get_args() - - # Checkpoint callback setup - checkpoint_callback = nl.ModelCheckpoint( - save_best_model=True, - save_last=False, - monitor="reduced_train_loss", - save_top_k=1, - every_n_train_steps=10, - enable_nemo_ckpt_io=False, - dirpath=args.experiment_dir, - ) - - trainer = nl.Trainer( - devices=args.devices, - max_steps=args.max_steps, - accelerator="gpu", - strategy=nl.MegatronStrategy( - ckpt_load_optimizer=False, - ckpt_save_optimizer=False, - tensor_model_parallel_size=1, - ), - plugins=nl.MegatronMixedPrecision( - precision="bf16-mixed", - params_dtype=torch.bfloat16, - ), - callbacks=[checkpoint_callback], - log_every_n_steps=1, - limit_val_batches=5, - val_check_interval=10, - num_sanity_val_steps=0, - ) - - opt_config = OptimizerConfig( - optimizer='adam', - lr=1e-5, - min_lr=1e-5, - use_distributed_optimizer=False, - clip_grad=1.0, - bf16=True, - ) - - optim = MegatronOptimizerModule(config=opt_config) - model_config = llm.BaseMambaConfig130m() - model_config.tokenizer_model_path = args.tokenizer_model_path - - tokenizer = get_nmt_tokenizer( - library=model_config.tokenizer_library, - model_name=model_config.tokenizer_name, - tokenizer_model=model_config.tokenizer_model_path, - use_fast=True, - ) - - model = llm.GPTModel(model_config, optim=optim, tokenizer=tokenizer) - - ckpt_path = model.import_ckpt( - path="pytorch://" + args.model_path, - model_config=model_config, - ) - - data = llm.SquadDataModule( - seq_length=512, - micro_batch_size=2, - global_batch_size=4, - tokenizer=model.tokenizer, - num_workers=0, - pad_to_max_length=True, - ) - - trainer.fit(model, data, ckpt_path=ckpt_path) diff --git a/tests/collections/llm/megatron_ssm_pretraining.py b/tests/collections/llm/megatron_ssm_pretraining.py deleted file mode 100644 index ed7e551cba7b..000000000000 --- a/tests/collections/llm/megatron_ssm_pretraining.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -## NOTE: This script is present for github-actions testing only. -## There are no guarantees that this script is up-to-date with latest NeMo. - -import argparse -import torch -from megatron.core.optimizer import OptimizerConfig -from pytorch_lightning.loggers import TensorBoardLogger -from nemo import lightning as nl -from nemo.collections import llm -from nemo.collections.llm.api import train -from nemo.collections.llm.gpt.data import PreTrainingDataModule -from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.lightning import NeMoLogger -from nemo.lightning.pytorch.callbacks import ModelCheckpoint -from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule - - -def get_args(): - parser = argparse.ArgumentParser(description='Train a Mamba model using NeMo 2.0') - parser.add_argument('--devices', type=int, help="Number of devices to use for training") - parser.add_argument('--max-steps', type=int, help="Number of steps to train for") - parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to") - parser.add_argument('--data-path', type=str, help="Path to data file") - parser.add_argument('--tokenizer-path', type=str, default=None, help="Path to tokenizer model") - - return parser.parse_args() - - -if __name__ == '__main__': - - args = get_args() - - seq_length = 512 - - tokenizer = get_nmt_tokenizer( - "huggingface", - "EleutherAI/gpt-neox-20b", - tokenizer_model=None, - use_fast=True, - ) - data = PreTrainingDataModule( - paths=args.data_path, - seq_length=seq_length, - micro_batch_size=2, - global_batch_size=16, - seed=1234, - tokenizer=tokenizer, - ) - ssm_config = llm.SSMConfig( - hybrid_override_pattern="M-M*", - num_layers=4, - hidden_size=1024, - ffn_hidden_size=1024, - num_attention_heads=4, - seq_length=seq_length, - init_method_std=0.02, - hidden_dropout=0.0, - attention_dropout=0.0, - layernorm_epsilon=1e-5, - make_vocab_size_divisible_by=16, - ) - model = llm.GPTModel(ssm_config, tokenizer=data.tokenizer) - strategy = nl.MegatronStrategy( - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, - ) - checkpoint_callback = ModelCheckpoint( - every_n_train_steps=10, - enable_nemo_ckpt_io=False, - ) - callbacks = [checkpoint_callback] - - loggers = [] - tensorboard_logger = TensorBoardLogger( - save_dir='dummy', ## NOTE: this gets overwritten by default - ) - loggers.append(tensorboard_logger) - - opt_config = OptimizerConfig( - optimizer='adam', - lr=6e-4, - min_lr=6e-5, - clip_grad=1.0, - use_distributed_optimizer=False, - bf16=True, - ) - opt = MegatronOptimizerModule(config=opt_config) - - trainer = nl.Trainer( - devices=args.devices, - max_steps=args.max_steps, - accelerator="gpu", - strategy=strategy, - logger=loggers, - callbacks=callbacks, - log_every_n_steps=1, - limit_val_batches=2, - plugins=nl.MegatronMixedPrecision( - precision="bf16-mixed", - params_dtype=torch.bfloat16, - ), - ) - - nemo_logger = NeMoLogger( - dir=args.experiment_dir, - ) - - train( - model=model, - data=data, - trainer=trainer, - log=nemo_logger, - tokenizer='data', - optim=opt, - ) From 176c54fdce07863553a8294b8f88e95c0613358b Mon Sep 17 00:00:00 2001 From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Date: Mon, 9 Sep 2024 10:12:29 -0700 Subject: [PATCH 157/664] Neva update to NeMo 2.0 (#10292) * add some placeholders Signed-off-by: yaoyu-33 * few updates Signed-off-by: yaoyu-33 * update Signed-off-by: yaoyu-33 * fix logging Signed-off-by: yaoyu-33 * add neva mock dataset Signed-off-by: yaoyu-33 * update neva conversation Signed-off-by: yaoyu-33 * neva data update Signed-off-by: yaoyu-33 * fixes to conversation Signed-off-by: yaoyu-33 * data save Signed-off-by: yaoyu-33 * Update paths and fix init Signed-off-by: yaoyu-33 * some fixes Signed-off-by: yaoyu-33 * fix combined embeddings logic Signed-off-by: yaoyu-33 * revert debug code Signed-off-by: yaoyu-33 * Fix mock dataset for neva Signed-off-by: yaoyu-33 * Fix dataset part and add llava Signed-off-by: yaoyu-33 * Fix and updates Signed-off-by: yaoyu-33 * Fix and updates Signed-off-by: yaoyu-33 * Fix and updates Signed-off-by: yaoyu-33 * Add checkpoint saving dev test script Signed-off-by: yaoyu-33 * Fix and updates for neva pretraining Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Fix and updates for neva finetuning Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Fix minor issues in neva 2.0 upgrade Signed-off-by: yaoyu-33 * Update init Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * update examples Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Formatting Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix formatting Signed-off-by: yaoyu-33 * Add license and fix formatting Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Fix formatting Signed-off-by: yaoyu-33 * Fix examples Signed-off-by: yaoyu-33 * Fix neva api file Signed-off-by: yaoyu-33 * Update tokens Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Update arg apis Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Update model and pretrain script Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * finetune and inference fixes Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * address comments Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix Signed-off-by: yaoyu-33 * Fix dataset resuming Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * address comments Signed-off-by: yaoyu-33 * clean up Signed-off-by: yaoyu-33 * update loss mask class Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix Signed-off-by: yaoyu-33 * fix args Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * address comments Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * remove examples Signed-off-by: yaoyu-33 * remove factory Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Signed-off-by: yaoyu-33 Co-authored-by: yaoyu-33 --- .../clip/megatron_clip_models.py | 28 +- nemo/collections/vlm/__init__.py | 41 + nemo/collections/vlm/neva/__init__.py | 0 nemo/collections/vlm/neva/data/__init__.py | 29 + nemo/collections/vlm/neva/data/api.py | 29 + nemo/collections/vlm/neva/data/config.py | 47 ++ .../collections/vlm/neva/data/conversation.py | 677 +++++++++++++++++ nemo/collections/vlm/neva/data/lazy.py | 612 +++++++++++++++ nemo/collections/vlm/neva/data/mock.py | 179 +++++ .../vlm/neva/data/multimodal_tokens.py | 52 ++ nemo/collections/vlm/neva/model/__init__.py | 34 + nemo/collections/vlm/neva/model/api.py | 31 + nemo/collections/vlm/neva/model/base.py | 700 ++++++++++++++++++ nemo/collections/vlm/neva/model/llava.py | 342 +++++++++ nemo/lightning/_strategy_lib.py | 3 - nemo/lightning/megatron_parallel.py | 17 + 16 files changed, 2805 insertions(+), 16 deletions(-) create mode 100644 nemo/collections/vlm/__init__.py create mode 100644 nemo/collections/vlm/neva/__init__.py create mode 100644 nemo/collections/vlm/neva/data/__init__.py create mode 100644 nemo/collections/vlm/neva/data/api.py create mode 100644 nemo/collections/vlm/neva/data/config.py create mode 100644 nemo/collections/vlm/neva/data/conversation.py create mode 100644 nemo/collections/vlm/neva/data/lazy.py create mode 100644 nemo/collections/vlm/neva/data/mock.py create mode 100644 nemo/collections/vlm/neva/data/multimodal_tokens.py create mode 100644 nemo/collections/vlm/neva/model/__init__.py create mode 100644 nemo/collections/vlm/neva/model/api.py create mode 100644 nemo/collections/vlm/neva/model/base.py create mode 100644 nemo/collections/vlm/neva/model/llava.py diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py index 393acdef35de..07c6f25662c4 100644 --- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py +++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py @@ -424,24 +424,26 @@ def __init__(self, *args, **kwargs): # TODO (yuya): need to handle post_process correctly in order to enable PP self.output_dim = kwargs.pop('output_dim') super().__init__(*args, **kwargs) - self.final_layernorm = TENorm( - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - ) - self.head = torch.nn.Linear( - self.config.hidden_size, - self.output_dim, - bias=False, - ) + if self.post_process: + self.final_layernorm = TENorm( + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + self.head = torch.nn.Linear( + self.config.hidden_size, + self.output_dim, + bias=False, + ) def forward(self, x): x = super().forward( x, ) - x = self.final_layernorm(x) - x = x[:, 0] - x = self.head(x) + if self.post_process: + x = self.final_layernorm(x) + x = x[:, 0] + x = self.head(x) return x diff --git a/nemo/collections/vlm/__init__.py b/nemo/collections/vlm/__init__.py new file mode 100644 index 000000000000..2aeeae299a7d --- /dev/null +++ b/nemo/collections/vlm/__init__.py @@ -0,0 +1,41 @@ +from nemo.collections.vlm.neva.data import ( + DataConfig, + ImageDataConfig, + ImageToken, + MockDataModule, + MultiModalToken, + NevaLazyDataModule, + VideoDataConfig, + VideoToken, +) +from nemo.collections.vlm.neva.model import ( + CLIPViTConfig, + HFCLIPVisionConfig, + Llava1_5Config7B, + Llava1_5Config13B, + LlavaConfig, + LlavaModel, + MultimodalProjectorConfig, + NevaConfig, + NevaModel, +) + +__all__ = [ + "MockDataModule", + "NevaLazyDataModule", + "DataConfig", + "ImageDataConfig", + "VideoDataConfig", + "MultiModalToken", + "ImageToken", + "VideoToken", + "CLIPViTConfig", + "HFCLIPVisionConfig", + "MultimodalProjectorConfig", + "NevaConfig", + "NevaModel", + "LlavaConfig", + "Llava1_5Config7B", + "Llava1_5Config13B", + "LlavaModel", +] diff --git a/nemo/collections/vlm/neva/__init__.py b/nemo/collections/vlm/neva/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/nemo/collections/vlm/neva/data/__init__.py b/nemo/collections/vlm/neva/data/__init__.py new file mode 100644 index 000000000000..bbd502e21c80 --- /dev/null +++ b/nemo/collections/vlm/neva/data/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig, VideoDataConfig +from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule +from nemo.collections.vlm.neva.data.mock import MockDataModule +from nemo.collections.vlm.neva.data.multimodal_tokens import ImageToken, MultiModalToken, VideoToken + +__all__ = [ + "NevaLazyDataModule", + "MockDataModule", + "DataConfig", + "ImageDataConfig", + "VideoDataConfig", + "MultiModalToken", + "ImageToken", + "VideoToken", +] diff --git a/nemo/collections/vlm/neva/data/api.py b/nemo/collections/vlm/neva/data/api.py new file mode 100644 index 000000000000..c2e51e033d8a --- /dev/null +++ b/nemo/collections/vlm/neva/data/api.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytorch_lightning as pl + +from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule +from nemo.collections.vlm.neva.data.mock import MockDataModule + + +def mock() -> pl.LightningDataModule: + return MockDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2) + + +def lazy() -> pl.LightningDataModule: + return NevaLazyDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2) + + +__all__ = ["mock", "lazy"] diff --git a/nemo/collections/vlm/neva/data/config.py b/nemo/collections/vlm/neva/data/config.py new file mode 100644 index 000000000000..3b22d5a493b3 --- /dev/null +++ b/nemo/collections/vlm/neva/data/config.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional + +from .multimodal_tokens import ImageToken, MultiModalToken, VideoToken + + +@dataclass +class DataConfig: + media_type: str # currently supported: image or video + media_token: MultiModalToken + conv_template: str = "v1" # check `nemo/collections/multimodal/data/neva/conversation.py` + reset_position_ids: bool = False # Option to reset the position IDs in the dataset at an interval + reset_attention_mask: bool = False # Option to reset the attention mask from the dataset + eod_mask_loss: bool = False # Option to enable the EOD mask loss + + +@dataclass +class ImageDataConfig(DataConfig): + media_type: str = "image" + media_token: MultiModalToken = ImageToken + image_folder: Optional[str] = None + image_process_mode: str = 'pad' + + +@dataclass +class VideoDataConfig(DataConfig): + media_type: str = "video" + media_token: MultiModalToken = VideoToken + splice_single_frame: Optional[str] = None + # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded. + num_frames: int = 8 # Selects the number of frames to use from the video + sep_token_between_frames: bool = False # TODO: Allow usage of separator tokens between frames + video_folder: Optional[str] = None diff --git a/nemo/collections/vlm/neva/data/conversation.py b/nemo/collections/vlm/neva/data/conversation.py new file mode 100644 index 000000000000..22c435cb1fd2 --- /dev/null +++ b/nemo/collections/vlm/neva/data/conversation.py @@ -0,0 +1,677 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import dataclasses +import re +from collections import defaultdict +from enum import Enum, auto +from io import BytesIO +from typing import Any, List, Optional, Union + +from PIL import Image +from transformers import AutoTokenizer + + +class SeparatorStyle(Enum): + """Different separator style.""" + + SINGLE = auto() + TWO = auto() + MPT = auto() + PLAIN = auto() + CHATML = auto() + LLAMA_2 = auto() + LLAMA_3 = auto() + MISTRAL = auto() + NVGPT = auto() + QWEN = auto() + GEMMA = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that keeps all conversation history.""" + + system: Optional[str] + roles: tuple[str, str] + messages: List[List[str]] + offset: int + sep_style: SeparatorStyle = SeparatorStyle.SINGLE + sep: str = "###" + sep2: str = None + version: str = "Unknown" + + tokenizer_name_or_path: Any = None + stop_str: Union[str, List[str]] = None + stop_token_ids: List[int] = None + + skip_next: bool = False + + def process_prompt_with_images(self, messages): + # Process messages to handle potential image tokens. + return messages + + def process_chat_template(self, tokenizer_name_or_path, messages): + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) + if self.system is None: + chat = [] + else: + chat = [{"role": "system", "content": self.system}] + for role, message in messages: + chat.append({"role": role.lower(), "content": message}) + ret = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) + return ret + + def get_prompt(self): + messages = self.messages + messages = self.process_prompt_with_images(messages) + + if self.sep_style == SeparatorStyle.SINGLE: + ret = self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + self.sep + else: + ret += role + ":" + + elif self.sep_style == SeparatorStyle.TWO: + """ + A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {{ user_message_1 }} ASSISTANT: {{ model_answer_1 }}USER: {{ user_message_2 }} + """ + seps = [self.sep, self.sep2] + ret = self.system + seps[0] + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + + elif self.sep_style == SeparatorStyle.MISTRAL and self.version == "vila": + """ + [INST] {{ user_message_1 }} [/INST]{{ model_answer_1 }}[INST] {{ user_message_2 }} [/INST] + """ + wrap_sys = lambda msg: f"{msg}" + ("\n" if msg else "") + wrap_inst = lambda msg: f"[INST] {msg} [/INST]" + ret = "" + + for i, (role, message) in enumerate(messages): + if i == 0: + assert message, "first message should not be none" + assert role == self.roles[0], "first message should come from user" + if message: + if type(message) is tuple: + message, _, _ = message + if i == 0: + message = wrap_sys(self.system) + message + if i % 2 == 0: + message = wrap_inst(message) + ret += self.sep + message + else: + ret += message + self.sep2 + else: + ret += "" + + elif self.sep_style == SeparatorStyle.LLAMA_2: + """ + [INST] <> + You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. + <> + + {{ user_message_1 }} [/INST] {{ model_answer_1 }} [INST] {{ user_message_2 }} [/INST] + """ + tokenizer_name_or_path = self.tokenizer_name_or_path or "meta-llama/Llama-2-7b-chat-hf" + ret = self.process_chat_template(tokenizer_name_or_path, messages) + + elif self.sep_style == SeparatorStyle.LLAMA_3: + """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + + {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|> + + {{ user_message_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + + {{ model_answer_1 }}<|eot_id|><|start_header_id|>user<|end_header_id|> + + {{ user_message_2 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + """ + tokenizer_name_or_path = self.tokenizer_name_or_path or "meta-llama/Meta-Llama-3-8B-Instruct" + ret = self.process_chat_template(tokenizer_name_or_path, messages) + + elif self.sep_style == SeparatorStyle.NVGPT: + ret = self.sep2 + self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + '\n' + message + '\n' + self.sep + else: + ret += role + '\n' + + elif self.sep_style == SeparatorStyle.PLAIN: + seps = [self.sep, self.sep2] + ret = self.system + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += message + seps[i % 2] + else: + ret += "" + + elif self.sep_style == SeparatorStyle.MISTRAL: + """ + NOT tested in NeMo! + """ + tokenizer_name_or_path = self.tokenizer_name_or_path or "mistralai/Mistral-7B-Instruct-v0.2" + ret = self.process_chat_template(tokenizer_name_or_path, messages) + + elif self.sep_style == SeparatorStyle.CHATML: + """ + NOT tested in NeMo! + """ + ret = "" if self.system == "" else self.system + self.sep + "\n" + for role, message in messages: + if message: + if type(message) is tuple: + message, images = message + message = "" * len(images) + message + ret += role + "\n" + message + self.sep + "\n" + else: + ret += role + "\n" + return ret + + elif self.sep_style == SeparatorStyle.MPT: + """ + NOT tested in NeMo! + """ + ret = self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + message + self.sep + else: + ret += role + + elif self.sep_style == SeparatorStyle.GEMMA: + """ + NOT tested in NeMo! + """ + ret = "" + for i, (role, message) in enumerate(messages): + assert role == self.roles[i % 2], "Conversation should alternate user/assistant/user/assistant/..." + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + message + self.sep + else: + ret += role + + else: + raise ValueError(f"Invalid style: {self.sep_style}") + + return ret + + def append_message(self, role, message): + self.messages.append([role, message]) + + def process_image(self, image, image_process_mode, return_pil=False, image_format="PNG"): + if image_process_mode == "Pad": + + def expand2square(pil_img, background_color=(122, 116, 104)): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + image = expand2square(image) + elif image_process_mode in ["Default", "Crop"]: + pass + elif image_process_mode == "Resize": + image = image.resize((336, 336)) + else: + raise ValueError(f"Invalid image_process_mode: {image_process_mode}") + + if type(image) is not Image.Image: + image = Image.open(image).convert("RGB") + + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + max_len, min_len = 1008, 672 + shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) + longest_edge = int(shortest_edge * aspect_ratio) + W, H = image.size + if H > W: + H, W = longest_edge, shortest_edge + else: + H, W = shortest_edge, longest_edge + image = image.resize((W, H)) + if return_pil: + return image + else: + buffered = BytesIO() + image.save(buffered, format=image_format) + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + return img_b64_str + + def get_images(self, return_pil=False, return_path=False): + images = [] + for i, (role, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + if type(msg) is tuple: + msg, image, image_process_mode = msg + if type(image) != list: + image = [image] + for img in image: + if not return_path: + img = self.process_image(img, image_process_mode, return_pil=return_pil) + images.append(img) + return images + + def to_gradio_chatbot(self): + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + if type(msg) is tuple: + msg, image, image_process_mode = msg + if type(image) != list: + image = [image] + if len(image) == 1: + msg = "\n" + msg.replace("", "").strip() + else: + msg = re.sub(r"()\n(?=)", r"\1 ", msg) + for img in image: + img_b64_str = self.process_image(img, "Default", return_pil=False, image_format="JPEG") + img_str = f'' + msg = msg.replace("", img_str, 1).strip() + if len(msg) > 0: + ret.append([msg, None]) + else: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def copy(self): + return Conversation( + system=self.system, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + version=self.version, + ) + + def dict(self): + if len(self.get_images()) > 0: + return { + "system": self.system, + "roles": self.roles, + "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages], + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + return { + "system": self.system, + "roles": self.roles, + "messages": self.messages, + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + + +# Conversation Template for NVGPT +conv_nvgpt = Conversation( + system="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n""", + roles=("User", "Assistant"), + version="nvgpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.NVGPT, + sep="", + sep2=f"System\n", +) + +conv_nv_dpo = Conversation( + system="\n", + roles=("User", "Assistant"), + version="nv_dpo", + messages=[], + offset=0, + sep_style=SeparatorStyle.NVGPT, + sep="", + sep2=f"System\n", +) + +conv_vicuna_v0 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("Human", "Assistant"), + messages=[ + ["Human", "What are the key differences between renewable and non-renewable energy sources?"], + [ + "Assistant", + "Renewable energy sources are those that can be replenished naturally in a relatively " + "short amount of time, such as solar, wind, hydro, geothermal, and biomass. " + "Non-renewable energy sources, on the other hand, are finite and will eventually be " + "depleted, such as coal, oil, and natural gas. Here are some key differences between " + "renewable and non-renewable energy sources:\n" + "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable " + "energy sources are finite and will eventually run out.\n" + "2. Environmental impact: Renewable energy sources have a much lower environmental impact " + "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, " + "and other negative effects.\n" + "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically " + "have lower operational costs than non-renewable sources.\n" + "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote " + "locations than non-renewable sources.\n" + "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different " + "situations and needs, while non-renewable sources are more rigid and inflexible.\n" + "6. Sustainability: Renewable energy sources are more sustainable over the long term, while " + "non-renewable sources are not, and their depletion can lead to economic and social instability.\n", + ], + ], + offset=2, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_vicuna_v1 = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="v1", + messages=[], + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", + stop_str="", +) + +conv_llama_2 = Conversation( + system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. + +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=[], + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", + stop_str=" ", +) + +conv_llava_llama_2 = Conversation( + system="You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=[], + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", + stop_str=" ", +) + +conv_llava_llama_3 = Conversation( + system="You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.", + roles=("user", "assistant"), + version="llama_v3", + messages=[], + offset=0, + sep="<|eot_id|>", + sep_style=SeparatorStyle.LLAMA_3, + tokenizer_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct", + stop_str="<|eot_id|>", +) + +conv_mistral_instruct = Conversation( + system="", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=[], + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", + stop_str=" ", +) + +conv_llava_llama_2_simple = Conversation( + system="Answer the questions about the visual content that the user provides.", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=[], + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", + stop_str=" ", +) + +conv_llava_llama_2_mmtag = Conversation( + system="Answer the questions about the visual content that the user provides." + "The visual content will be provided with the following format: visual content.", + roles=("USER", "ASSISTANT"), + version="llama_v2_mmtag", + messages=[], + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", + stop_str=" ", +) + +conv_mpt = Conversation( + system="""<|im_start|>system +A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +conv_qwen = Conversation( + system="""<|im_start|>system +You are a helpful assistant.""", + roles=("<|im_start|>user", "<|im_start|>assistant"), + version="qwen", + messages=[], + offset=0, + sep_style=SeparatorStyle.CHATML, + sep="<|im_end|>", +) + +conv_gemma_instruct = Conversation( + system="", + roles=("user\n", "model\n"), + version="gemma", + messages=[], + offset=0, + sep_style=SeparatorStyle.GEMMA, + sep="\n", +) + +conv_llava_plain = Conversation( + system="", + roles=("", ""), + messages=[], + offset=0, + sep_style=SeparatorStyle.PLAIN, + sep="", + sep2="\n", + stop_str="\n", +) + +conv_llava_v0 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("Human", "Assistant"), + messages=[], + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_llava_v0_mmtag = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language." + "The visual content will be provided with the following format: visual content.", + roles=("Human", "Assistant"), + messages=[], + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", + version="v0_mmtag", +) + +conv_llava_v1 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("USER", "ASSISTANT"), + version="v1", + messages=[], + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", +) + +conv_llava_v1_mmtag = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language." + "The visual content will be provided with the following format: visual content.", + roles=("USER", "ASSISTANT"), + messages=[], + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", + version="v1_mmtag", +) + +conv_mistral_vila = Conversation( + system=None, + roles=("USER", "ASSISTANT"), + version="vila", + messages=[], + offset=0, + sep_style=SeparatorStyle.MISTRAL, + sep="", + sep2="", + stop_str="", +) + +conv_mistral_orca = Conversation( + system="""<|im_start|>system +You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +conv_mistral_zephyr = Conversation( + system="""<|system|> +You are a helpful AI assistant.""", + roles=("<|user|>\n", "<|assistant|>\n"), + version="mpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="", +) + +conv_mistral_direct = Conversation( + system="""<|im_start|>system +Answer the questions.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +conv_chatml_direct = Conversation( + system="""<|im_start|>system +Answer the questions.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +default_conversation = conv_vicuna_v1 +conv_templates = { + "default": conv_vicuna_v1, + "v0": conv_vicuna_v0, + "v1": conv_vicuna_v1, + "vicuna_v1": conv_vicuna_v1, + "llama_2": conv_llama_2, + "mistral_instruct": conv_mistral_instruct, + "mistral_orca": conv_mistral_orca, + "mistral_zephyr": conv_mistral_zephyr, + "mistral_direct": conv_mistral_direct, + "mistral": conv_mistral_vila, + "plain": conv_llava_plain, + "v0_plain": conv_llava_plain, + "chatml_direct": conv_chatml_direct, + "llava_v0": conv_llava_v0, + "llava_v0_mmtag": conv_llava_v0_mmtag, + "llava_v1": conv_llava_v1, + "llava_v1_mmtag": conv_llava_v1_mmtag, + "llava_llama_2": conv_llava_llama_2, + "llava_llama_3": conv_llava_llama_3, + "llava_llama_2_simple": conv_llava_llama_2_simple, + "llava_llama_2_mmtag": conv_llava_llama_2_mmtag, + "llava_mistral_instruct": conv_mistral_instruct, + "mpt": conv_mpt, + "qwen_1_5": conv_qwen, + "gemma_instruct": conv_gemma_instruct, + "nvgpt": conv_nvgpt, + "nv_steerlm": conv_nvgpt, + "nv_dpo": conv_nv_dpo, +} + +if __name__ == "__main__": + print(default_conversation.get_prompt()) diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py new file mode 100644 index 000000000000..ca1179e24033 --- /dev/null +++ b/nemo/collections/vlm/neva/data/lazy.py @@ -0,0 +1,612 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING, Optional + +import pytorch_lightning as pl +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS +from torch.utils import data +from torch.utils.data import DataLoader + +from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig +from nemo.collections.vlm.neva.data.conversation import conv_templates as supported_conv_templates +from nemo.lightning.pytorch.plugins import MegatronDataSampler + +if TYPE_CHECKING: + pass + +import json +import logging +import os +import re +import tarfile +from typing import Any, Dict, List, Sequence + +import decord +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image +from torch.utils.data import Dataset, default_collate +from transformers import CLIPImageProcessor, SiglipImageProcessor + +from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids +from nemo.collections.vlm.neva.data.multimodal_tokens import IGNORE_INDEX, SPECIAL_TOKEN_MAP + + +class TarOrFolderImageLoader: + """ + A class for loading images from a tar archive or a regular folder. + + This class provides functionality to open and read images from either a tar archive + (.tar file) or a standard directory with image files. It builds an index of images + if the source is a tar archive for efficient access. + + Attributes: + image_folder (str): The path to the tar archive or image folder. + tar_index (dict): A dictionary that maps file names to their tarfile member + objects if the image source is a tar archive. + + Methods: + __init__(self, image_folder): Initializes the loader with the specified image folder. + build_index(self): Builds an index of image file names and their corresponding + tarfile member objects for a tar archive. + open_image(self, file_name): Opens and returns an image by its file name. The image + is returned as an RGB PIL Image object. + """ + + def __init__(self, image_folder): + self.image_folder = image_folder + self.tar_index = {} + if self.image_folder.endswith('.tar'): + self.build_index() + + def build_index(self): + with tarfile.open(self.image_folder, 'r') as tar: + for member in tar.getmembers(): + self.tar_index[member.name] = member + + def open_image(self, file_name): + if self.image_folder.endswith('.tar'): + with tarfile.open(self.image_folder, 'r') as tar: + member = self.tar_index.get(file_name) + if member: + f = tar.extractfile(member) + return Image.open(f).convert('RGB') + else: + return Image.open(os.path.join(self.image_folder, file_name)).convert('RGB') + return None + + +class TarOrFolderVideoLoader: + """ + A class for loading videos from a tar archive or a regular folder. + + This class provides functionality to open and read videos from either a tar archive + (.tar file) or a standard directory with video files. It builds an index of videos + if the source is a tar archive for efficient access. + + Attributes: + video_folder (str): The path to the tar archive or video folder. + data_config (dict): A dictionary of configuration options for video decoding to frames + tar_index (dict): A dictionary that maps file names to their tarfile member + objects if the video source is a tar archive. + + Methods: + __init__(self, video_folder): Initializes the loader with the specified video folder. + build_index(self): Builds an index of image file names and their corresponding + tarfile member objects for a tar archive. + open_video(self, file_name): Opens and returns an video by its file name. The video + is returned as a list of RGB PIL Image objects. + flatten_frames(self, cap): Converts decord VideoReader video object to list of frame + images based on data config information. + """ + + def __init__(self, video_folder, data_config): + self.video_folder = video_folder + self.data_config = data_config + self.tar_index = {} + if self.video_folder.endswith('.tar'): + self.build_index() + + def build_index(self): + with tarfile.open(self.video_folder, 'r') as tar: + for member in tar.getmembers(): + self.tar_index[member.name] = member + + def open_video(self, file_name): + if self.video_folder.endswith('.tar'): + with tarfile.open(self.video_folder, 'r') as tar: + member = self.tar_index.get(file_name) + if member: + f = tar.extractfile(member) + cap = decord.VideoReader(f) + return self.flatten_frames(cap) + else: + # decord.bridge.set_bridge("torch") + cap = decord.VideoReader(os.path.join(self.video_folder, file_name)) + return self.flatten_frames(cap) + return None + + def flatten_frames(self, cap): + if self.data_config.splice_single_frame == 'first': + frame = cap[0].asnumpy() + return Image.fromarray(frame).convert('RGB') + elif self.data_config.splice_single_frame == 'middle': + frame = cap[len(cap) // 2].asnumpy() + return Image.fromarray(frame).convert('RGB') + elif self.data_config.splice_single_frame == 'last': + frame = cap[-1].asnumpy() + return Image.fromarray(frame).convert('RGB') + else: + if self.data_config.num_frames == -1: + frames = [] + for frame in cap: + rgb_frame = frame.asnumpy() + img = Image.fromarray(rgb_frame).convert('RGB') + frames.append(img) + return frames + else: + num_frames = min(len(cap), self.data_config.num_frames) + indices = np.linspace(0, len(cap) - 1, num_frames, dtype=int) + frames = [Image.fromarray(cap[i].asnumpy()).convert('RGB') for i in indices] + while len(frames) < self.data_config.num_frames: + frames.append(frames[-1]) + return frames + + +def process_image(processor, image, image_process_mode="square"): # this needs to be merged with conv's process image + if isinstance(processor, CLIPImageProcessor) or isinstance(processor, SiglipImageProcessor): + # image processor from HF + if image_process_mode == 'keep': + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + max_len, min_len = 448, 224 + shortest_edge = int(min(max_len / aspect_ratio, min_len)) + image = processor.preprocess( + image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge} + )['pixel_values'][0] + elif image_process_mode == 'pad': + + def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean)) + image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + else: + image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + else: + assert image_process_mode == 'square', 'NeMo image transform with setting `image_process_mode` to `square`.' + image = processor(image) + return image + + +def tokenize_special_token(prompt, tokenizer, special_token_map=None): + """ + Tokenizes a given prompt with special handling for multiple special tokens. + + This function splits the prompt at special tokens, tokenizes each chunk separately, + and then reassembles the chunks with the corresponding special token inserted in place of the placeholders. + + Parameters: + prompt (str): The input prompt containing text and special token placeholders. + tokenizer: The tokenizer object used to tokenize the prompt chunks. + special_token_map (list, optional): A list containing tuples of special token strings + and their corresponding token indices. Defaults to SPECIAL_TOKEN_MAP. + + Returns: + torch.Tensor: A tensor of token IDs representing the tokenized prompt with special tokens. + """ + + # Use the default special token map if none is provided + if special_token_map is None: + special_token_map = SPECIAL_TOKEN_MAP + + # Create a mapping of special tokens to their indices + special_token_dict = {token: index for token, index in special_token_map} + + # Split the prompt into chunks and track special tokens + regex_pattern = '(' + '|'.join(re.escape(token) for token in special_token_dict.keys()) + ')' + chunks = re.split(regex_pattern, prompt) + + # Tokenize each chunk and replace special tokens with their indices + tokenized_chunks = [] + for chunk in chunks: + if chunk in special_token_dict: + tokenized_chunks.append(special_token_dict[chunk]) + elif len(chunk) > 0: + tokenized_chunks.extend(tokenizer(chunk, add_special_tokens=False).input_ids) + + return torch.tensor(tokenized_chunks, dtype=torch.long) + + +def find_pattern_indices(template, pattern, search_start_index=0, allow_first_token_mismatch=False): + template_len = len(template) + pattern_len = len(pattern) + for i in range(search_start_index, template_len - pattern_len + 1): + match = template[i : i + pattern_len] == pattern + if torch.all(match) or (allow_first_token_mismatch and torch.all(match[1:])): + return i, i + pattern_len + return -1, -1 + + +class LazySupervisedDataset(Dataset): + + def __init__( + self, + data_path, + data_config, + tokenizer, + image_processor, + ): + super().__init__() + if data_path is not None: + with open(data_path, "r") as file: + list_data_dict = json.load(file) + else: + list_data_dict = [] + + logging.warning("Formatting inputs...Skip in lazy mode") + self.data_config = data_config + self.tokenizer = tokenizer + self.image_processor = image_processor + + self.conv_template = data_config.conv_template + self.conv = supported_conv_templates[self.conv_template] + self.image_process_mode = data_config.image_process_mode + self.list_data_dict = list_data_dict + + image_folder = getattr(data_config, "image_folder", None) + video_folder = getattr(data_config, "video_folder", None) + + self.image_loader = TarOrFolderImageLoader(image_folder) if image_folder else None + self.video_loader = TarOrFolderVideoLoader(video_folder, data_config) if video_folder else None + + def __len__(self): + return len(self.list_data_dict) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + source = self.list_data_dict[i] + conversations = self._apply_prompt_templates(source, use_plain=self.conv_template == "plain") + tokens, labels = self._tokenize_and_label(conversations) + + media_tensors = self._process_images(source) + data_dict = dict( + image=media_tensors, + tokens=tokens, + labels=labels, + ) + return data_dict + + def _process_images(self, source): + media_tensors = torch.tensor([]) + if 'image' in source: + if not isinstance(source['image'], list): + source['image'] = [source['image']] + + images = [] + for image_file in source['image']: + image = self.image_loader.open_image(image_file) + if image is None: + logging.warning(f"Image {image_file} could not be found!") + image = process_image(self.image_processor, image, self.image_process_mode) + images.append(image) + + if images: + media_tensors = torch.stack(images) + return media_tensors + + def _apply_prompt_templates(self, source, use_plain=False): + conv = self.conv + + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + source = source['conversations'] + if roles[source[0]["from"]] != conv.roles[0]: + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{j}" + conv.append_message(role, sentence["value"]) + + if use_plain: + assert len(conv.messages) == 2, "Plain template requires image-caption pairs." + assert "" in conv.messages[0][1] + conv.messages[0][1] = "" + + return conv.get_prompt() + + def _tokenize_and_label(self, conversations): + tokens = tokenize_special_token(conversations, self.tokenizer) + labels = torch.ones_like(tokens) * IGNORE_INDEX + search_start_index = 0 + for i in range(1, len(self.conv.messages), 2): + stop_str = getattr(self.conv, "stop_str", None) + assert ( + stop_str is not None + ), "If `stop_str` is not provided, issues might occur in labeling the answer tokens." + answer_tokens = self.tokenizer.encode( + self.conv.messages[i][1] + ("" if stop_str is None else stop_str), + add_special_tokens=False, + return_tensors="pt", + )[0] + answer_start, answer_end = find_pattern_indices(tokens, answer_tokens, search_start_index) + labels[answer_start:answer_end] = tokens[answer_start:answer_end] + search_start_index = answer_end + tokens = tokens[:-1] + labels = labels[1:] + return tokens, labels + + def _get_crop_size(self): + if isinstance(self.image_processor, CLIPImageProcessor): + return [self.image_processor.crop_size['height'], self.image_processor.crop_size['width']] + else: + raise NotImplementedError + + +class NevaDataset(LazySupervisedDataset): + """Dataset for supervised fine-tuning.""" + + def __init__( + self, + data_path, + data_config, + tokenizer, + image_processor, + ): + + if data_path.endswith(".json"): + super().__init__(data_path, data_config, tokenizer, image_processor) + + elif data_path.endswith(".jsonl"): + super().__init__(None, data_config, tokenizer, image_processor) + logging.warning("Loading image inputs from SteerLM Dataset...") + if data_config.media_type == 'image': + image_folder = data_config.image_folder + for line in open(data_path, "r"): + record = json.loads(line) + + # This currently supports only a single image + # search for tag + + record['image'] = [] + for turn in record['conversations']: + matches = re.finditer('', "", turn['value']) + + self.list_data_dict.append(record) + + else: + raise ValueError(f"Formatting of {data_path} is not supported in Neva.") + + def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + data_config = self.data_config + packed_sequence = "cu_seqlens" in instances[0] + max_len = max(instance['tokens'].shape[0] for instance in instances) + for instance in instances: + pad_len = max_len - instance['tokens'].shape[0] + instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0) + instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', IGNORE_INDEX) + if packed_sequence and instance["cu_seqlens"][-1] != max_len: + instance["cu_seqlens"] = torch.cat((instance["cu_seqlens"], torch.IntTensor([max_len])), 0) + + if packed_sequence: + max_len_cu = max(instance['cu_seqlens'].shape[0] for instance in instances) + max_len_image = max(instance['image'].shape[0] for instance in instances) + for instance in instances: + pad_len_cu = max_len_cu - instance['cu_seqlens'].shape[0] + instance['cu_seqlens'] = F.pad(instance['cu_seqlens'], (0, pad_len_cu), 'constant', max_len) + + x = instance['image'] + num_pad = max_len_image - x.shape[0] + pad_tensor = torch.zeros(num_pad, *x.shape[1:], dtype=x.dtype, device=x.device) + instance['image'] = torch.cat((x, pad_tensor), dim=0) + + media_type = data_config.media_type + if media_type == 'image': + media = [instance.pop('image') for instance in instances] + media = torch.cat(media, dim=0) + if media.size(0) == 0: + media = None + elif media_type == 'video': + media = [instance.pop('video', None) for instance in instances] + else: + raise ValueError(f"Unsupported media type {media_type}") + + batch = default_collate(instances) + tokenizer = self.tokenizer + + tokens = batch['tokens'] + labels = batch['labels'] + + if packed_sequence: + cu_seqlens = batch["cu_seqlens"] + position_ids = [] + for cu_seqlen in cu_seqlens: + position_ids.append([]) + for ind in range(0, len(cu_seqlen) - 1): + seqlen = cu_seqlen[ind + 1] - cu_seqlen[ind] + position_ids[-1].extend(list(range(seqlen))) + position_ids = torch.LongTensor(position_ids) + loss_mask = torch.ones(tokens.size(), dtype=torch.float, device=tokens.device) + attention_mask = torch.ones(tokens.size(), dtype=torch.long, device=tokens.device) + else: + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + data=tokens, + eod_token=tokenizer.eos_token_id, + eod_mask_loss=data_config.eod_mask_loss, + reset_attention_mask=data_config.reset_attention_mask, + reset_position_ids=data_config.reset_position_ids, + ) + + loss_mask[labels < 0] = 0.0 + + batch = { + 'tokens': tokens, + 'labels': labels, + 'attention_mask': attention_mask, + 'loss_mask': loss_mask, + 'position_ids': position_ids, + 'media': media, + } + if packed_sequence: + batch["cu_seqlens"] = cu_seqlens + return batch + + +class NevaLazyDataModule(pl.LightningDataModule): + def __init__( + self, + paths: str | List[str], + weights: Optional[List[float]] = None, + data_config: Optional[DataConfig] = ImageDataConfig, + seq_length: int = 2048, + tokenizer: Optional = None, + image_processor: Optional = None, + micro_batch_size: int = 4, + global_batch_size: int = 8, + num_train_samples: int = 10_000, + num_val_samples: int = 10_000, + num_test_samples: int = 10_000, + num_workers: int = 8, + pin_memory: bool = True, + persistent_workers: bool = False, + use_packed_sequence: bool = False, + seed: int = 1234, + ) -> None: + super().__init__() + if not isinstance(paths, (list, tuple)): + paths = [paths] + if weights is not None: + assert len(weights) == len(paths) + if len(weights) == 1: + # weights must be None if there is only one dataset + weights = None + + self.paths = paths + self.weights = weights + self.data_config = data_config + self.seq_length = seq_length + self.tokenizer = tokenizer + self.image_processor = image_processor + self.num_train_samples = num_train_samples + self.num_val_samples = num_val_samples + self.num_test_samples = num_test_samples + self.num_workers = num_workers + self.pin_memory = pin_memory + self.persistent_workers = persistent_workers + self.seed = seed + self.use_packed_sequence = use_packed_sequence + self.init_global_step = 0 + + if tokenizer is None or image_processor is None: + logging.warning(f"Processor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.") + from transformers import AutoProcessor + + processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + self.tokenizer = tokenizer or processor.tokenizer + self.image_processor = image_processor or processor.image_processor + + self.data_sampler = MegatronDataSampler( + seq_len=self.seq_length, + micro_batch_size=micro_batch_size, + global_batch_size=global_batch_size, + dataloader_type="cyclic", + ) + + def setup(self, stage: str = "") -> None: + assert len(self.paths) == 1, "not yet support blend dataset in Neva 2.0!" + if self.use_packed_sequence: + pass # TODO + else: + # TODO: + # rng = torch.Generator().manual_seed(self.seed) + # train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], generator=rng) + self._train_ds = NevaDataset(self.paths[0], self.data_config, self.tokenizer, self.image_processor) + self._validation_ds = NevaDataset(self.paths[0], self.data_config, self.tokenizer, self.image_processor) + + def train_dataloader(self) -> TRAIN_DATALOADERS: + return self._create_dataloader(self._train_ds) + + def val_dataloader(self) -> EVAL_DATALOADERS: + return self._create_dataloader(self._validation_ds) + + def test_dataloader(self) -> EVAL_DATALOADERS: + return self._create_dataloader(self._test_ds) + + def _create_dataloader(self, dataset, **kwargs) -> DataLoader: + self.init_global_step = self.trainer.global_step + self.data_sampler.init_global_step = self.init_global_step + return DataLoader( + dataset, + num_workers=self.num_workers, + pin_memory=self.pin_memory, + persistent_workers=self.persistent_workers, + collate_fn=getattr(dataset, 'collate_fn', data.dataloader.default_collate), + **kwargs, + ) + + def state_dict(self) -> Dict[str, Any]: + """Called when saving a checkpoint, implement to generate and save datamodule state. + + Returns: + A dictionary containing datamodule state. + + """ + consumed_samples = self.data_sampler.compute_consumed_samples(self.trainer.global_step - self.init_global_step) + return {'consumed_samples': consumed_samples} + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + """Called when loading a checkpoint, implement to reload datamodule state given datamodule stat + + Args: + state_dict: the datamodule state returned by ``state_dict``. + + """ + try: + from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR + except ModuleNotFoundError: + from nemo.lightning.apex_utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR + consumed_samples = state_dict['consumed_samples'] + self.data_sampler.init_consumed_samples = consumed_samples + self.data_sampler.prev_consumed_samples = consumed_samples + self.if_first_step = 1 + + if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is not None: + num_microbatch_calculator = _GLOBAL_NUM_MICROBATCHES_CALCULATOR # noqa: SLF001 + + num_microbatch_calculator.update( + consumed_samples=consumed_samples, + consistency_check=False, + ) diff --git a/nemo/collections/vlm/neva/data/mock.py b/nemo/collections/vlm/neva/data/mock.py new file mode 100644 index 000000000000..ac4bc56a068c --- /dev/null +++ b/nemo/collections/vlm/neva/data/mock.py @@ -0,0 +1,179 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional + +import numpy as np +import pytorch_lightning as pl +import torch +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS +from torch.utils import data +from torch.utils.data import DataLoader, Dataset + +from nemo.collections.vlm.neva.data.multimodal_tokens import IMAGE_TOKEN_INDEX +from nemo.lightning.pytorch.plugins import MegatronDataSampler + + +class MockDataModule(pl.LightningDataModule): + def __init__( + self, + seq_length: int = 2048, + tokenizer: Optional = None, + image_processor: Optional = None, + micro_batch_size: int = 4, + global_batch_size: int = 8, + rampup_batch_size: Optional[List[int]] = None, + num_train_samples: int = 10_000, + num_val_samples: int = 10_000, + num_test_samples: int = 10_000, + num_workers: int = 8, + pin_memory: bool = True, + persistent_workers: bool = False, + ): + super().__init__() + self.seq_length = seq_length + self.num_train_samples = num_train_samples + self.num_val_samples = num_val_samples + self.num_test_samples = num_test_samples + self.num_workers = num_workers + self.pin_memory = pin_memory + self.persistent_workers = persistent_workers + + if tokenizer is None or image_processor is None: + from transformers import AutoProcessor + + processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + self.tokenizer = tokenizer or processor.tokenizer + self.image_processor = image_processor or processor.image_processor + self.data_sampler = MegatronDataSampler( + seq_len=self.seq_length, + micro_batch_size=micro_batch_size, + global_batch_size=global_batch_size, + rampup_batch_size=rampup_batch_size, + ) + + def setup(self, stage: str = "") -> None: + self._train_ds = _MockNevaDataset( + self.tokenizer, self.image_processor, "train", self.num_train_samples, self.seq_length + ) + self._validation_ds = _MockNevaDataset( + self.tokenizer, self.image_processor, "valid", self.num_val_samples, self.seq_length + ) + self._test_ds = _MockNevaDataset( + self.tokenizer, self.image_processor, "test", self.num_test_samples, self.seq_length + ) + + def train_dataloader(self) -> TRAIN_DATALOADERS: + if not hasattr(self, "_train_ds"): + self.setup() + return self._create_dataloader(self._train_ds) + + def val_dataloader(self) -> EVAL_DATALOADERS: + if not hasattr(self, "_validation_ds"): + self.setup() + return self._create_dataloader(self._validation_ds) + + def test_dataloader(self) -> EVAL_DATALOADERS: + if not hasattr(self, "_test_ds"): + self.setup() + return self._create_dataloader(self._test_ds) + + def _create_dataloader(self, dataset, **kwargs) -> DataLoader: + return DataLoader( + dataset, + num_workers=self.num_workers, + pin_memory=self.pin_memory, + persistent_workers=self.persistent_workers, + collate_fn=dataset.collate_fn, + **kwargs, + ) + + +class _MockNevaDataset(Dataset): + def __init__( + self, + tokenizer, + image_processor, + name: str, + num_samples: int, + seq_length: int, + seed: int = 42, + ) -> None: + super().__init__() + self.name = name + self.seq_length = seq_length + + self.vocab_size = tokenizer.vocab_size + + crop_size = image_processor.crop_size + self.image_height, self.image_width = crop_size["height"], crop_size["width"] + + self.length = num_samples + self.seed = seed + + self.loss_mask = torch.ones(self.seq_length, dtype=torch.float) + self.position_ids = torch.arange(self.seq_length, dtype=torch.int64) + + def __len__(self) -> int: + return self.length + + def _get_text(self, idx: int) -> np.ndarray: + np_gen = np.random.default_rng(seed=(self.seed + idx)) + return np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64) + + def __getitem__(self, idx) -> Dict[str, torch.Tensor]: + # Generate data of the expected size and datatype (based on GPTDataset). + np_gen = np.random.default_rng(seed=(self.seed + idx)) + tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length + 1], dtype=np.int64)) + tokens[2] = IMAGE_TOKEN_INDEX # ImageToken token index + labels = tokens.clone() + images = torch.from_numpy(np_gen.random(size=[3, self.image_height, self.image_width], dtype=np.float32)) + tokens = tokens[:-1] + labels = labels[1:] + return { + "media": images, + "tokens": tokens, + "labels": labels, + "loss_mask": self.loss_mask, + "position_ids": self.position_ids, + } + + def _collate_fn(self, batch): + """ + A default implementation of a collation function. + Users should override this method to define custom data loaders. + """ + collated_batch = data.dataloader.default_collate(batch) + collated_batch["attention_mask"] = None + return collated_batch + + def collate_fn(self, batch): + """Method that user pass as functor to DataLoader. + + The method optionally performs neural type checking and add types to the outputs. + + Please note, subclasses of Dataset should not implement `input_types`. + + # Usage: + dataloader = torch.utils.data.DataLoader( + ...., + collate_fn=dataset.collate_fn, + .... + ) + + Returns + ------- + Collated batch, with or without types. + """ + return self._collate_fn(batch) diff --git a/nemo/collections/vlm/neva/data/multimodal_tokens.py b/nemo/collections/vlm/neva/data/multimodal_tokens.py new file mode 100644 index 000000000000..8c4dcadad63c --- /dev/null +++ b/nemo/collections/vlm/neva/data/multimodal_tokens.py @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Callable, Optional + + +@dataclass +class MultiModalToken: + """ + Base class for multimodal tokens representing different media types. + """ + + token_str: str + token_index: int + media_type: str + use_start_end: bool + encoder_fn: Optional[Callable] = None + + +@dataclass +class ImageToken(MultiModalToken): + token_str: str = "" + token_index: int = -200 + media_type: str = "image" + use_start_end: bool = False + + +@dataclass +class VideoToken(MultiModalToken): + token_str: str = "