NVIDIA · ko3n1g · Jan 28, 2025 · Jan 28, 2025 · Jan 22, 2025 · Jan 23, 2025
@@ -45,19 +45,6 @@ apt-get install -y bc libsox-fmt-all -y
 apt-get clean
 EOF
 
-ARG MAX_JOBS
-ARG TE_TAG
-ARG TE_REPO
-RUN --mount=type=bind,from=nemo-bump,source=/opt/NeMo/reinstall.sh,target=/opt/NeMo/reinstall.sh \
-  bash /opt/NeMo/reinstall.sh --library te --mode build && \
-  ls -al /opt/TransformerEngine || true
-
-ARG APEX_REPO
-ARG APEX_TAG
-RUN --mount=type=bind,from=nemo-bump,source=/opt/NeMo/reinstall.sh,target=/opt/NeMo/reinstall.sh \
-  bash /opt/NeMo/reinstall.sh --library apex --mode build && \
-  ls -al /opt/Apex || true
-
 ARG MLM_REPO
 ARG MLM_TAG
 RUN --mount=type=bind,from=nemo-bump,source=/opt/NeMo/reinstall.sh,target=/opt/NeMo/reinstall.sh \
@@ -74,7 +61,8 @@ RUN \
   --mount=type=bind,from=nemo-bump,source=/opt/NeMo/nemo/package_info.py,target=/tmp/NeMo/nemo/package_info.py \
   --mount=type=bind,from=nemo-bump,source=/opt/NeMo/nemo/__init__.py,target=/tmp/NeMo/nemo/__init__.py <<"EOF" bash -ex
     export NEMO_DIR=/tmp/NeMo 
-    bash /tmp/NeMo/reinstall.sh --library all --mode install
+    bash /tmp/NeMo/reinstall.sh --library mcore --mode install
+    bash /tmp/NeMo/reinstall.sh --library nemo --mode install
     rm -rf $NEMO_DIR || true
 EOF
 
@@ -83,7 +71,8 @@ ARG NEMO_REPO
 ARG NEMO_TAG
 RUN \
   --mount=type=bind,from=nemo-bump,source=/opt/NeMo/reinstall.sh,target=/tmp/reinstall.sh <<"EOF" bash -ex
-    bash /tmp/reinstall.sh --library all --mode install
+  bash /tmp/reinstall.sh --library mcore --mode install
+  bash /tmp/reinstall.sh --library nemo --mode install
 
     # Copy into workspace
     cp -a /opt/NeMo/. /workspace/

diff --git a/examples/nlp/machine_translation/enc_dec_nmt.py b/examples/nlp/machine_translation/enc_dec_nmt.py
@@ -94,8 +94,8 @@ class MTEncDecConfig(NemoConfig):
     name: Optional[str] = 'MTEncDec'
     do_training: bool = True
     do_testing: bool = False
-    model: MTEncDecModelConfig = field(default_factory=lambda: MTEncDecModelConfig())
-    trainer: Optional[TrainerConfig] = field(default_factory=lambda: TrainerConfig())
+    model: MTEncDecModelConfig = field(default_factory=MTEncDecModelConfig)
+    trainer: Optional[TrainerConfig] = field(default_factory=TrainerConfig)
     exp_manager: Optional[ExpManagerConfig] = field(
         default_factory=lambda: ExpManagerConfig(name='MTEncDec', files_to_copy=[])
     )

diff --git a/nemo/collections/common/callbacks/ema.py b/nemo/collections/common/callbacks/ema.py
@@ -121,8 +121,6 @@ def on_load_checkpoint(
     ) -> None:
         checkpoint_callback = trainer.checkpoint_callback
 
-        # use the connector as NeMo calls the connector directly in the exp_manager when restoring.
-        connector = trainer._checkpoint_connector
         # Replace connector._ckpt_path with below to avoid calling into lightning's protected API
         ckpt_path = trainer.ckpt_path
 
@@ -137,7 +135,7 @@ def on_load_checkpoint(
                 return
             ema_path = ckpt_path.replace(ext, f'-EMA{ext}')
             if os.path.exists(ema_path):
-                ema_state_dict = torch.load(ema_path, map_location=torch.device('cpu'))
+                ema_state_dict = torch.load(ema_path, map_location=torch.device('cpu'), weights_only=False)
 
                 checkpoint['optimizer_states'] = ema_state_dict['optimizer_states']
                 del ema_state_dict

diff --git a/nemo/collections/llm/peft/api.py b/nemo/collections/llm/peft/api.py
@@ -36,6 +36,7 @@
 
 @factory
 def gpt_lora() -> PEFT:
+    """ """
     return LoRA()
 
 
@@ -75,6 +76,7 @@ def export_lora(
 def merge_lora(
     lora_checkpoint_path: str,
     output_path: str,
+    legacy_ckpt: bool = False,
 ) -> None:
     """
     Merges the LoRA adapter weights into the base model's weights.
@@ -101,6 +103,10 @@ def merge_lora(
         strategy=MegatronStrategy(ddp="pytorch", setup_optimizers=False, plugins=bf16_mixed()),
     )
 
+    # Load ckpt saved with TE < 1.14
+    if legacy_ckpt:
+        trainer.strategy.ckpt_load_strictness = False
+
     model, lora = _load_base_model_and_lora(lora_checkpoint_path)
     _setup_trainer_and_restore_model_and_adapter(Path(lora_checkpoint_path), trainer, model, lora)
 

diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py
@@ -67,7 +67,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=False):
         self.hidden_size = None
         self.bert_model = None
         vocab_file = None
-        nemo_file = None
         config_dict = None
         config_file = None
 
@@ -112,8 +111,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=False):
         self._save_restore_connector = NLPSaveRestoreConnector()
 
         if cfg.get('language_model') and not no_lm_init:
-            if cfg.get('language_model').get('nemo_file'):
-                nemo_file = self.register_artifact('language_model.nemo_file', cfg.language_model.nemo_file)
             if cfg.get('language_model').get('config'):
                 config_dict = OmegaConf.to_container(cfg.language_model.config)
             if cfg.get('language_model').get('config_file'):
@@ -184,16 +181,18 @@ def register_bert_model(self):
                             f.write(json.dumps(output_config, indent=2, sort_keys=True) + '\n')
                         self.register_artifact('language_model.config_file', encoder_config_src)  # for .nemo
                     else:
-                        # No defaults as this case can be any possible hyper-parameter combination of MegatronBert config
+                        # No defaults as this case can be any possible
+                        # hyper-parameter combination of MegatronBert config
                         logging.info(f'For {self.pretrained_model_name}, set the config_file in the YAML file')
                 else:
                     logging.info(
-                        f'Registering MegatronBERT model config for {self.pretrained_model_name} is not yet supported. \
-                        Please override this method if needed.'
+                        f'Registering MegatronBERT model config for {self.pretrained_model_name} \
+                        is not yet supported. Please override this method if needed.'
                     )
             else:
                 logging.info(
-                    f'Registering BERT model config for {self.bert_model} is not yet supported. Please override this method if needed.'
+                    f'Registering BERT model config for {self.bert_model} is not yet supported. \
+                    Please override this method if needed.'
                 )
 
     def setup_tokenizer(self, cfg: DictConfig):
@@ -283,7 +282,8 @@ def _register_vocab_from_tokenizer(
                 self.register_artifact(config_path=vocab_file_config_path, src=vocab_file_src)
             else:
                 logging.info(
-                    f'Registering tokenizer vocab for {self.tokenizer} is not yet supported. Please override this method if needed.'
+                    f'Registering tokenizer vocab for {self.tokenizer} is not yet supported. \
+                    Please override this method if needed.'
                 )
 
     @staticmethod
@@ -304,6 +304,7 @@ def output_module(self):
 
     @property
     def is_model_parallel_initialized(self):
+        """ """
         app_state = AppState()
         if app_state.model_parallel_group is not None:
             return True
@@ -420,7 +421,8 @@ def dummy():
                 if hasattr(model, 'setup_transformer_engine_tp_groups'):
                     model.setup_transformer_engine_tp_groups()
 
-            # NMT models do not have a `tokenizer` attribute, they instead have an encoder_tokenizer and decoder_tokenizer attribute.
+            # NMT models do not have a `tokenizer` attribute,
+            # they instead have an encoder_tokenizer and decoder_tokenizer attribute.
             if hasattr(cfg, "tokenizer"):
                 if cfg.tokenizer.get("tokenizer_model") is not None:
                     model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.tokenizer_model)
@@ -452,6 +454,7 @@ def dummy():
         return checkpoint
 
     def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
+        """ """
         # starting with trasformers v4.31.0, buffer for position_ids is persistent=False
         if (
             self.bert_model is not None
@@ -464,7 +467,18 @@ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
             pos_id_keys = [x for x in state_dict.keys() if "position_ids" in x]
             for key in pos_id_keys:
                 del state_dict[key]
-        results = super(NLPModel, self).load_state_dict(state_dict, strict=strict)
+        try:
+            results = super(NLPModel, self).load_state_dict(state_dict, strict=strict)
+        except RuntimeError as e:
+            results = super(NLPModel, self).load_state_dict(state_dict, strict=False)
+            if all(s.endswith('_extra_state') for s in results.missing_keys):
+                logging.warning(
+                    f'Loding checkpoint created with Transformer Engine version lower than 1.13. \
+                    Missing layers {results.missing_keys} will be ignored.'
+                )
+            else:
+                raise e
+
         return results
 
     @classmethod

diff --git a/nemo/core/classes/mixins/adapter_mixins.py b/nemo/core/classes/mixins/adapter_mixins.py
@@ -402,7 +402,8 @@ def get_adapter_cfg(self, name: str):
     def set_accepted_adapter_types(self, adapter_types: List[Union[type, str]]) -> None:
         """
         The module with this mixin can define a list of adapter names that it will accept.
-        This method should be called in the modules init method and set the adapter names the module will expect to be added.
+        This method should be called in the modules init method and set the adapter names
+        the module will expect to be added.
 
         Args:
             adapter_types: A list of str paths that correspond to classes. The class paths will be instantiated to
@@ -579,7 +580,7 @@ def forward_single_enabled_adapter_(
         adapter_module: torch.nn.Module,
         *,
         adapter_name: str,
-        adapter_strategy: 'nemo.core.classes.mixins.adapter_mixin_strategies.AbstractAdapterStrategy',
+        adapter_strategy: 'nemo.core.classes.mixins.adapter_mixin_strategies.AbstractAdapterStrategy',  # noqa: F821
     ):
         """
         Perform the forward step of a single adapter module on some input data.
@@ -958,7 +959,7 @@ def load_adapters(self, filepath: str, name: str = None, map_location: str = Non
                 map_location = 'cpu'
 
         # Load the state dict and extract the internal config
-        state_dict = torch.load(filepath, map_location=map_location)
+        state_dict = torch.load(filepath, map_location=map_location, weights_only=False)
         config = state_dict.pop('__cfg__')
 
         # Normalize the name to a list of names (exact match with the state dict)