huggingface · echarlaix · Oct 16, 2023 · Aug 3, 2023 · Aug 4, 2023 · Aug 4, 2023
diff --git a/optimum/commands/export/onnx.py b/optimum/commands/export/onnx.py
@@ -217,6 +217,11 @@ def parse_args_onnx(parser):
         default=DEFAULT_DUMMY_SHAPES["nb_points_per_image"],
         help="For Segment Anything. It corresponds to the number of points per segmentation masks.",
     )
+    optional_group.add_argument(
+        "--legacy",
+        action="store_true",
+        help=("Export decoder only models in two (without + with past) model as a single ONNX file."),
+    )
 
     # deprecated argument
     parser.add_argument("--for-ort", action="store_true", help=argparse.SUPPRESS)
@@ -256,5 +261,6 @@ def run(self):
             _variant=self.args.variant,
             library_name=self.args.library_name,
             no_position_ids=self.args.no_position_ids,
+            legacy=self.args.legacy,
             **input_shapes,
         )
diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
@@ -69,6 +69,7 @@ def _get_submodels_and_onnx_configs(
     fn_get_submodels: Optional[Callable] = None,
     preprocessors: Optional[List[Any]] = None,
     no_position_ids: bool = False,
+    legacy: bool = False,
 ):
     is_stable_diffusion = "stable-diffusion" in task
     if not custom_architecture:
@@ -106,7 +107,7 @@ def _get_submodels_and_onnx_configs(
             ):
                 models_and_onnx_configs = get_encoder_decoder_models_for_export(model, onnx_config)
             elif task.startswith("text-generation") and not monolith:
-                models_and_onnx_configs = get_decoder_models_for_export(model, onnx_config)
+                models_and_onnx_configs = get_decoder_models_for_export(model, onnx_config, legacy=legacy)
             elif model.config.model_type == "sam":
                 models_and_onnx_configs = get_sam_models_for_export(model, onnx_config)
             else:
@@ -185,6 +186,7 @@ def main_export(
     _variant: str = "default",
     library_name: Optional[str] = None,
     no_position_ids: bool = False,
+    legacy: bool = False,
     **kwargs_shapes,
 ):
     """
@@ -425,6 +427,7 @@ def main_export(
         preprocessors=preprocessors,
         _variant=_variant,
         no_position_ids=no_position_ids,
+        legacy=legacy,
     )
 
     if not is_stable_diffusion:
@@ -610,6 +613,7 @@ def main():
         pad_token_id=args.pad_token_id,
         for_ort=args.for_ort,
         library_name=args.library_name,
+        legacy=args.legacy,
         **input_shapes,
     )
 

diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
@@ -585,7 +585,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         elif self.task == "feature-extraction":
             common_outputs = OrderedDict({"last_hidden_state": {0: "batch_size"}})
         else:
-            common_outputs = OrderedDict({"logits": {0: "batch_size"}})
+            common_outputs = OrderedDict({"logits": {0: "batch_size", 1: "sequence_length"}})
         if self.use_past:
             # When exporting decoder models with use_cache=True, both the decoder without past and with past have the KV cache as an output.
             self.add_past_key_values(common_outputs, direction="outputs")

diff --git a/optimum/exporters/onnx/config.py b/optimum/exporters/onnx/config.py
@@ -92,7 +92,7 @@ def __init__(
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         if self.use_past_in_inputs:
-            common_inputs = {"input_ids": {0: "batch_size"}}
+            common_inputs = {"input_ids": {0: "batch_size", 1: "sequence_length"}}
             self.add_past_key_values(common_inputs, direction="inputs")
             common_inputs["attention_mask"] = {0: "batch_size", 1: "past_sequence_length + 1"}
         else:
@@ -164,10 +164,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
         # generating wrong position_ids in the model itself:
         # https://github.com/huggingface/transformers/blob/v4.33.1/src/transformers/models/gpt2/modeling_gpt2.py#L802
         if not self.no_position_ids and self.task == "text-generation":
-            if self.use_past_in_inputs:
-                common_inputs["position_ids"] = {0: "batch_size"}
-            else:
-                common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"}
+            common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"}
 
         return common_inputs
 

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -55,7 +55,15 @@
     TextSeq2SeqOnnxConfig,
     VisionOnnxConfig,
 )
-from .model_patcher import SAMModelPatcher, WavLMModelPatcher
+from .model_patcher import (
+    BartModelPatcher,
+    BloomModelPatcher,
+    LlamaModelPatcher,
+    MPTModelPatcher,
+    OPTModelPatcher,
+    SAMModelPatcher,
+    WavLMModelPatcher,
+)
 
 
 if TYPE_CHECKING:
@@ -215,11 +223,21 @@ class OPTOnnxConfig(TextDecoderOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return OPTModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return LlamaModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class MPTOnnxConfig(TextDecoderOnnxConfig):
     # MPT does not require position_ids input.
@@ -228,6 +246,11 @@ class MPTOnnxConfig(TextDecoderOnnxConfig):
         num_attention_heads="n_heads", hidden_size="d_model", num_layers="n_layers"
     )
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return MPTModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class BloomOnnxConfig(TextDecoderOnnxConfig):
     # Bloom does not require position_ids input.
@@ -261,6 +284,11 @@ def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], dire
                 1: decoder_sequence_name,
             }
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return BloomModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class GPTBigCodeOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (
@@ -400,7 +428,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
         return int_tensor
 
 
-class BartOnnxConfig(TextSeq2SeqOnnxConfig):
+class M2M100OnnxConfig(TextSeq2SeqOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         encoder_num_layers="encoder_layers",
         decoder_num_layers="decoder_layers",
@@ -524,11 +552,14 @@ def flatten_past_key_values(self, flattened_output, name, idx, t):
             )
 
 
-class MBartOnnxConfig(BartOnnxConfig):
-    pass
+class BartOnnxConfig(M2M100OnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return BartModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
-class M2M100OnnxConfig(BartOnnxConfig):
+class MBartOnnxConfig(BartOnnxConfig):
     pass
 
 

diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -19,6 +19,8 @@
 
 from transformers.utils import is_torch_available
 
+from ...utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask
+
 
 if is_torch_available():
     import torch
@@ -342,3 +344,114 @@ def patched_forward(
                         return {"iou_scores": iou_predictions, "pred_masks": low_res_masks}
 
         self.patched_forward = patched_forward
+
+
+class BloomModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        self.patch = self.real_config.task == "text-generation" and self.real_config.use_past
+        if self.patch:
+            self.orig_prepare_attn_mask = getattr(self._model.transformer, "_prepare_attn_mask")
+
+    def __enter__(self):
+        super().__enter__()
+        if self.patch:
+            setattr(self._model.transformer, "_prepare_attn_mask", _prepare_attn_mask)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if self.patch:
+            setattr(self._model.transformer, "_prepare_attn_mask", self.orig_prepare_attn_mask)
+
+
+class LlamaModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        self.patch = self.real_config.task == "text-generation" and self.real_config.use_past
+        if self.patch:
+            self.orig_prepare_attn_mask = getattr(self._model.model, "_prepare_decoder_attention_mask")
+
+    def __enter__(self):
+        super().__enter__()
+        if self.patch:
+            setattr(self._model.model, "_prepare_decoder_attention_mask", _prepare_decoder_attention_mask)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if self.patch:
+            setattr(self._model.model, "_prepare_decoder_attention_mask", self.orig_prepare_attn_mask)
+
+
+class BartModelPatcher(Seq2SeqModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        self.patch = self.real_config.task == "text-generation" and self.real_config.use_past
+        if self.patch:
+            self.orig_prepare_attn_mask = getattr(self._model.model.decoder, "_prepare_decoder_attention_mask")
+
+    def __enter__(self):
+        super().__enter__()
+        if self.patch:
+            setattr(self._model.model.decoder, "_prepare_decoder_attention_mask", _prepare_decoder_attention_mask)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if self.patch:
+            setattr(self._model.model.decoder, "_prepare_decoder_attention_mask", self.orig_prepare_attn_mask)
+
+
+class OPTModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+        self.patch = self.real_config.task == "text-generation" and self.real_config.use_past
+        if self.patch:
+            self.orig_prepare_attn_mask = getattr(self._model.model.decoder, "_prepare_decoder_attention_mask")
+
+    def __enter__(self):
+        super().__enter__()
+        if self.patch:
+            setattr(self._model.model.decoder, "_prepare_decoder_attention_mask", _prepare_decoder_attention_mask)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if self.patch:
+            setattr(self._model.model.decoder, "_prepare_decoder_attention_mask", self.orig_prepare_attn_mask)
+
+
+class MPTModelPatcher(BloomModelPatcher):
+    pass
+
+
+class BlenderbotSmallModelPatcher(BartModelPatcher):
+    pass
+
+
+class BlenderbotModelPatcher(BartModelPatcher):
+    pass
+
+
+class PegasusModelPatcher(BartModelPatcher):
+    pass
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
@@ -29,6 +29,7 @@
     logging,
 )
 from ...utils.import_utils import _diffusers_version
+from ...utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask  # noqa: F401
 from ..tasks import TasksManager
 from .constants import ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ONNX_ENCODER_NAME
 
@@ -158,15 +159,16 @@ def _get_submodels_for_export_stable_diffusion(
 
 
 def _get_submodels_for_export_decoder(
-    model: Union["PreTrainedModel", "TFPreTrainedModel"], use_past: bool
+    model: Union["PreTrainedModel", "TFPreTrainedModel"],
+    use_past: bool,
+    legacy: bool = False,
 ) -> Dict[str, Union["PreTrainedModel", "TFPreTrainedModel"]]:
     """
     Returns the decoder part of the model.
     """
-    models_for_export = {}
+    models_for_export = {ONNX_DECODER_NAME if legacy else "model": model}
 
-    models_for_export[ONNX_DECODER_NAME] = model
-    if use_past:
+    if legacy and use_past:
         models_for_export[ONNX_DECODER_WITH_PAST_NAME] = model
 
     return models_for_export
@@ -226,6 +228,7 @@ def get_encoder_decoder_models_for_export(
 def get_decoder_models_for_export(
     model: Union["PreTrainedModel", "TFPreTrainedModel"],
     config: "OnnxConfig",
+    legacy: bool = False,
 ) -> Dict[str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel"], "OnnxConfig"]]:
     """
     Returns two versions of the decoder that can be used together to perform fast generation:
@@ -245,31 +248,42 @@ def get_decoder_models_for_export(
         `Dict[str, Tuple[Union[PreTrainedModel, TFPreTrainedModel], OnnxConfig]]: A Dict containing the model and
         onnx configs for the encoder and decoder parts of the model.
     """
-    models_for_export = _get_submodels_for_export_decoder(model, use_past=config.use_past)
+
+    models_for_export = _get_submodels_for_export_decoder(model, use_past=config.use_past, legacy=legacy)
 
     onnx_kwargs = {"task": config.task, "float_dtype": config.float_dtype, "int_dtype": config.int_dtype}
     if model.config.model_type.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS:
         onnx_kwargs["no_position_ids"] = config.no_position_ids
 
-    onnx_config = config.__class__(
-        model.config,
-        use_past=config.use_past,
-        use_past_in_inputs=False,
-        **onnx_kwargs,
-    )
-    models_for_export[ONNX_DECODER_NAME] = (models_for_export[ONNX_DECODER_NAME], onnx_config)
-
-    if config.use_past:
-        onnx_config_with_past = config.__class__(
+    if legacy:
+        onnx_config = config.__class__(
             model.config,
-            use_past=True,
-            use_past_in_inputs=True,
+            use_past=config.use_past,
+            use_past_in_inputs=False,
             **onnx_kwargs,
         )
-        models_for_export[ONNX_DECODER_WITH_PAST_NAME] = (
-            models_for_export[ONNX_DECODER_WITH_PAST_NAME],
-            onnx_config_with_past,
+        models_for_export[ONNX_DECODER_NAME] = (models_for_export[ONNX_DECODER_NAME], onnx_config)
+
+        if config.use_past:
+            onnx_config_with_past = config.__class__(
+                model.config,
+                use_past=True,
+                use_past_in_inputs=True,
+                **onnx_kwargs,
+            )
+            models_for_export[ONNX_DECODER_WITH_PAST_NAME] = (
+                models_for_export[ONNX_DECODER_WITH_PAST_NAME],
+                onnx_config_with_past,
+            )
+
+    else:
+        onnx_config = config.__class__(
+            model.config,
+            use_past=config.use_past,
+            use_past_in_inputs=config.use_past,
+            **onnx_kwargs,
         )
+        models_for_export["model"] = (models_for_export["model"], onnx_config)
 
     return models_for_export