diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 79d3d1e86fb..869d4818407 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -92,6 +92,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - SEW - SEW-D - Speech2Text +- SigLIP - SpeechT5 - Splinter - SqueezeBert diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 2b94306bc6c..b5db3feeffc 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -1081,6 +1081,31 @@ def patch_model_for_export( return CLIPModelPatcher(self, model, model_kwargs=model_kwargs) +class SiglipNormalizedConfig(CLIPNormalizedConfig): + pass + + +class SiglipOnnxConfig(CLIPOnnxConfig): + NORMALIZED_CONFIG_CLASS = SiglipNormalizedConfig + DEFAULT_ONNX_OPSET = 13 + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_ids": {0: "text_batch_size", 1: "sequence_length"}, + "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"}, + # NOTE: No attention_mask + } + + +class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig): + pass + + +class SiglipTextOnnxConfig(CLIPTextOnnxConfig): + pass + + class UNetOnnxConfig(VisionOnnxConfig): ATOL_FOR_VALIDATION = 1e-3 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index c164cc7bb84..b02e6b392a4 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -1027,6 +1027,19 @@ class TasksManager: "audio-classification", onnx="SEWDOnnxConfig", ), + "siglip": supported_tasks_mapping( + "feature-extraction", + "zero-shot-image-classification", + onnx="SiglipOnnxConfig", + ), + "siglip-text-model": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipTextOnnxConfig", + ), + "siglip-text-with-projection": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipTextWithProjectionOnnxConfig", + ), "speech-to-text": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 8ab5a304d1b..f8e47ed91ae 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -218,6 +218,8 @@ class NormalizedConfigManager: 'owlvit', 'perceiver', 'roformer', + 'segformer', + 'siglip', 'squeezebert', 'table-transformer', """ diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 577f020c5bf..5b7719a921e 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -146,6 +146,7 @@ "roformer": "hf-internal-testing/tiny-random-RoFormerModel", "sam": "fxmarty/sam-vit-tiny-random", "segformer": "hf-internal-testing/tiny-random-SegformerModel", + "siglip": "HuggingFaceM4/tiny-random-siglip", "splinter": "hf-internal-testing/tiny-random-SplinterModel", "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel", "swin": "hf-internal-testing/tiny-random-SwinModel",