diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index cf83fbeaba2..195340bbd02 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -107,6 +107,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - UniSpeech SAT - Vision Encoder Decoder - Vit +- VitMAE +- VitMSN - Wav2Vec2 - Wav2Vec2 Conformer - WavLM diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 47bfed12674..aced5b7d7f2 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -780,6 +780,18 @@ class PvtOnnxConfig(ViTOnnxConfig): DEFAULT_ONNX_OPSET = 11 +class VitMAEOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + +class VitMSNOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + class Dinov2DummyInputGenerator(DummyVisionInputGenerator): def __init__( self, diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 2231d66de08..ab6b9bd1d7e 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -1135,7 +1135,21 @@ class TasksManager: onnx="VisionEncoderDecoderOnnxConfig", ), "vit": supported_tasks_mapping( - "feature-extraction", "image-classification", "masked-im", onnx="ViTOnnxConfig" + "feature-extraction", + "image-classification", + "masked-im", + onnx="ViTOnnxConfig", + ), + "vit-mae": supported_tasks_mapping( + "feature-extraction", + "masked-im", + onnx="VitMAEOnnxConfig", + ), + "vit-msn": supported_tasks_mapping( + "feature-extraction", + "image-classification", + "masked-im", + onnx="VitMSNOnnxConfig", ), "vits": supported_tasks_mapping( "text-to-audio", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 5d388715e01..6983a4e0291 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -157,6 +157,7 @@ "t5": "hf-internal-testing/tiny-random-t5", "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel", "vit": "hf-internal-testing/tiny-random-vit", + "vit-msn": "hf-internal-testing/tiny-random-ViTMSNForImageClassification", "vits": "echarlaix/tiny-random-vits", "yolos": "hf-internal-testing/tiny-random-YolosModel", "whisper": "openai/whisper-tiny.en", # hf-internal-testing ones are broken @@ -279,6 +280,7 @@ "t5": "t5-small", "table-transformer": "microsoft/table-transformer-detection", "vit": "google/vit-base-patch16-224", + "vit-msn": "facebook/vit-msn-small", "yolos": "hustvl/yolos-tiny", "whisper": "openai/whisper-tiny.en", "hubert": "facebook/hubert-base-ls960",