diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 089300f7cd9..fec5c7e5b27 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -18,8 +18,10 @@ jobs: fail-fast: false matrix: transformers-version: ["latest"] - os: [ubuntu-20.04, windows-2019, macos-13] + os: [ubuntu-20.04, windows-2019, macos-15] include: + - transformers-version: "4.36.*" + os: ubuntu-20.04 - transformers-version: "4.45.*" os: ubuntu-20.04 diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index cc752779d30..6b92109b7b6 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -295,7 +295,7 @@ class Qwen2OnnxConfig(LlamaOnnxConfig): class GemmaOnnxConfig(LlamaOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator - pass + MIN_TRANSFORMERS_VERSION = version.parse("4.38.0") class GraniteOnnxConfig(LlamaOnnxConfig): @@ -348,6 +348,8 @@ def patch_model_for_export( class MPTOnnxConfig(TextDecoderOnnxConfig): # MPT does not require position_ids input. DEFAULT_ONNX_OPSET = 13 + # TODO: fix inference for transformers < v4.41 for beam_search > 1 + MIN_TRANSFORMERS_VERSION = version.parse("4.41.0") NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_attention_heads="n_heads", hidden_size="d_model", num_layers="n_layers" ) diff --git a/setup.py b/setup.py index 7ea0da56c29..29f97b604e0 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "onnxruntime-gpu": [ "onnx", @@ -63,19 +63,19 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "exporters": [ "onnx", "onnxruntime", "timm", - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "exporters-gpu": [ "onnx", "onnxruntime-gpu", "timm", - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", @@ -86,7 +86,7 @@ "h5py", "numpy<1.24.0", "datasets<=2.16", - "transformers>=4.26,<4.38", + "transformers>=4.36,<4.38", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index a335e014478..84ac27029f9 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2318,21 +2318,28 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "bloom", "codegen", "falcon", - "gemma", "gpt2", "gpt_bigcode", "gpt_neo", "gpt_neox", "gptj", - "granite", "llama", "mistral", - "mpt", "opt", ] - if check_if_transformers_greater("4.40"): - SUPPORTED_ARCHITECTURES.extend(["gemma", "phi3", "qwen2"]) + if check_if_transformers_greater("4.37"): + SUPPORTED_ARCHITECTURES.append("qwen2") + + if check_if_transformers_greater("4.38"): + SUPPORTED_ARCHITECTURES.append("gemma") + + # TODO: fix "mpt" for which inference fails for transformers < v4.41 + if check_if_transformers_greater("4.41"): + SUPPORTED_ARCHITECTURES.extend(["phi3", "mpt"]) + + if check_if_transformers_greater("4.45"): + SUPPORTED_ARCHITECTURES.append("granite") FULL_GRID = { "model_arch": SUPPORTED_ARCHITECTURES, @@ -2445,7 +2452,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach transformers_model = AutoModelForCausalLM.from_pretrained(model_id) transformers_model = transformers_model.eval() tokenizer = get_preprocessor(model_id) - tokens = tokenizer("This is a sample output", return_tensors="pt") + tokens = tokenizer("This is a sample input", return_tensors="pt") position_ids = None if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: input_shape = tokens["input_ids"].shape @@ -2467,7 +2474,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach # Compare batched generation. tokenizer.pad_token_id = tokenizer.eos_token_id tokenizer.padding_side = "left" - tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + tokens = tokenizer(["This is", "This is a sample input"], return_tensors="pt", padding=True) onnx_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None onnx_model.config.eos_token_id = None @@ -4598,14 +4605,14 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str): ) self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) - self.assertEqual( - outputs_model_with_pkv.shape[1], - self.GENERATION_LENGTH + 2 if model_arch == "whisper" else self.GENERATION_LENGTH + 1, - ) - self.assertEqual( - outputs_model_without_pkv.shape[1], - self.GENERATION_LENGTH + 2 if model_arch == "whisper" else self.GENERATION_LENGTH + 1, - ) + + if model_arch == "whisper" and check_if_transformers_greater("4.43"): + gen_length = self.GENERATION_LENGTH + 2 + else: + gen_length = self.GENERATION_LENGTH + 1 + + self.assertEqual(outputs_model_with_pkv.shape[1], gen_length) + self.assertEqual(outputs_model_without_pkv.shape[1], gen_length) self.GENERATION_LENGTH = generation_length if os.environ.get("TEST_LEVEL", 0) == "1":