From 3bea1a22acf0ca401fc8994056670f8e715d6cc7 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 21 Sep 2023 16:01:19 +0200 Subject: [PATCH 1/6] fix quantization for onnxruntime v1.16.0 --- optimum/onnxruntime/quantization.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index 31140c5b747..730abae1dfb 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -279,6 +279,10 @@ def compute_ranges(self) -> Dict[str, Tuple[float, float]]: ) LOGGER.info("Computing calibration ranges") + + if parse(ort_version) >= Version("1.16.0"): + return self._calibrator.compute_data() + return self._calibrator.compute_range() def quantize( @@ -351,8 +355,13 @@ def quantize( has_subgraphs = True break - if quantization_config.is_static and has_subgraphs: - raise NotImplementedError("Static quantization is currently not supported for models with" " subgraphs.") + if has_subgraphs: + if quantization_config.is_static: + raise NotImplementedError("Static quantization is currently not supported for models with subgraphs.") + if parse(ort_version) >= Version("1.16.0"): + raise ValueError( + "Onnxruntime version v1.16.0 not compatible with quantization for models with subgraphs, please downgrade to an earlier version." + ) quantizer_factory = QDQQuantizer if use_qdq else ONNXQuantizer From b343ce4bd7cf99de01594733ce31e71fd8c90f31 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 21 Sep 2023 16:06:49 +0200 Subject: [PATCH 2/6] Update optimum/onnxruntime/quantization.py Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> --- optimum/onnxruntime/quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index 730abae1dfb..08d7451b9f9 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -360,7 +360,7 @@ def quantize( raise NotImplementedError("Static quantization is currently not supported for models with subgraphs.") if parse(ort_version) >= Version("1.16.0"): raise ValueError( - "Onnxruntime version v1.16.0 not compatible with quantization for models with subgraphs, please downgrade to an earlier version." + "ONNX Runtime version v1.16.0 not compatible with quantization for models with subgraphs, please downgrade to 1.15.1 or upgrade to a higher version. Reference: https://github.com/microsoft/onnxruntime/pull/17651" ) quantizer_factory = QDQQuantizer if use_qdq else ONNXQuantizer From ac550be1365d4e15a7edf117b0310ea7c5535265 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 21 Sep 2023 16:10:41 +0200 Subject: [PATCH 3/6] Update optimum/onnxruntime/quantization.py Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> --- optimum/onnxruntime/quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index 08d7451b9f9..9677fbd912c 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -358,7 +358,7 @@ def quantize( if has_subgraphs: if quantization_config.is_static: raise NotImplementedError("Static quantization is currently not supported for models with subgraphs.") - if parse(ort_version) >= Version("1.16.0"): + if parse(ort_version) == Version("1.16.0"): raise ValueError( "ONNX Runtime version v1.16.0 not compatible with quantization for models with subgraphs, please downgrade to 1.15.1 or upgrade to a higher version. Reference: https://github.com/microsoft/onnxruntime/pull/17651" ) From 4d6880e8f89aa2610307b375f5f7b8f34d9fdbc7 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 21 Sep 2023 16:32:01 +0200 Subject: [PATCH 4/6] Update optimum/onnxruntime/quantization.py Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> --- optimum/onnxruntime/quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index 9677fbd912c..12313008b2c 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -360,7 +360,7 @@ def quantize( raise NotImplementedError("Static quantization is currently not supported for models with subgraphs.") if parse(ort_version) == Version("1.16.0"): raise ValueError( - "ONNX Runtime version v1.16.0 not compatible with quantization for models with subgraphs, please downgrade to 1.15.1 or upgrade to a higher version. Reference: https://github.com/microsoft/onnxruntime/pull/17651" + "ONNX Runtime version v1.16.0 is not compatible with quantization for models with subgraphs, please downgrade to 1.15.1 or upgrade to a higher version. Reference: https://github.com/microsoft/onnxruntime/pull/17651" ) quantizer_factory = QDQQuantizer if use_qdq else ONNXQuantizer From b20ffa83677fe44d6a6be859577d383142a93876 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 21 Sep 2023 16:44:15 +0200 Subject: [PATCH 5/6] skip test for ort v1.16.0 --- tests/onnxruntime/test_quantization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/onnxruntime/test_quantization.py b/tests/onnxruntime/test_quantization.py index 111c7338808..aff1b51b534 100644 --- a/tests/onnxruntime/test_quantization.py +++ b/tests/onnxruntime/test_quantization.py @@ -19,7 +19,9 @@ from pathlib import Path from onnx import load as onnx_load +from onnxruntime import __version__ as ort_version from onnxruntime.quantization import QuantFormat, QuantizationMode, QuantType +from packaging.version import Version, parse from parameterized import parameterized from transformers import AutoTokenizer @@ -112,9 +114,9 @@ def test_dynamic_quantization(self, model_cls, model_name, expected_quantized_ma self.assertEqual(expected_quantized_matmuls, num_quantized_matmul) gc.collect() + @unittest.skipIf(parse(ort_version) == Version("1.16.0"), "not supported with this onnxruntime version") def test_dynamic_quantization_subgraphs(self): qconfig = AutoQuantizationConfig.avx512(is_static=False, per_channel=True) - # with tempfile.TemporaryDirectory() as tmp_dir: tmp_dir = tempfile.mkdtemp() output_dir = Path(tmp_dir) model = ORTModelForCausalLM.from_pretrained( From 590f993ed63f6349620b68ee21857f9fcd33877e Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 21 Sep 2023 16:45:33 +0200 Subject: [PATCH 6/6] skip test --- tests/cli/test_cli.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index 7e94d58db7b..147c0bd258f 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -21,6 +21,9 @@ import unittest from pathlib import Path +from onnxruntime import __version__ as ort_version +from packaging.version import Version, parse + import optimum.commands @@ -84,14 +87,22 @@ def test_quantize_commands(self): export_commands = [ f"optimum-cli export onnx --model hf-internal-testing/tiny-random-BertModel {tempdir}/encoder", f"optimum-cli export onnx --model hf-internal-testing/tiny-random-gpt2 {tempdir}/decoder", - f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder", + # f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder", ] quantize_commands = [ f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder --avx2 -o {tempdir}/quantized_encoder", f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/decoder --avx2 -o {tempdir}/quantized_decoder", - f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder-decoder --avx2 -o {tempdir}/quantized_encoder_decoder", + # f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder-decoder --avx2 -o {tempdir}/quantized_encoder_decoder", ] + if parse(ort_version) != Version("1.16.0"): + export_commands.append( + f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder" + ) + quantize_commands.append( + f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder-decoder --avx2 -o {tempdir}/quantized_encoder_decoder" + ) + for export, quantize in zip(export_commands, quantize_commands): subprocess.run(export, shell=True, check=True) subprocess.run(quantize, shell=True, check=True)