From 6fc369b53a50bb3d6147da82b839d3731406f13e Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Sat, 16 Sep 2023 15:26:15 +0000 Subject: [PATCH] updated config --- optimum/onnxruntime/configuration.py | 16 ++++++++++++---- optimum/onnxruntime/quantization.py | 7 +++++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py index 8e8d0074e8a..2ddeb329187 100644 --- a/optimum/onnxruntime/configuration.py +++ b/optimum/onnxruntime/configuration.py @@ -267,6 +267,18 @@ class QuantizationConfig: qdq_op_type_per_channel_support_to_axis (`Dict[str, int]`): Set the channel axis for a specific operator type. Effective only when per channel quantization is supported and `per_channel` is set to True. + smooth_quant (`bool`, defaults to `False`) : + Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do + fake input channel quantization. + smooth_quant_alpha (`float`, defaults to `0.5`) : + Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight + and activation quantization. A larger alpha value could be used on models with more significant + activation outliers to migrate more quantization difficulty to weights. + smooth_quant_folding (`bool`, defaults to `True`) : + It only works if SmoothQuant is True. If enabled, inserted Mul ops during + SmoothQuant will be folded into the previous op if the previous op is foldable. + smooth_quant_op_types (`List[str]`, defaults to `[]`): + The op types to be smooth quantized """ is_static: bool @@ -396,7 +408,6 @@ def arm64( nodes_to_quantize: Optional[List[str]] = None, nodes_to_exclude: Optional[List[str]] = None, operators_to_quantize: Optional[List[str]] = None, - smooth_quant_op_types: Optional[List[str]] = None, ): """ Creates a [`~onnxruntime.QuantizationConfig`] fit for ARM64. @@ -450,7 +461,6 @@ def avx2( nodes_to_quantize: Optional[List[str]] = None, nodes_to_exclude: Optional[List[str]] = None, operators_to_quantize: Optional[List[str]] = None, - smooth_quant_op_types: Optional[List[str]] = None, ) -> QuantizationConfig: """ Creates a [`~onnxruntime.QuantizationConfig`] fit for CPU with AVX2 instruction set. @@ -508,7 +518,6 @@ def avx512( nodes_to_quantize: Optional[List[str]] = None, nodes_to_exclude: Optional[List[str]] = None, operators_to_quantize: Optional[List[str]] = None, - smooth_quant_op_types: Optional[List[str]] = None, ) -> QuantizationConfig: """ Creates a [`~onnxruntime.QuantizationConfig`] fit for CPU with AVX512 instruction set. @@ -565,7 +574,6 @@ def avx512_vnni( nodes_to_quantize: Optional[List[str]] = None, nodes_to_exclude: Optional[List[str]] = None, operators_to_quantize: Optional[List[str]] = None, - smooth_quant_op_types: Optional[List[str]] = None, ) -> QuantizationConfig: """ Creates a [`~onnxruntime.QuantizationConfig`] fit for CPU with AVX512-VNNI instruction set. diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index 10e466728d8..0fa6e55f6a3 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -232,7 +232,9 @@ def apply_smooth_quant( importlib.import_module("neural_compressor.adaptor.ox_utils.smooth_quant") except Exception as e: logging.error(f"{e}.") - raise RuntimeError("neural-compressor is not correctly installed. Please check your environment.") from e + raise RuntimeError("Neural-compressor is required for SmoothQuant. Please install the library") from e + + import copy import onnx from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant @@ -242,13 +244,14 @@ def apply_smooth_quant( os.makedirs(save_dir, exist_ok=True) def inc_dataloader(): - calibration_data_reader = ORTCalibrationDataReader(dataset, batch_size) + calibration_data_reader = ORTCalibrationDataReader(copy.deepcopy(dataset), batch_size) for data in calibration_data_reader: yield data, None orig_nodes = [i.name for i in model.graph.node] dataloader = inc_dataloader() sq = ORTSmoothQuant(self.onnx_model_path.as_posix(), dataloader, quantization_config.reduce_range) + del dataloader model = sq.transform( quantization_config.smooth_quant_alpha, quantization_config.smooth_quant_folding,