updated config

huggingface · Sep 20, 2023 · 6fc369b · 6fc369b
1 parent 2c6c09b
commit 6fc369b
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 6 deletions.
diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py
@@ -267,6 +267,18 @@ class QuantizationConfig:
         qdq_op_type_per_channel_support_to_axis (`Dict[str, int]`):
             Set the channel axis for a specific operator type. Effective only when per channel quantization is
             supported and `per_channel` is set to True.
+        smooth_quant (`bool`, defaults to `False`) :
+            Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
+            fake input channel quantization.
+        smooth_quant_alpha (`float`, defaults to `0.5`) :
+            Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
+            and activation quantization. A larger alpha value could be used on models with more significant
+            activation outliers to migrate more quantization difficulty to weights.
+        smooth_quant_folding (`bool`, defaults to `True`) :
+            It only works if SmoothQuant is True. If enabled, inserted Mul ops during
+            SmoothQuant will be folded into the previous op if the previous op is foldable.
+        smooth_quant_op_types (`List[str]`, defaults to `[]`):
+            The op types to be smooth quantized
     """
 
     is_static: bool
@@ -396,7 +408,6 @@ def arm64(
         nodes_to_quantize: Optional[List[str]] = None,
         nodes_to_exclude: Optional[List[str]] = None,
         operators_to_quantize: Optional[List[str]] = None,
-        smooth_quant_op_types: Optional[List[str]] = None,
     ):
         """
         Creates a [`~onnxruntime.QuantizationConfig`] fit for ARM64.
@@ -450,7 +461,6 @@ def avx2(
         nodes_to_quantize: Optional[List[str]] = None,
         nodes_to_exclude: Optional[List[str]] = None,
         operators_to_quantize: Optional[List[str]] = None,
-        smooth_quant_op_types: Optional[List[str]] = None,
     ) -> QuantizationConfig:
         """
         Creates a [`~onnxruntime.QuantizationConfig`] fit for CPU with AVX2 instruction set.
@@ -508,7 +518,6 @@ def avx512(
         nodes_to_quantize: Optional[List[str]] = None,
         nodes_to_exclude: Optional[List[str]] = None,
         operators_to_quantize: Optional[List[str]] = None,
-        smooth_quant_op_types: Optional[List[str]] = None,
     ) -> QuantizationConfig:
         """
         Creates a [`~onnxruntime.QuantizationConfig`] fit for CPU with AVX512 instruction set.
@@ -565,7 +574,6 @@ def avx512_vnni(
         nodes_to_quantize: Optional[List[str]] = None,
         nodes_to_exclude: Optional[List[str]] = None,
         operators_to_quantize: Optional[List[str]] = None,
-        smooth_quant_op_types: Optional[List[str]] = None,
     ) -> QuantizationConfig:
         """
         Creates a [`~onnxruntime.QuantizationConfig`] fit for CPU with AVX512-VNNI instruction set.

diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py
@@ -232,7 +232,9 @@ def apply_smooth_quant(
             importlib.import_module("neural_compressor.adaptor.ox_utils.smooth_quant")
         except Exception as e:
             logging.error(f"{e}.")
-            raise RuntimeError("neural-compressor is not correctly installed. Please check your environment.") from e
+            raise RuntimeError("Neural-compressor is required for SmoothQuant. Please install the library") from e
+
+        import copy
 
         import onnx
         from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant
@@ -242,13 +244,14 @@ def apply_smooth_quant(
         os.makedirs(save_dir, exist_ok=True)
 
         def inc_dataloader():
-            calibration_data_reader = ORTCalibrationDataReader(dataset, batch_size)
+            calibration_data_reader = ORTCalibrationDataReader(copy.deepcopy(dataset), batch_size)
             for data in calibration_data_reader:
                 yield data, None
 
         orig_nodes = [i.name for i in model.graph.node]
         dataloader = inc_dataloader()
         sq = ORTSmoothQuant(self.onnx_model_path.as_posix(), dataloader, quantization_config.reduce_range)
+        del dataloader
         model = sq.transform(
             quantization_config.smooth_quant_alpha,
             quantization_config.smooth_quant_folding,