Skip to content

Commit

Permalink
fix comment
Browse files Browse the repository at this point in the history
Signed-off-by: jiqing-feng <[email protected]>
  • Loading branch information
jiqing-feng committed Dec 17, 2024
1 parent 19e7261 commit 7125fe9
Showing 1 changed file with 11 additions and 11 deletions.
22 changes: 11 additions & 11 deletions optimum/gptq/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,6 @@ def __init__(
desc_act: bool = False,
sym: bool = True,
true_sequential: bool = True,
checkpoint_format: str = "gptq",
meta: Optional[Dict[str, any]] = None,
backend: Optional[str] = None,
use_cuda_fp16: bool = False,
model_seqlen: Optional[int] = None,
block_name_to_quantize: Optional[str] = None,
Expand All @@ -104,6 +101,9 @@ def __init__(
max_input_length: Optional[int] = None,
cache_block_outputs: Optional[bool] = True,
modules_in_block_to_quantize: Optional[List[List[str]]] = None,
checkpoint_format: str = "gptq",
meta: Optional[Dict[str, any]] = None,
backend: Optional[str] = None,
*args,
**kwargs,
):
Expand All @@ -129,13 +129,6 @@ def __init__(
Whether to perform sequential quantization even within a single Transformer block.
Instead of quantizing the entire block at once, we perform layer-wise quantization.
As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
checkpoint_format (`str`, *optional*, defaults to `gptq`):
GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only.
meta (`Dict[str, any]`, *optional*):
Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
backend (`str`, *optional*):
Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
use_cuda_fp16 (`bool`, defaults to `False`):
Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
model_seqlen (`Optional[int]`, defaults to `None`):
Expand All @@ -162,6 +155,13 @@ def __init__(
List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
checkpoint_format (`str`, *optional*, defaults to `gptq`):
GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only.
meta (`Dict[str, any]`, *optional*):
Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
backend (`str`, *optional*):
Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
"""

self.bits = bits
Expand Down Expand Up @@ -564,7 +564,7 @@ def store_input_hook(_, input, *args):
raise ValueError(f"Module {module_name} was not found in model")

torch.cuda.empty_cache()
if hasattr(torch, "xpu"):
if hasattr(torch, "xpu") and torch.xpu.is_available():
torch.xpu.empty_cache()

# Step 3: Quantize the blocks
Expand Down

0 comments on commit 7125fe9

Please sign in to comment.