diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 20f207eb34d..b38fa35d8bf 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -70,6 +70,7 @@ def build_woq_model(model, quantization_config): not getattr(quantization_config, "sym", False), ) use_optimum_format = True + g_idx = hasattr(m, "g_idx") and m.g_idx is not None with init_empty_weights(): new_module = INCWeightOnlyLinear( @@ -80,7 +81,7 @@ def build_woq_model(model, quantization_config): group_size=quantization_config.group_size, zp=zp, bias=m.bias is not None, - g_idx=True, + g_idx=g_idx, use_optimum_format=use_optimum_format, ) set_module(model, n, new_module) diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index df7785183c2..de939cf2652 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -206,14 +206,8 @@ def _replace_linear( device=device, use_optimum_format=getattr(module, "use_optimum_format", True), ) - if quantization_config.quant_method.value == "gptq": - g_idx = getattr( - module, - "g_idx", - torch.zeros(in_features, dtype=torch.int32).to(device), - ) - else: - g_idx = None + # g_idx is only present when using GPTQ quantization method + g_idx = module.g_idx if hasattr(module, "g_idx") else None model._modules[name].set_scales_zps_gidx( ( module.scales