Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Support INC layerwise quant #1623

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from datasets import load_dataset
from neural_compressor import quantization
from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
from neural_compressor.utils.pytorch import load
from neural_compressor.utils.utility import LazyImport
from neural_compressor.config import PostTrainingQuantConfig
from intel_extension_for_transformers.tools.utils import (
Expand Down Expand Up @@ -583,6 +584,10 @@ def default_calib_func(model):
inc_model = quantization.fit(
model, conf, calib_func=calib_func, calib_dataloader=calib_dataloader
)
if config.layer_wise:
inc_model.save("./tmp")
inc_model = load("./tmp", model, weight_only=True, layer_wise=True)
return inc_model.eval()
inc_model.eval()

if device == "xpu" or device == torch.device("xpu"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
from accelerate import init_empty_weights
from huggingface_hub import hf_hub_download
from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
from neural_compressor.adaptor.torch_utils.layer_wise_quant import load_empty_model
from neural_compressor.model.torch_model import PyTorchFXModel
from threading import Thread
from transformers.configuration_utils import PretrainedConfig
Expand Down Expand Up @@ -778,13 +779,16 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
if quantization_config.quant_method.value in ["teq", "awq"]
else False
)
model = cls.ORIG_MODEL.from_pretrained(
pretrained_model_name_or_path,
*model_args,
config=config,
**kwargs,
)
model.config.update({"low_cpu_mem_usage": True})
if quantization_config.layer_wise:
model = load_empty_model(pretrained_model_name_or_path, torchscript=True)
else:
model = cls.ORIG_MODEL.from_pretrained(
pretrained_model_name_or_path,
*model_args,
config=config,
**kwargs,
)
model.config.update({"low_cpu_mem_usage": True})
model.eval()

if use_xpu:
Expand Down
Loading