From c633259b67cde7901c1c8531049ed73e26f2816a Mon Sep 17 00:00:00 2001 From: Benjamin Date: Mon, 22 Apr 2024 10:36:05 -0400 Subject: [PATCH 01/10] Compressed lifecycle implementation (INT8 only) --- .../quantization/lifecycle/__init__.py | 1 + .../quantization/lifecycle/apply.py | 24 ++++++- .../quantization/lifecycle/compressed.py | 65 +++++++++++++++++++ .../quantization/lifecycle/forward.py | 24 ++++--- .../quantization/quant_config.py | 23 +++++++ tests/quantization/lifecycle/test_apply.py | 42 ++++++++++-- 6 files changed, 162 insertions(+), 17 deletions(-) create mode 100644 src/compressed_tensors/quantization/lifecycle/compressed.py diff --git a/src/compressed_tensors/quantization/lifecycle/__init__.py b/src/compressed_tensors/quantization/lifecycle/__init__.py index 9504597b..0fa15b64 100644 --- a/src/compressed_tensors/quantization/lifecycle/__init__.py +++ b/src/compressed_tensors/quantization/lifecycle/__init__.py @@ -19,4 +19,5 @@ from .forward import * from .frozen import * from .initialize import * +from .compressed import * from .apply import * diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py index 08cb42f9..119c97fe 100644 --- a/src/compressed_tensors/quantization/lifecycle/apply.py +++ b/src/compressed_tensors/quantization/lifecycle/apply.py @@ -19,6 +19,9 @@ from compressed_tensors.quantization.lifecycle.calibration import ( set_module_for_calibration, ) +from compressed_tensors.quantization.lifecycle.compressed import ( + compress_quantized_weights, +) from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quantization from compressed_tensors.quantization.lifecycle.initialize import ( initialize_module_for_quantization, @@ -71,13 +74,20 @@ def apply_quantization_status(model: Module, status: QuantizationStatus): :param model: model to apply quantization to :param status: status to update the module to """ - if status >= QuantizationStatus.INITIALIZED: + current_status = _infer_status(model) + + if status >= QuantizationStatus.INITIALIZED > current_status: model.apply(initialize_module_for_quantization) - if status >= QuantizationStatus.CALIBRATION: + + if current_status < status >= QuantizationStatus.CALIBRATION > current_status: model.apply(set_module_for_calibration) - if status >= QuantizationStatus.FROZEN: + + if current_status < status >= QuantizationStatus.FROZEN > current_status: model.apply(freeze_module_quantization) + if current_status < status >= QuantizationStatus.COMPRESSED > current_status: + model.apply(compress_quantized_weights) + def _find_first_name_or_class_match( name: str, @@ -103,3 +113,11 @@ def _find_first_match(value: str, targets: Iterable[str]) -> Optional[str]: elif target == value: return target return None + + +def _infer_status(model: Module) -> Optional[QuantizationStatus]: + for module in model.modules(): + status = getattr(module, "quantization_status", None) + if status is not None: + return status + return None diff --git a/src/compressed_tensors/quantization/lifecycle/compressed.py b/src/compressed_tensors/quantization/lifecycle/compressed.py new file mode 100644 index 00000000..4d7ef830 --- /dev/null +++ b/src/compressed_tensors/quantization/lifecycle/compressed.py @@ -0,0 +1,65 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging + +import torch +from compressed_tensors.quantization.lifecycle.forward import quantize +from compressed_tensors.quantization.quant_config import QuantizationStatus +from torch.nn import Module + + +__all__ = [ + "compress_quantized_weights", +] + + +_LOGGER = logging.getLogger(__name__) + + +def compress_quantized_weights(module: Module): + """ + Quantizes the module weight representation to use fewer bits in memory + + apply to full model with `model.apply(compress_quantized)` + + :param module: module to compress to quantized representation + """ + scheme = getattr(module, "quantization_scheme", None) + if not scheme or not scheme.weights: + # no quantization scheme or weights not quantized, nothing to do + return + + weight = getattr(module, "weight", None) + scale = getattr(module, "weight_scale", None) + zero_point = getattr(module, "weight_zero_point", None) + + if weight is None or scale is None or zero_point is None: + # no weight, scale, or ZP, nothing to do + + # TODO: Should we mark as compressed anyway here to maintain consistent + # status throughout the model? + return + + module.weight.requires_grad = False # cannot use auto grad after compression + module.weight.data = quantize( + x=weight, + scale=scale, + zero_point=zero_point, + args=scheme.weights, + dtype=torch.int8, + ) + + module.quantization_status = QuantizationStatus.COMPRESSED diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py index 48b93e02..335b658a 100644 --- a/src/compressed_tensors/quantization/lifecycle/forward.py +++ b/src/compressed_tensors/quantization/lifecycle/forward.py @@ -13,6 +13,7 @@ # limitations under the License. from functools import wraps +from typing import Optional import torch from compressed_tensors.quantization.quant_args import QuantizationArgs @@ -29,10 +30,14 @@ def quantize( x: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor, - q_min: torch.Tensor, - q_max: torch.Tensor, + args: QuantizationArgs, + dtype: Optional[torch.dtype] = None, ) -> torch.Tensor: - return torch.clamp( + bit_range = 2**args.num_bits + q_min = torch.tensor(bit_range / 2 - 1, device=x.device) + q_max = torch.tensor(-bit_range / 2, device=x.device) + + quantized_value = torch.clamp( torch.round( x / scale + zero_point, ), @@ -40,6 +45,11 @@ def quantize( q_max, ) + if dtype is not None: + quantized_value = quantized_value.to(dtype) + + return quantized_value + @torch.no_grad() def dequantize( @@ -57,12 +67,8 @@ def fake_quantize( zero_point: torch.Tensor, args: QuantizationArgs, ) -> torch.Tensor: - bit_range = 2**args.num_bits - max_q = torch.tensor(bit_range / 2 - 1, device=x.device) - min_q = torch.tensor(-bit_range / 2, device=x.device) - Q = torch.zeros_like(x) - Q = quantize(x, scale, zero_point, min_q, max_q) - return dequantize(Q, scale, zero_point) + x_quant = quantize(x, scale, zero_point, args) + return dequantize(x_quant, scale, zero_point) def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme): diff --git a/src/compressed_tensors/quantization/quant_config.py b/src/compressed_tensors/quantization/quant_config.py index a62a79bd..ad55b70e 100644 --- a/src/compressed_tensors/quantization/quant_config.py +++ b/src/compressed_tensors/quantization/quant_config.py @@ -60,10 +60,33 @@ def lifecycle_order(cls) -> List["QuantizationStatus"]: return def __ge__(self, other): + if other is None: + return True if not isinstance(other, self.__class__): raise NotImplementedError return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other) + def __gt__(self, other): + if other is None: + return True + if not isinstance(other, self.__class__): + raise NotImplementedError + return LIFECYCLE_ORDER.index(self) > LIFECYCLE_ORDER.index(other) + + def __lt__(self, other): + if other is None: + return False + if not isinstance(other, self.__class__): + raise NotImplementedError + return LIFECYCLE_ORDER.index(self) < LIFECYCLE_ORDER.index(other) + + def __le__(self, other): + if other is None: + return False + if not isinstance(other, self.__class__): + raise NotImplementedError + return LIFECYCLE_ORDER.index(self) <= LIFECYCLE_ORDER.index(other) + LIFECYCLE_ORDER = [ QuantizationStatus.INITIALIZED, diff --git a/tests/quantization/lifecycle/test_apply.py b/tests/quantization/lifecycle/test_apply.py index 6a3d17af..c658f040 100644 --- a/tests/quantization/lifecycle/test_apply.py +++ b/tests/quantization/lifecycle/test_apply.py @@ -12,8 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional -from compressed_tensors.quantization.lifecycle import apply_quantization_config +import torch +from compressed_tensors.quantization.lifecycle import ( + apply_quantization_config, + apply_quantization_status, +) from compressed_tensors.quantization.quant_config import ( QuantizationConfig, QuantizationStatus, @@ -22,7 +27,7 @@ def test_apply_quantization_config_tinyllama(): - quant_config = get_sample_tinyllama_quant_config() + quant_config = get_sample_tinyllama_quant_config(status="calibration") model = get_tinyllama_model() # check that model is not already quantized @@ -55,6 +60,23 @@ def test_apply_quantization_config_tinyllama(): assert num_embeddings == 1 assert num_rotary_embeddings == 22 + # test quantization compression + # sample forward pass to fill scales, zps + model(torch.zeros((1, 1), dtype=int), torch.zeros((1, 1), dtype=int)) + apply_quantization_status(model, QuantizationStatus.COMPRESSED) + for name, module in model.named_modules(): + if name in quant_config.ignore: + continue + module_type = module.__class__.__name__ + if module_type == "Linear": + _test_layer_quantization_status( + module, + inputs=True, + weights=True, + expected_status=QuantizationStatus.COMPRESSED, + expected_dtype=torch.int8, + ) + def test_serialize_config_tinyllama(): quant_config = get_sample_tinyllama_quant_config() @@ -81,11 +103,19 @@ def test_serialize_config_tinyllama(): assert serialized_config.global_compression_ratio < 8.0 -def _test_layer_quantization_status(module, inputs: bool, weights: bool): +def _test_layer_quantization_status( + module, + inputs: bool, + weights: bool, + expected_status: Optional[QuantizationStatus] = None, + expected_dtype: Optional[torch.dtype] = None, +): # check if quantization is applied at all (true if inputs or weights targeted) quantized = inputs or weights assert hasattr(module, "quantization_scheme") == quantized assert hasattr(module, "quantization_status") == quantized + if expected_status is not None: + assert module.quantization_status is expected_status # check inputs matches expected assert hasattr(module, "input_scale") == inputs @@ -94,6 +124,8 @@ def _test_layer_quantization_status(module, inputs: bool, weights: bool): # check weights matches expected assert hasattr(module, "weight_scale") == weights assert hasattr(module, "weight_zero_point") == weights + if weights and expected_dtype is not None: + assert module.weight.dtype is expected_dtype def get_tinyllama_model(): @@ -102,11 +134,11 @@ def get_tinyllama_model(): ) -def get_sample_tinyllama_quant_config(): +def get_sample_tinyllama_quant_config(status: str = "frozen"): config_dict = { "quant_method": "sparseml", "format": "fakequant", - "quantization_status": "frozen", + "quantization_status": status, "global_compression_ratio": None, "config_groups": { "group_1": { From 88c39dc17d1a444b2d0b9423770daae2dce67664 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Tue, 23 Apr 2024 15:01:09 -0400 Subject: [PATCH 02/10] Apply suggestions from code review --- src/compressed_tensors/quantization/lifecycle/compressed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compressed_tensors/quantization/lifecycle/compressed.py b/src/compressed_tensors/quantization/lifecycle/compressed.py index 4d7ef830..7a8aa7fd 100644 --- a/src/compressed_tensors/quantization/lifecycle/compressed.py +++ b/src/compressed_tensors/quantization/lifecycle/compressed.py @@ -33,7 +33,7 @@ def compress_quantized_weights(module: Module): """ Quantizes the module weight representation to use fewer bits in memory - apply to full model with `model.apply(compress_quantized)` + apply to full model with `model.apply(compress_quantized_weights)` :param module: module to compress to quantized representation """ From e2cbd1bee0dda2d3e2e1a5ef7a08817a6040a4ac Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Thu, 25 Apr 2024 20:13:07 +0000 Subject: [PATCH 03/10] small fixes for runtime --- src/compressed_tensors/compressors/sparse_bitmask.py | 3 ++- .../quantization/lifecycle/forward.py | 4 ++-- src/compressed_tensors/quantization/quant_config.py | 10 ++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/compressed_tensors/compressors/sparse_bitmask.py b/src/compressed_tensors/compressors/sparse_bitmask.py index dec359c3..b53ca7d5 100644 --- a/src/compressed_tensors/compressors/sparse_bitmask.py +++ b/src/compressed_tensors/compressors/sparse_bitmask.py @@ -76,7 +76,8 @@ def decompress( ) -> Generator[Tuple[str, Tensor], None, None]: """ Reads a bitmask compressed state dict located at path_to_model_or_tensors - and returns a generator for sequentially decompressing back to a dense state dict + and returns a generator for sequentially decompressing back to a dense state + dict :param model_path: path to compressed safetensors model (directory with one or more safetensors files) or compressed tensors file diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py index 8fab5eab..7d49613a 100644 --- a/src/compressed_tensors/quantization/lifecycle/forward.py +++ b/src/compressed_tensors/quantization/lifecycle/forward.py @@ -34,8 +34,8 @@ def quantize( dtype: Optional[torch.dtype] = None, ) -> torch.Tensor: bit_range = 2**args.num_bits - q_min = torch.tensor(bit_range / 2 - 1, device=x.device) - q_max = torch.tensor(-bit_range / 2, device=x.device) + q_max = torch.tensor(bit_range / 2 - 1, device=x.device) + q_min = torch.tensor(-bit_range / 2, device=x.device) quantized_value = torch.clamp( torch.round( diff --git a/src/compressed_tensors/quantization/quant_config.py b/src/compressed_tensors/quantization/quant_config.py index 3f81f546..5c3e1cc0 100644 --- a/src/compressed_tensors/quantization/quant_config.py +++ b/src/compressed_tensors/quantization/quant_config.py @@ -185,10 +185,20 @@ def from_pretrained(model: Module) -> "QuantizationConfig": group_name = "group_" + str(idx) config_groups[group_name] = scheme + # TODO: this is incorrect in compressed mode, since we are overwriting the + # original weight we lose the uncompressed bit_depth indo compression_ratio = calculate_compression_ratio(model) + + # TODO: replace this with compressor classes like we do for sparsity + if quantization_status == QuantizationStatus.COMPRESSED: + format = "compressed" + else: + format = "fakequant" + return QuantizationConfig( config_groups=config_groups, quantization_status=quantization_status, global_compression_ratio=compression_ratio, + format=format, ignore=consolidated_ignore, ) From df94b5eb6a57819720b0f44de1cb0fe0d9ead484 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Fri, 3 May 2024 14:07:58 -0400 Subject: [PATCH 04/10] Quantization Compressor Support (#45) * add classes * WIP * moving around classes * code complete * tests passing * unit test bugs * fill out int decompression * docstrings * allow repeat frozens * int compressor unit tests * PR comments --- src/compressed_tensors/base.py | 3 +- .../compressors/__init__.py | 4 +- src/compressed_tensors/compressors/base.py | 64 +---- src/compressed_tensors/compressors/dense.py | 8 +- src/compressed_tensors/compressors/helpers.py | 24 +- .../compressors/int_quantized.py | 95 +++++++ .../compressors/model_compressor.py | 262 ++++++++++++++++++ .../compressors/sparse_bitmask.py | 6 +- src/compressed_tensors/config/base.py | 9 +- src/compressed_tensors/config/dense.py | 8 +- .../config/sparse_bitmask.py | 6 +- .../quantization/lifecycle/compressed.py | 8 +- .../quantization/lifecycle/frozen.py | 4 + .../quantization/quant_config.py | 18 +- .../quantization/lifecycle/test_apply.py | 2 +- tests/test_int_quant.py | 109 ++++++++ tests/test_registry.py | 16 +- tests/test_utils/test_helpers.py | 8 +- 18 files changed, 547 insertions(+), 107 deletions(-) create mode 100644 src/compressed_tensors/compressors/int_quantized.py create mode 100644 src/compressed_tensors/compressors/model_compressor.py create mode 100644 tests/test_int_quant.py diff --git a/src/compressed_tensors/base.py b/src/compressed_tensors/base.py index d096bc86..65803e5c 100644 --- a/src/compressed_tensors/base.py +++ b/src/compressed_tensors/base.py @@ -13,4 +13,5 @@ # limitations under the License. SPARSITY_CONFIG_NAME = "sparsity_config" -QUANTIZATION_CONFIG_NAME = "sparseml_quantization_config" +QUANTIZATION_CONFIG_NAME = "quantization_config" +COMPRESSION_CONFIG_NAME = "compression_config" diff --git a/src/compressed_tensors/compressors/__init__.py b/src/compressed_tensors/compressors/__init__.py index 42724967..17acadb9 100644 --- a/src/compressed_tensors/compressors/__init__.py +++ b/src/compressed_tensors/compressors/__init__.py @@ -14,7 +14,9 @@ # flake8: noqa -from .base import ModelCompressor +from .base import Compressor from .dense import DenseCompressor from .helpers import load_compressed, save_compressed, save_compressed_model +from .int_quantized import IntQuantizationCompressor +from .model_compressor import ModelCompressor from .sparse_bitmask import BitmaskCompressor, BitmaskTensor diff --git a/src/compressed_tensors/compressors/base.py b/src/compressed_tensors/compressors/base.py index e30492d0..a0ceef74 100644 --- a/src/compressed_tensors/compressors/base.py +++ b/src/compressed_tensors/compressors/base.py @@ -12,56 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. -import operator -from typing import Dict, Generator, Optional, Tuple +from typing import Dict, Generator, Tuple, Union -from compressed_tensors.base import SPARSITY_CONFIG_NAME -from compressed_tensors.config import CompressionConfig +from compressed_tensors.config import SparsityCompressionConfig +from compressed_tensors.quantization import QuantizationConfig from compressed_tensors.registry import RegistryMixin -from compressed_tensors.utils import get_safetensors_folder from torch import Tensor -from torch.nn import Module, Parameter -from tqdm import tqdm -from transformers import AutoConfig -__all__ = ["ModelCompressor"] +__all__ = ["Compressor"] -class ModelCompressor(RegistryMixin): +class Compressor(RegistryMixin): """ - Base class representing a model compression algorithm. + Base class representing a model compression algorithm :param config: config specifying compression parameters """ - @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path: str - ) -> Optional["ModelCompressor"]: - """ - Given a path to a model config, extract a sparsity config if it exists and - return the associated ModelCompressor - - :param pretrained_model_name_or_path: path to model config on disk or HF hub - :return: matching compressor if config contains a sparsity config - """ - config = AutoConfig.from_pretrained(pretrained_model_name_or_path) - sparsity_config = getattr(config, SPARSITY_CONFIG_NAME, None) - if sparsity_config is None: - return None - - format = sparsity_config.get("format") - sparsity_config = CompressionConfig.load_from_registry( - format, **sparsity_config - ) - compressor = cls.load_from_registry(format, config=sparsity_config) - return compressor - - def __init__(self, config: Optional[CompressionConfig] = None): + def __init__( + self, config: Union[SparsityCompressionConfig, QuantizationConfig, None] = None + ): self.config = config - def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]: + def compress(self, model_state: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]: """ Compresses a dense state dict @@ -83,21 +57,3 @@ def decompress( :return: compressed state dict """ raise NotImplementedError() - - def overwrite_weights(self, model_path: str, model: Module): - """ - Overwrites the weights in model with weights decompressed from model_path - - :param model_path: path to compressed weights - :param model: pytorch model to load decompressed weights into - """ - model_path = get_safetensors_folder(model_path) - dense_gen = self.decompress(model_path) - for name, data in tqdm(dense_gen, desc="Decompressing model"): - # loading the decompressed weights into the model - model_device = operator.attrgetter(name)(model).device - data_new = Parameter(data.to(model_device)) - data_old = operator.attrgetter(name)(model) - data_old.data = data_new.data - - setattr(model, SPARSITY_CONFIG_NAME, self.config) diff --git a/src/compressed_tensors/compressors/dense.py b/src/compressed_tensors/compressors/dense.py index d6319980..8f09c8bf 100644 --- a/src/compressed_tensors/compressors/dense.py +++ b/src/compressed_tensors/compressors/dense.py @@ -14,18 +14,18 @@ from typing import Dict, Generator, Tuple -from compressed_tensors.compressors import ModelCompressor +from compressed_tensors.compressors import Compressor from compressed_tensors.config import CompressionFormat from torch import Tensor -@ModelCompressor.register(name=CompressionFormat.dense_sparsity.value) -class DenseCompressor(ModelCompressor): +@Compressor.register(name=CompressionFormat.dense.value) +class DenseCompressor(Compressor): """ Identity compressor for dense models, returns the original state_dict """ - def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]: + def compress(self, model_state: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]: return model_state def decompress( diff --git a/src/compressed_tensors/compressors/helpers.py b/src/compressed_tensors/compressors/helpers.py index 64ccfb3d..fe4b361c 100644 --- a/src/compressed_tensors/compressors/helpers.py +++ b/src/compressed_tensors/compressors/helpers.py @@ -16,8 +16,8 @@ from typing import Dict, Generator, Optional, Tuple, Union import torch -from compressed_tensors.compressors import ModelCompressor -from compressed_tensors.config import CompressionConfig, CompressionFormat +from compressed_tensors.compressors import Compressor +from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig from compressed_tensors.utils.safetensors_load import get_weight_mappings from safetensors import safe_open from safetensors.torch import save_file @@ -48,20 +48,20 @@ def save_compressed( if tensors is None or len(tensors) == 0: raise ValueError("No tensors or empty tensors provided to compress") - # if no compression_format specified, default to `dense_sparsity` - compression_format = compression_format or CompressionFormat.dense_sparsity.value + # if no compression_format specified, default to `dense` + compression_format = compression_format or CompressionFormat.dense.value if not ( - compression_format in ModelCompressor.registered_names() - or compression_format in ModelCompressor.registered_aliases() + compression_format in Compressor.registered_names() + or compression_format in Compressor.registered_aliases() ): raise ValueError( f"Unknown compression format: {compression_format}. " - f"Must be one of {set(ModelCompressor.registered_names() + ModelCompressor.registered_aliases())}" # noqa E501 + f"Must be one of {set(Compressor.registered_names() + Compressor.registered_aliases())}" # noqa E501 ) # compress - compressor = ModelCompressor.load_from_registry(compression_format) + compressor = Compressor.load_from_registry(compression_format) # save compressed tensors compressed_tensors = compressor.compress(tensors) save_file(compressed_tensors, save_path) @@ -69,7 +69,7 @@ def save_compressed( def load_compressed( compressed_tensors: Union[str, Path], - compression_config: CompressionConfig = None, + compression_config: SparsityCompressionConfig = None, device: Optional[str] = "cpu", ) -> Generator[Tuple[str, Tensor], None, None]: """ @@ -90,9 +90,9 @@ def load_compressed( if ( compression_config is None - or compression_config.format == CompressionFormat.dense_sparsity.value + or compression_config.format == CompressionFormat.dense.value ): - # if no compression_config specified, or `dense_sparsity` format specified, + # if no compression_config specified, or `dense` format specified, # assume tensors are not compressed on disk weight_mappings = get_weight_mappings(compressed_tensors) for weight_name, file_with_weight_name in weight_mappings.items(): @@ -102,7 +102,7 @@ def load_compressed( else: # decompress tensors compression_format = compression_config.format - compressor = ModelCompressor.load_from_registry( + compressor = Compressor.load_from_registry( compression_format, config=compression_config ) yield from compressor.decompress(compressed_tensors, device=device) diff --git a/src/compressed_tensors/compressors/int_quantized.py b/src/compressed_tensors/compressors/int_quantized.py new file mode 100644 index 00000000..6fbc0c66 --- /dev/null +++ b/src/compressed_tensors/compressors/int_quantized.py @@ -0,0 +1,95 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Dict, Generator, Tuple + +import torch +from compressed_tensors.compressors import Compressor +from compressed_tensors.config import CompressionFormat +from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize +from compressed_tensors.utils import get_nested_weight_mappings, merge_names +from safetensors import safe_open +from torch import Tensor +from tqdm import tqdm + + +__all__ = ["IntQuantizationCompressor"] + +_LOGGER: logging.Logger = logging.getLogger(__name__) + + +@Compressor.register(name=CompressionFormat.int_quantized.value) +class IntQuantizationCompressor(Compressor): + """ + Integer compression for quantized models. Weight of each quantized layer is + converted from its original float type to the format specified by the layer's + quantization scheme. + """ + + COMPRESSION_PARAM_NAMES = ["weight", "weight_scale", "weight_zero_point"] + + def compress(self, model_state: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]: + model_quant_args = kwargs["model_quant_args"] + compressed_dict = {} + _LOGGER.debug( + f"Compressing model with {len(model_state)} parameterized layers..." + ) + + for name, value in tqdm(model_state.items(), desc="Compressing model"): + if name.endswith(".weight"): + prefix = name.removesuffix(".weight") + scale = model_state.get(merge_names(prefix, "weight_scale"), None) + zp = model_state.get(merge_names(prefix, "weight_zero_point"), None) + if scale is not None and zp is not None: + # weight is quantized, compress it + quant_args = model_quant_args[prefix] + try: + bit_depth = torch.finfo(value.dtype).bits + except TypeError: + bit_depth = torch.iinfo(value.dtype).bits + if bit_depth > quant_args.num_bits: + # only quantize if not already quantized + value = quantize( + x=value, + scale=scale, + zero_point=zp, + args=quant_args, + dtype=torch.int8, + ) + + compressed_dict[name] = value.to("cpu") + + return compressed_dict + + def decompress( + self, path_to_model_or_tensors: str, device: str = "cpu" + ) -> Generator[Tuple[str, Tensor], None, None]: + weight_mappings = get_nested_weight_mappings( + path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES + ) + for weight_name in weight_mappings.keys(): + weight_data = {} + for param_name, safe_path in weight_mappings[weight_name].items(): + full_name = merge_names(weight_name, param_name) + with safe_open(safe_path, framework="pt", device=device) as f: + weight_data[param_name] = f.get_tensor(full_name) + + if len(weight_data) == len(self.COMPRESSION_PARAM_NAMES): + decompressed = dequantize( + x_q=weight_data["weight"], + scale=weight_data["weight_scale"], + zero_point=weight_data["weight_zero_point"], + ) + yield merge_names(weight_name, "weight"), decompressed diff --git a/src/compressed_tensors/compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressor.py new file mode 100644 index 00000000..be7dba54 --- /dev/null +++ b/src/compressed_tensors/compressors/model_compressor.py @@ -0,0 +1,262 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import operator +import os +from typing import Dict, Optional, Union + +from compressed_tensors.base import ( + COMPRESSION_CONFIG_NAME, + QUANTIZATION_CONFIG_NAME, + SPARSITY_CONFIG_NAME, +) +from compressed_tensors.compressors import Compressor +from compressed_tensors.config import SparsityCompressionConfig +from compressed_tensors.quantization import ( + QuantizationConfig, + QuantizationStatus, + apply_quantization_config, + load_pretrained_quantization, +) +from compressed_tensors.quantization.utils import ( + is_module_quantized, + iter_named_leaf_modules, +) +from compressed_tensors.utils import get_safetensors_folder +from torch import Tensor +from torch.nn import Module, Parameter +from tqdm import tqdm +from transformers import AutoConfig +from transformers.file_utils import CONFIG_NAME + + +__all__ = ["ModelCompressor"] + +_LOGGER: logging.Logger = logging.getLogger(__name__) + + +class ModelCompressor: + """ + Handles compression and decompression of a model with a sparsity config and/or + quantization config. + + Compression LifeCycle + - compressor = ModelCompressor.from_pretrained_model(model) + - compressed_state_dict = compressor.compress(model, state_dict) + - compressor.quantization_compressor.compress(model, state_dict) + - compressor.sparsity_compressor.compress(model, state_dict) + - model.save_pretrained(output_dir, state_dict=compressed_state_dict) + - compressor.update_config(output_dir) + + Decompression LifeCycle + - compressor = ModelCompressor.from_pretrained(comp_model_path) + - model = AutoModel.from_pretrained(comp_model_path) + - compressor.decompress(comp_model_path, model) + - compressor.sparsity_compressor.decompress(comp_model_path, model) + - compressor.quantization_compressor.decompress(comp_model_path, model) + + :param sparsity_config: config specifying sparsity compression parameters + :param quantization_config: config specifying quantization compression parameters + """ + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + ) -> Optional["ModelCompressor"]: + """ + Given a path to a model config, extract the sparsity and/or quantization + configs and load a ModelCompressor + + :param pretrained_model_name_or_path: path to model config on disk or HF hub + :return: compressor for the extracted configs + """ + config = AutoConfig.from_pretrained(pretrained_model_name_or_path) + compression_config = getattr(config, COMPRESSION_CONFIG_NAME, None) + if compression_config is None: + return None + + sparsity_config = compression_config.get(SPARSITY_CONFIG_NAME, None) + quantization_config = compression_config.get(QUANTIZATION_CONFIG_NAME, None) + + if sparsity_config is None and quantization_config is None: + return None + + if sparsity_config is not None: + format = sparsity_config.get("format") + sparsity_config = SparsityCompressionConfig.load_from_registry( + format, **sparsity_config + ) + if quantization_config is not None: + quantization_config = QuantizationConfig.parse_obj(quantization_config) + + return cls( + sparsity_config=sparsity_config, quantization_config=quantization_config + ) + + @classmethod + def from_pretrained_model( + cls, + model: Module, + sparsity_config: Union[SparsityCompressionConfig, str, None] = None, + quantization_format: Optional[str] = None, + ) -> Optional["ModelCompressor"]: + """ + Given a pytorch model and optional sparsity and/or quantization configs, + load the appropriate compressors + + :param model: pytorch model to target for compression + :param sparsity_config: a filled in sparsity config or string corresponding + to a sparsity compression algorithm + :param quantization_format: string corresponding to a quantization compression + algorithm + :return: compressor for the extracted configs + """ + quantization_config = QuantizationConfig.from_pretrained( + model, format=quantization_format + ) + + if isinstance(sparsity_config, str): # we passed in a sparsity format + sparsity_config = SparsityCompressionConfig.load_from_registry( + sparsity_config + ) + + if sparsity_config is None and quantization_config is None: + return None + + return cls( + sparsity_config=sparsity_config, quantization_config=quantization_config + ) + + def __init__( + self, + sparsity_config: Optional[SparsityCompressionConfig] = None, + quantization_config: Optional[QuantizationConfig] = None, + ): + self.sparsity_config = sparsity_config + self.quantization_config = quantization_config + self.sparsity_compressor = None + self.quantization_compressor = None + + if sparsity_config is not None: + self.sparsity_compressor = Compressor.load_from_registry( + sparsity_config.format, config=sparsity_config + ) + if quantization_config is not None: + self.quantization_compressor = Compressor.load_from_registry( + quantization_config.format, config=quantization_config + ) + + def compress( + self, model: Module, state_dict: Optional[Dict[str, Tensor]] = None + ) -> Dict[str, Tensor]: + """ + Compresses a dense state dict or model with sparsity and/or quantization + + :param model: uncompressed model to compress + :param model_state: optional uncompressed state_dict to insert into model + :return: compressed state dict + """ + if state_dict is None: + state_dict = model.state_dict() + + compressed_state_dict = state_dict + quantized_modules_to_args = _get_weight_arg_mappings(model) + if self.quantization_compressor is not None: + compressed_state_dict = self.quantization_compressor.compress( + state_dict, model_quant_args=quantized_modules_to_args + ) + + if self.sparsity_compressor is not None: + compressed_state_dict = self.sparsity_compressor.compress(state_dict) + + return compressed_state_dict + + def decompress(self, model_path: str, model: Module): + """ + Overwrites the weights in model with weights decompressed from model_path + + :param model_path: path to compressed weights + :param model: pytorch model to load decompressed weights into + """ + model_path = get_safetensors_folder(model_path) + if self.sparsity_compressor is not None: + dense_gen = self.sparsity_compressor.decompress(model_path) + self._replace_weights(dense_gen, model) + setattr(model, SPARSITY_CONFIG_NAME, self.sparsity_compressor.config) + + if self.quantization_compressor is not None: + apply_quantization_config(model, self.quantization_config) + load_pretrained_quantization(model, model_path) + dense_gen = self.quantization_compressor.decompress(model_path) + self._replace_weights(dense_gen, model) + + def update_status(module): + module.quantization_status = QuantizationStatus.FROZEN + + model.apply(update_status) + setattr(model, QUANTIZATION_CONFIG_NAME, self.quantization_config) + + def update_config(self, save_directory: str): + """ + Update the model config located at save_directory with compression configs + for sparsity and/or quantization + + :param save_directory: path to a folder containing a HF model config + """ + config_file_path = os.path.join(save_directory, CONFIG_NAME) + if not os.path.exists(config_file_path): + _LOGGER.warning( + f"Could not find a valid model config file in " + f"{save_directory}. Compression config will not be saved." + ) + return + + with open(config_file_path, "r") as config_file: + config_data = json.load(config_file) + + config_data[COMPRESSION_CONFIG_NAME] = {} + if self.quantization_config is not None: + quant_config_data = self.quantization_config.model_dump() + config_data[COMPRESSION_CONFIG_NAME][ + QUANTIZATION_CONFIG_NAME + ] = quant_config_data + if self.sparsity_config is not None: + sparsity_config_data = self.sparsity_config.model_dump() + config_data[COMPRESSION_CONFIG_NAME][ + SPARSITY_CONFIG_NAME + ] = sparsity_config_data + + with open(config_file_path, "w") as config_file: + json.dump(config_data, config_file, indent=2, sort_keys=True) + + def _replace_weights(self, dense_weight_generator, model): + for name, data in tqdm(dense_weight_generator, desc="Decompressing model"): + # loading the decompressed weights into the model + model_device = operator.attrgetter(name)(model).device + data_new = Parameter(data.to(model_device)) + data_old = operator.attrgetter(name)(model) + data_old.data = data_new.data + + +def _get_weight_arg_mappings(model: Module) -> Dict: + quantized_modules_to_args = {} + for name, submodule in iter_named_leaf_modules(model): + if is_module_quantized(submodule): + if submodule.quantization_scheme.weights is not None: + quantized_modules_to_args[name] = submodule.quantization_scheme.weights + + return quantized_modules_to_args diff --git a/src/compressed_tensors/compressors/sparse_bitmask.py b/src/compressed_tensors/compressors/sparse_bitmask.py index b4a4e6f2..10b398c1 100644 --- a/src/compressed_tensors/compressors/sparse_bitmask.py +++ b/src/compressed_tensors/compressors/sparse_bitmask.py @@ -17,7 +17,7 @@ import numpy import torch -from compressed_tensors.compressors import ModelCompressor +from compressed_tensors.compressors import Compressor from compressed_tensors.config import CompressionFormat from compressed_tensors.utils import get_nested_weight_mappings, merge_names from safetensors import safe_open @@ -37,8 +37,8 @@ _LOGGER: logging.Logger = logging.getLogger(__name__) -@ModelCompressor.register(name=CompressionFormat.sparse_bitmask.value) -class BitmaskCompressor(ModelCompressor): +@Compressor.register(name=CompressionFormat.sparse_bitmask.value) +class BitmaskCompressor(Compressor): """ Compression for sparse models using bitmasks. Non-zero weights are stored in a 1d values tensor, with their locations stored in a 2d bitmask diff --git a/src/compressed_tensors/config/base.py b/src/compressed_tensors/config/base.py index 96778995..17c8da73 100644 --- a/src/compressed_tensors/config/base.py +++ b/src/compressed_tensors/config/base.py @@ -19,17 +19,18 @@ from pydantic import BaseModel -__all__ = ["CompressionConfig", "CompressionFormat"] +__all__ = ["SparsityCompressionConfig", "CompressionFormat"] class CompressionFormat(Enum): - dense_sparsity = "dense-sparsity" + dense = "dense" sparse_bitmask = "sparse-bitmask" + int_quantized = "int-quantized" -class CompressionConfig(RegistryMixin, BaseModel): +class SparsityCompressionConfig(RegistryMixin, BaseModel): """ - Base data class for storing compression parameters + Base data class for storing sparsity compression parameters :param format: name of compression format :param global_sparsity: average sparsity of the entire model diff --git a/src/compressed_tensors/config/dense.py b/src/compressed_tensors/config/dense.py index 0a18309e..8e7e3b7a 100644 --- a/src/compressed_tensors/config/dense.py +++ b/src/compressed_tensors/config/dense.py @@ -14,14 +14,14 @@ from typing import Optional -from compressed_tensors.config import CompressionConfig, CompressionFormat +from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig __all__ = ["DenseSparsityConfig"] -@CompressionConfig.register(name=CompressionFormat.dense_sparsity.value) -class DenseSparsityConfig(CompressionConfig): +@SparsityCompressionConfig.register(name=CompressionFormat.dense.value) +class DenseSparsityConfig(SparsityCompressionConfig): """ Identity configuration for storing a sparse model in an uncompressed dense format @@ -31,6 +31,6 @@ class DenseSparsityConfig(CompressionConfig): "unstructured", "2:4", "8:16" etc """ - format: str = CompressionFormat.dense_sparsity.value + format: str = CompressionFormat.dense.value global_sparsity: Optional[float] = 0.0 sparsity_structure: Optional[str] = "unstructured" diff --git a/src/compressed_tensors/config/sparse_bitmask.py b/src/compressed_tensors/config/sparse_bitmask.py index 9d2015f3..c14d9f7c 100644 --- a/src/compressed_tensors/config/sparse_bitmask.py +++ b/src/compressed_tensors/config/sparse_bitmask.py @@ -14,14 +14,14 @@ from typing import Optional -from compressed_tensors.config import CompressionConfig, CompressionFormat +from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig __all__ = ["BitmaskConfig"] -@CompressionConfig.register(name=CompressionFormat.sparse_bitmask.value) -class BitmaskConfig(CompressionConfig): +@SparsityCompressionConfig.register(name=CompressionFormat.sparse_bitmask.value) +class BitmaskConfig(SparsityCompressionConfig): """ Configuration for storing a sparse model using bitmask compression diff --git a/src/compressed_tensors/quantization/lifecycle/compressed.py b/src/compressed_tensors/quantization/lifecycle/compressed.py index 7a8aa7fd..84962df4 100644 --- a/src/compressed_tensors/quantization/lifecycle/compressed.py +++ b/src/compressed_tensors/quantization/lifecycle/compressed.py @@ -42,6 +42,10 @@ def compress_quantized_weights(module: Module): # no quantization scheme or weights not quantized, nothing to do return + if scheme is QuantizationStatus.COMPRESSED: + # module is already compressed, nothing to do + return + weight = getattr(module, "weight", None) scale = getattr(module, "weight_scale", None) zero_point = getattr(module, "weight_zero_point", None) @@ -49,8 +53,8 @@ def compress_quantized_weights(module: Module): if weight is None or scale is None or zero_point is None: # no weight, scale, or ZP, nothing to do - # TODO: Should we mark as compressed anyway here to maintain consistent - # status throughout the model? + # mark as compressed here to maintain consistent status throughout the model + module.quantization_status = QuantizationStatus.COMPRESSED return module.weight.requires_grad = False # cannot use auto grad after compression diff --git a/src/compressed_tensors/quantization/lifecycle/frozen.py b/src/compressed_tensors/quantization/lifecycle/frozen.py index 34d132ec..652f1c3a 100644 --- a/src/compressed_tensors/quantization/lifecycle/frozen.py +++ b/src/compressed_tensors/quantization/lifecycle/frozen.py @@ -35,6 +35,10 @@ def freeze_module_quantization(module: Module): # no quantization scheme nothing to do return + if module.quantization_status == QuantizationStatus.FROZEN: + # nothing to do, already frozen + return + # delete observers from module if not dynamic if scheme.input_activations and not scheme.input_activations.dynamic: delattr(module, "input_observer") diff --git a/src/compressed_tensors/quantization/quant_config.py b/src/compressed_tensors/quantization/quant_config.py index 5c3e1cc0..43127b79 100644 --- a/src/compressed_tensors/quantization/quant_config.py +++ b/src/compressed_tensors/quantization/quant_config.py @@ -16,6 +16,7 @@ from typing import Dict, List, Optional from compressed_tensors.base import QUANTIZATION_CONFIG_NAME +from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization.quant_scheme import QuantizationScheme from compressed_tensors.quantization.utils import ( calculate_compression_ratio, @@ -139,7 +140,9 @@ def from_model_config(model_name_or_path) -> "QuantizationConfig": return QuantizationConfig.parse_obj(quantization_config) @staticmethod - def from_pretrained(model: Module) -> "QuantizationConfig": + def from_pretrained( + model: Module, format: Optional[str] = None + ) -> Optional["QuantizationConfig"]: """ Converts a model into its associated QuantizationConfig based on the QuantizationScheme attached to each quanitzed module @@ -170,6 +173,9 @@ def from_pretrained(model: Module) -> "QuantizationConfig": if not match_found: quant_scheme_to_layers.append(scheme) + if len(quant_scheme_to_layers) == 0: # No quantized layers + return None + # clean up ignore list, we can leave out layers types if none of the # instances are quantized consolidated_ignore = [] @@ -189,11 +195,11 @@ def from_pretrained(model: Module) -> "QuantizationConfig": # original weight we lose the uncompressed bit_depth indo compression_ratio = calculate_compression_ratio(model) - # TODO: replace this with compressor classes like we do for sparsity - if quantization_status == QuantizationStatus.COMPRESSED: - format = "compressed" - else: - format = "fakequant" + if format is None: + if quantization_status == QuantizationStatus.COMPRESSED: + format = CompressionFormat.int_quantized.value + else: + format = CompressionFormat.dense.value return QuantizationConfig( config_groups=config_groups, diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py index c658f040..82e21afb 100644 --- a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py +++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py @@ -96,7 +96,7 @@ def test_serialize_config_tinyllama(): assert serialized_config.config_groups["group_1"].targets == ["Linear"] assert serialized_config.config_groups["group_1"].input_activations is not None assert serialized_config.quantization_status == QuantizationStatus.FROZEN - assert serialized_config.format == "fakequant" + assert serialized_config.format == "dense" assert serialized_config.quant_method == "sparseml" assert serialized_config.ignore == ["model.layers.1.mlp.down_proj"] assert serialized_config.global_compression_ratio > 1.0 diff --git a/tests/test_int_quant.py b/tests/test_int_quant.py new file mode 100644 index 00000000..b5b2cdb3 --- /dev/null +++ b/tests/test_int_quant.py @@ -0,0 +1,109 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil + +import torch +from compressed_tensors import IntQuantizationCompressor +from compressed_tensors.quantization import ( + QuantizationArgs, + QuantizationConfig, + QuantizationScheme, +) +from compressed_tensors.quantization.lifecycle.forward import fake_quantize +from safetensors.torch import save_file + + +def get_dummy_quant_config(): + config_groups = { + "group_1": QuantizationScheme(targets=["Linear"], weights=QuantizationArgs()), + } + ignore = ["lm_head"] + quant_config = QuantizationConfig( + config_groups=config_groups, + ignore=ignore, + ) + + return quant_config + + +def test_quant_format(): + dense_state_dict = { + "dummy.weight": torch.rand((512, 1024)), + "dummy.weight_scale": torch.tensor(0.01, dtype=torch.float32), + "dummy.weight_zero_point": torch.tensor(0, dtype=torch.int32), + } + quant_config = get_dummy_quant_config() + + compressor = IntQuantizationCompressor(config=quant_config) + quantized_modules_to_args = {"dummy": quant_config.config_groups["group_1"].weights} + compressed_state_dict = compressor.compress( + dense_state_dict, model_quant_args=quantized_modules_to_args + ) + + # state_dict params should be the same + assert len(dense_state_dict) == len(compressed_state_dict) + + # check compressed to int8 + assert compressed_state_dict["dummy.weight"].dtype == torch.int8 + assert compressed_state_dict["dummy.weight_scale"].dtype == torch.float32 + assert compressed_state_dict["dummy.weight_zero_point"].dtype == torch.int32 + + +def test_reload_match(tmp_path): + dense_state_dict = { + "dummy.weight": torch.rand((511, 350)), + "dummy.weight_scale": torch.tensor(0.01, dtype=torch.float32), + "dummy.weight_zero_point": torch.tensor(0, dtype=torch.int32), + "dummy2.weight": torch.rand((128, 280)), + "dummy2.weight_scale": torch.tensor(0.02, dtype=torch.float32), + "dummy2.weight_zero_point": torch.tensor(15, dtype=torch.int32), + } + quant_config = get_dummy_quant_config() + + compressor = IntQuantizationCompressor(config=quant_config) + quantized_modules_to_args = { + "dummy": quant_config.config_groups["group_1"].weights, + "dummy2": quant_config.config_groups["group_1"].weights, + } + compressed_state_dict = compressor.compress( + dense_state_dict, model_quant_args=quantized_modules_to_args + ) + save_file(compressed_state_dict, tmp_path / "model.safetensors") + reconstructed_dense_gen = compressor.decompress(tmp_path) + reconstructed_dense = {} + for name, value in reconstructed_dense_gen: + reconstructed_dense[name] = value + + fake_quant_dummy = fake_quantize( + dense_state_dict["dummy.weight"], + scale=dense_state_dict["dummy.weight_scale"], + zero_point=dense_state_dict["dummy.weight_zero_point"], + args=quantized_modules_to_args["dummy"], + ) + assert torch.equal( + fake_quant_dummy, reconstructed_dense["dummy.weight"].to(torch.float32) + ) + + fake_quant_dummy2 = fake_quantize( + dense_state_dict["dummy2.weight"], + scale=dense_state_dict["dummy2.weight_scale"], + zero_point=dense_state_dict["dummy2.weight_zero_point"], + args=quantized_modules_to_args["dummy2"], + ) + assert torch.equal( + fake_quant_dummy2, reconstructed_dense["dummy2.weight"].to(torch.float32) + ) + + shutil.rmtree(tmp_path) diff --git a/tests/test_registry.py b/tests/test_registry.py index ffe66b85..4726fcf2 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -16,11 +16,11 @@ from compressed_tensors import ( BitmaskCompressor, BitmaskConfig, - CompressionConfig, CompressionFormat, + Compressor, DenseCompressor, DenseSparsityConfig, - ModelCompressor, + SparsityCompressionConfig, ) @@ -28,11 +28,11 @@ "name,type", [ [CompressionFormat.sparse_bitmask.value, BitmaskConfig], - [CompressionFormat.dense_sparsity.value, DenseSparsityConfig], + [CompressionFormat.dense.value, DenseSparsityConfig], ], ) def test_configs(name, type): - config = CompressionConfig.load_from_registry(name) + config = SparsityCompressionConfig.load_from_registry(name) assert isinstance(config, type) assert config.format == name @@ -41,13 +41,13 @@ def test_configs(name, type): "name,type", [ [CompressionFormat.sparse_bitmask.value, BitmaskCompressor], - [CompressionFormat.dense_sparsity.value, DenseCompressor], + [CompressionFormat.dense.value, DenseCompressor], ], ) def test_compressors(name, type): - compressor = ModelCompressor.load_from_registry( - name, config=CompressionConfig(format="none") + compressor = Compressor.load_from_registry( + name, config=SparsityCompressionConfig(format="none") ) assert isinstance(compressor, type) - assert isinstance(compressor.config, CompressionConfig) + assert isinstance(compressor.config, SparsityCompressionConfig) assert compressor.config.format == "none" diff --git a/tests/test_utils/test_helpers.py b/tests/test_utils/test_helpers.py index 7ae0799d..d8a430e4 100644 --- a/tests/test_utils/test_helpers.py +++ b/tests/test_utils/test_helpers.py @@ -44,10 +44,10 @@ def test_save_compressed_sparse_bitmask(tmp_path, tensors): assert (tmp_path / "model.safetensors").exists() -def test_save_compressed_dense_sparsity(tmp_path, tensors): +def test_save_compressed_dense(tmp_path, tensors): save_compressed( tensors, - compression_format="dense-sparsity", + compression_format="dense", save_path=tmp_path / "model.safetensors", ) assert (tmp_path / "model.safetensors").exists() @@ -92,10 +92,10 @@ def test_load_compressed_sparse_bitmask(tmp_path, tensors): assert torch.allclose(tensors[key], loaded_tensors[key]) -def test_load_compressed_dense_sparsity(tmp_path, tensors): +def test_load_compressed_dense(tmp_path, tensors): save_compressed( tensors, - compression_format="dense-sparsity", + compression_format="dense", save_path=tmp_path / "model.safetensors", ) save_compressed( From 650c236544fd2ffc6f2960388a9427403aefed31 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Fri, 3 May 2024 19:01:58 +0000 Subject: [PATCH 05/10] fix device issue --- src/compressed_tensors/quantization/observers/helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/compressed_tensors/quantization/observers/helpers.py b/src/compressed_tensors/quantization/observers/helpers.py index f548dba3..ea65e90a 100644 --- a/src/compressed_tensors/quantization/observers/helpers.py +++ b/src/compressed_tensors/quantization/observers/helpers.py @@ -35,12 +35,13 @@ def calculate_qparams( """ min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + device = min_vals.device bit_range = 2**quantization_args.num_bits - 1 bit_min = -(bit_range + 1) / 2 bit_max = bit_min + bit_range if quantization_args.symmetric: - zero_points = torch.tensor(0).to(torch.int8) + zero_points = torch.tensor(0, device=device).to(torch.int8) max_val_pos = torch.max(-min_vals, max_vals) scales = max_val_pos / (float(bit_range) / 2) scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps) From bb369367bd74f7b18ddd3bfdc32db2902b604f92 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Fri, 3 May 2024 20:49:23 +0000 Subject: [PATCH 06/10] fixing leaf checker --- .../compressors/model_compressor.py | 4 +++- .../quantization/utils/helpers.py | 21 ++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/compressed_tensors/compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressor.py index be7dba54..9d1cf6df 100644 --- a/src/compressed_tensors/compressors/model_compressor.py +++ b/src/compressed_tensors/compressors/model_compressor.py @@ -181,7 +181,9 @@ def compress( ) if self.sparsity_compressor is not None: - compressed_state_dict = self.sparsity_compressor.compress(state_dict) + compressed_state_dict = self.sparsity_compressor.compress( + compressed_state_dict + ) return compressed_state_dict diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py index 8676ef15..66944eb1 100644 --- a/src/compressed_tensors/quantization/utils/helpers.py +++ b/src/compressed_tensors/quantization/utils/helpers.py @@ -15,6 +15,7 @@ from typing import Tuple import torch +from compressed_tensors.quantization.observers.base import Observer from torch.nn import Module from tqdm import tqdm @@ -78,11 +79,25 @@ def module_type(module: Module) -> str: def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]: - # yields modules that do not have any submodules - # TODO: potentially expand to add list of allowed submodules such as observers + """ + Yields modules that do not have any submodules except observers. The observers + themselves are not yielded + + :param model: model to get leaf modules of + :returns: generator tuple of (name, leaf_submodule) + """ for name, submodule in model.named_modules(): - if len(list(submodule.children())) == 0: + children = list(submodule.children()) + if len(children) == 0 and not isinstance(submodule, Observer): yield name, submodule + else: + has_non_observer_children = False + for child in children: + if not isinstance(child, Observer): + has_non_observer_children = True + + if not has_non_observer_children: + yield name, submodule def calculate_compression_ratio(model: Module) -> float: From 1e7fb4215ca85a63040e08931372b4977c364ff8 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Tue, 7 May 2024 10:19:17 +0000 Subject: [PATCH 07/10] initial commit --- .../observers/quantization => test_quantization}/__init__.py | 0 .../quantization => test_quantization}/lifecycle/__init__.py | 0 .../quantization => test_quantization}/lifecycle/conftest.py | 0 .../quantization => test_quantization}/lifecycle/test_apply.py | 0 .../lifecycle/test_dynamic_lifecycle.py | 0 .../quantization => test_quantization}/lifecycle/test_forward.py | 0 .../quantization => test_quantization}/lifecycle/test_frozen.py | 0 .../lifecycle/test_initialize.py | 0 .../lifecycle/test_lifecycle.py | 0 .../test_observers}/test_min_max.py | 0 .../quantization => test_quantization}/test_quant_args.py | 0 .../quantization => test_quantization}/test_quant_config.py | 0 .../quantization => test_quantization}/test_quant_scheme.py | 0 13 files changed, 0 insertions(+), 0 deletions(-) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/__init__.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/lifecycle/__init__.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/lifecycle/conftest.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/lifecycle/test_apply.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/lifecycle/test_dynamic_lifecycle.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/lifecycle/test_forward.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/lifecycle/test_frozen.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/lifecycle/test_initialize.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/lifecycle/test_lifecycle.py (100%) rename tests/{compressed_tensors/quantization/observers => test_quantization/test_observers}/test_min_max.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/test_quant_args.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/test_quant_config.py (100%) rename tests/{compressed_tensors/quantization/observers/quantization => test_quantization}/test_quant_scheme.py (100%) diff --git a/tests/compressed_tensors/quantization/observers/quantization/__init__.py b/tests/test_quantization/__init__.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/__init__.py rename to tests/test_quantization/__init__.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py b/tests/test_quantization/lifecycle/__init__.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py rename to tests/test_quantization/lifecycle/__init__.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py b/tests/test_quantization/lifecycle/conftest.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py rename to tests/test_quantization/lifecycle/conftest.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py b/tests/test_quantization/lifecycle/test_apply.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py rename to tests/test_quantization/lifecycle/test_apply.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_dynamic_lifecycle.py b/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_dynamic_lifecycle.py rename to tests/test_quantization/lifecycle/test_dynamic_lifecycle.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py b/tests/test_quantization/lifecycle/test_forward.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py rename to tests/test_quantization/lifecycle/test_forward.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py b/tests/test_quantization/lifecycle/test_frozen.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py rename to tests/test_quantization/lifecycle/test_frozen.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py b/tests/test_quantization/lifecycle/test_initialize.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py rename to tests/test_quantization/lifecycle/test_initialize.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py b/tests/test_quantization/lifecycle/test_lifecycle.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py rename to tests/test_quantization/lifecycle/test_lifecycle.py diff --git a/tests/compressed_tensors/quantization/observers/test_min_max.py b/tests/test_quantization/test_observers/test_min_max.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/test_min_max.py rename to tests/test_quantization/test_observers/test_min_max.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py b/tests/test_quantization/test_quant_args.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py rename to tests/test_quantization/test_quant_args.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py b/tests/test_quantization/test_quant_config.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py rename to tests/test_quantization/test_quant_config.py diff --git a/tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py b/tests/test_quantization/test_quant_scheme.py similarity index 100% rename from tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py rename to tests/test_quantization/test_quant_scheme.py From 26fb3f78d1219123b2ee51bb95998e3155f379c8 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Tue, 7 May 2024 15:53:55 +0000 Subject: [PATCH 08/10] Revert "Merge branch 'main' into compressed-lifecycle" This reverts commit 8dcdde51bb153036b551a5a0252208f1ff9275e1, reversing changes made to bb369367bd74f7b18ddd3bfdc32db2902b604f92. --- setup.py | 2 +- src/compressed_tensors/compressors/sparse_bitmask.py | 2 +- .../quantization/observers/quantization}/__init__.py | 0 .../quantization/observers/quantization}/lifecycle/__init__.py | 0 .../quantization/observers/quantization}/lifecycle/conftest.py | 0 .../observers/quantization}/lifecycle/test_apply.py | 0 .../observers/quantization}/lifecycle/test_dynamic_lifecycle.py | 0 .../observers/quantization}/lifecycle/test_forward.py | 0 .../observers/quantization}/lifecycle/test_frozen.py | 0 .../observers/quantization}/lifecycle/test_initialize.py | 0 .../observers/quantization}/lifecycle/test_lifecycle.py | 0 .../quantization/observers/quantization}/test_quant_args.py | 0 .../quantization/observers/quantization}/test_quant_config.py | 0 .../quantization/observers/quantization}/test_quant_scheme.py | 0 .../quantization/observers}/test_min_max.py | 0 15 files changed, 2 insertions(+), 2 deletions(-) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/__init__.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/lifecycle/__init__.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/lifecycle/conftest.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/lifecycle/test_apply.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/lifecycle/test_dynamic_lifecycle.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/lifecycle/test_forward.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/lifecycle/test_frozen.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/lifecycle/test_initialize.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/lifecycle/test_lifecycle.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/test_quant_args.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/test_quant_config.py (100%) rename tests/{test_quantization => compressed_tensors/quantization/observers/quantization}/test_quant_scheme.py (100%) rename tests/{test_quantization/test_observers => compressed_tensors/quantization/observers}/test_min_max.py (100%) diff --git a/setup.py b/setup.py index 999677d1..3d280a2e 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def _setup_extras() -> Dict: setup( name="compressed-tensors", - version="0.3.3", + version="0.3.2", author="Neuralmagic, Inc.", author_email="support@neuralmagic.com", license="Apache 2.0", diff --git a/src/compressed_tensors/compressors/sparse_bitmask.py b/src/compressed_tensors/compressors/sparse_bitmask.py index c0cc1888..10b398c1 100644 --- a/src/compressed_tensors/compressors/sparse_bitmask.py +++ b/src/compressed_tensors/compressors/sparse_bitmask.py @@ -67,7 +67,7 @@ def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]: f"found an existing entry for {key}. The existing entry will " "be replaced." ) - compressed_dict.update(bitmask_dict) + compressed_dict |= bitmask_dict return compressed_dict diff --git a/tests/test_quantization/__init__.py b/tests/compressed_tensors/quantization/observers/quantization/__init__.py similarity index 100% rename from tests/test_quantization/__init__.py rename to tests/compressed_tensors/quantization/observers/quantization/__init__.py diff --git a/tests/test_quantization/lifecycle/__init__.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py similarity index 100% rename from tests/test_quantization/lifecycle/__init__.py rename to tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py diff --git a/tests/test_quantization/lifecycle/conftest.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py similarity index 100% rename from tests/test_quantization/lifecycle/conftest.py rename to tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py diff --git a/tests/test_quantization/lifecycle/test_apply.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py similarity index 100% rename from tests/test_quantization/lifecycle/test_apply.py rename to tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py diff --git a/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_dynamic_lifecycle.py similarity index 100% rename from tests/test_quantization/lifecycle/test_dynamic_lifecycle.py rename to tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_dynamic_lifecycle.py diff --git a/tests/test_quantization/lifecycle/test_forward.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py similarity index 100% rename from tests/test_quantization/lifecycle/test_forward.py rename to tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py diff --git a/tests/test_quantization/lifecycle/test_frozen.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py similarity index 100% rename from tests/test_quantization/lifecycle/test_frozen.py rename to tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py diff --git a/tests/test_quantization/lifecycle/test_initialize.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py similarity index 100% rename from tests/test_quantization/lifecycle/test_initialize.py rename to tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py diff --git a/tests/test_quantization/lifecycle/test_lifecycle.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py similarity index 100% rename from tests/test_quantization/lifecycle/test_lifecycle.py rename to tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py diff --git a/tests/test_quantization/test_quant_args.py b/tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py similarity index 100% rename from tests/test_quantization/test_quant_args.py rename to tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py diff --git a/tests/test_quantization/test_quant_config.py b/tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py similarity index 100% rename from tests/test_quantization/test_quant_config.py rename to tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py diff --git a/tests/test_quantization/test_quant_scheme.py b/tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py similarity index 100% rename from tests/test_quantization/test_quant_scheme.py rename to tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py diff --git a/tests/test_quantization/test_observers/test_min_max.py b/tests/compressed_tensors/quantization/observers/test_min_max.py similarity index 100% rename from tests/test_quantization/test_observers/test_min_max.py rename to tests/compressed_tensors/quantization/observers/test_min_max.py From 2695be3b36b96609c54d16f5ea81c72d563f7ce8 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Tue, 7 May 2024 15:57:02 +0000 Subject: [PATCH 09/10] update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3d280a2e..999677d1 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def _setup_extras() -> Dict: setup( name="compressed-tensors", - version="0.3.2", + version="0.3.3", author="Neuralmagic, Inc.", author_email="support@neuralmagic.com", license="Apache 2.0", From dac6a41f2b196b9777b31301d03ce72c09bbbbdd Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Tue, 7 May 2024 16:10:19 +0000 Subject: [PATCH 10/10] fix test --- tests/test_quantization/lifecycle/test_lifecycle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_quantization/lifecycle/test_lifecycle.py b/tests/test_quantization/lifecycle/test_lifecycle.py index 352fcb4d..7ded8ef9 100644 --- a/tests/test_quantization/lifecycle/test_lifecycle.py +++ b/tests/test_quantization/lifecycle/test_lifecycle.py @@ -97,7 +97,7 @@ def test_lifecyle(create_quantization_scheme): for _ in range(10): layer(torch.randn(4, 4)) - assert initialized_layer_input_zero_point != layer.input_zero_point + assert initialized_layer_input_zero_point != 0 assert initialized_layer_input_scale != layer.input_scale assert initialized_layer_weight_scale == layer.weight_scale