diff --git a/requirements-common.txt b/requirements-common.txt index 67de33c57873f..6ace082ad97d6 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -24,4 +24,3 @@ librosa # Required for audio processing soundfile # Required for audio processing gguf == 0.9.1 importlib_metadata -compressed-tensors == 0.5.0 diff --git a/requirements-test.txt b/requirements-test.txt index 8e003a09703ad..62d6cc49eade4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -17,7 +17,7 @@ peft requests ray sentence-transformers # required for embedding -compressed-tensors==0.5.0 # required for compressed-tensors +compressed-tensors==0.4.0 # required for compressed-tensors timm # required for internvl test # TODO: Add this after fully implementing llava(mantis) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 7924d187df602..2ea340779b819 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -5,12 +5,13 @@ import pytest import torch -from compressed_tensors.quantization import QuantizationType from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationType) @pytest.mark.parametrize("model_args", [ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 57c57d5d95c71..ae75781927381 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,10 +1,6 @@ from typing import Any, Dict, List, Optional import torch -from compressed_tensors.config import CompressionFormat -from compressed_tensors.quantization import (QuantizationArgs, - QuantizationStrategy, - QuantizationType) from pydantic import BaseModel from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase @@ -17,7 +13,8 @@ CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - find_matched_target, is_activation_quantization_format, + CompressionFormat, QuantizationArgs, QuantizationStrategy, + QuantizationType, find_matched_target, is_activation_quantization_format, should_ignore_layer) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index 1671a23d77c63..3d55d55cc390d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -1,10 +1,11 @@ from typing import Callable, List, Optional import torch -from compressed_tensors.quantization import QuantizationStrategy from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index e0e1b8bad4ec3..8a3d24e2fd258 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -1,11 +1,12 @@ from typing import Callable, List, Optional import torch -from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale) from vllm.model_executor.parameter import (ChannelQuantScaleParameter, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 5597dc888b7b2..078380f159291 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -1,11 +1,12 @@ from typing import Callable, List, Optional import torch -from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( apply_int8_linear, convert_to_channelwise) from vllm.model_executor.parameter import (BasevLLMParameter, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index a74eaef5efdee..7912cbde5721f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,13 +1,85 @@ import re -from typing import Iterable, Optional +from enum import Enum +from typing import Any, Dict, Iterable, Optional -from compressed_tensors import CompressionFormat +from pydantic import BaseModel, Field from torch.nn import Module from vllm.model_executor.layers.quantization.utils.quant_utils import ( FUSED_LAYER_NAME_MAPPING) +class CompressionFormat(Enum): + dense = "dense" + sparse_bitmask = "sparse-bitmask" + naive_quantized = "naive-quantized" + float_quantized = "float-quantized" + int_quantized = "int-quantized" + pack_quantized = "pack-quantized" + marlin_24 = "marlin-24" + + +class QuantizationType(str, Enum): + """ + Enum storing quantization type options + """ + + INT = "int" + FLOAT = "float" + + +class QuantizationStrategy(str, Enum): + """ + Enum storing quantization strategy options + """ + + TENSOR = "tensor" + CHANNEL = "channel" + GROUP = "group" + BLOCK = "block" + TOKEN = "token" + + +class QuantizationArgs(BaseModel): + """ + User facing arguments used to define a quantization config + for weights or activations + + :param num_bits: quantization bit depth + :param type: dtype to quantized to, either int or float + :param symmetric: whether or not quantization scale is symmetric + :param strategy: string determining the scope of scale/zero-point to apply + :param group_size: group length to use for the group strategy + :param block_structure: 2d block structure to use for the block + strategy, must be of the format "2x4", "8x16", etc. + :param dynamic: set True to perform dynamic quantization - + values will not be calibrated during calibration phase, + instead during inference new quantization ranges will be + observed with every sample. Defaults to False for static + quantization. Note that enabling dynamic quantization + will change the default observer to a memoryless one + """ + + num_bits: int = 8 + type: QuantizationType = QuantizationType.INT + symmetric: bool = True + group_size: Optional[int] = None + strategy: Optional[QuantizationStrategy] = None + block_structure: Optional[str] = None + dynamic: bool = False + observer: str = Field( + default="minmax", + description=("The class to use to compute the quantization param - " + "scale and zero-point'"), + ) + observer_kwargs: Dict[str, Any] = Field( + default_factory=dict, + description= + ("optional dict of kwargs to be passed directly to torch quantization " + "Observers constructor excluding quantization range or symmetry"), + ) + + def is_activation_quantization_format(format: str) -> bool: _ACTIVATION_QUANTIZATION_FORMATS = [ CompressionFormat.naive_quantized.value,