diff --git a/requirements-common.txt b/requirements-common.txt index 05215f25e94b6..dc9b86d1973a8 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -31,4 +31,4 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.7.0 # required for compressed-tensors +compressed-tensors == 0.7.1 # required for compressed-tensors diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 658303a01fb3d..41274e5b39859 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -101,12 +101,20 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": "weights"] = QuantizationArgs.parse_obj( quant_config.get("weights")) + target_scheme_map[target]["input_activations"] = None if is_activation_quantization_format(quant_format): - target_scheme_map[target][ - "input_activations"] = QuantizationArgs.parse_obj( - quant_config.get("input_activations")) - else: - target_scheme_map[target]["input_activations"] = None + input_activations = quant_config.get("input_activations") + # The only case where we have activation quant supported + # but no input_activations provided in the config + # should be w8a16fp8 w8a16fp8 can also run for cases where + # there is an input_quant but it is ignored + if not input_activations: + assert target_scheme_map[target][ + "weights"].type == QuantizationType.FLOAT + else: + target_scheme_map[target][ + "input_activations"] = QuantizationArgs.parse_obj( + quant_config.get("input_activations")) return cls(target_scheme_map=target_scheme_map, ignore=ignore, @@ -245,8 +253,7 @@ def _get_scheme_from_parts( group_size=weight_quant.group_size, actorder=weight_quant.actorder) - # Will only be not None if is_activation_quantization_format is True - if input_quant: + if is_activation_quantization_format(self.quant_format): if self._is_fp8_w8a8(weight_quant, input_quant): is_fp8_w8a8_supported = self._check_scheme_supported( CompressedTensorsW8A8Fp8.get_min_capability(), error=False) @@ -256,16 +263,19 @@ def _get_scheme_from_parts( is_static_input_scheme=(input_quant and not input_quant.dynamic)) else: + # note: input_quant will be present for converted models; + # will be ignored during inference post loading return CompressedTensorsW8A16Fp8( strategy=weight_quant.strategy, - is_static_input_scheme=(input_quant - and not input_quant.dynamic)) + is_static_input_scheme=not input_quant.dynamic) + # note: input_quant can be None if self._is_fp8_w8a16(weight_quant, input_quant): + is_static_input_scheme = (input_quant is None + or not input_quant.dynamic) return CompressedTensorsW8A16Fp8( strategy=weight_quant.strategy, - is_static_input_scheme=(input_quant - and not input_quant.dynamic)) + is_static_input_scheme=is_static_input_scheme) if self._is_static_tensor_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Int8(