From 53b018edcbc601f0eea9f65f13a9a9620c4be8dc Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 18 Apr 2024 03:21:55 -0400 Subject: [PATCH] [Bugfix] Get available quantization methods from quantization registry (#4098) --- benchmarks/benchmark_latency.py | 3 ++- benchmarks/benchmark_throughput.py | 4 +++- tests/models/test_marlin.py | 7 +++---- vllm/config.py | 7 ++++--- vllm/engine/arg_utils.py | 3 ++- vllm/model_executor/layers/quantization/__init__.py | 7 ++++--- 6 files changed, 18 insertions(+), 13 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index aadbc441713fc..44da3bad8d840 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -9,6 +9,7 @@ from tqdm import tqdm from vllm import LLM, SamplingParams +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS def main(args: argparse.Namespace): @@ -101,7 +102,7 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--quantization', '-q', - choices=['awq', 'gptq', 'squeezellm', None], + choices=[*QUANTIZATION_METHODS, None], default=None) parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--input-len', type=int, default=32) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 6df1e1d628e6c..6bb889d1eceba 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -10,6 +10,8 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS + def sample_requests( dataset_path: str, @@ -267,7 +269,7 @@ def main(args: argparse.Namespace): parser.add_argument("--tokenizer", type=str, default=None) parser.add_argument('--quantization', '-q', - choices=['awq', 'gptq', 'squeezellm', None], + choices=[*QUANTIZATION_METHODS, None], default=None) parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 2305db3510060..4fe6daec02520 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -16,13 +16,12 @@ import pytest import torch -from vllm.model_executor.layers.quantization import ( - _QUANTIZATION_CONFIG_REGISTRY) +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] -marlin_not_supported = ( - capability < _QUANTIZATION_CONFIG_REGISTRY["marlin"].get_min_capability()) +marlin_not_supported = (capability < + QUANTIZATION_METHODS["marlin"].get_min_capability()) @dataclass diff --git a/vllm/config.py b/vllm/config.py index 5a29620e85ac6..2912d6ccc2c5b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -9,6 +9,7 @@ from transformers import PretrainedConfig from vllm.logger import init_logger +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip, is_neuron) @@ -118,8 +119,8 @@ def _verify_tokenizer_mode(self) -> None: self.tokenizer_mode = tokenizer_mode def _verify_quantization(self) -> None: - supported_quantization = ["awq", "gptq", "squeezellm", "marlin"] - rocm_not_supported_quantization = ["awq", "marlin"] + supported_quantization = [*QUANTIZATION_METHODS] + rocm_supported_quantization = ["gptq", "squeezellm"] if self.quantization is not None: self.quantization = self.quantization.lower() @@ -155,7 +156,7 @@ def _verify_quantization(self) -> None: f"Unknown quantization method: {self.quantization}. Must " f"be one of {supported_quantization}.") if is_hip( - ) and self.quantization in rocm_not_supported_quantization: + ) and self.quantization not in rocm_supported_quantization: raise ValueError( f"{self.quantization} quantization is currently not " f"supported in ROCm.") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c61c0cc67d7a2..2999ab0a7e72a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -7,6 +7,7 @@ EngineConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, TokenizerPoolConfig, VisionLanguageConfig) +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import str_to_int_tuple @@ -286,7 +287,7 @@ def add_cli_args( parser.add_argument('--quantization', '-q', type=str, - choices=['awq', 'gptq', 'squeezellm', None], + choices=[*QUANTIZATION_METHODS, None], default=EngineArgs.quantization, help='Method used to quantize the weights. If ' 'None, we first check the `quantization_config` ' diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index ad988d48755b0..a3b89a66469eb 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -7,7 +7,7 @@ from vllm.model_executor.layers.quantization.marlin import MarlinConfig from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig -_QUANTIZATION_CONFIG_REGISTRY = { +QUANTIZATION_METHODS = { "awq": AWQConfig, "gptq": GPTQConfig, "squeezellm": SqueezeLLMConfig, @@ -16,12 +16,13 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: - if quantization not in _QUANTIZATION_CONFIG_REGISTRY: + if quantization not in QUANTIZATION_METHODS: raise ValueError(f"Invalid quantization method: {quantization}") - return _QUANTIZATION_CONFIG_REGISTRY[quantization] + return QUANTIZATION_METHODS[quantization] __all__ = [ "QuantizationConfig", "get_quantization_config", + "QUANTIZATION_METHODS", ]