diff --git a/ROCm_performance.md b/ROCm_performance.md index bae57ea62d47c..c6b7a0ec77a03 100644 --- a/ROCm_performance.md +++ b/ROCm_performance.md @@ -37,7 +37,7 @@ For more details, please refer to Quark's documentation. To use ammo, please follow this [instruction](https://github.com/ROCm/vllm/tree/main/examples/fp8/quantizer), and set `VLLM_FP8_USE_AMMO=1`. -Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantization_param_path={relative path of the safetensors with your model path}`. +Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantized_weights_path={relative path of the safetensors with your model path}`. ## Gemm Tuning for Fp8 diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 2aca1b23f9b6f..7eeb90516bdfa 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -24,6 +24,7 @@ def main(args: argparse.Namespace): num_speculative_tokens=args.num_speculative_tokens, tokenizer=args.tokenizer, quantization=args.quantization, + quantized_weights_path=args.quantized_weights_path, tensor_parallel_size=args.tensor_parallel_size, trust_remote_code=args.trust_remote_code, dtype=args.dtype, @@ -175,6 +176,13 @@ def run_to_completion(profile_dir: Optional[str] = None): 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' 'instead supported for common inference criteria.') + parser.add_argument( + '--quantized-weights-path', + type=str, + default=None, + help='Path to the safetensor file containing the quantized weights ' + 'and scaling factors. This should generally be supplied, when ' + 'quantization is FP8.') parser.add_argument( '--profile', action='store_true', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index fdfbf23c721b3..eb59e38fa2c9d 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -64,6 +64,7 @@ def run_vllm( model: str, tokenizer: str, quantization: Optional[str], + quantized_weights_path: Optional[str], tensor_parallel_size: int, seed: int, n: int, @@ -87,6 +88,7 @@ def run_vllm( model=model, tokenizer=tokenizer, quantization=quantization, + quantized_weights_path=quantized_weights_path, tensor_parallel_size=tensor_parallel_size, seed=seed, trust_remote_code=trust_remote_code, @@ -222,9 +224,9 @@ def main(args: argparse.Namespace): if args.backend == "vllm": elapsed_time = run_vllm( requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, + args.quantized_weights_path, args.tensor_parallel_size, args.seed, + args.n, args.use_beam_search, args.trust_remote_code, args.dtype, + args.max_model_len, args.enforce_eager, args.kv_cache_dtype, args.quantization_param_path, args.device, args.enable_prefix_caching, args.enable_chunked_prefill, args.max_num_batched_tokens, args.gpu_memory_utilization, @@ -342,6 +344,13 @@ def main(args: argparse.Namespace): 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' 'instead supported for common inference criteria.') + parser.add_argument( + '--quantized-weights-path', + type=str, + default=None, + help='Path to the safetensor file containing the quantized weights ' + 'and scaling factors. This should generally be supplied, when ' + 'quantization is FP8.') parser.add_argument( "--device", type=str, diff --git a/vllm/config.py b/vllm/config.py index 63471aa5301b1..a810b67777224 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -97,6 +97,7 @@ def __init__( max_model_len: Optional[int] = None, quantization: Optional[str] = None, quantization_param_path: Optional[str] = None, + quantized_weights_path: Optional[str] = None, enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, max_seq_len_to_capture: Optional[int] = None, @@ -116,6 +117,7 @@ def __init__( self.tokenizer_revision = tokenizer_revision self.quantization = quantization self.quantization_param_path = quantization_param_path + self.quantized_weights_path = quantized_weights_path self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture if self.max_context_len_to_capture is not None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 04b9e8032c0cf..e49e8b47b50dd 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -54,6 +54,7 @@ class EngineArgs: rope_scaling: Optional[dict] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None + quantized_weights_path: Optional[str] = None enforce_eager: bool = False max_context_len_to_capture: Optional[int] = None max_seq_len_to_capture: int = 32768 @@ -337,6 +338,13 @@ def add_cli_args( 'None, we assume the model weights are not ' 'quantized and use `dtype` to determine the data ' 'type of the weights.') + parser.add_argument( + '--quantized-weights-path', + type=nullable_str, + default=None, + help='Path to the safetensor file containing the quantized weights ' + 'and scaling factors. This should generally be supplied, when ' + 'quantization is FP8.') parser.add_argument('--rope-scaling', default=None, type=json.loads, @@ -562,10 +570,11 @@ def create_engine_config(self, ) -> EngineConfig: self.trust_remote_code, self.dtype, self.seed, self.revision, self.code_revision, self.rope_scaling, self.tokenizer_revision, self.max_model_len, self.quantization, - self.quantization_param_path, self.enforce_eager, - self.max_context_len_to_capture, self.max_seq_len_to_capture, - self.max_logprobs, self.disable_sliding_window, - self.skip_tokenizer_init, self.served_model_name) + self.quantization_param_path, self.quantized_weights_path, + self.enforce_eager, self.max_context_len_to_capture, + self.max_seq_len_to_capture, self.max_logprobs, + self.disable_sliding_window, self.skip_tokenizer_init, + self.served_model_name) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index a7b8d1ad35620..cc1a792c16c25 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -248,11 +248,11 @@ def load_model(self, *, model_config: ModelConfig, "fall_back_to_pt_during_load", True)), ) if (model_config.quantization == 'fp8' - and model_config.quantization_param_path is not None): + and model_config.quantized_weights_path is not None): model.load_quantized_weights( safetensors_weights_iterator([ model_config.model + - model_config.quantization_param_path + model_config.quantized_weights_path ])) for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None)