From a9be7c9a27d52da945ee677899c0d88143f374f0 Mon Sep 17 00:00:00 2001 From: charlifu Date: Wed, 19 Jun 2024 20:22:56 +0000 Subject: [PATCH 1/5] add quantization_weights_path for fp8 weights --- ROCm_performance.md | 2 +- benchmarks/benchmark_latency.py | 8 ++++++++ benchmarks/benchmark_throughput.py | 10 ++++++++++ vllm/config.py | 2 ++ vllm/engine/arg_utils.py | 11 ++++++++++- vllm/model_executor/model_loader/loader.py | 4 ++-- 6 files changed, 33 insertions(+), 4 deletions(-) diff --git a/ROCm_performance.md b/ROCm_performance.md index bae57ea62d47c..e0ff0e9060ef4 100644 --- a/ROCm_performance.md +++ b/ROCm_performance.md @@ -37,7 +37,7 @@ For more details, please refer to Quark's documentation. To use ammo, please follow this [instruction](https://github.com/ROCm/vllm/tree/main/examples/fp8/quantizer), and set `VLLM_FP8_USE_AMMO=1`. -Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantization_param_path={relative path of the safetensors with your model path}`. +Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantization_weights_path={relative path of the safetensors with your model path}`. ## Gemm Tuning for Fp8 diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 2aca1b23f9b6f..1694288350275 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -24,6 +24,7 @@ def main(args: argparse.Namespace): num_speculative_tokens=args.num_speculative_tokens, tokenizer=args.tokenizer, quantization=args.quantization, + quantization_weights_path=args.quantization_weights_path, tensor_parallel_size=args.tensor_parallel_size, trust_remote_code=args.trust_remote_code, dtype=args.dtype, @@ -175,6 +176,13 @@ def run_to_completion(profile_dir: Optional[str] = None): 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' 'instead supported for common inference criteria.') + parser.add_argument( + '--quantization-weights-path', + type=str, + default=None, + help='Path to the safetensor file containing the quantized weights ' + 'and scaling factors. This should generally be supplied, when ' + 'quantization is FP8.') parser.add_argument( '--profile', action='store_true', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index fdfbf23c721b3..25a64dd3feb20 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -64,6 +64,7 @@ def run_vllm( model: str, tokenizer: str, quantization: Optional[str], + quantization_weights_path: Optional[str], tensor_parallel_size: int, seed: int, n: int, @@ -87,6 +88,7 @@ def run_vllm( model=model, tokenizer=tokenizer, quantization=quantization, + quantization_weights_path=quantization_weights_path, tensor_parallel_size=tensor_parallel_size, seed=seed, trust_remote_code=trust_remote_code, @@ -222,6 +224,7 @@ def main(args: argparse.Namespace): if args.backend == "vllm": elapsed_time = run_vllm( requests, args.model, args.tokenizer, args.quantization, + args.quantization_weights_path, args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, args.kv_cache_dtype, @@ -342,6 +345,13 @@ def main(args: argparse.Namespace): 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' 'instead supported for common inference criteria.') + parser.add_argument( + '--quantization-weights-path', + type=str, + default=None, + help='Path to the safetensor file containing the quantized weights ' + 'and scaling factors. This should generally be supplied, when ' + 'quantization is FP8.') parser.add_argument( "--device", type=str, diff --git a/vllm/config.py b/vllm/config.py index 63471aa5301b1..9f6ac0298d2df 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -97,6 +97,7 @@ def __init__( max_model_len: Optional[int] = None, quantization: Optional[str] = None, quantization_param_path: Optional[str] = None, + quantization_weights_path: Optional[str] = None, enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, max_seq_len_to_capture: Optional[int] = None, @@ -116,6 +117,7 @@ def __init__( self.tokenizer_revision = tokenizer_revision self.quantization = quantization self.quantization_param_path = quantization_param_path + self.quantization_weights_path = quantization_weights_path self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture if self.max_context_len_to_capture is not None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 04b9e8032c0cf..b0383c95e5cb7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -54,6 +54,7 @@ class EngineArgs: rope_scaling: Optional[dict] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None + quantization_weights_path: Optional[str] = None enforce_eager: bool = False max_context_len_to_capture: Optional[int] = None max_seq_len_to_capture: int = 32768 @@ -337,6 +338,13 @@ def add_cli_args( 'None, we assume the model weights are not ' 'quantized and use `dtype` to determine the data ' 'type of the weights.') + parser.add_argument( + '--quantization-weights-path', + type=nullable_str, + default=None, + help='Path to the safetensor file containing the quantized weights ' + 'and scaling factors. This should generally be supplied, when ' + 'quantization is FP8.') parser.add_argument('--rope-scaling', default=None, type=json.loads, @@ -562,7 +570,8 @@ def create_engine_config(self, ) -> EngineConfig: self.trust_remote_code, self.dtype, self.seed, self.revision, self.code_revision, self.rope_scaling, self.tokenizer_revision, self.max_model_len, self.quantization, - self.quantization_param_path, self.enforce_eager, + self.quantization_param_path, self.quantization_weights_path, + self.enforce_eager, self.max_context_len_to_capture, self.max_seq_len_to_capture, self.max_logprobs, self.disable_sliding_window, self.skip_tokenizer_init, self.served_model_name) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index a7b8d1ad35620..1c13c51738ec0 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -248,11 +248,11 @@ def load_model(self, *, model_config: ModelConfig, "fall_back_to_pt_during_load", True)), ) if (model_config.quantization == 'fp8' - and model_config.quantization_param_path is not None): + and model_config.quantization_weights_path is not None): model.load_quantized_weights( safetensors_weights_iterator([ model_config.model + - model_config.quantization_param_path + model_config.quantization_weights_path ])) for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) From a7971a4a1341a0073420276a77ec1eed4665b060 Mon Sep 17 00:00:00 2001 From: charlifu Date: Wed, 19 Jun 2024 20:45:11 +0000 Subject: [PATCH 2/5] fix lint --- vllm/engine/arg_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b0383c95e5cb7..03c73686b8ee1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -571,10 +571,10 @@ def create_engine_config(self, ) -> EngineConfig: self.code_revision, self.rope_scaling, self.tokenizer_revision, self.max_model_len, self.quantization, self.quantization_param_path, self.quantization_weights_path, - self.enforce_eager, - self.max_context_len_to_capture, self.max_seq_len_to_capture, - self.max_logprobs, self.disable_sliding_window, - self.skip_tokenizer_init, self.served_model_name) + self.enforce_eager, self.max_context_len_to_capture, + self.max_seq_len_to_capture, self.max_logprobs, + self.disable_sliding_window, self.skip_tokenizer_init, + self.served_model_name) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, From f088187bd09d0d2b5419c206d2ee864f5ab65d6d Mon Sep 17 00:00:00 2001 From: charlifu Date: Wed, 19 Jun 2024 20:46:54 +0000 Subject: [PATCH 3/5] fix lint --- benchmarks/benchmark_throughput.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 25a64dd3feb20..37e1e2e8431ec 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -224,11 +224,10 @@ def main(args: argparse.Namespace): if args.backend == "vllm": elapsed_time = run_vllm( requests, args.model, args.tokenizer, args.quantization, - args.quantization_weights_path, - args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, + args.quantization_weights_path, args.tensor_parallel_size, + args.seed, args.n, args.use_beam_search, args.trust_remote_code, + args.dtype, args.max_model_len, args.enforce_eager, + args.kv_cache_dtype, args.quantization_param_path, args.device, args.enable_prefix_caching, args.enable_chunked_prefill, args.max_num_batched_tokens, args.gpu_memory_utilization, args.worker_use_ray, args.download_dir) From 3857ba2a579078a0a896c4778c867f8373b68e4c Mon Sep 17 00:00:00 2001 From: charlifu Date: Wed, 19 Jun 2024 20:55:54 +0000 Subject: [PATCH 4/5] change to quantized_weights_path --- ROCm_performance.md | 2 +- benchmarks/benchmark_latency.py | 4 ++-- benchmarks/benchmark_throughput.py | 8 ++++---- vllm/config.py | 4 ++-- vllm/engine/arg_utils.py | 6 +++--- vllm/model_executor/model_loader/loader.py | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ROCm_performance.md b/ROCm_performance.md index e0ff0e9060ef4..c6b7a0ec77a03 100644 --- a/ROCm_performance.md +++ b/ROCm_performance.md @@ -37,7 +37,7 @@ For more details, please refer to Quark's documentation. To use ammo, please follow this [instruction](https://github.com/ROCm/vllm/tree/main/examples/fp8/quantizer), and set `VLLM_FP8_USE_AMMO=1`. -Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantization_weights_path={relative path of the safetensors with your model path}`. +Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantized_weights_path={relative path of the safetensors with your model path}`. ## Gemm Tuning for Fp8 diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 1694288350275..7eeb90516bdfa 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -24,7 +24,7 @@ def main(args: argparse.Namespace): num_speculative_tokens=args.num_speculative_tokens, tokenizer=args.tokenizer, quantization=args.quantization, - quantization_weights_path=args.quantization_weights_path, + quantized_weights_path=args.quantized_weights_path, tensor_parallel_size=args.tensor_parallel_size, trust_remote_code=args.trust_remote_code, dtype=args.dtype, @@ -177,7 +177,7 @@ def run_to_completion(profile_dir: Optional[str] = None): 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' 'instead supported for common inference criteria.') parser.add_argument( - '--quantization-weights-path', + '--quantized-weights-path', type=str, default=None, help='Path to the safetensor file containing the quantized weights ' diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 37e1e2e8431ec..5433e1b0ca115 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -64,7 +64,7 @@ def run_vllm( model: str, tokenizer: str, quantization: Optional[str], - quantization_weights_path: Optional[str], + quantized_weights_path: Optional[str], tensor_parallel_size: int, seed: int, n: int, @@ -88,7 +88,7 @@ def run_vllm( model=model, tokenizer=tokenizer, quantization=quantization, - quantization_weights_path=quantization_weights_path, + quantized_weights_path=quantized_weights_path, tensor_parallel_size=tensor_parallel_size, seed=seed, trust_remote_code=trust_remote_code, @@ -224,7 +224,7 @@ def main(args: argparse.Namespace): if args.backend == "vllm": elapsed_time = run_vllm( requests, args.model, args.tokenizer, args.quantization, - args.quantization_weights_path, args.tensor_parallel_size, + args.quantized_weights_path, args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, args.kv_cache_dtype, args.quantization_param_path, args.device, @@ -345,7 +345,7 @@ def main(args: argparse.Namespace): 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' 'instead supported for common inference criteria.') parser.add_argument( - '--quantization-weights-path', + '--quantized-weights-path', type=str, default=None, help='Path to the safetensor file containing the quantized weights ' diff --git a/vllm/config.py b/vllm/config.py index 9f6ac0298d2df..a810b67777224 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -97,7 +97,7 @@ def __init__( max_model_len: Optional[int] = None, quantization: Optional[str] = None, quantization_param_path: Optional[str] = None, - quantization_weights_path: Optional[str] = None, + quantized_weights_path: Optional[str] = None, enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, max_seq_len_to_capture: Optional[int] = None, @@ -117,7 +117,7 @@ def __init__( self.tokenizer_revision = tokenizer_revision self.quantization = quantization self.quantization_param_path = quantization_param_path - self.quantization_weights_path = quantization_weights_path + self.quantized_weights_path = quantized_weights_path self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture if self.max_context_len_to_capture is not None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 03c73686b8ee1..e49e8b47b50dd 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -54,7 +54,7 @@ class EngineArgs: rope_scaling: Optional[dict] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None - quantization_weights_path: Optional[str] = None + quantized_weights_path: Optional[str] = None enforce_eager: bool = False max_context_len_to_capture: Optional[int] = None max_seq_len_to_capture: int = 32768 @@ -339,7 +339,7 @@ def add_cli_args( 'quantized and use `dtype` to determine the data ' 'type of the weights.') parser.add_argument( - '--quantization-weights-path', + '--quantized-weights-path', type=nullable_str, default=None, help='Path to the safetensor file containing the quantized weights ' @@ -570,7 +570,7 @@ def create_engine_config(self, ) -> EngineConfig: self.trust_remote_code, self.dtype, self.seed, self.revision, self.code_revision, self.rope_scaling, self.tokenizer_revision, self.max_model_len, self.quantization, - self.quantization_param_path, self.quantization_weights_path, + self.quantization_param_path, self.quantized_weights_path, self.enforce_eager, self.max_context_len_to_capture, self.max_seq_len_to_capture, self.max_logprobs, self.disable_sliding_window, self.skip_tokenizer_init, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 1c13c51738ec0..cc1a792c16c25 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -248,11 +248,11 @@ def load_model(self, *, model_config: ModelConfig, "fall_back_to_pt_during_load", True)), ) if (model_config.quantization == 'fp8' - and model_config.quantization_weights_path is not None): + and model_config.quantized_weights_path is not None): model.load_quantized_weights( safetensors_weights_iterator([ model_config.model + - model_config.quantization_weights_path + model_config.quantized_weights_path ])) for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) From c74572258354652064fb56dfa67182205e8fec36 Mon Sep 17 00:00:00 2001 From: charlifu Date: Wed, 19 Jun 2024 20:59:05 +0000 Subject: [PATCH 5/5] fix lint --- benchmarks/benchmark_throughput.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 5433e1b0ca115..eb59e38fa2c9d 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -224,10 +224,10 @@ def main(args: argparse.Namespace): if args.backend == "vllm": elapsed_time = run_vllm( requests, args.model, args.tokenizer, args.quantization, - args.quantized_weights_path, args.tensor_parallel_size, - args.seed, args.n, args.use_beam_search, args.trust_remote_code, - args.dtype, args.max_model_len, args.enforce_eager, - args.kv_cache_dtype, args.quantization_param_path, args.device, + args.quantized_weights_path, args.tensor_parallel_size, args.seed, + args.n, args.use_beam_search, args.trust_remote_code, args.dtype, + args.max_model_len, args.enforce_eager, args.kv_cache_dtype, + args.quantization_param_path, args.device, args.enable_prefix_caching, args.enable_chunked_prefill, args.max_num_batched_tokens, args.gpu_memory_utilization, args.worker_use_ray, args.download_dir)