From 93aab3c29490071a0d9a1651bb4b06cc86ca73f1 Mon Sep 17 00:00:00 2001
From: Charlie Fu <Charlie.Fu@amd.com>
Date: Wed, 19 Jun 2024 16:02:34 -0500
Subject: [PATCH] Adding quantized_weights_path arg for fp8 weights (#57)

* add quantization_weights_path for fp8 weights

* fix lint

* fix lint

* change to quantized_weights_path

* fix lint
---
 ROCm_performance.md                        |  2 +-
 benchmarks/benchmark_latency.py            |  8 ++++++++
 benchmarks/benchmark_throughput.py         | 15 ++++++++++++---
 vllm/config.py                             |  2 ++
 vllm/engine/arg_utils.py                   | 17 +++++++++++++----
 vllm/model_executor/model_loader/loader.py |  4 ++--
 6 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/ROCm_performance.md b/ROCm_performance.md
index bae57ea62d47c..c6b7a0ec77a03 100644
--- a/ROCm_performance.md
+++ b/ROCm_performance.md
@@ -37,7 +37,7 @@ For more details, please refer to Quark's documentation.
 
 To use ammo, please follow this [instruction](https://github.com/ROCm/vllm/tree/main/examples/fp8/quantizer), and set `VLLM_FP8_USE_AMMO=1`.
 
-Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantization_param_path={relative path of the safetensors with your model path}`.
+Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantized_weights_path={relative path of the safetensors with your model path}`.
 
 ## Gemm Tuning for Fp8
 
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 2aca1b23f9b6f..7eeb90516bdfa 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -24,6 +24,7 @@ def main(args: argparse.Namespace):
               num_speculative_tokens=args.num_speculative_tokens,
               tokenizer=args.tokenizer,
               quantization=args.quantization,
+              quantized_weights_path=args.quantized_weights_path,
               tensor_parallel_size=args.tensor_parallel_size,
               trust_remote_code=args.trust_remote_code,
               dtype=args.dtype,
@@ -175,6 +176,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
         'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
         'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
         'instead supported for common inference criteria.')
+    parser.add_argument(
+        '--quantized-weights-path',
+        type=str,
+        default=None,
+        help='Path to the safetensor file containing the quantized weights '
+        'and scaling factors. This should generally be supplied, when '
+        'quantization is FP8.')
     parser.add_argument(
         '--profile',
         action='store_true',
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index fdfbf23c721b3..eb59e38fa2c9d 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -64,6 +64,7 @@ def run_vllm(
     model: str,
     tokenizer: str,
     quantization: Optional[str],
+    quantized_weights_path: Optional[str],
     tensor_parallel_size: int,
     seed: int,
     n: int,
@@ -87,6 +88,7 @@ def run_vllm(
         model=model,
         tokenizer=tokenizer,
         quantization=quantization,
+        quantized_weights_path=quantized_weights_path,
         tensor_parallel_size=tensor_parallel_size,
         seed=seed,
         trust_remote_code=trust_remote_code,
@@ -222,9 +224,9 @@ def main(args: argparse.Namespace):
     if args.backend == "vllm":
         elapsed_time = run_vllm(
             requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-            args.trust_remote_code, args.dtype, args.max_model_len,
-            args.enforce_eager, args.kv_cache_dtype,
+            args.quantized_weights_path, args.tensor_parallel_size, args.seed,
+            args.n, args.use_beam_search, args.trust_remote_code, args.dtype,
+            args.max_model_len, args.enforce_eager, args.kv_cache_dtype,
             args.quantization_param_path, args.device,
             args.enable_prefix_caching, args.enable_chunked_prefill,
             args.max_num_batched_tokens, args.gpu_memory_utilization,
@@ -342,6 +344,13 @@ def main(args: argparse.Namespace):
         'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
         'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
         'instead supported for common inference criteria.')
+    parser.add_argument(
+        '--quantized-weights-path',
+        type=str,
+        default=None,
+        help='Path to the safetensor file containing the quantized weights '
+        'and scaling factors. This should generally be supplied, when '
+        'quantization is FP8.')
     parser.add_argument(
         "--device",
         type=str,
diff --git a/vllm/config.py b/vllm/config.py
index 63471aa5301b1..a810b67777224 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -97,6 +97,7 @@ def __init__(
         max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
         quantization_param_path: Optional[str] = None,
+        quantized_weights_path: Optional[str] = None,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
         max_seq_len_to_capture: Optional[int] = None,
@@ -116,6 +117,7 @@ def __init__(
         self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
         self.quantization_param_path = quantization_param_path
+        self.quantized_weights_path = quantized_weights_path
         self.enforce_eager = enforce_eager
         self.max_context_len_to_capture = max_context_len_to_capture
         if self.max_context_len_to_capture is not None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 04b9e8032c0cf..e49e8b47b50dd 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -54,6 +54,7 @@ class EngineArgs:
     rope_scaling: Optional[dict] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
+    quantized_weights_path: Optional[str] = None
     enforce_eager: bool = False
     max_context_len_to_capture: Optional[int] = None
     max_seq_len_to_capture: int = 32768
@@ -337,6 +338,13 @@ def add_cli_args(
                             'None, we assume the model weights are not '
                             'quantized and use `dtype` to determine the data '
                             'type of the weights.')
+        parser.add_argument(
+            '--quantized-weights-path',
+            type=nullable_str,
+            default=None,
+            help='Path to the safetensor file containing the quantized weights '
+            'and scaling factors. This should generally be supplied, when '
+            'quantization is FP8.')
         parser.add_argument('--rope-scaling',
                             default=None,
                             type=json.loads,
@@ -562,10 +570,11 @@ def create_engine_config(self, ) -> EngineConfig:
             self.trust_remote_code, self.dtype, self.seed, self.revision,
             self.code_revision, self.rope_scaling, self.tokenizer_revision,
             self.max_model_len, self.quantization,
-            self.quantization_param_path, self.enforce_eager,
-            self.max_context_len_to_capture, self.max_seq_len_to_capture,
-            self.max_logprobs, self.disable_sliding_window,
-            self.skip_tokenizer_init, self.served_model_name)
+            self.quantization_param_path, self.quantized_weights_path,
+            self.enforce_eager, self.max_context_len_to_capture,
+            self.max_seq_len_to_capture, self.max_logprobs,
+            self.disable_sliding_window, self.skip_tokenizer_init,
+            self.served_model_name)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index a7b8d1ad35620..cc1a792c16c25 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -248,11 +248,11 @@ def load_model(self, *, model_config: ModelConfig,
                                                "fall_back_to_pt_during_load",
                                                True)), )
             if (model_config.quantization == 'fp8'
-                    and model_config.quantization_param_path is not None):
+                    and model_config.quantized_weights_path is not None):
                 model.load_quantized_weights(
                     safetensors_weights_iterator([
                         model_config.model +
-                        model_config.quantization_param_path
+                        model_config.quantized_weights_path
                     ]))
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)