From a9be7c9a27d52da945ee677899c0d88143f374f0 Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Wed, 19 Jun 2024 20:22:56 +0000
Subject: [PATCH 1/5] add quantization_weights_path for fp8 weights

---
 ROCm_performance.md                        |  2 +-
 benchmarks/benchmark_latency.py            |  8 ++++++++
 benchmarks/benchmark_throughput.py         | 10 ++++++++++
 vllm/config.py                             |  2 ++
 vllm/engine/arg_utils.py                   | 11 ++++++++++-
 vllm/model_executor/model_loader/loader.py |  4 ++--
 6 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/ROCm_performance.md b/ROCm_performance.md
index bae57ea62d47c..e0ff0e9060ef4 100644
--- a/ROCm_performance.md
+++ b/ROCm_performance.md
@@ -37,7 +37,7 @@ For more details, please refer to Quark's documentation.
 
 To use ammo, please follow this [instruction](https://github.com/ROCm/vllm/tree/main/examples/fp8/quantizer), and set `VLLM_FP8_USE_AMMO=1`.
 
-Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantization_param_path={relative path of the safetensors with your model path}`.
+Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantization_weights_path={relative path of the safetensors with your model path}`.
 
 ## Gemm Tuning for Fp8
 
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 2aca1b23f9b6f..1694288350275 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -24,6 +24,7 @@ def main(args: argparse.Namespace):
               num_speculative_tokens=args.num_speculative_tokens,
               tokenizer=args.tokenizer,
               quantization=args.quantization,
+              quantization_weights_path=args.quantization_weights_path,
               tensor_parallel_size=args.tensor_parallel_size,
               trust_remote_code=args.trust_remote_code,
               dtype=args.dtype,
@@ -175,6 +176,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
         'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
         'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
         'instead supported for common inference criteria.')
+    parser.add_argument(
+        '--quantization-weights-path',
+        type=str,
+        default=None,
+        help='Path to the safetensor file containing the quantized weights '
+        'and scaling factors. This should generally be supplied, when '
+        'quantization is FP8.')
     parser.add_argument(
         '--profile',
         action='store_true',
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index fdfbf23c721b3..25a64dd3feb20 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -64,6 +64,7 @@ def run_vllm(
     model: str,
     tokenizer: str,
     quantization: Optional[str],
+    quantization_weights_path: Optional[str],
     tensor_parallel_size: int,
     seed: int,
     n: int,
@@ -87,6 +88,7 @@ def run_vllm(
         model=model,
         tokenizer=tokenizer,
         quantization=quantization,
+        quantization_weights_path=quantization_weights_path,
         tensor_parallel_size=tensor_parallel_size,
         seed=seed,
         trust_remote_code=trust_remote_code,
@@ -222,6 +224,7 @@ def main(args: argparse.Namespace):
     if args.backend == "vllm":
         elapsed_time = run_vllm(
             requests, args.model, args.tokenizer, args.quantization,
+            args.quantization_weights_path,
             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
             args.trust_remote_code, args.dtype, args.max_model_len,
             args.enforce_eager, args.kv_cache_dtype,
@@ -342,6 +345,13 @@ def main(args: argparse.Namespace):
         'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
         'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
         'instead supported for common inference criteria.')
+    parser.add_argument(
+        '--quantization-weights-path',
+        type=str,
+        default=None,
+        help='Path to the safetensor file containing the quantized weights '
+        'and scaling factors. This should generally be supplied, when '
+        'quantization is FP8.')
     parser.add_argument(
         "--device",
         type=str,
diff --git a/vllm/config.py b/vllm/config.py
index 63471aa5301b1..9f6ac0298d2df 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -97,6 +97,7 @@ def __init__(
         max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
         quantization_param_path: Optional[str] = None,
+        quantization_weights_path: Optional[str] = None,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
         max_seq_len_to_capture: Optional[int] = None,
@@ -116,6 +117,7 @@ def __init__(
         self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
         self.quantization_param_path = quantization_param_path
+        self.quantization_weights_path = quantization_weights_path
         self.enforce_eager = enforce_eager
         self.max_context_len_to_capture = max_context_len_to_capture
         if self.max_context_len_to_capture is not None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 04b9e8032c0cf..b0383c95e5cb7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -54,6 +54,7 @@ class EngineArgs:
     rope_scaling: Optional[dict] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
+    quantization_weights_path: Optional[str] = None
     enforce_eager: bool = False
     max_context_len_to_capture: Optional[int] = None
     max_seq_len_to_capture: int = 32768
@@ -337,6 +338,13 @@ def add_cli_args(
                             'None, we assume the model weights are not '
                             'quantized and use `dtype` to determine the data '
                             'type of the weights.')
+        parser.add_argument(
+            '--quantization-weights-path',
+            type=nullable_str,
+            default=None,
+            help='Path to the safetensor file containing the quantized weights '
+            'and scaling factors. This should generally be supplied, when '
+            'quantization is FP8.')
         parser.add_argument('--rope-scaling',
                             default=None,
                             type=json.loads,
@@ -562,7 +570,8 @@ def create_engine_config(self, ) -> EngineConfig:
             self.trust_remote_code, self.dtype, self.seed, self.revision,
             self.code_revision, self.rope_scaling, self.tokenizer_revision,
             self.max_model_len, self.quantization,
-            self.quantization_param_path, self.enforce_eager,
+            self.quantization_param_path, self.quantization_weights_path,
+            self.enforce_eager,
             self.max_context_len_to_capture, self.max_seq_len_to_capture,
             self.max_logprobs, self.disable_sliding_window,
             self.skip_tokenizer_init, self.served_model_name)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index a7b8d1ad35620..1c13c51738ec0 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -248,11 +248,11 @@ def load_model(self, *, model_config: ModelConfig,
                                                "fall_back_to_pt_during_load",
                                                True)), )
             if (model_config.quantization == 'fp8'
-                    and model_config.quantization_param_path is not None):
+                    and model_config.quantization_weights_path is not None):
                 model.load_quantized_weights(
                     safetensors_weights_iterator([
                         model_config.model +
-                        model_config.quantization_param_path
+                        model_config.quantization_weights_path
                     ]))
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)

From a7971a4a1341a0073420276a77ec1eed4665b060 Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Wed, 19 Jun 2024 20:45:11 +0000
Subject: [PATCH 2/5] fix lint

---
 vllm/engine/arg_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b0383c95e5cb7..03c73686b8ee1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -571,10 +571,10 @@ def create_engine_config(self, ) -> EngineConfig:
             self.code_revision, self.rope_scaling, self.tokenizer_revision,
             self.max_model_len, self.quantization,
             self.quantization_param_path, self.quantization_weights_path,
-            self.enforce_eager,
-            self.max_context_len_to_capture, self.max_seq_len_to_capture,
-            self.max_logprobs, self.disable_sliding_window,
-            self.skip_tokenizer_init, self.served_model_name)
+            self.enforce_eager, self.max_context_len_to_capture,
+            self.max_seq_len_to_capture, self.max_logprobs,
+            self.disable_sliding_window, self.skip_tokenizer_init,
+            self.served_model_name)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,

From f088187bd09d0d2b5419c206d2ee864f5ab65d6d Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Wed, 19 Jun 2024 20:46:54 +0000
Subject: [PATCH 3/5] fix lint

---
 benchmarks/benchmark_throughput.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 25a64dd3feb20..37e1e2e8431ec 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -224,11 +224,10 @@ def main(args: argparse.Namespace):
     if args.backend == "vllm":
         elapsed_time = run_vllm(
             requests, args.model, args.tokenizer, args.quantization,
-            args.quantization_weights_path,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-            args.trust_remote_code, args.dtype, args.max_model_len,
-            args.enforce_eager, args.kv_cache_dtype,
-            args.quantization_param_path, args.device,
+            args.quantization_weights_path, args.tensor_parallel_size,
+            args.seed, args.n, args.use_beam_search, args.trust_remote_code,
+            args.dtype, args.max_model_len, args.enforce_eager,
+            args.kv_cache_dtype, args.quantization_param_path, args.device,
             args.enable_prefix_caching, args.enable_chunked_prefill,
             args.max_num_batched_tokens, args.gpu_memory_utilization,
             args.worker_use_ray, args.download_dir)

From 3857ba2a579078a0a896c4778c867f8373b68e4c Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Wed, 19 Jun 2024 20:55:54 +0000
Subject: [PATCH 4/5] change to quantized_weights_path

---
 ROCm_performance.md                        | 2 +-
 benchmarks/benchmark_latency.py            | 4 ++--
 benchmarks/benchmark_throughput.py         | 8 ++++----
 vllm/config.py                             | 4 ++--
 vllm/engine/arg_utils.py                   | 6 +++---
 vllm/model_executor/model_loader/loader.py | 4 ++--
 6 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/ROCm_performance.md b/ROCm_performance.md
index e0ff0e9060ef4..c6b7a0ec77a03 100644
--- a/ROCm_performance.md
+++ b/ROCm_performance.md
@@ -37,7 +37,7 @@ For more details, please refer to Quark's documentation.
 
 To use ammo, please follow this [instruction](https://github.com/ROCm/vllm/tree/main/examples/fp8/quantizer), and set `VLLM_FP8_USE_AMMO=1`.
 
-Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantization_weights_path={relative path of the safetensors with your model path}`.
+Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantized_weights_path={relative path of the safetensors with your model path}`.
 
 ## Gemm Tuning for Fp8
 
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 1694288350275..7eeb90516bdfa 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -24,7 +24,7 @@ def main(args: argparse.Namespace):
               num_speculative_tokens=args.num_speculative_tokens,
               tokenizer=args.tokenizer,
               quantization=args.quantization,
-              quantization_weights_path=args.quantization_weights_path,
+              quantized_weights_path=args.quantized_weights_path,
               tensor_parallel_size=args.tensor_parallel_size,
               trust_remote_code=args.trust_remote_code,
               dtype=args.dtype,
@@ -177,7 +177,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
         'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
         'instead supported for common inference criteria.')
     parser.add_argument(
-        '--quantization-weights-path',
+        '--quantized-weights-path',
         type=str,
         default=None,
         help='Path to the safetensor file containing the quantized weights '
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 37e1e2e8431ec..5433e1b0ca115 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -64,7 +64,7 @@ def run_vllm(
     model: str,
     tokenizer: str,
     quantization: Optional[str],
-    quantization_weights_path: Optional[str],
+    quantized_weights_path: Optional[str],
     tensor_parallel_size: int,
     seed: int,
     n: int,
@@ -88,7 +88,7 @@ def run_vllm(
         model=model,
         tokenizer=tokenizer,
         quantization=quantization,
-        quantization_weights_path=quantization_weights_path,
+        quantized_weights_path=quantized_weights_path,
         tensor_parallel_size=tensor_parallel_size,
         seed=seed,
         trust_remote_code=trust_remote_code,
@@ -224,7 +224,7 @@ def main(args: argparse.Namespace):
     if args.backend == "vllm":
         elapsed_time = run_vllm(
             requests, args.model, args.tokenizer, args.quantization,
-            args.quantization_weights_path, args.tensor_parallel_size,
+            args.quantized_weights_path, args.tensor_parallel_size,
             args.seed, args.n, args.use_beam_search, args.trust_remote_code,
             args.dtype, args.max_model_len, args.enforce_eager,
             args.kv_cache_dtype, args.quantization_param_path, args.device,
@@ -345,7 +345,7 @@ def main(args: argparse.Namespace):
         'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
         'instead supported for common inference criteria.')
     parser.add_argument(
-        '--quantization-weights-path',
+        '--quantized-weights-path',
         type=str,
         default=None,
         help='Path to the safetensor file containing the quantized weights '
diff --git a/vllm/config.py b/vllm/config.py
index 9f6ac0298d2df..a810b67777224 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -97,7 +97,7 @@ def __init__(
         max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
         quantization_param_path: Optional[str] = None,
-        quantization_weights_path: Optional[str] = None,
+        quantized_weights_path: Optional[str] = None,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
         max_seq_len_to_capture: Optional[int] = None,
@@ -117,7 +117,7 @@ def __init__(
         self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
         self.quantization_param_path = quantization_param_path
-        self.quantization_weights_path = quantization_weights_path
+        self.quantized_weights_path = quantized_weights_path
         self.enforce_eager = enforce_eager
         self.max_context_len_to_capture = max_context_len_to_capture
         if self.max_context_len_to_capture is not None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 03c73686b8ee1..e49e8b47b50dd 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -54,7 +54,7 @@ class EngineArgs:
     rope_scaling: Optional[dict] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
-    quantization_weights_path: Optional[str] = None
+    quantized_weights_path: Optional[str] = None
     enforce_eager: bool = False
     max_context_len_to_capture: Optional[int] = None
     max_seq_len_to_capture: int = 32768
@@ -339,7 +339,7 @@ def add_cli_args(
                             'quantized and use `dtype` to determine the data '
                             'type of the weights.')
         parser.add_argument(
-            '--quantization-weights-path',
+            '--quantized-weights-path',
             type=nullable_str,
             default=None,
             help='Path to the safetensor file containing the quantized weights '
@@ -570,7 +570,7 @@ def create_engine_config(self, ) -> EngineConfig:
             self.trust_remote_code, self.dtype, self.seed, self.revision,
             self.code_revision, self.rope_scaling, self.tokenizer_revision,
             self.max_model_len, self.quantization,
-            self.quantization_param_path, self.quantization_weights_path,
+            self.quantization_param_path, self.quantized_weights_path,
             self.enforce_eager, self.max_context_len_to_capture,
             self.max_seq_len_to_capture, self.max_logprobs,
             self.disable_sliding_window, self.skip_tokenizer_init,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 1c13c51738ec0..cc1a792c16c25 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -248,11 +248,11 @@ def load_model(self, *, model_config: ModelConfig,
                                                "fall_back_to_pt_during_load",
                                                True)), )
             if (model_config.quantization == 'fp8'
-                    and model_config.quantization_weights_path is not None):
+                    and model_config.quantized_weights_path is not None):
                 model.load_quantized_weights(
                     safetensors_weights_iterator([
                         model_config.model +
-                        model_config.quantization_weights_path
+                        model_config.quantized_weights_path
                     ]))
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)

From c74572258354652064fb56dfa67182205e8fec36 Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Wed, 19 Jun 2024 20:59:05 +0000
Subject: [PATCH 5/5] fix lint

---
 benchmarks/benchmark_throughput.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 5433e1b0ca115..eb59e38fa2c9d 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -224,10 +224,10 @@ def main(args: argparse.Namespace):
     if args.backend == "vllm":
         elapsed_time = run_vllm(
             requests, args.model, args.tokenizer, args.quantization,
-            args.quantized_weights_path, args.tensor_parallel_size,
-            args.seed, args.n, args.use_beam_search, args.trust_remote_code,
-            args.dtype, args.max_model_len, args.enforce_eager,
-            args.kv_cache_dtype, args.quantization_param_path, args.device,
+            args.quantized_weights_path, args.tensor_parallel_size, args.seed,
+            args.n, args.use_beam_search, args.trust_remote_code, args.dtype,
+            args.max_model_len, args.enforce_eager, args.kv_cache_dtype,
+            args.quantization_param_path, args.device,
             args.enable_prefix_caching, args.enable_chunked_prefill,
             args.max_num_batched_tokens, args.gpu_memory_utilization,
             args.worker_use_ray, args.download_dir)