Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding quantization_weights_path for fp8 weights #57

Merged
merged 5 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ROCm_performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ For more details, please refer to Quark's documentation.

To use ammo, please follow this [instruction](https://github.com/ROCm/vllm/tree/main/examples/fp8/quantizer), and set `VLLM_FP8_USE_AMMO=1`.

Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantization_param_path={relative path of the safetensors with your model path}`.
Both quantizers generate a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be placed under your model folder. Then we can run a model with fp8 quantization using vllm. When creating `vllm.LLM` object, two additional parameters should be added: `quantization="fp8"` and `quantized_weights_path={relative path of the safetensors with your model path}`.

## Gemm Tuning for Fp8

Expand Down
8 changes: 8 additions & 0 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def main(args: argparse.Namespace):
num_speculative_tokens=args.num_speculative_tokens,
tokenizer=args.tokenizer,
quantization=args.quantization,
quantized_weights_path=args.quantized_weights_path,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
Expand Down Expand Up @@ -175,6 +176,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
'--quantized-weights-path',
type=str,
default=None,
help='Path to the safetensor file containing the quantized weights '
'and scaling factors. This should generally be supplied, when '
'quantization is FP8.')
parser.add_argument(
'--profile',
action='store_true',
Expand Down
15 changes: 12 additions & 3 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def run_vllm(
model: str,
tokenizer: str,
quantization: Optional[str],
quantized_weights_path: Optional[str],
tensor_parallel_size: int,
seed: int,
n: int,
Expand All @@ -87,6 +88,7 @@ def run_vllm(
model=model,
tokenizer=tokenizer,
quantization=quantization,
quantized_weights_path=quantized_weights_path,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code,
Expand Down Expand Up @@ -222,9 +224,9 @@ def main(args: argparse.Namespace):
if args.backend == "vllm":
elapsed_time = run_vllm(
requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
args.quantized_weights_path, args.tensor_parallel_size, args.seed,
args.n, args.use_beam_search, args.trust_remote_code, args.dtype,
args.max_model_len, args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.gpu_memory_utilization,
Expand Down Expand Up @@ -342,6 +344,13 @@ def main(args: argparse.Namespace):
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
'--quantized-weights-path',
type=str,
default=None,
help='Path to the safetensor file containing the quantized weights '
'and scaling factors. This should generally be supplied, when '
'quantization is FP8.')
parser.add_argument(
"--device",
type=str,
Expand Down
2 changes: 2 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def __init__(
max_model_len: Optional[int] = None,
quantization: Optional[str] = None,
quantization_param_path: Optional[str] = None,
quantized_weights_path: Optional[str] = None,
enforce_eager: bool = False,
max_context_len_to_capture: Optional[int] = None,
max_seq_len_to_capture: Optional[int] = None,
Expand All @@ -116,6 +117,7 @@ def __init__(
self.tokenizer_revision = tokenizer_revision
self.quantization = quantization
self.quantization_param_path = quantization_param_path
self.quantized_weights_path = quantized_weights_path
self.enforce_eager = enforce_eager
self.max_context_len_to_capture = max_context_len_to_capture
if self.max_context_len_to_capture is not None:
Expand Down
17 changes: 13 additions & 4 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class EngineArgs:
rope_scaling: Optional[dict] = None
tokenizer_revision: Optional[str] = None
quantization: Optional[str] = None
quantized_weights_path: Optional[str] = None
enforce_eager: bool = False
max_context_len_to_capture: Optional[int] = None
max_seq_len_to_capture: int = 32768
Expand Down Expand Up @@ -337,6 +338,13 @@ def add_cli_args(
'None, we assume the model weights are not '
'quantized and use `dtype` to determine the data '
'type of the weights.')
parser.add_argument(
'--quantized-weights-path',
type=nullable_str,
default=None,
help='Path to the safetensor file containing the quantized weights '
'and scaling factors. This should generally be supplied, when '
'quantization is FP8.')
parser.add_argument('--rope-scaling',
default=None,
type=json.loads,
Expand Down Expand Up @@ -562,10 +570,11 @@ def create_engine_config(self, ) -> EngineConfig:
self.trust_remote_code, self.dtype, self.seed, self.revision,
self.code_revision, self.rope_scaling, self.tokenizer_revision,
self.max_model_len, self.quantization,
self.quantization_param_path, self.enforce_eager,
self.max_context_len_to_capture, self.max_seq_len_to_capture,
self.max_logprobs, self.disable_sliding_window,
self.skip_tokenizer_init, self.served_model_name)
self.quantization_param_path, self.quantized_weights_path,
self.enforce_eager, self.max_context_len_to_capture,
self.max_seq_len_to_capture, self.max_logprobs,
self.disable_sliding_window, self.skip_tokenizer_init,
self.served_model_name)
cache_config = CacheConfig(self.block_size,
self.gpu_memory_utilization,
self.swap_space, self.kv_cache_dtype,
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/model_loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,11 +248,11 @@ def load_model(self, *, model_config: ModelConfig,
"fall_back_to_pt_during_load",
True)), )
if (model_config.quantization == 'fp8'
and model_config.quantization_param_path is not None):
and model_config.quantized_weights_path is not None):
model.load_quantized_weights(
safetensors_weights_iterator([
model_config.model +
model_config.quantization_param_path
model_config.quantized_weights_path
]))
for _, module in model.named_modules():
quant_method = getattr(module, "quant_method", None)
Expand Down
Loading