From ee7056418175f5807954b233e53e9224904bf625 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Tue, 17 Dec 2024 21:58:03 +0000 Subject: [PATCH] Revert some unwanted changes --- docs/source/quantization/fp8_e4m3_kvcache.rst | 10 +++------- vllm/engine/arg_utils.py | 2 -- vllm/envs.py | 6 +++--- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/docs/source/quantization/fp8_e4m3_kvcache.rst b/docs/source/quantization/fp8_e4m3_kvcache.rst index cc52d8f40af8f..a9147f8fd8ff3 100644 --- a/docs/source/quantization/fp8_e4m3_kvcache.rst +++ b/docs/source/quantization/fp8_e4m3_kvcache.rst @@ -30,18 +30,14 @@ Here is an example of how to enable this feature: .. code-block:: python - # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to - # https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. + # To calculate kv cache scales on the fly enable the calculate_kv_scales + # parameter from vllm import LLM, SamplingParams sampling_params = SamplingParams(temperature=1.3, top_p=0.8) llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", kv_cache_dtype="fp8", - quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") + calculate_kv_scales=True) prompt = "London is the capital of" out = llm.generate(prompt, sampling_params)[0].outputs[0].text print(out) - - # output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, - # output w/o scaling factors: England, located in the southeastern part of the country. It is known - diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5bc48c6903606..bac9527f285cb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -99,7 +99,6 @@ class EngineArgs: config_format: ConfigFormat = ConfigFormat.AUTO dtype: str = 'auto' kv_cache_dtype: str = 'auto' - quantization_param_path: Optional[str] = None seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False @@ -969,7 +968,6 @@ def create_model_config(self) -> ModelConfig: tokenizer_revision=self.tokenizer_revision, max_model_len=self.max_model_len, quantization=self.quantization, - quantization_param_path=self.quantization_param_path, enforce_eager=self.enforce_eager, max_seq_len_to_capture=self.max_seq_len_to_capture, max_logprobs=self.max_logprobs, diff --git a/vllm/envs.py b/vllm/envs.py index 4f03a7dbd289e..19e520691e436 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -16,7 +16,7 @@ VLLM_USE_TRITON_FLASH_ATTN: bool = True VLLM_USE_ROCM_SKINNY_GEMM: bool = True VLLM_USE_ROCM_CUSTOM_PAGED_ATTN: bool = True - VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = False + VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = True RANK: int = 0 LOCAL_RANK: int = 0 CUDA_VISIBLE_DEVICES: Optional[str] = None @@ -537,12 +537,12 @@ def get_default_config_root(): # Divisor for dynamic value scale factor calculation for FP8 KV Cache "V_SCALE_CONSTANT": lambda: int(os.getenv("V_SCALE_CONSTANT", "100")), - "VLLM_LOG_BATCHSIZE_INTERVAL": - lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), # If set, enable multiprocessing in LLM for the V1 code path. "VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))), + "VLLM_LOG_BATCHSIZE_INTERVAL": + lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), } # end-env-vars-definition