From ee7056418175f5807954b233e53e9224904bf625 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Tue, 17 Dec 2024 21:58:03 +0000
Subject: [PATCH] Revert some unwanted changes

---
 docs/source/quantization/fp8_e4m3_kvcache.rst | 10 +++-------
 vllm/engine/arg_utils.py                      |  2 --
 vllm/envs.py                                  |  6 +++---
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/docs/source/quantization/fp8_e4m3_kvcache.rst b/docs/source/quantization/fp8_e4m3_kvcache.rst
index cc52d8f40af8f..a9147f8fd8ff3 100644
--- a/docs/source/quantization/fp8_e4m3_kvcache.rst
+++ b/docs/source/quantization/fp8_e4m3_kvcache.rst
@@ -30,18 +30,14 @@ Here is an example of how to enable this feature:
 
 .. code-block:: python
 
-        # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to 
-        # https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
+        # To calculate kv cache scales on the fly enable the calculate_kv_scales
+        # parameter
 
         from vllm import LLM, SamplingParams
         sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
         llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
                   kv_cache_dtype="fp8",
-                  quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
+                  calculate_kv_scales=True)
         prompt = "London is the capital of"
         out = llm.generate(prompt, sampling_params)[0].outputs[0].text
         print(out)
-
-        # output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
-        # output w/o scaling factors:  England, located in the southeastern part of the country. It is known 
-
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5bc48c6903606..bac9527f285cb 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -99,7 +99,6 @@ class EngineArgs:
     config_format: ConfigFormat = ConfigFormat.AUTO
     dtype: str = 'auto'
     kv_cache_dtype: str = 'auto'
-    quantization_param_path: Optional[str] = None
     seed: int = 0
     max_model_len: Optional[int] = None
     worker_use_ray: bool = False
@@ -969,7 +968,6 @@ def create_model_config(self) -> ModelConfig:
             tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
             quantization=self.quantization,
-            quantization_param_path=self.quantization_param_path,
             enforce_eager=self.enforce_eager,
             max_seq_len_to_capture=self.max_seq_len_to_capture,
             max_logprobs=self.max_logprobs,
diff --git a/vllm/envs.py b/vllm/envs.py
index 4f03a7dbd289e..19e520691e436 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -16,7 +16,7 @@
     VLLM_USE_TRITON_FLASH_ATTN: bool = True
     VLLM_USE_ROCM_SKINNY_GEMM: bool = True
     VLLM_USE_ROCM_CUSTOM_PAGED_ATTN: bool = True
-    VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = False
+    VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = True
     RANK: int = 0
     LOCAL_RANK: int = 0
     CUDA_VISIBLE_DEVICES: Optional[str] = None
@@ -537,12 +537,12 @@ def get_default_config_root():
     # Divisor for dynamic value scale factor calculation for FP8 KV Cache
     "V_SCALE_CONSTANT":
     lambda: int(os.getenv("V_SCALE_CONSTANT", "100")),
-    "VLLM_LOG_BATCHSIZE_INTERVAL":
-    lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
 
     # If set, enable multiprocessing in LLM for the V1 code path.
     "VLLM_ENABLE_V1_MULTIPROCESSING":
     lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
+    "VLLM_LOG_BATCHSIZE_INTERVAL":
+    lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
 }
 
 # end-env-vars-definition