Skip to content

Commit

Permalink
Revert some unwanted changes
Browse files Browse the repository at this point in the history
  • Loading branch information
gshtras committed Dec 17, 2024
1 parent 7524323 commit ee70564
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 12 deletions.
10 changes: 3 additions & 7 deletions docs/source/quantization/fp8_e4m3_kvcache.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,14 @@ Here is an example of how to enable this feature:

.. code-block:: python
# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
# To calculate kv cache scales on the fly enable the calculate_kv_scales
# parameter
from vllm import LLM, SamplingParams
sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
kv_cache_dtype="fp8",
quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
calculate_kv_scales=True)
prompt = "London is the capital of"
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
print(out)
# output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial,
# output w/o scaling factors: England, located in the southeastern part of the country. It is known
2 changes: 0 additions & 2 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ class EngineArgs:
config_format: ConfigFormat = ConfigFormat.AUTO
dtype: str = 'auto'
kv_cache_dtype: str = 'auto'
quantization_param_path: Optional[str] = None
seed: int = 0
max_model_len: Optional[int] = None
worker_use_ray: bool = False
Expand Down Expand Up @@ -969,7 +968,6 @@ def create_model_config(self) -> ModelConfig:
tokenizer_revision=self.tokenizer_revision,
max_model_len=self.max_model_len,
quantization=self.quantization,
quantization_param_path=self.quantization_param_path,
enforce_eager=self.enforce_eager,
max_seq_len_to_capture=self.max_seq_len_to_capture,
max_logprobs=self.max_logprobs,
Expand Down
6 changes: 3 additions & 3 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
VLLM_USE_TRITON_FLASH_ATTN: bool = True
VLLM_USE_ROCM_SKINNY_GEMM: bool = True
VLLM_USE_ROCM_CUSTOM_PAGED_ATTN: bool = True
VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = False
VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = True
RANK: int = 0
LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: Optional[str] = None
Expand Down Expand Up @@ -537,12 +537,12 @@ def get_default_config_root():
# Divisor for dynamic value scale factor calculation for FP8 KV Cache
"V_SCALE_CONSTANT":
lambda: int(os.getenv("V_SCALE_CONSTANT", "100")),
"VLLM_LOG_BATCHSIZE_INTERVAL":
lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),

# If set, enable multiprocessing in LLM for the V1 code path.
"VLLM_ENABLE_V1_MULTIPROCESSING":
lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
"VLLM_LOG_BATCHSIZE_INTERVAL":
lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
}

# end-env-vars-definition
Expand Down

0 comments on commit ee70564

Please sign in to comment.