From 8e874344a20b2969348edb97b8ed713633efc255 Mon Sep 17 00:00:00 2001 From: yuanjian Date: Sat, 21 Dec 2024 15:06:44 +0800 Subject: [PATCH] fixed available_kv_cache_memory --- vllm/worker/worker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index f51b51d433d3d..9b7155ba3799d 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -207,7 +207,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: memory_for_current_instance = total_gpu_memory * \ self.cache_config.gpu_memory_utilization available_kv_cache_memory = (memory_for_current_instance - - result.non_kv_cache_memory_in_bytes) + result.non_kv_cache_memory_in_bytes - + result.baseline_memory_in_bytes) # Calculate the number of blocks that can be allocated with the # profiled peak memory.