From 6a7b7c4fa293988e45b2ef638d751a02e7022076 Mon Sep 17 00:00:00 2001 From: "weilong.yu" Date: Fri, 25 Oct 2024 16:11:36 +0800 Subject: [PATCH] bug fix for issue 9688 --- vllm/worker/model_runner.py | 4 ++++ vllm/worker/worker.py | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 8b74f06e77be0..f4263525217cb 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1303,6 +1303,10 @@ def profile_run(self) -> None: with set_compile_context(batch_size_capture_list): self.execute_model(model_input, kv_caches, intermediate_tensors) torch.cuda.synchronize() + # Cleanup + if self.lora_config: + assert self.lora_manager is not None + self.remove_all_loras(() return def remove_all_loras(self): diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index fd30962e5d6bb..1fdf06cf301fc 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -270,9 +270,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: available_kv_cache_memory / (1024**3), self.cache_config.gpu_memory_utilization) - # Final cleanup - if self.model_runner.lora_manager: - self.model_runner.remove_all_loras() gc.collect() return num_gpu_blocks, num_cpu_blocks