bug fix for issue 9688

sleepwalker2017 · Oct 25, 2024 · 6a7b7c4 · 6a7b7c4
1 parent 9645b9f
commit 6a7b7c4
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 3 deletions.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -1303,6 +1303,10 @@ def profile_run(self) -> None:
         with set_compile_context(batch_size_capture_list):
             self.execute_model(model_input, kv_caches, intermediate_tensors)
         torch.cuda.synchronize()
+        # Cleanup
+        if self.lora_config:
+            assert self.lora_manager is not None
+            self.remove_all_loras(()
         return
 
     def remove_all_loras(self):

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -270,9 +270,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             available_kv_cache_memory / (1024**3),
             self.cache_config.gpu_memory_utilization)
 
-        # Final cleanup
-        if self.model_runner.lora_manager:
-            self.model_runner.remove_all_loras()
         gc.collect()
 
         return num_gpu_blocks, num_cpu_blocks