From c1dea4930fa9c4a9a0b03006e60c51e38912ff1e Mon Sep 17 00:00:00 2001 From: Ahmed Mansy Date: Tue, 5 Nov 2024 00:18:11 +0200 Subject: [PATCH] [Core] Revert previous implementation and update worker to check for GPU blocks override Signed-off-by: Ahmed Mansy --- vllm/worker/worker.py | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d9b8ad8ab5df8..dd75daaebd5b4 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -265,29 +265,12 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: def _assert_memory_footprint_increased_during_profiling(self): # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. - free_gpu_memory, total_memory = torch.cuda.mem_get_info() - memory_diff = self.init_gpu_memory - free_gpu_memory - - # If we've loaded model weights but memory shows no change, - # we're likely in a restricted environment - model_loaded = hasattr(self.model_runner, 'model') - memory_is_static = memory_diff == 0 - - is_restricted_env = model_loaded and memory_is_static - - if is_restricted_env: - logger.info("Detected restricted GPU environment. " - "Model is loaded but memory reports static usage. " - "Free memory: %.2fGB, Total memory: %.2fGB", - free_gpu_memory / (1024**3), - total_memory / (1024**3)) - - assert memory_diff > 0 or is_restricted_env, ( - "Error in memory profiling." + free_gpu_memory, _ = torch.cuda.mem_get_info() + assert self.init_gpu_memory - free_gpu_memory > 0, ( + "Error in memory profiling. " f"Initial free memory {self.init_gpu_memory}, current free memory" f" {free_gpu_memory}. This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Allocate GPU and CPU KV cache with the specified number of blocks.