[Core] Revert previous implementation and update worker to check for …

…GPU blocks override Signed-off-by: Ahmed Mansy <[email protected]>
vllm-project · Nov 4, 2024 · 86c41e9 · 86c41e9
1 parent 18f25a3
commit 86c41e9
Showing 1 changed file with 33 additions and 23 deletions.
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -179,33 +179,45 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         KV blocks may be allocated without OOMs.
 
         The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
+        Then, it calculates the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
 
         .. tip::
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
-        # Profile the memory usage of the model and get the maximum number of
-        # cache blocks that can be allocated with the remaining free memory.
+        # Check if GPU blocks override is provided; skip profiling if so.
+        num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
+
+        if num_gpu_blocks_override is not None:
+            # Calculate num_cpu_blocks based on available swap space.
+            cache_block_size = self.get_cache_block_size_bytes()
+            num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                                 cache_block_size) if cache_block_size else 0
+            num_cpu_blocks = max(num_cpu_blocks, 0)
+
+            logger.info(
+                "Using num_gpu_blocks_override=%d, calculated num_cpu_blocks=%d",
+                num_gpu_blocks_override, num_cpu_blocks)
+
+            return num_gpu_blocks_override, num_cpu_blocks
+
+        # Proceed with full profiling if no override is provided.
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
 
         free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info()
 
-        # Execute a forward pass with dummy inputs to profile the memory usage
-        # of the model.
+        # Profile model memory usage with a forward pass.
         self.model_runner.profile_run()
         torch.cuda.synchronize()
 
         self._assert_memory_footprint_increased_during_profiling()
 
-        # Get the peak memory allocation recorded by torch
+        # Gather peak memory allocation.
         peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
 
-        # Check for any memory left around that may have been allocated on the
-        # gpu outside of `torch`. NCCL operations, for example, can use a few
-        # GB during a forward pass
+        # Adjust for non-torch memory allocations.
         torch.cuda.empty_cache()
         torch_allocated_bytes = torch.cuda.memory_stats(
         )["allocated_bytes.all.current"]
@@ -219,33 +231,31 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             total_gpu_memory * self.cache_config.gpu_memory_utilization -
             peak_memory)
 
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
+        # Calculate the number of blocks for both GPU and CPU.
         cache_block_size = self.get_cache_block_size_bytes()
-        if cache_block_size == 0:
-            num_gpu_blocks = 0
-            num_cpu_blocks = 0
-        else:
-            num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
-            num_cpu_blocks = int(self.cache_config.swap_space_bytes //
-                                 cache_block_size)
-        num_gpu_blocks = max(num_gpu_blocks, 0)
-        num_cpu_blocks = max(num_cpu_blocks, 0)
+        num_gpu_blocks = int(available_kv_cache_memory //
+                             cache_block_size) if cache_block_size else 0
+        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                             cache_block_size) if cache_block_size else 0
+        num_gpu_blocks, num_cpu_blocks = max(num_gpu_blocks,
+                                             0), max(num_cpu_blocks, 0)
 
         logger.info(
             "Memory profiling results: total_gpu_memory=%.2fGiB"
             " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB"
             " memory_usage_post_profile=%.2fGib"
             " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB"
-            " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3),
+            " gpu_memory_utilization=%.2f",
+            total_gpu_memory / (1024**3),
             (total_gpu_memory - free_memory_pre_profile) / (1024**3),
             (peak_memory - non_torch_allocations) / (1024**3),
             total_allocated_bytes / (1024**3),
             non_torch_allocations / (1024**3),
             available_kv_cache_memory / (1024**3),
-            self.cache_config.gpu_memory_utilization)
+            self.cache_config.gpu_memory_utilization,
+        )
 
-        # Final cleanup
+        # Final cleanup.
         if self.model_runner.lora_manager:
             self.model_runner.remove_all_loras()
         gc.collect()