add --block-allocator parameter to vLLM

vllm-project · Oct 25, 2024 · 006a839 · 006a839
1 parent a9e1897
commit 006a839
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 2 deletions.
diff --git a/vllm/config.py b/vllm/config.py
@@ -651,6 +651,7 @@ def __init__(
         sliding_window: Optional[int] = None,
         enable_prefix_caching: bool = False,
         cpu_offload_gb: float = 0,
+        block_allocator: str = "CpuGpuBlockAllocator",
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
@@ -661,6 +662,7 @@ def __init__(
         self.sliding_window = sliding_window
         self.enable_prefix_caching = enable_prefix_caching
         self.cpu_offload_gb = cpu_offload_gb
+        self.block_allocator = block_allocator
 
         self._verify_args()
         self._verify_cache_dtype()
@@ -681,6 +683,15 @@ def _verify_args(self) -> None:
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
 
+        if self.block_allocator not in [
+            "CpuGpuBlockAllocator",
+            "CpuOffloadingBlockAllocator"
+        ]:
+            raise ValueError(
+                "Only CpuGpuBlockAllocator and CpuOffloadingBlockAllocator is "
+                "supported. Got %s." % self.block_allocator
+            )
+
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
@@ -5,6 +5,8 @@
 
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.core.block.cpu_offloading_block_allocator \
+    import CpuOffloadingBlockAllocator
 from vllm.core.block.interfaces import Block
 from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
                                                   LastAccessBlocksTracker)
@@ -17,6 +19,12 @@
 EncoderSeqId = str
 
 
+block_allocator_creator = {
+    "CpuGpuBlockAllocator": CpuGpuBlockAllocator.create,
+    "CpuOffloadingBlockAllocator": CpuOffloadingBlockAllocator.create,
+}
+
+
 class SelfAttnBlockSpaceManager(BlockSpaceManager):
     """BlockSpaceManager which manages the allocation of KV cache.
 
@@ -65,6 +73,7 @@ def __init__(
         watermark: float = 0.01,
         sliding_window: Optional[int] = None,
         enable_caching: bool = False,
+        block_allocator: str = "CpuGpuBlockAllocator",
     ) -> None:
         self.block_size = block_size
         self.num_total_gpu_blocks = num_gpu_blocks
@@ -90,7 +99,7 @@ def __init__(
 
         self.watermark_blocks = int(watermark * num_gpu_blocks)
 
-        self.block_allocator = CpuGpuBlockAllocator.create(
+        self.block_allocator = block_allocator_creator[block_allocator](
             allocator_type="prefix_caching" if enable_caching else "naive",
             num_gpu_blocks=num_gpu_blocks,
             num_cpu_blocks=num_cpu_blocks,

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
@@ -334,7 +334,8 @@ def __init__(
             num_gpu_blocks=num_gpu_blocks,
             num_cpu_blocks=num_cpu_blocks,
             sliding_window=self.cache_config.sliding_window,
-            enable_caching=self.cache_config.enable_prefix_caching)
+            enable_caching=self.cache_config.enable_prefix_caching,
+            block_allocator=self.cache_config.block_allocator)
 
         # Sequence groups in the WAITING state.
         # Contain new prefill or preempted requests.

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -105,6 +105,7 @@ class EngineArgs:
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
+    block_allocator: str = "CpuGpuBlockAllocator"
     block_size: int = 16
     enable_prefix_caching: bool = False
     disable_sliding_window: bool = False
@@ -368,6 +369,17 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             action='store_true',
             help='If specified, use nsight to profile Ray workers.')
         # KV cache arguments
+        parser.add_argument('--block-allocator',
+                            type=str,
+                            default='CpuGpuBlockAllocator',
+                            choices=['CpuGpuBlockAllocator', 
+                                     'CpuOffloadingBlockAllocator'],
+                            help='The block allocator that vLLM uses. Currently'
+                            ' can be CpuGpuBlockAllocator (the default) and '
+                            'CpuOffloadingBlockAllocator (experimental) that '
+                            'supports offloading the KV cache to CPU . '
+                            'When using CpuOffloadingBlockAllocator, the '
+                            'preemption mode must be recompute.')
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
@@ -913,6 +925,15 @@ def create_engine_config(self) -> EngineConfig:
         assert self.cpu_offload_gb >= 0, (
             "CPU offload space must be non-negative"
             f", but got {self.cpu_offload_gb}")
+
+        if self.block_allocator == "CpuOffloadingBlockAllocator" and \
+            self.preemption_mode == "swap":
+            raise ValueError(
+                "CpuOffloadingBlockAllocator only supports preemption by "
+                "recomputation as it internally offloads the request KV cache "
+                "to CPU. Please add `--preemption-mode recomputation` to vLLM "
+                "engine args"
+            )
 
         device_config = DeviceConfig(device=self.device)
         model_config = self.create_model_config()
@@ -936,6 +957,7 @@ def create_engine_config(self) -> EngineConfig:
             sliding_window=model_config.get_sliding_window(),
             enable_prefix_caching=self.enable_prefix_caching,
             cpu_offload_gb=self.cpu_offload_gb,
+            block_allocator=self.block_allocator,
         )
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,