Skip to content

Commit

Permalink
add --block-allocator parameter to vLLM
Browse files Browse the repository at this point in the history
  • Loading branch information
KuntaiDu committed Oct 25, 2024
1 parent a9e1897 commit 006a839
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 2 deletions.
11 changes: 11 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ def __init__(
sliding_window: Optional[int] = None,
enable_prefix_caching: bool = False,
cpu_offload_gb: float = 0,
block_allocator: str = "CpuGpuBlockAllocator",
) -> None:
self.block_size = block_size
self.gpu_memory_utilization = gpu_memory_utilization
Expand All @@ -661,6 +662,7 @@ def __init__(
self.sliding_window = sliding_window
self.enable_prefix_caching = enable_prefix_caching
self.cpu_offload_gb = cpu_offload_gb
self.block_allocator = block_allocator

self._verify_args()
self._verify_cache_dtype()
Expand All @@ -681,6 +683,15 @@ def _verify_args(self) -> None:
"GPU memory utilization must be less than 1.0. Got "
f"{self.gpu_memory_utilization}.")

if self.block_allocator not in [
"CpuGpuBlockAllocator",
"CpuOffloadingBlockAllocator"
]:
raise ValueError(
"Only CpuGpuBlockAllocator and CpuOffloadingBlockAllocator is "
"supported. Got %s." % self.block_allocator
)

def _verify_cache_dtype(self) -> None:
if self.cache_dtype == "auto":
pass
Expand Down
11 changes: 10 additions & 1 deletion vllm/core/block_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from vllm.core.block.block_table import BlockTable
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
from vllm.core.block.cpu_offloading_block_allocator \
import CpuOffloadingBlockAllocator
from vllm.core.block.interfaces import Block
from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
LastAccessBlocksTracker)
Expand All @@ -17,6 +19,12 @@
EncoderSeqId = str


block_allocator_creator = {
"CpuGpuBlockAllocator": CpuGpuBlockAllocator.create,
"CpuOffloadingBlockAllocator": CpuOffloadingBlockAllocator.create,
}


class SelfAttnBlockSpaceManager(BlockSpaceManager):
"""BlockSpaceManager which manages the allocation of KV cache.
Expand Down Expand Up @@ -65,6 +73,7 @@ def __init__(
watermark: float = 0.01,
sliding_window: Optional[int] = None,
enable_caching: bool = False,
block_allocator: str = "CpuGpuBlockAllocator",
) -> None:
self.block_size = block_size
self.num_total_gpu_blocks = num_gpu_blocks
Expand All @@ -90,7 +99,7 @@ def __init__(

self.watermark_blocks = int(watermark * num_gpu_blocks)

self.block_allocator = CpuGpuBlockAllocator.create(
self.block_allocator = block_allocator_creator[block_allocator](
allocator_type="prefix_caching" if enable_caching else "naive",
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
Expand Down
3 changes: 2 additions & 1 deletion vllm/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,8 @@ def __init__(
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
sliding_window=self.cache_config.sliding_window,
enable_caching=self.cache_config.enable_prefix_caching)
enable_caching=self.cache_config.enable_prefix_caching,
block_allocator=self.cache_config.block_allocator)

# Sequence groups in the WAITING state.
# Contain new prefill or preempted requests.
Expand Down
22 changes: 22 additions & 0 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class EngineArgs:
pipeline_parallel_size: int = 1
tensor_parallel_size: int = 1
max_parallel_loading_workers: Optional[int] = None
block_allocator: str = "CpuGpuBlockAllocator"
block_size: int = 16
enable_prefix_caching: bool = False
disable_sliding_window: bool = False
Expand Down Expand Up @@ -368,6 +369,17 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
action='store_true',
help='If specified, use nsight to profile Ray workers.')
# KV cache arguments
parser.add_argument('--block-allocator',
type=str,
default='CpuGpuBlockAllocator',
choices=['CpuGpuBlockAllocator',
'CpuOffloadingBlockAllocator'],
help='The block allocator that vLLM uses. Currently'
' can be CpuGpuBlockAllocator (the default) and '
'CpuOffloadingBlockAllocator (experimental) that '
'supports offloading the KV cache to CPU . '
'When using CpuOffloadingBlockAllocator, the '
'preemption mode must be recompute.')
parser.add_argument('--block-size',
type=int,
default=EngineArgs.block_size,
Expand Down Expand Up @@ -913,6 +925,15 @@ def create_engine_config(self) -> EngineConfig:
assert self.cpu_offload_gb >= 0, (
"CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")

if self.block_allocator == "CpuOffloadingBlockAllocator" and \
self.preemption_mode == "swap":
raise ValueError(
"CpuOffloadingBlockAllocator only supports preemption by "
"recomputation as it internally offloads the request KV cache "
"to CPU. Please add `--preemption-mode recomputation` to vLLM "
"engine args"
)

device_config = DeviceConfig(device=self.device)
model_config = self.create_model_config()
Expand All @@ -936,6 +957,7 @@ def create_engine_config(self) -> EngineConfig:
sliding_window=model_config.get_sliding_window(),
enable_prefix_caching=self.enable_prefix_caching,
cpu_offload_gb=self.cpu_offload_gb,
block_allocator=self.block_allocator,
)
parallel_config = ParallelConfig(
pipeline_parallel_size=self.pipeline_parallel_size,
Expand Down

0 comments on commit 006a839

Please sign in to comment.