diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 83bfbb6ade8d7..b44d3e5cb0678 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -23,7 +23,8 @@ def test_prefill(): manager = KVCacheManager( block_size=16, num_gpu_blocks=10, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=16, ) @@ -121,7 +122,8 @@ def test_decode(): manager = KVCacheManager( block_size=16, num_gpu_blocks=10, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=16, ) @@ -172,7 +174,8 @@ def test_evict(): manager = KVCacheManager( block_size=16, num_gpu_blocks=10, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=16, ) @@ -220,7 +223,8 @@ def test_hash_block_correct_reuse(): manager = KVCacheManager( block_size=block_size, num_gpu_blocks=1, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=0, ) @@ -256,7 +260,8 @@ def test_computed_blocks_not_evicted(): manager = KVCacheManager( block_size=block_size, num_gpu_blocks=2, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=0, ) @@ -303,7 +308,8 @@ def test_basic_prefix_caching_disabled(): manager = KVCacheManager( block_size=block_size, num_gpu_blocks=4, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=False, num_preallocate_tokens=0, ) @@ -342,7 +348,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int): manager = KVCacheManager( block_size=block_size, num_gpu_blocks=10, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=num_preallocate_tokens, ) @@ -370,7 +377,8 @@ def test_cache_blocks(): manager = KVCacheManager( block_size=block_size, num_gpu_blocks=5, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=0, ) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 8eb3fb976eb87..b492a755e6dd5 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -17,12 +17,15 @@ def __init__( self, block_size: int, num_gpu_blocks: int, + max_model_len: int, sliding_window: Optional[int] = None, enable_caching: bool = True, num_preallocate_tokens: int = 64, ) -> None: self.block_size = block_size self.num_gpu_blocks = num_gpu_blocks + self.max_model_len = max_model_len + self.max_num_blocks_per_req = cdiv(max_model_len, block_size) self.sliding_window = sliding_window self.enable_caching = enable_caching # NOTE(woosuk): To avoid frequent block allocation, we preallocate some @@ -132,7 +135,14 @@ def append_slots( num_new_blocks = min( num_new_blocks + self.num_preallocate_blocks, self.free_block_queue.num_free_blocks, + # Should not exceed the maximum number of blocks per request. + # This is especially because the block table has the shape + # [..., max_num_blocks_per_req]. + # TODO(woosuk): Check and reject requests if + # num_prompt_tokens + max_tokens > max_model_len. + self.max_num_blocks_per_req - len(req_blocks), ) + assert num_new_blocks > 0 new_blocks = self._get_new_blocks(num_new_blocks) req_blocks.extend(new_blocks) @@ -212,7 +222,14 @@ def allocate_slots( num_required_blocks + self.num_preallocate_blocks, self.free_block_queue.num_free_blocks - num_evictable_computed_blocks, + # Should not exceed the maximum number of blocks per request. + # This is especially because the block table has the shape + # [..., max_num_blocks_per_req]. + # TODO(woosuk): Check and reject requests if + # num_prompt_tokens + max_tokens > max_model_len. + self.max_num_blocks_per_req - len(computed_blocks), ) + assert num_new_blocks > 0 # Concatenate the computed block IDs and the new block IDs. new_blocks = self._get_new_blocks(num_new_blocks) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index ba50a9786d805..f1f26f4e8d443 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -33,22 +33,23 @@ def __init__( # TODO: Support LoRA. assert lora_config is None, "V1 does not support LoRA yet." + # Scheduling constraints. + self.max_num_running_reqs = self.scheduler_config.max_num_seqs + self.max_num_scheduled_tokens = \ + self.scheduler_config.max_num_batched_tokens + self.max_model_len = self.scheduler_config.max_model_len + num_gpu_blocks = cache_config.num_gpu_blocks assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0 - # Create the block space manager. + # Create the KV cache manager. self.kv_cache_manager = KVCacheManager( block_size=self.cache_config.block_size, num_gpu_blocks=num_gpu_blocks, + max_model_len=self.max_model_len, sliding_window=self.cache_config.sliding_window, enable_caching=self.cache_config.enable_prefix_caching) self.block_size = self.cache_config.block_size - # Scheduling constraints. - self.max_num_running_reqs = self.scheduler_config.max_num_seqs - self.max_num_scheduled_tokens = \ - self.scheduler_config.max_num_batched_tokens - self.max_model_len = self.scheduler_config.max_model_len - # req_id -> Request self.requests: Dict[str, Request] = {} # Priority queues for requests.