Skip to content

Commit 2c57188

Browse files
authored
[Bugfix] Move the _touch(computed_blocks) call in the allocate_slots method to after the check for allocating new blocks. (vllm-project#11565)
1 parent 82c49d3 commit 2c57188

File tree

2 files changed

+74
-8
lines changed

2 files changed

+74
-8
lines changed

tests/v1/core/test_prefix_caching.py

+61-2
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,9 @@ def test_prefill():
9898
# Incomplete 1 block (6 tokens)
9999
unique_token_ids = [3] * 6
100100
req2 = make_request("2", common_token_ids + unique_token_ids)
101-
computed_block = manager.get_computed_blocks(req2)
101+
computed_blocks = manager.get_computed_blocks(req2)
102102
assert len(req2.kv_block_hashes) == 3
103-
assert [b.block_id for b in computed_block] == [0, 1, 2]
103+
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
104104
num_new_tokens = 53 - 3 * 16
105105
blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
106106
assert [b.block_id for b in blocks] == [7, 8]
@@ -500,3 +500,62 @@ def test_mm_prefix_caching():
500500
mm_hashes=mm_hashes)
501501
computed_blocks = manager.get_computed_blocks(req1)
502502
assert len(computed_blocks) == 3
503+
504+
505+
def test_prefill_not_enough_free_blocks_with_computed_blocks():
506+
"""
507+
This is a unit test that tests the correctness of the allocate_slots
508+
when there is not enough free blocks. Specifically, when a request
509+
has computed blocks but cannot be allocated due to not enough free blocks,
510+
the computed blocks should not be touched.
511+
"""
512+
block_size = 16
513+
manager = KVCacheManager(
514+
block_size=block_size,
515+
num_gpu_blocks=10,
516+
max_model_len=8192,
517+
sliding_window=None,
518+
enable_caching=True,
519+
num_preallocate_tokens=0,
520+
)
521+
# Complete 3 blocks (48 tokens)
522+
# | Common-0 | Common-1 | Common-2 | ... |
523+
common_token_ids = [i for i in range(3) for _ in range(16)]
524+
req0 = make_request("0", common_token_ids)
525+
computed_blocks = manager.get_computed_blocks(req0)
526+
assert not computed_blocks
527+
manager.allocate_slots(req0, 48, computed_blocks)
528+
block_part0 = manager.req_to_blocks[req0.request_id]
529+
530+
# | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
531+
req1 = make_request("1", common_token_ids * 2)
532+
computed_blocks = manager.get_computed_blocks(req1)
533+
assert computed_blocks == block_part0
534+
manager.allocate_slots(req1, 48, computed_blocks)
535+
block_part1 = manager.req_to_blocks[req1.request_id]
536+
# | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
537+
# | Req1-5(F)| ... |
538+
manager.free(req1)
539+
assert {block.ref_cnt for block in block_part1[:3]} == {1}
540+
assert {block.ref_cnt for block in block_part1[3:]} == {0}
541+
542+
# | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
543+
# | Req1-5(F)| Req2-0 | Req2-1 | ... |
544+
req2 = make_request("2", [7] * block_size * 2)
545+
computed_blocks = manager.get_computed_blocks(req2)
546+
assert not computed_blocks
547+
manager.allocate_slots(req2, block_size * 2, computed_blocks)
548+
549+
# Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
550+
# but it cannot be allocated due to insufficient free blocks (2).
551+
# In this case, the ref_cnt of the computed blocks should not be changed.
552+
assert manager.free_block_queue.num_free_blocks == 5
553+
req3 = make_request("3", common_token_ids * 3)
554+
computed_blocks = manager.get_computed_blocks(req3)
555+
assert computed_blocks == block_part1
556+
# Req3 cannot be allocated.
557+
assert manager.allocate_slots(req3, 48, computed_blocks) is None
558+
# Block 0-2 are used by Req 1.
559+
assert {block.ref_cnt for block in block_part1[:3]} == {1}
560+
# Block 3-5 are free.
561+
assert {block.ref_cnt for block in block_part1[3:]} == {0}

vllm/v1/core/kv_cache_manager.py

+13-6
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def allocate_slots(
191191
request: The request to allocate slots.
192192
num_tokens: The number of tokens to allocate. Note that this does
193193
not include the tokens that have already been computed.
194-
computed_blocks: The blocks that have already been computed.
194+
computed_blocks: A list of computed blocks.
195195
196196
Returns:
197197
A list of new allocated blocks.
@@ -200,6 +200,18 @@ def allocate_slots(
200200
raise ValueError(
201201
f"num_tokens must be greater than 0, got {num_tokens}")
202202

203+
# If a computed block of a request is an eviction candidate (in the
204+
# free queue and ref_cnt == 0), it cannot be counted as a free block
205+
# when allocating this request.
206+
num_evictable_computed_blocks = sum(1 for blk in computed_blocks
207+
if blk.ref_cnt == 0)
208+
209+
num_required_blocks = cdiv(num_tokens, self.block_size)
210+
if (num_required_blocks > self.free_block_queue.num_free_blocks -
211+
num_evictable_computed_blocks):
212+
# Cannot allocate new blocks.
213+
return None
214+
203215
# Touch the computed blocks to make sure they won't be evicted.
204216
if self.enable_caching:
205217
self._touch(computed_blocks)
@@ -208,11 +220,6 @@ def allocate_slots(
208220
"Computed blocks should be empty when "
209221
"prefix caching is disabled")
210222

211-
num_required_blocks = cdiv(num_tokens, self.block_size)
212-
if (num_required_blocks > self.free_block_queue.num_free_blocks):
213-
# Cannot allocate new blocks.
214-
return None
215-
216223
# Determine the number of new blocks to allocate considering
217224
# preallocated blocks.
218225
num_new_blocks = min(

0 commit comments

Comments
 (0)