Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
Signed-off-by: Cody Yu <[email protected]>
  • Loading branch information
comaniac committed Oct 24, 2024
1 parent 855a598 commit a200257
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 6 deletions.
23 changes: 18 additions & 5 deletions vllm/v1/core/kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ class KVCacheBlock:
# is closer to the end of a prompt and more likely to be evicted.
num_hashed_tokens: int = 0

def reset(self):
self.prev_block_id = None
self.ref_cnt = 0
self.token_ids.clear()
self.block_hash = None
self.num_hashed_tokens = 0


class KVCacheManager:

Expand Down Expand Up @@ -180,8 +187,8 @@ def append_slots(
# No new block is needed. We caching is enabled,
# then token_id_idx must be equal to len(new_token_ids),
# meaning that all tokens are added to allocated blocks.
assert not self.enable_caching or token_id_idx == len(
new_token_ids)
assert not self.enable_caching or token_id_idx == num_tokens, \
f"{token_id_idx=} != {num_tokens=}"
return []

num_new_blocks = num_required_blocks - len(req_block_ids)
Expand All @@ -195,8 +202,12 @@ def append_slots(
if self.enable_caching:
new_token_ids = new_token_ids[token_id_idx:]
prev_block_id = req_block_ids[-1]
new_block_ids = self._get_new_blocks(num_new_blocks, new_token_ids,
prev_block_id)
else:
new_token_ids = None
prev_block_id = None
new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids,
prev_block_id)
new_block_ids = [blk.block_id for blk in new_blocks]
req_block_ids.extend(new_block_ids)
return new_block_ids

Expand Down Expand Up @@ -235,7 +246,8 @@ def allocate_slots(
# request, so we must have all new token IDs in the prompt.
num_computed_tokens = len(computed_block_ids) * self.block_size
if self.enable_caching:
new_token_ids = request.prompt_token_ids[num_computed_tokens:]
new_token_ids = request.prompt_token_ids[
num_computed_tokens:num_computed_tokens + num_tokens]
if not new_token_ids:
raise RuntimeError(
"Failed to infer the token IDs for allocation. "
Expand Down Expand Up @@ -337,6 +349,7 @@ def _get_new_blocks(
else:
del self.cached_block_hash_to_block[block_hash][
curr_block.block_id]
curr_block.reset()

curr_block.ref_cnt = 1
ret.append(curr_block)
Expand Down
11 changes: 10 additions & 1 deletion vllm/v1/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(
block_size=self.cache_config.block_size,
num_gpu_blocks=num_gpu_blocks,
sliding_window=self.cache_config.sliding_window,
enable_caching=True)
enable_caching=self.cache_config.enable_prefix_caching)
self.block_size = self.cache_config.block_size

# Scheduling constraints.
Expand Down Expand Up @@ -137,6 +137,15 @@ def schedule(self) -> "SchedulerOutput":
# `request.num_prompt_tokens` to consider the resumed requests,
# which have output tokens.
num_new_tokens = request.num_tokens - num_computed_tokens
if num_new_tokens == 0:
# FIXME: The happens when prompt length is divisible by
# the block size and all blocks are cached. We have to
# support query_len=0 in model runner to handle this case.
# Now we force to recompute the last block, which hurts
# performance and introduces duplications.
num_computed_tokens -= self.block_size
num_new_tokens = self.block_size
computed_block_ids.pop()
num_new_tokens = min(num_new_tokens, token_budget)
assert num_new_tokens > 0
new_block_ids = self.kv_cache_manager.allocate_slots(
Expand Down

0 comments on commit a200257

Please sign in to comment.