Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
Signed-off-by: Cody Yu <[email protected]>
  • Loading branch information
comaniac committed Oct 24, 2024
1 parent a200257 commit ba5c809
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 7 deletions.
1 change: 1 addition & 0 deletions vllm/v1/core/kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ def free(self, request: Request) -> None:
# end of the free_block_list to maintain theeviction
# order.
self.free_block_queue.remove(self.block_pool[block_id])
self.lazy_remove_block_ids.remove(block_id)
self.free_block_queue.append(self.block_pool[block_id])
self.num_free_blocks += 1
else:
Expand Down
12 changes: 5 additions & 7 deletions vllm/v1/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,11 @@ def schedule(self) -> "SchedulerOutput":
# which have output tokens.
num_new_tokens = request.num_tokens - num_computed_tokens
if num_new_tokens == 0:
# FIXME: The happens when prompt length is divisible by
# the block size and all blocks are cached. We have to
# support query_len=0 in model runner to handle this case.
# Now we force to recompute the last block, which hurts
# performance and introduces duplications.
num_computed_tokens -= self.block_size
num_new_tokens = self.block_size
# The happens when prompt length is divisible by the block
# size and all blocks are cached. Now we force to recompute
# the last token.
num_computed_tokens -= 1
num_new_tokens = 1
computed_block_ids.pop()
num_new_tokens = min(num_new_tokens, token_budget)
assert num_new_tokens > 0
Expand Down

0 comments on commit ba5c809

Please sign in to comment.