fix

Signed-off-by: Cody Yu <[email protected]>
vllm-project · Oct 24, 2024 · ba5c809 · ba5c809
1 parent a200257
commit ba5c809
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 7 deletions.
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -295,6 +295,7 @@ def free(self, request: Request) -> None:
                         # end of the free_block_list to maintain theeviction
                         # order.
                         self.free_block_queue.remove(self.block_pool[block_id])
+                        self.lazy_remove_block_ids.remove(block_id)
                     self.free_block_queue.append(self.block_pool[block_id])
                     self.num_free_blocks += 1
         else:

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
@@ -138,13 +138,11 @@ def schedule(self) -> "SchedulerOutput":
                 # which have output tokens.
                 num_new_tokens = request.num_tokens - num_computed_tokens
                 if num_new_tokens == 0:
-                    # FIXME: The happens when prompt length is divisible by
-                    # the block size and all blocks are cached. We have to
-                    # support query_len=0 in model runner to handle this case.
-                    # Now we force to recompute the last block, which hurts
-                    # performance and introduces duplications.
-                    num_computed_tokens -= self.block_size
-                    num_new_tokens = self.block_size
+                    # The happens when prompt length is divisible by the block
+                    # size and all blocks are cached. Now we force to recompute
+                    # the last token.
+                    num_computed_tokens -= 1
+                    num_new_tokens = 1
                     computed_block_ids.pop()
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0