-
-
Notifications
You must be signed in to change notification settings - Fork 5.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Performance] Enable chunked prefill and prefix caching together #7753
Changes from all commits
d893717
1f16ece
94315d4
1daa758
79563bf
f1e9548
d57951f
324fcec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1145,7 +1145,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: | |
# will crash the vLLM instance / will not retry. | ||
for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: | ||
self.block_manager.mark_blocks_as_computed( | ||
scheduled_seq_group.seq_group) | ||
scheduled_seq_group.seq_group, | ||
scheduled_seq_group.token_chunk_size) | ||
|
||
scheduler_time = time.perf_counter() - scheduler_start_time | ||
# Add this to scheduler time to all the sequences that are currently | ||
|
@@ -1347,10 +1348,27 @@ def _get_num_new_tokens(self, seq_group: SequenceGroup, | |
for seq in seqs: | ||
num_new_tokens += seq.get_num_new_tokens() | ||
assert num_new_tokens > 0 | ||
# Chunk if a running request cannot fit in. | ||
# If number of seq > 1, it means it is doing beam search in a | ||
# decode phase. Do not chunk in that case. | ||
# Chunk if a running request cannot fit in the given budget. | ||
# If number of seq > 1, it means it is doing beam search | ||
# in a decode phase. Do not chunk. | ||
if enable_chunking and len(seqs) == 1: | ||
num_new_tokens = min(num_new_tokens, | ||
budget.remaining_token_budget()) | ||
remaining_token_budget = budget.remaining_token_budget() | ||
comaniac marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if self.cache_config.enable_prefix_caching: | ||
# When prefix caching is enabled, we always allocate | ||
# the number of new tokens that is dividable by the block size | ||
# to avoid partial block matching. | ||
block_size = self.cache_config.block_size | ||
reminder = budget.token_budget % block_size | ||
if reminder != 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Btw, should we raise this exception at the engine start time instead and just add assert here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel we could just raise here for now because this constraint should be able to be removed once we refactor the schedule to consider prefix caching. |
||
raise ValueError("When enabling chunked prefill and " | ||
"prefix caching, max_num_batched_tokens " | ||
"(chunk size) must be dividable by " | ||
"block size, but got chunk_size " | ||
f"({budget.token_budget}) % block_size " | ||
f"({block_size}) = {reminder}") | ||
if remaining_token_budget < num_new_tokens: | ||
comaniac marked this conversation as resolved.
Show resolved
Hide resolved
|
||
num_new_tokens = (remaining_token_budget // | ||
block_size) * block_size | ||
else: | ||
num_new_tokens = min(num_new_tokens, remaining_token_budget) | ||
return num_new_tokens |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we have corresponding test in v2?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need to test v2 because v2 automatically mark touched blocks as computed.