Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefix Cache Aware Scheduling #1

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions benchmarks/benchmark_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ def main(args):
input_length_range = tuple(map(int, args.input_length_range.split(':')))
random.seed(args.seed)
if args.dataset_path is not None:
print(f"Start to sample {args.num_prompts} prompts"
"from {args.dataset_path}")
print(f"Start to sample {args.num_prompts} prompts "
f"from {args.dataset_path}")
filtered_datasets = sample_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
Expand All @@ -132,6 +132,7 @@ def main(args):
] * args.num_prompts

engine_args = EngineArgs.from_cli_args(args)
engine_args.enable_chunked_prefill = True

llm = LLM(**dataclasses.asdict(engine_args))

Expand Down
76 changes: 75 additions & 1 deletion tests/core/block/test_block_manager.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
import pytest

from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
Expand Down Expand Up @@ -205,6 +206,75 @@
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE


@pytest.mark.parametrize("block_size", [1, 4])
@pytest.mark.parametrize("num_prefill_tokens", [1, 2, 4, 5, 6, 8, 10])
@pytest.mark.parametrize("prefix_shared_percentage", [0.0, 0.3, 0.5, 0.7, 1.0])
def test_can_allocate_with_prefix_cache(
block_size: int,
num_prefill_tokens: int,
prefix_shared_percentage: float,
):
num_seqs_fittable = 1.5
num_blocks_required_seq = math.ceil(num_prefill_tokens / block_size)
num_gpu_blocks = math.ceil(num_seqs_fittable * num_blocks_required_seq)

num_tokens_shared = int(num_prefill_tokens * prefix_shared_percentage)
num_blocks_shared = num_tokens_shared // block_size

tokens_1 = list(range(num_prefill_tokens))
tokens_2 = tokens_1[:num_tokens_shared] + [
t + 10 for t in tokens_1[num_tokens_shared:]
]

print(f"tokens_1: {tokens_1}")
print(f"tokens_2: {tokens_2}")
print(f"num_blocks_shared: {num_blocks_shared}")
print(f"num_blocks_required_seq: {num_blocks_required_seq}")
print(f"num_gpu_blocks: {num_gpu_blocks}")

# Num blocks needed for 2 seqs, minus the number of blocks shared.
num_blocks_required_with_sharing = 2 * num_blocks_required_seq - num_blocks_shared

Check failure on line 236 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

tests/core/block/test_block_manager.py:236:81: E501 Line too long (86 > 80)

Check failure on line 236 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (E501)

tests/core/block/test_block_manager.py:236:81: E501 Line too long (86 > 80)

Check failure on line 236 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (E501)

tests/core/block/test_block_manager.py:236:81: E501 Line too long (86 > 80)

Check failure on line 236 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (E501)

tests/core/block/test_block_manager.py:236:81: E501 Line too long (86 > 80)

Check failure on line 236 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (E501)

tests/core/block/test_block_manager.py:236:81: E501 Line too long (86 > 80)
print(
f"num_blocks_required_with_sharing: {num_blocks_required_with_sharing}"
)

block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
enable_caching=True, # Prefix cache
)

seq_group_1 = create_seq_group(
seq_output_lens=[0],
request_id="0",
seq_id_start=0,
prompt_token_ids=tokens_1,
block_size=block_size,
)
assert block_manager.can_allocate(seq_group_1) == AllocStatus.OK
# Allocate the seq 1
block_manager.allocate(seq_group_1)

# Mark the seq 1 as computed (This should be done by the scheduler in reality)

Check failure on line 259 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

tests/core/block/test_block_manager.py:259:81: E501 Line too long (82 > 80)

Check failure on line 259 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (E501)

tests/core/block/test_block_manager.py:259:81: E501 Line too long (82 > 80)

Check failure on line 259 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (E501)

tests/core/block/test_block_manager.py:259:81: E501 Line too long (82 > 80)

Check failure on line 259 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (E501)

tests/core/block/test_block_manager.py:259:81: E501 Line too long (82 > 80)

Check failure on line 259 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (E501)

tests/core/block/test_block_manager.py:259:81: E501 Line too long (82 > 80)
block_manager.mark_blocks_as_computed(seq_group=seq_group_1,
token_chunk_size=len(tokens_1))

# Test if allocatable of seq 2.
seq_group_2 = create_seq_group(
seq_output_lens=[0],
request_id="1",
seq_id_start=1,
prompt_token_ids=tokens_2,
block_size=block_size,
)
if num_blocks_required_with_sharing <= num_gpu_blocks:
assert block_manager.can_allocate(seq_group_2) == AllocStatus.OK
block_manager.allocate(seq_group_2)
else:
assert block_manager.can_allocate(seq_group_2) == AllocStatus.LATER


@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("prompt_len", [1, 7, 8])
@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
Expand Down Expand Up @@ -328,7 +398,10 @@
watermark=0,
enable_caching=enable_caching)
prompt, seq_group = create_dummy_prompt(
"1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
"1",
prompt_length=(num_gpu_blocks - 1) * block_size - 1,
block_size=block_size,
)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
prompt.status = SequenceStatus.RUNNING
Expand Down Expand Up @@ -484,6 +557,7 @@
for token_id in range(num_slots_to_append):
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
seq.data.update_num_computed_tokens(1)
block_manager._computed_blocks_tracker.update_seq(seq)
block_manager.append_slots(seq, num_lookahead_slots=0)
if prompt_len < sliding_window + 10:
check_used(0, sliding_blocks + 1)
Expand Down
103 changes: 85 additions & 18 deletions tests/core/block/test_block_table.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,33 @@
from typing import List
from typing import List, Optional

import pytest

from tests.core.utils import create_dummy_sequence
from vllm.core.block.block_table import BlockTable
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
from vllm.sequence import Logprob
from vllm.utils import Device, cdiv, chunk_list


def make_sequence(
request_id: int,
token_ids: List[int],
block_size: int,
num_output_tokens: int = 0,
output_tokens: Optional[List[int]] = None,
):
if output_tokens is None:
output_tokens = list(range(num_output_tokens))

seq = create_dummy_sequence(
sequence_id=request_id,
prompt_tokens=token_ids,
block_size=block_size,
output_tokens=output_tokens,
)
return seq


@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
def test_allocate_naive(block_size: int, sequence_len: int):
Expand Down Expand Up @@ -35,12 +56,13 @@
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc

seq = make_sequence(i, token_ids, block_size)
block_tables.append(
BlockTable(
block_size=block_size,
block_allocator=allocator,
))
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
block_tables[-1].allocate(seq=seq, device=Device.GPU)


@pytest.mark.parametrize("block_size", [16])
Expand Down Expand Up @@ -82,8 +104,10 @@
BlockTable(
block_size=block_size,
block_allocator=allocator,
enable_prefix_caching=True,
))
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
seq = make_sequence(alloc_i, token_ids, block_size)
block_tables[-1].allocate(seq=seq, device=Device.GPU)

# Expect all sequences to share allocations, except for their last block
# (which may be mutable).
Expand Down Expand Up @@ -123,10 +147,13 @@
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
enable_prefix_caching=True
if allocator_type == "prefix_caching" else False,

Check failure on line 151 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (SIM210)

tests/core/block/test_block_table.py:150:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 151 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (SIM210)

tests/core/block/test_block_table.py:150:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 151 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (SIM210)

tests/core/block/test_block_table.py:150:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 151 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (SIM210)

tests/core/block/test_block_table.py:150:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 151 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (SIM210)

tests/core/block/test_block_table.py:150:31: SIM210 Remove unnecessary `True if ... else False`
)

for i in range(5):
block_table.allocate(token_ids=token_ids, device=device)
seq = make_sequence(i, token_ids, block_size)
block_table.allocate(seq=seq, device=device)
assert allocator.get_num_free_blocks(
device) == num_device_blocks - num_blocks_per_alloc
assert all(block_id is not None
Expand Down Expand Up @@ -166,6 +193,8 @@
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
enable_prefix_caching=True
if allocator_type == "prefix_caching" else False,

Check failure on line 197 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (SIM210)

tests/core/block/test_block_table.py:196:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 197 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (SIM210)

tests/core/block/test_block_table.py:196:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 197 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (SIM210)

tests/core/block/test_block_table.py:196:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 197 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (SIM210)

tests/core/block/test_block_table.py:196:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 197 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (SIM210)

tests/core/block/test_block_table.py:196:31: SIM210 Remove unnecessary `True if ... else False`
)

num_expected_blocks_before_append = len(
Expand All @@ -174,11 +203,18 @@
list(chunk_list(token_ids + token_ids_to_append,
block_size))) - num_expected_blocks_before_append

block_table.allocate(token_ids=token_ids, device=Device.GPU)
seq = make_sequence(0, token_ids, block_size)

block_table.allocate(seq=seq, device=Device.GPU)

assert len(
block_table.physical_block_ids) == num_expected_blocks_before_append
block_table.append_token_ids(token_ids_to_append)

# Update the sequence.
for token_id in token_ids_to_append:
seq.append_token_id(token_id, {token_id: Logprob(0.0)})

block_table.append_slots(seq=seq, num_lookahead_slots=0)
assert len(
block_table.physical_block_ids
) == num_expected_blocks_before_append + num_expected_appended_blocks
Expand Down Expand Up @@ -215,6 +251,8 @@
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
enable_prefix_caching=True
if allocator_type == "prefix_caching" else False,

Check failure on line 255 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (SIM210)

tests/core/block/test_block_table.py:254:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 255 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (SIM210)

tests/core/block/test_block_table.py:254:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 255 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (SIM210)

tests/core/block/test_block_table.py:254:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 255 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (SIM210)

tests/core/block/test_block_table.py:254:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 255 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (SIM210)

tests/core/block/test_block_table.py:254:31: SIM210 Remove unnecessary `True if ... else False`
)

num_expected_blocks_before_append = len(
Expand All @@ -223,7 +261,9 @@
list(chunk_list(token_ids + [-1] * num_empty_slots,
block_size))) - num_expected_blocks_before_append

block_table.allocate(token_ids=token_ids, device=Device.GPU)
seq = make_sequence(0, token_ids, block_size)

block_table.allocate(seq=seq, device=Device.GPU)

# Assert that the empty slots consume the expected number of additional
# blocks.
Expand All @@ -236,7 +276,10 @@

# Now, ensure no additional blocks consumed as we fill up the empty slots.
num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
tokens_to_append = list(range(num_empty_slots))
for token_id in tokens_to_append:
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
block_table.append_slots(seq=seq, num_lookahead_slots=0)
assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)


Expand Down Expand Up @@ -267,12 +310,18 @@
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
enable_prefix_caching=True
if allocator_type == "prefix_caching" else False,

Check failure on line 314 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (SIM210)

tests/core/block/test_block_table.py:313:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 314 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (SIM210)

tests/core/block/test_block_table.py:313:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 314 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (SIM210)

tests/core/block/test_block_table.py:313:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 314 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (SIM210)

tests/core/block/test_block_table.py:313:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 314 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (SIM210)

tests/core/block/test_block_table.py:313:31: SIM210 Remove unnecessary `True if ... else False`
)
block_table.allocate(token_ids=token_ids, device=Device.GPU)
seq = make_sequence(0, token_ids, block_size)
block_table.allocate(seq=seq, device=Device.GPU)

appended_so_far: List[int] = []
for append in chunk_list(token_ids_to_append, append_size):
block_table.append_token_ids(append)
for token_id in append:
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
block_table.append_slots(seq=seq, num_lookahead_slots=0)

appended_so_far.extend(append)

assert block_table._get_all_token_ids() == token_ids + appended_so_far
Expand Down Expand Up @@ -307,9 +356,12 @@
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
enable_prefix_caching=True
if allocator_type == "prefix_caching" else False,

Check failure on line 360 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (SIM210)

tests/core/block/test_block_table.py:359:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 360 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (SIM210)

tests/core/block/test_block_table.py:359:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 360 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (SIM210)

tests/core/block/test_block_table.py:359:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 360 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (SIM210)

tests/core/block/test_block_table.py:359:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 360 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (SIM210)

tests/core/block/test_block_table.py:359:31: SIM210 Remove unnecessary `True if ... else False`
)

block_table.allocate(token_ids)
seq = make_sequence(0, token_ids, block_size)
block_table.allocate(seq=seq, device=Device.GPU)

num_free_blocks_before_fork = allocator.get_num_free_blocks(
device=Device.GPU)
Expand Down Expand Up @@ -366,13 +418,16 @@
original_block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
enable_prefix_caching=True
if allocator_type == "prefix_caching" else False,

Check failure on line 422 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (SIM210)

tests/core/block/test_block_table.py:421:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 422 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (SIM210)

tests/core/block/test_block_table.py:421:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 422 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (SIM210)

tests/core/block/test_block_table.py:421:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 422 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (SIM210)

tests/core/block/test_block_table.py:421:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 422 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (SIM210)

tests/core/block/test_block_table.py:421:31: SIM210 Remove unnecessary `True if ... else False`
)

num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
num_expected_cow_blocks = cdiv(sequence_len + append_len,
block_size) - (sequence_len // block_size)

original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
seq = make_sequence(0, token_ids, block_size)
original_block_table.allocate(seq=seq, device=Device.GPU)
original_block_ids = original_block_table.physical_block_ids[:]

print("original_block_ids = {}".format(original_block_ids))
Expand All @@ -392,7 +447,9 @@
raise ValueError(f"unknown test config {appender=}")

# Write tokens.
appender_block_table.append_token_ids(token_ids_to_append)
for token_id in token_ids_to_append:
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
appender_block_table.append_slots(seq=seq, num_lookahead_slots=0)

# Expect the non-appending block table to have no change.
assert static_block_table.physical_block_ids == original_block_ids
Expand Down Expand Up @@ -452,9 +509,12 @@
original_block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
enable_prefix_caching=True
if allocator_type == "prefix_caching" else False,

Check failure on line 513 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (SIM210)

tests/core/block/test_block_table.py:512:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 513 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (SIM210)

tests/core/block/test_block_table.py:512:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 513 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (SIM210)

tests/core/block/test_block_table.py:512:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 513 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (SIM210)

tests/core/block/test_block_table.py:512:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 513 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (SIM210)

tests/core/block/test_block_table.py:512:31: SIM210 Remove unnecessary `True if ... else False`
)

original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
seq = make_sequence(0, token_ids, block_size)
original_block_table.allocate(seq=seq, device=Device.GPU)

# Allocate lookahead slots.
original_block_table.ensure_num_empty_slots(lookahead_slots)
Expand All @@ -472,7 +532,9 @@
raise ValueError(f"unknown test config {appender=}")

# Write tokens.
appender_block_table.append_token_ids(token_ids_to_append)
for token_id in token_ids_to_append:
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
appender_block_table.append_slots(seq=seq, num_lookahead_slots=0)

# Expect the non-appending block table to have no change.
assert static_block_table.physical_block_ids == original_block_ids
Expand Down Expand Up @@ -534,9 +596,11 @@
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
enable_prefix_caching=True
if allocator_type == "prefix_caching" else False,

Check failure on line 600 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (SIM210)

tests/core/block/test_block_table.py:599:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 600 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (SIM210)

tests/core/block/test_block_table.py:599:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 600 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (SIM210)

tests/core/block/test_block_table.py:599:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 600 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (SIM210)

tests/core/block/test_block_table.py:599:31: SIM210 Remove unnecessary `True if ... else False`

Check failure on line 600 in tests/core/block/test_block_table.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (SIM210)

tests/core/block/test_block_table.py:599:31: SIM210 Remove unnecessary `True if ... else False`
)

block_table.allocate(token_ids=token_ids, device=Device.GPU)
seq = make_sequence(0, token_ids, block_size)
block_table.allocate(seq=seq, device=Device.GPU)

# Add lookahead before fork so both sequences have the same lookahead
# blocks.
Expand All @@ -556,7 +620,10 @@
#
# We expect append_token_ids to CoW all mutated blocks that have refcount>1.
num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
for token_id in token_ids_to_append:
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
block_table.append_slots(seq=seq, num_lookahead_slots=num_lookahead_slots)

num_consumed_blocks = (num_free_blocks_before_append -
allocator.get_num_free_blocks(Device.GPU))

Expand Down
Loading
Loading