rickyyx · rickyyx · Oct 10, 2024 · Oct 11, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
@@ -117,8 +117,8 @@ def main(args):
     input_length_range = tuple(map(int, args.input_length_range.split(':')))
     random.seed(args.seed)
     if args.dataset_path is not None:
-        print(f"Start to sample {args.num_prompts} prompts"
-              "from {args.dataset_path}")
+        print(f"Start to sample {args.num_prompts} prompts "
+              f"from {args.dataset_path}")
         filtered_datasets = sample_requests(
             dataset_path=args.dataset_path,
             num_requests=args.num_prompts,
@@ -132,6 +132,7 @@ def main(args):
                              ] * args.num_prompts
 
     engine_args = EngineArgs.from_cli_args(args)
+    engine_args.enable_chunked_prefill = True
 
     llm = LLM(**dataclasses.asdict(engine_args))
 

diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
@@ -1,3 +1,4 @@
+import math
 import pytest
 
 from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
@@ -205,6 +206,75 @@
     assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
 
 
+@pytest.mark.parametrize("block_size", [1, 4])
+@pytest.mark.parametrize("num_prefill_tokens", [1, 2, 4, 5, 6, 8, 10])
+@pytest.mark.parametrize("prefix_shared_percentage", [0.0, 0.3, 0.5, 0.7, 1.0])
+def test_can_allocate_with_prefix_cache(
+    block_size: int,
+    num_prefill_tokens: int,
+    prefix_shared_percentage: float,
+):
+    num_seqs_fittable = 1.5
+    num_blocks_required_seq = math.ceil(num_prefill_tokens / block_size)
+    num_gpu_blocks = math.ceil(num_seqs_fittable * num_blocks_required_seq)
+
+    num_tokens_shared = int(num_prefill_tokens * prefix_shared_percentage)
+    num_blocks_shared = num_tokens_shared // block_size
+
+    tokens_1 = list(range(num_prefill_tokens))
+    tokens_2 = tokens_1[:num_tokens_shared] + [
+        t + 10 for t in tokens_1[num_tokens_shared:]
+    ]
+
+    print(f"tokens_1: {tokens_1}")
+    print(f"tokens_2: {tokens_2}")
+    print(f"num_blocks_shared: {num_blocks_shared}")
+    print(f"num_blocks_required_seq: {num_blocks_required_seq}")
+    print(f"num_gpu_blocks: {num_gpu_blocks}")
+
+    # Num blocks needed for 2 seqs, minus the number of blocks shared.
+    num_blocks_required_with_sharing = 2 * num_blocks_required_seq - num_blocks_shared
+    print(
+        f"num_blocks_required_with_sharing: {num_blocks_required_with_sharing}"
+    )
+
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        enable_caching=True,  # Prefix cache
+    )
+
+    seq_group_1 = create_seq_group(
+        seq_output_lens=[0],
+        request_id="0",
+        seq_id_start=0,
+        prompt_token_ids=tokens_1,
+        block_size=block_size,
+    )
+    assert block_manager.can_allocate(seq_group_1) == AllocStatus.OK
+    # Allocate the seq 1
+    block_manager.allocate(seq_group_1)
+
+    # Mark the seq 1 as computed (This should be done by the scheduler in reality)
+    block_manager.mark_blocks_as_computed(seq_group=seq_group_1,
+                                          token_chunk_size=len(tokens_1))
+
+    # Test if allocatable of seq 2.
+    seq_group_2 = create_seq_group(
+        seq_output_lens=[0],
+        request_id="1",
+        seq_id_start=1,
+        prompt_token_ids=tokens_2,
+        block_size=block_size,
+    )
+    if num_blocks_required_with_sharing <= num_gpu_blocks:
+        assert block_manager.can_allocate(seq_group_2) == AllocStatus.OK
+        block_manager.allocate(seq_group_2)
+    else:
+        assert block_manager.can_allocate(seq_group_2) == AllocStatus.LATER
+
+
 @pytest.mark.parametrize("block_size", [1, 8])
 @pytest.mark.parametrize("prompt_len", [1, 7, 8])
 @pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
@@ -328,7 +398,10 @@
                                               watermark=0,
                                               enable_caching=enable_caching)
     prompt, seq_group = create_dummy_prompt(
-        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
+        "1",
+        prompt_length=(num_gpu_blocks - 1) * block_size - 1,
+        block_size=block_size,
+    )
     prompt.status = SequenceStatus.WAITING
     block_manager.allocate(seq_group)
     prompt.status = SequenceStatus.RUNNING
@@ -484,6 +557,7 @@
     for token_id in range(num_slots_to_append):
         seq.append_token_id(token_id, {token_id: Logprob(0.0)})
         seq.data.update_num_computed_tokens(1)
+        block_manager._computed_blocks_tracker.update_seq(seq)
         block_manager.append_slots(seq, num_lookahead_slots=0)
         if prompt_len < sliding_window + 10:
             check_used(0, sliding_blocks + 1)

diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
@@ -1,12 +1,33 @@
-from typing import List
+from typing import List, Optional
 
 import pytest
 
+from tests.core.utils import create_dummy_sequence
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.sequence import Logprob
 from vllm.utils import Device, cdiv, chunk_list
 
 
+def make_sequence(
+    request_id: int,
+    token_ids: List[int],
+    block_size: int,
+    num_output_tokens: int = 0,
+    output_tokens: Optional[List[int]] = None,
+):
+    if output_tokens is None:
+        output_tokens = list(range(num_output_tokens))
+
+    seq = create_dummy_sequence(
+        sequence_id=request_id,
+        prompt_tokens=token_ids,
+        block_size=block_size,
+        output_tokens=output_tokens,
+    )
+    return seq
+
+
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("sequence_len", [1, 16, 129])
 def test_allocate_naive(block_size: int, sequence_len: int):
@@ -35,12 +56,13 @@
         assert allocator.get_num_free_blocks(
             device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
 
+        seq = make_sequence(i, token_ids, block_size)
         block_tables.append(
             BlockTable(
                 block_size=block_size,
                 block_allocator=allocator,
             ))
-        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
+        block_tables[-1].allocate(seq=seq, device=Device.GPU)
 
 
 @pytest.mark.parametrize("block_size", [16])
@@ -82,8 +104,10 @@
             BlockTable(
                 block_size=block_size,
                 block_allocator=allocator,
+                enable_prefix_caching=True,
             ))
-        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
+        seq = make_sequence(alloc_i, token_ids, block_size)
+        block_tables[-1].allocate(seq=seq, device=Device.GPU)
 
         # Expect all sequences to share allocations, except for their last block
         # (which may be mutable).
@@ -123,10 +147,13 @@
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
     for i in range(5):
-        block_table.allocate(token_ids=token_ids, device=device)
+        seq = make_sequence(i, token_ids, block_size)
+        block_table.allocate(seq=seq, device=device)
         assert allocator.get_num_free_blocks(
             device) == num_device_blocks - num_blocks_per_alloc
         assert all(block_id is not None
@@ -166,6 +193,8 @@
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
     num_expected_blocks_before_append = len(
@@ -174,11 +203,18 @@
         list(chunk_list(token_ids + token_ids_to_append,
                         block_size))) - num_expected_blocks_before_append
 
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    seq = make_sequence(0, token_ids, block_size)
+
+    block_table.allocate(seq=seq, device=Device.GPU)
 
     assert len(
         block_table.physical_block_ids) == num_expected_blocks_before_append
-    block_table.append_token_ids(token_ids_to_append)
+
+    # Update the sequence.
+    for token_id in token_ids_to_append:
+        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    block_table.append_slots(seq=seq, num_lookahead_slots=0)
     assert len(
         block_table.physical_block_ids
     ) == num_expected_blocks_before_append + num_expected_appended_blocks
@@ -215,6 +251,8 @@
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
     num_expected_blocks_before_append = len(
@@ -223,7 +261,9 @@
         list(chunk_list(token_ids + [-1] * num_empty_slots,
                         block_size))) - num_expected_blocks_before_append
 
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    seq = make_sequence(0, token_ids, block_size)
+
+    block_table.allocate(seq=seq, device=Device.GPU)
 
     # Assert that the empty slots consume the expected number of additional
     # blocks.
@@ -236,7 +276,10 @@
 
     # Now, ensure no additional blocks consumed as we fill up the empty slots.
     num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
-    block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
+    tokens_to_append = list(range(num_empty_slots))
+    for token_id in tokens_to_append:
+        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_table.append_slots(seq=seq, num_lookahead_slots=0)
     assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
 
 
@@ -267,12 +310,18 @@
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    seq = make_sequence(0, token_ids, block_size)
+    block_table.allocate(seq=seq, device=Device.GPU)
 
     appended_so_far: List[int] = []
     for append in chunk_list(token_ids_to_append, append_size):
-        block_table.append_token_ids(append)
+        for token_id in append:
+            seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+        block_table.append_slots(seq=seq, num_lookahead_slots=0)
+
         appended_so_far.extend(append)
 
         assert block_table._get_all_token_ids() == token_ids + appended_so_far
@@ -307,9 +356,12 @@
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
-    block_table.allocate(token_ids)
+    seq = make_sequence(0, token_ids, block_size)
+    block_table.allocate(seq=seq, device=Device.GPU)
 
     num_free_blocks_before_fork = allocator.get_num_free_blocks(
         device=Device.GPU)
@@ -366,13 +418,16 @@
     original_block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
     num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
     num_expected_cow_blocks = cdiv(sequence_len + append_len,
                                    block_size) - (sequence_len // block_size)
 
-    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    seq = make_sequence(0, token_ids, block_size)
+    original_block_table.allocate(seq=seq, device=Device.GPU)
     original_block_ids = original_block_table.physical_block_ids[:]
 
     print("original_block_ids = {}".format(original_block_ids))
@@ -392,7 +447,9 @@
         raise ValueError(f"unknown test config {appender=}")
 
     # Write tokens.
-    appender_block_table.append_token_ids(token_ids_to_append)
+    for token_id in token_ids_to_append:
+        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+    appender_block_table.append_slots(seq=seq, num_lookahead_slots=0)
 
     # Expect the non-appending block table to have no change.
     assert static_block_table.physical_block_ids == original_block_ids
@@ -452,9 +509,12 @@
     original_block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
-    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    seq = make_sequence(0, token_ids, block_size)
+    original_block_table.allocate(seq=seq, device=Device.GPU)
 
     # Allocate lookahead slots.
     original_block_table.ensure_num_empty_slots(lookahead_slots)
@@ -472,7 +532,9 @@
         raise ValueError(f"unknown test config {appender=}")
 
     # Write tokens.
-    appender_block_table.append_token_ids(token_ids_to_append)
+    for token_id in token_ids_to_append:
+        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+    appender_block_table.append_slots(seq=seq, num_lookahead_slots=0)
 
     # Expect the non-appending block table to have no change.
     assert static_block_table.physical_block_ids == original_block_ids
@@ -534,9 +596,11 @@
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
-
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    seq = make_sequence(0, token_ids, block_size)
+    block_table.allocate(seq=seq, device=Device.GPU)
 
     # Add lookahead before fork so both sequences have the same lookahead
     # blocks.
@@ -556,7 +620,10 @@
     #
     # We expect append_token_ids to CoW all mutated blocks that have refcount>1.
     num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
-    block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
+    for token_id in token_ids_to_append:
+        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_table.append_slots(seq=seq, num_lookahead_slots=num_lookahead_slots)
+
     num_consumed_blocks = (num_free_blocks_before_append -
                            allocator.get_num_free_blocks(Device.GPU))