From cf718b07d7aa925a1534f7338209041c84c988f9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Fri, 9 Aug 2024 15:14:59 -0700 Subject: [PATCH 1/2] Fix edge case in chunked prefill + block manager v2 --- tests/core/block/e2e/test_correctness.py | 21 ++++++++++++++++++--- vllm/core/block/block_table.py | 6 ++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index e0dee43f500a0..b87e0fd26157d 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -261,11 +261,25 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, # skip cuda graph creation for fast test. "enforce_eager": True, "enable_chunked_prefill": True, - "max_num_batched_tokens": 2, - "max_num_seqs": 2, }, ]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ +{ + "block_size": 8, + "max_num_batched_tokens": 2, + "max_num_seqs": 2, +}, +{ + "block_size": 8, + "max_num_batched_tokens": 3, + "max_num_seqs": 2, +}, +{ + "block_size": 8, + "max_num_batched_tokens": 256, + "max_num_seqs": 10, +} +]) @pytest.mark.parametrize("baseline_llm_kwargs", [ { "use_v2_block_manager": False, @@ -294,6 +308,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator, prompts = [ "Hello, my name is", "The president of the United States is", + ("1 + " * 50) + " 1 = ", # Longer prompt. "The capital of France is", "The future of AI is", ] diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 06b816eb367f5..c002dd1397f96 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -356,7 +356,13 @@ def _chunk_token_blocks_for_append( appended to blocks. The first such "token block" may have less token ids than the block size, since the last allocated block may be partially full. + + If no token ids are provided, then no chunks are returned. """ + + if not token_ids: + return [] + first_chunk_size = self._block_size - (self._num_full_slots % self._block_size) token_blocks = [token_ids[:first_chunk_size]] From f19d2f81dbab7bc88dacb0af48578d67068bd61f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Fri, 9 Aug 2024 15:16:29 -0700 Subject: [PATCH 2/2] format --- tests/core/block/e2e/test_correctness.py | 33 +++++++++++------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index b87e0fd26157d..b3d3667b37d88 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -263,23 +263,20 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, "enable_chunked_prefill": True, }, ]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ -{ - "block_size": 8, - "max_num_batched_tokens": 2, - "max_num_seqs": 2, -}, -{ - "block_size": 8, - "max_num_batched_tokens": 3, - "max_num_seqs": 2, -}, -{ - "block_size": 8, - "max_num_batched_tokens": 256, - "max_num_seqs": 10, -} -]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", + [{ + "block_size": 8, + "max_num_batched_tokens": 2, + "max_num_seqs": 2, + }, { + "block_size": 8, + "max_num_batched_tokens": 3, + "max_num_seqs": 2, + }, { + "block_size": 8, + "max_num_batched_tokens": 256, + "max_num_seqs": 10, + }]) @pytest.mark.parametrize("baseline_llm_kwargs", [ { "use_v2_block_manager": False, @@ -308,7 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator, prompts = [ "Hello, my name is", "The president of the United States is", - ("1 + " * 50) + " 1 = ", # Longer prompt. + ("1 + " * 50) + " 1 = ", # Longer prompt. "The capital of France is", "The future of AI is", ]