diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index e0dee43f500a0..b3d3667b37d88 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -261,11 +261,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, # skip cuda graph creation for fast test. "enforce_eager": True, "enable_chunked_prefill": True, - "max_num_batched_tokens": 2, - "max_num_seqs": 2, }, ]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", + [{ + "block_size": 8, + "max_num_batched_tokens": 2, + "max_num_seqs": 2, + }, { + "block_size": 8, + "max_num_batched_tokens": 3, + "max_num_seqs": 2, + }, { + "block_size": 8, + "max_num_batched_tokens": 256, + "max_num_seqs": 10, + }]) @pytest.mark.parametrize("baseline_llm_kwargs", [ { "use_v2_block_manager": False, @@ -294,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator, prompts = [ "Hello, my name is", "The president of the United States is", + ("1 + " * 50) + " 1 = ", # Longer prompt. "The capital of France is", "The future of AI is", ] diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 06b816eb367f5..c002dd1397f96 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -356,7 +356,13 @@ def _chunk_token_blocks_for_append( appended to blocks. The first such "token block" may have less token ids than the block size, since the last allocated block may be partially full. + + If no token ids are provided, then no chunks are returned. """ + + if not token_ids: + return [] + first_chunk_size = self._block_size - (self._num_full_slots % self._block_size) token_blocks = [token_ids[:first_chunk_size]]