From cf718b07d7aa925a1534f7338209041c84c988f9 Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Fri, 9 Aug 2024 15:14:59 -0700
Subject: [PATCH 1/2] Fix edge case in chunked prefill + block manager v2

---
 tests/core/block/e2e/test_correctness.py | 21 ++++++++++++++++++---
 vllm/core/block/block_table.py           |  6 ++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index e0dee43f500a0..b87e0fd26157d 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -261,11 +261,25 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
             # skip cuda graph creation for fast test.
             "enforce_eager": True,
             "enable_chunked_prefill": True,
-            "max_num_batched_tokens": 2,
-            "max_num_seqs": 2,
         },
     ])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+{
+    "block_size": 8,
+    "max_num_batched_tokens": 2,
+    "max_num_seqs": 2,
+},
+{
+    "block_size": 8,
+    "max_num_batched_tokens": 3,
+    "max_num_seqs": 2,
+},
+{
+    "block_size": 8,
+    "max_num_batched_tokens": 256,
+    "max_num_seqs": 10,
+}
+])
 @pytest.mark.parametrize("baseline_llm_kwargs", [
     {
         "use_v2_block_manager": False,
@@ -294,6 +308,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
+        ("1 + " * 50) + " 1 = ", # Longer prompt.
         "The capital of France is",
         "The future of AI is",
     ]
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 06b816eb367f5..c002dd1397f96 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -356,7 +356,13 @@ def _chunk_token_blocks_for_append(
         appended to blocks. The first such "token block" may have less token ids
         than the block size, since the last allocated block may be partially
         full.
+
+        If no token ids are provided, then no chunks are returned.
         """
+
+        if not token_ids:
+            return []
+
         first_chunk_size = self._block_size - (self._num_full_slots %
                                                self._block_size)
         token_blocks = [token_ids[:first_chunk_size]]

From f19d2f81dbab7bc88dacb0af48578d67068bd61f Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Fri, 9 Aug 2024 15:16:29 -0700
Subject: [PATCH 2/2] format

---
 tests/core/block/e2e/test_correctness.py | 33 +++++++++++-------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index b87e0fd26157d..b3d3667b37d88 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -263,23 +263,20 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
             "enable_chunked_prefill": True,
         },
     ])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-{
-    "block_size": 8,
-    "max_num_batched_tokens": 2,
-    "max_num_seqs": 2,
-},
-{
-    "block_size": 8,
-    "max_num_batched_tokens": 3,
-    "max_num_seqs": 2,
-},
-{
-    "block_size": 8,
-    "max_num_batched_tokens": 256,
-    "max_num_seqs": 10,
-}
-])
+@pytest.mark.parametrize("per_test_common_llm_kwargs",
+                         [{
+                             "block_size": 8,
+                             "max_num_batched_tokens": 2,
+                             "max_num_seqs": 2,
+                         }, {
+                             "block_size": 8,
+                             "max_num_batched_tokens": 3,
+                             "max_num_seqs": 2,
+                         }, {
+                             "block_size": 8,
+                             "max_num_batched_tokens": 256,
+                             "max_num_seqs": 10,
+                         }])
 @pytest.mark.parametrize("baseline_llm_kwargs", [
     {
         "use_v2_block_manager": False,
@@ -308,7 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
-        ("1 + " * 50) + " 1 = ", # Longer prompt.
+        ("1 + " * 50) + " 1 = ",  # Longer prompt.
         "The capital of France is",
         "The future of AI is",
     ]