[None][doc] update feature_combination_matrix of disaggregated and chunked prefill

leslie-fang25 · leslie-fang25 · commit 1e71ba2e9bad · 2025-08-18T23:43:25.000-07:00
Signed-off-by: leslie-fang25 &lt;leslief@nvidia.com&gt;
diff --git a/docs/source/torch/features/feature_combination_matrix.md b/docs/source/torch/features/feature_combination_matrix.md
@@ -6,13 +6,13 @@
 | CUDA Graph                 | Yes               | ---        |                            |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |
 | Attention Data Parallelism | Yes               | Yes        | ---                        |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |
 | Disaggregated Serving      | Yes               | Yes        | Yes                        | ---                   |                 |          |                           |                           |               |                  |                |                        |                       |                 |
-| Chunked Prefill            | Yes               | Yes        | Yes                        | Untested              | ---             |          |                           |                           |               |                  |                |                        |                       |                 |
+| Chunked Prefill            | Yes               | Yes        | Yes                        | Yes                        | ---             |          |                           |                           |               |                  |                |                        |                       |                 |
 | MTP                        | Yes               | Yes        | Yes                        | Yes                   | Yes        | ---      |                           |                           |               |                  |                |                        |                       |                 |
 | EAGLE-3(One Model Engine)  | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | ---                       |                           |               |                  |                |                        |                       |                 |
-| EAGLE-3(Two Model Engine)  | NO                | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | ---                       |               |                  |                |                        |                       |                 |
+| EAGLE-3(Two Model Engine)  | No                | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | ---                       |               |                  |                |                        |                       |                 |
 | Torch Sampler              | Yes               | Yes        | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | ---           |                  |                |                        |                       |                 |
 | TLLM C++ Sampler           | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | No                        | No            | ---              |                |                        |                       |                 |
-| KV Cache Reuse             | Yes               | Yes        | Yes                        | Untested              | Yes             | Untested | Yes                       | No                        | Yes           | Yes              | ---            |                        |                       |                 |
-| Slide Window Attention     | Yes               | Yes        | Yes                        | Untested              | No              | Untested | Untested                  | Untested                  | Yes           | Yes              | WIP            | ---                    |                       |                 |
+| KV Cache Reuse             | Yes               | Yes        | Yes                        | Yes              | Yes             | Untested | Yes                       | No                        | Yes           | Yes              | ---            |                        |                       |                 |
+| Slide Window Attention     | Yes               | Yes        | Yes                        | Yes                        | No              | Untested | Untested                  | Untested                  | Yes           | Yes              | WIP            | ---                    |                       |                 |
 | Logits Post Processor      | No                | Yes        | Yes                        | No                    | Yes             | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | ---                   |                 |
 | Guided Decoding            | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | Yes                       | Yes           | Yes              | Yes            | Yes                    | Yes                   | ---             |
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -797,6 +797,43 @@ def test_auto_dtype(self, overlap_scheduler):
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
+    def test_chunked_prefill(self):
+        ctx_server_config = {
+            "disable_overlap_scheduler": True,
+            "cuda_graph_config": None,
+            "cache_transceiver_config": {
+                "backend": "DEFAULT"
+            },
+            "enable_chunked_prefill": True,
+            "max_num_tokens": 256,
+        }
+        gen_server_config = {
+            "cuda_graph_config": None,
+            "cache_transceiver_config": {
+                "backend": "DEFAULT"
+            }
+        }
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "port": 8000,
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8001"]
+            },
+            "generation_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8002"]
+            }
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config, gen_server_config,
+                                      self.MODEL_PATH) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 @skip_pre_blackwell
 @pytest.mark.timeout(3600)
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -41,6 +41,7 @@ l0_dgx_h100:
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
   - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
   - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
+  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_chunked_prefill
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]