diff --git a/docs/source/torch/features/feature_combination_matrix.md b/docs/source/torch/features/feature_combination_matrix.md index 8dd701f689e..214d37b61d8 100644 --- a/docs/source/torch/features/feature_combination_matrix.md +++ b/docs/source/torch/features/feature_combination_matrix.md @@ -8,8 +8,8 @@ | Disaggregated Serving | Yes | Yes | Yes | --- | | | | | | | | | | | | Chunked Prefill | Yes | Yes | Yes | Untested | --- | | | | | | | | | | | MTP | Yes | Yes | Yes | Yes | Untested | --- | | | | | | | | | -| EAGLE-3(One Model Engine) | Yes | Yes | Yes | No | Untested | No | --- | | | | | | | | -| EAGLE-3(Two Model Engine) | NO | Yes | Yes | No | Untested | No | No | --- | | | | | | | +| EAGLE-3(One Model Engine) | Yes | Yes | Yes | No | Yes | No | --- | | | | | | | | +| EAGLE-3(Two Model Engine) | NO | Yes | Yes | No | Yes | No | No | --- | | | | | | | | Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | | | TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | | | KV Cache Reuse | Yes | Yes | Yes | Untested | Yes | Untested | Yes | No | Yes | Yes | --- | | | | diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 03ef770d454..276ad131217 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1955,7 +1955,9 @@ def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, task = MMLU(self.MODEL_NAME) task.evaluate(llm) - def test_eagle3(self): + @parametrize_with_ids("eagle3_one_model", [True, False]) + @parametrize_with_ids("enable_chunked_prefill", [False, True]) + def test_eagle3(self, enable_chunked_prefill, eagle3_one_model): pytorch_config = dict( disable_overlap_scheduler=True, cuda_graph_config=CudaGraphConfig(batch_sizes=[1]), @@ -1967,11 +1969,13 @@ def test_eagle3(self): draft_len = 4 spec_config = EagleDecodingConfig(max_draft_len=draft_len, - speculative_model_dir=eagle_model_dir) + speculative_model_dir=eagle_model_dir, + eagle3_one_model=eagle3_one_model) llm = LLM(model=target_model_dir, **pytorch_config, kv_cache_config=kv_cache_config, + enable_chunked_prefill=enable_chunked_prefill, speculative_config=spec_config, build_config=None) diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 42ba6aee54b..0289c317a1b 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -43,7 +43,10 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3 + - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=False] + - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=True] + - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=True] + - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=0] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=2]