update test config

crazydemo · crazydemo · commit e1e1f2278f0b · 2025-08-01T11:17:19.000+08:00
Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -169,6 +169,8 @@ meta-llama/Llama-3.2-3B:
     kv_cache_quant_algo: FP8
     accuracy: 33.629
 meta-llama/Llama-3.3-70B-Instruct:
+  - spec_dec_algo: Eagle
+    accuracy: 33.244
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 34.383
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -70,7 +70,7 @@ meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 86.40
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
-    spec_dec_algo: Eagle3
+    spec_dec_algo: Eagle
     accuracy: 86.40
 meta-llama/Llama-4-Scout-17B-16E-Instruct:
   - accuracy: 80.00
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -229,35 +229,6 @@ def test_fp8_beam_search(self):
                           sampling_params=sampling_params,
                           extra_acc_spec="beam_width=4")
 
-    @skip_pre_hopper
-    @pytest.mark.parametrize("eagle3_one_model", [True, False],
-                             ids=["one_model", "two_model"])
-    def test_eagle3(self, eagle3_one_model):
-        pytorch_config = dict(
-            disable_overlap_scheduler=True,
-            cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
-        )
-        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
-                                        free_gpu_memory_fraction=0.7)
-
-        eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B"
-        target_model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
-
-        draft_len = 4
-        spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir,
-                                          eagle3_one_model=eagle3_one_model)
-
-        with LLM(model=target_model_dir,
-                 **pytorch_config,
-                 kv_cache_config=kv_cache_config,
-                 speculative_config=spec_config,
-                 build_config=None) as llm:
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-
     @skip_pre_hopper
     def test_ngram(self):
         pytorch_config = dict(
@@ -370,26 +341,24 @@ def test_auto_dtype_tp8(self):
                           extra_evaluator_kwargs=dict(apply_chat_template=True))
 
     @pytest.mark.skip_less_mpi_world_size(8)
-    @pytest.mark.parametrize("eagle3_one_model", [True, False],
-                             ids=["one_model", "two_model"])
+    @pytest.mark.parametrize("eagle3_one_model", [True, False])
     def test_eagle3_tp8(self, eagle3_one_model):
         model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct"
         eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.3-Instruct-70B"
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
         spec_config = EagleDecodingConfig(max_draft_len=4,
                                           speculative_model_dir=eagle_model_dir,
                                           eagle3_one_model=eagle3_one_model)
+        pytorch_config = dict(disable_overlap_scheduler=True, )
         with LLM(model_path,
                  tensor_parallel_size=8,
                  speculative_config=spec_config,
-                 kv_cache_config=kv_cache_config) as llm:
-            task = MMLU(self.MODEL_NAME)
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
-            task = GSM8K(self.MODEL_NAME)
+            task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
-            task = GPQADiamond(self.MODEL_NAME)
-            task.evaluate(llm,
-                          extra_evaluator_kwargs=dict(apply_chat_template=True))
 
     @pytest.mark.skip_less_device(4)
     @skip_pre_hopper
@@ -469,18 +438,21 @@ def test_chunked_prefill(self, attn_backend):
 
     @skip_pre_hopper
     @pytest.mark.skip_less_mpi_world_size(8)
+    @parametrize_with_ids("torch_compile", [True, False])
     @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1)],
                              ids=["tp8"])
-    def test_fp8_eagle3(self, cuda_graph, tp_size, pp_size, ep_size):
+    def test_fp8_eagle3(self, tp_size, pp_size, ep_size, torch_compile):
         model_path = f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
         eagle_model_dir = f"{llm_models_root()}/Llama-4-Maverick-17B-128E-Eagle3"
         spec_config = EagleDecodingConfig(max_draft_len=3,
                                           speculative_model_dir=eagle_model_dir)
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
+        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+                                        free_gpu_memory_fraction=0.75)
         pytorch_config = dict(
-            disable_overlap_scheduler=not cuda_graph,
             cuda_graph_config=CudaGraphConfig(max_batch_size=8),
-            enable_attention_dp=False)
+            enable_attention_dp=False,
+            torch_compile_config=TorchCompileConfig(
+                enable_fullgraph=torch_compile))
         with LLM(model_path,
                  kv_cache_config=kv_cache_config,
                  tensor_parallel_size=tp_size,
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -441,8 +441,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[one_model]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[two_model]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
@@ -459,7 +457,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp
 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
-accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8]
+accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]