Skip to content

Commit e1e1f22

Browse files
committed
update test config
Signed-off-by: Ivy Zhang <[email protected]>
1 parent 09ba8d8 commit e1e1f22

File tree

4 files changed

+19
-46
lines changed

4 files changed

+19
-46
lines changed

tests/integration/defs/accuracy/references/cnn_dailymail.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,8 @@ meta-llama/Llama-3.2-3B:
169169
kv_cache_quant_algo: FP8
170170
accuracy: 33.629
171171
meta-llama/Llama-3.3-70B-Instruct:
172+
- spec_dec_algo: Eagle
173+
accuracy: 33.244
172174
- quant_algo: NVFP4
173175
kv_cache_quant_algo: FP8
174176
accuracy: 34.383

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ meta-llama/Llama-4-Maverick-17B-128E-Instruct:
7070
- accuracy: 86.40
7171
- quant_algo: FP8
7272
kv_cache_quant_algo: FP8
73-
spec_dec_algo: Eagle3
73+
spec_dec_algo: Eagle
7474
accuracy: 86.40
7575
meta-llama/Llama-4-Scout-17B-16E-Instruct:
7676
- accuracy: 80.00

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 14 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -229,35 +229,6 @@ def test_fp8_beam_search(self):
229229
sampling_params=sampling_params,
230230
extra_acc_spec="beam_width=4")
231231

232-
@skip_pre_hopper
233-
@pytest.mark.parametrize("eagle3_one_model", [True, False],
234-
ids=["one_model", "two_model"])
235-
def test_eagle3(self, eagle3_one_model):
236-
pytorch_config = dict(
237-
disable_overlap_scheduler=True,
238-
cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
239-
)
240-
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
241-
free_gpu_memory_fraction=0.7)
242-
243-
eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B"
244-
target_model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
245-
246-
draft_len = 4
247-
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
248-
speculative_model_dir=eagle_model_dir,
249-
eagle3_one_model=eagle3_one_model)
250-
251-
with LLM(model=target_model_dir,
252-
**pytorch_config,
253-
kv_cache_config=kv_cache_config,
254-
speculative_config=spec_config,
255-
build_config=None) as llm:
256-
task = CnnDailymail(self.MODEL_NAME)
257-
task.evaluate(llm)
258-
task = MMLU(self.MODEL_NAME)
259-
task.evaluate(llm)
260-
261232
@skip_pre_hopper
262233
def test_ngram(self):
263234
pytorch_config = dict(
@@ -370,26 +341,24 @@ def test_auto_dtype_tp8(self):
370341
extra_evaluator_kwargs=dict(apply_chat_template=True))
371342

372343
@pytest.mark.skip_less_mpi_world_size(8)
373-
@pytest.mark.parametrize("eagle3_one_model", [True, False],
374-
ids=["one_model", "two_model"])
344+
@pytest.mark.parametrize("eagle3_one_model", [True, False])
375345
def test_eagle3_tp8(self, eagle3_one_model):
376346
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct"
377347
eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.3-Instruct-70B"
378-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
348+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
379349
spec_config = EagleDecodingConfig(max_draft_len=4,
380350
speculative_model_dir=eagle_model_dir,
381351
eagle3_one_model=eagle3_one_model)
352+
pytorch_config = dict(disable_overlap_scheduler=True, )
382353
with LLM(model_path,
383354
tensor_parallel_size=8,
384355
speculative_config=spec_config,
385-
kv_cache_config=kv_cache_config) as llm:
386-
task = MMLU(self.MODEL_NAME)
356+
kv_cache_config=kv_cache_config,
357+
**pytorch_config) as llm:
358+
task = CnnDailymail(self.MODEL_NAME)
387359
task.evaluate(llm)
388-
task = GSM8K(self.MODEL_NAME)
360+
task = MMLU(self.MODEL_NAME)
389361
task.evaluate(llm)
390-
task = GPQADiamond(self.MODEL_NAME)
391-
task.evaluate(llm,
392-
extra_evaluator_kwargs=dict(apply_chat_template=True))
393362

394363
@pytest.mark.skip_less_device(4)
395364
@skip_pre_hopper
@@ -469,18 +438,21 @@ def test_chunked_prefill(self, attn_backend):
469438

470439
@skip_pre_hopper
471440
@pytest.mark.skip_less_mpi_world_size(8)
441+
@parametrize_with_ids("torch_compile", [True, False])
472442
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1)],
473443
ids=["tp8"])
474-
def test_fp8_eagle3(self, cuda_graph, tp_size, pp_size, ep_size):
444+
def test_fp8_eagle3(self, tp_size, pp_size, ep_size, torch_compile):
475445
model_path = f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
476446
eagle_model_dir = f"{llm_models_root()}/Llama-4-Maverick-17B-128E-Eagle3"
477447
spec_config = EagleDecodingConfig(max_draft_len=3,
478448
speculative_model_dir=eagle_model_dir)
479-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
449+
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
450+
free_gpu_memory_fraction=0.75)
480451
pytorch_config = dict(
481-
disable_overlap_scheduler=not cuda_graph,
482452
cuda_graph_config=CudaGraphConfig(max_batch_size=8),
483-
enable_attention_dp=False)
453+
enable_attention_dp=False,
454+
torch_compile_config=TorchCompileConfig(
455+
enable_fullgraph=torch_compile))
484456
with LLM(model_path,
485457
kv_cache_config=kv_cache_config,
486458
tensor_parallel_size=tp_size,

tests/integration/test_lists/qa/examples_test_list.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -441,8 +441,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[
441441
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
442442
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
443443
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
444-
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[one_model]
445-
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[two_model]
446444
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
447445
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
448446
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
@@ -459,7 +457,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp
459457
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
460458
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
461459
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
462-
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8]
460+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True]
461+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False]
463462
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
464463
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
465464
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]

0 commit comments

Comments
 (0)