diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index bab8cc59f69..ea169b29ecc 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -172,7 +172,8 @@ meta-llama/Llama-3.2-3B: kv_cache_quant_algo: FP8 accuracy: 33.629 meta-llama/Llama-3.3-70B-Instruct: - - spec_dec_algo: Eagle + - quant_algo: FP8 + spec_dec_algo: Eagle accuracy: 33.244 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 70cfb64bfbe..e7ec5b58b66 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -59,7 +59,8 @@ meta-llama/Llama-3.2-3B: accuracy: 60.60 meta-llama/Llama-3.3-70B-Instruct: - accuracy: 81.31 - - spec_dec_algo: Eagle + - quant_algo: FP8 + spec_dec_algo: Eagle accuracy: 81.31 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index ecb8a1980bf..04c16169f03 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -383,25 +383,27 @@ def test_auto_dtype_tp8(self): task.evaluate(llm, extra_evaluator_kwargs=dict(apply_chat_template=True)) + @skip_pre_hopper @pytest.mark.skip_less_mpi_world_size(8) @parametrize_with_ids("eagle3_one_model", [True, False]) - def test_eagle3_tp8(self, eagle3_one_model): - model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct" + def test_fp8_eagle3_tp8(self, eagle3_one_model): + model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8" eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.3-Instruct-70B" kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) spec_config = EagleDecodingConfig(max_draft_len=4, speculative_model_dir=eagle_model_dir, eagle3_one_model=eagle3_one_model) - pytorch_config = dict(disable_overlap_scheduler=True, ) + pytorch_config = dict( + disable_overlap_scheduler=True, + cuda_graph_config=CudaGraphConfig(max_batch_size=1)) with LLM(model_path, + max_batch_size=16, tensor_parallel_size=8, speculative_config=spec_config, kv_cache_config=kv_cache_config, **pytorch_config) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) - task = MMLU(self.MODEL_NAME) - task.evaluate(llm) @pytest.mark.skip_less_device(4) @skip_pre_hopper diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 5c0a9585ffc..24e3b44e48e 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -450,8 +450,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False] accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index 4a1eb942062..8be15cf469f 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -66,8 +66,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]