| 
28 | 28 | from defs.trt_test_alternative import (check_call, check_call_negative_test,  | 
29 | 29 |                                        check_output)  | 
30 | 30 | 
 
  | 
31 |  | -from .common import (PluginOptions, convert_weights, prune_checkpoint,  | 
32 |  | -                     quantize_data, refit_model, venv_check_call)  | 
 | 31 | +from .common import (PluginOptions, convert_weights, get_mmlu_accuracy,  | 
 | 32 | +                     prune_checkpoint, quantize_data, refit_model,  | 
 | 33 | +                     venv_check_call)  | 
33 | 34 | from .conftest import (llm_models_root, skip_no_sm120, skip_nvlink_inactive,  | 
34 | 35 |                        skip_post_blackwell, skip_pre_blackwell, skip_pre_hopper,  | 
35 | 36 |                        tests_path, unittest_path)  | 
 | 
42 | 43 |     os.environ['TLLM_LOG_LEVEL'] = 'INFO'  | 
43 | 44 | 
 
  | 
44 | 45 | _MEM_FRACTION_50 = 0.5  | 
 | 46 | +_MEM_FRACTION_80 = 0.8  | 
45 | 47 | _MEM_FRACTION_95 = 0.95  | 
46 | 48 | 
 
  | 
47 | 49 | 
 
  | 
@@ -2677,4 +2679,43 @@ def test_ptp_quickstart_advanced_llama_multi_nodes(llm_root, llm_venv,  | 
2677 | 2679 |     check_call(" ".join(run_cmd), shell=True, env=llm_venv._new_env)  | 
2678 | 2680 | 
 
  | 
2679 | 2681 | 
 
  | 
2680 |  | -# End of Pivot-To-Python examples  | 
 | 2682 | +@pytest.mark.timeout(5400)  | 
 | 2683 | +@pytest.mark.skip_less_device_memory(80000)  | 
 | 2684 | +@pytest.mark.skip_less_device(4)  | 
 | 2685 | +@pytest.mark.parametrize("eval_task", ["mmlu"])  | 
 | 2686 | +@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(16, 1, 8), (8, 2, 8)],  | 
 | 2687 | +                         ids=["tp16", "tp8pp2"])  | 
 | 2688 | +@pytest.mark.parametrize("model_path", [  | 
 | 2689 | +    pytest.param('llama-3.3-models/Llama-3.3-70B-Instruct',  | 
 | 2690 | +                 marks=skip_pre_hopper),  | 
 | 2691 | +    pytest.param('llama4-models/Llama-4-Maverick-17B-128E-Instruct',  | 
 | 2692 | +                 marks=skip_pre_hopper),  | 
 | 2693 | +    pytest.param('llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8',  | 
 | 2694 | +                 marks=skip_pre_hopper),  | 
 | 2695 | +    pytest.param('Qwen3/Qwen3-235B-A22B', marks=skip_pre_hopper),  | 
 | 2696 | +    pytest.param('Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf',  | 
 | 2697 | +                 marks=skip_pre_blackwell),  | 
 | 2698 | +    pytest.param('DeepSeek-R1/DeepSeek-R1-0528-FP4', marks=skip_pre_blackwell),  | 
 | 2699 | +])  | 
 | 2700 | +def test_multi_nodes_eval(llm_venv, model_path, tp_size, pp_size, ep_size,  | 
 | 2701 | +                          eval_task):  | 
 | 2702 | +    if "Llama-4" in model_path and tp_size == 16:  | 
 | 2703 | +        pytest.skip("Llama-4 with tp16 is not supported")  | 
 | 2704 | + | 
 | 2705 | +    mmlu_threshold = 81.5  | 
 | 2706 | +    run_cmd = [  | 
 | 2707 | +        "trtllm-llmapi-launch",  | 
 | 2708 | +        "trtllm-eval",  | 
 | 2709 | +        f"--model={llm_models_root()}/{model_path}",  | 
 | 2710 | +        f"--ep_size={ep_size}",  | 
 | 2711 | +        f"--tp_size={tp_size}",  | 
 | 2712 | +        f"--pp_size={pp_size}",  | 
 | 2713 | +        f"--kv_cache_free_gpu_memory_fraction={_MEM_FRACTION_80}",  | 
 | 2714 | +        "--max_batch_size=32",  | 
 | 2715 | +        eval_task,  | 
 | 2716 | +    ]  | 
 | 2717 | +    output = check_output(" ".join(run_cmd), shell=True, env=llm_venv._new_env)  | 
 | 2718 | + | 
 | 2719 | +    if os.environ.get("SLURM_PROCID", '0') == '0':  | 
 | 2720 | +        mmlu_accuracy = get_mmlu_accuracy(output)  | 
 | 2721 | +        assert mmlu_accuracy > mmlu_threshold, f"MMLU accuracy {mmlu_accuracy} is less than threshold {mmlu_threshold}"  | 
0 commit comments