Skip to content

Commit 5eefdf2

Browse files
crazydemodc3671
authored andcommitted
tests: Add llama4 functional cases (#6392)
Signed-off-by: Ivy Zhang <[email protected]>
1 parent e1eca33 commit 5eefdf2

File tree

3 files changed

+134
-0
lines changed

3 files changed

+134
-0
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,51 @@ def test_chunked_prefill(self, attn_backend):
489489
task = MMLU(self.MODEL_NAME)
490490
task.evaluate(llm)
491491

492+
@skip_pre_hopper
493+
@pytest.mark.skip_less_mpi_world_size(8)
494+
@parametrize_with_ids("cuda_graph", [False, True])
495+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
496+
(8, 1, 8)],
497+
ids=["tp8", "tp8ep4", "tp8ep8"])
498+
def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
499+
with LLM(
500+
f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
501+
tensor_parallel_size=tp_size,
502+
# Keep this low to avoid warmup OOM in CI
503+
max_seq_len=8192,
504+
pipeline_parallel_size=pp_size,
505+
moe_expert_parallel_size=ep_size,
506+
use_cuda_graph=cuda_graph) as llm:
507+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
508+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
509+
task = MMLU(self.MODEL_NAME)
510+
task.evaluate(llm)
511+
task = GSM8K(self.MODEL_NAME)
512+
task.evaluate(llm)
513+
514+
@skip_pre_hopper
515+
@pytest.mark.skip_less_mpi_world_size(8)
516+
@parametrize_with_ids("cuda_graph", [False, True])
517+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8)],
518+
ids=["tp8ep8"])
519+
def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
520+
with LLM(
521+
f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
522+
tensor_parallel_size=tp_size,
523+
# Keep this low to avoid warmup OOM in CI
524+
max_seq_len=8192,
525+
pipeline_parallel_size=pp_size,
526+
moe_expert_parallel_size=ep_size,
527+
enable_chunked_prefill=True,
528+
max_num_tokens=256,
529+
use_cuda_graph=cuda_graph) as llm:
530+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
531+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
532+
task = MMLU(self.MODEL_NAME)
533+
task.evaluate(llm)
534+
task = GSM8K(self.MODEL_NAME)
535+
task.evaluate(llm)
536+
492537
@skip_pre_hopper
493538
@pytest.mark.skip_less_mpi_world_size(8)
494539
@parametrize_with_ids("torch_compile", [True, False])
@@ -587,6 +632,50 @@ def test_fp4(self, cuda_graph, tp_size, pp_size, ep_size):
587632
task = GSM8K(self.MODEL_NAME)
588633
task.evaluate(llm)
589634

635+
@skip_pre_hopper
636+
@pytest.mark.skip_less_mpi_world_size(4)
637+
@parametrize_with_ids("cuda_graph", [True])
638+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)],
639+
ids=["tp4ep4"])
640+
def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
641+
with LLM(
642+
f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8",
643+
tensor_parallel_size=tp_size,
644+
max_seq_len=22000,
645+
pipeline_parallel_size=pp_size,
646+
moe_expert_parallel_size=ep_size,
647+
enable_chunked_prefill=True,
648+
max_num_tokens=256,
649+
use_cuda_graph=cuda_graph) as llm:
650+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
651+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
652+
task = MMLU(self.MODEL_NAME)
653+
task.evaluate(llm)
654+
task = GSM8K(self.MODEL_NAME)
655+
task.evaluate(llm)
656+
657+
@skip_pre_blackwell
658+
@pytest.mark.skip_less_mpi_world_size(8)
659+
@parametrize_with_ids("cuda_graph", [True])
660+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)],
661+
ids=["tp4ep4"])
662+
def test_fp4_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
663+
with LLM(
664+
f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4",
665+
tensor_parallel_size=tp_size,
666+
pipeline_parallel_size=pp_size,
667+
moe_expert_parallel_size=ep_size,
668+
max_seq_len=22000,
669+
enable_chunked_prefill=True,
670+
max_num_tokens=256,
671+
use_cuda_graph=cuda_graph) as llm:
672+
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
673+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
674+
task = MMLU(self.MODEL_NAME)
675+
task.evaluate(llm)
676+
task = GSM8K(self.MODEL_NAME)
677+
task.evaluate(llm)
678+
590679

591680
class TestMistral7B(LlmapiAccuracyTestHarness):
592681
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

tests/integration/defs/test_e2e.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1923,6 +1923,40 @@ def test_ptp_quickstart_advanced_8gpus(llm_root, llm_venv, model_name,
19231923
_check_mem_usage(running_log, [mapping[model_name], 0, 0, 0], 8)
19241924

19251925

1926+
@skip_pre_hopper
1927+
@pytest.mark.skip_less_device(8)
1928+
@pytest.mark.parametrize("cuda_graph", [False, True])
1929+
@pytest.mark.parametrize("model_name,model_path", [
1930+
("Llama-4-Maverick-17B-128E-Instruct-FP8",
1931+
"llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"),
1932+
("Llama-4-Scout-17B-16E-Instruct-FP8",
1933+
"llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"),
1934+
pytest.param('Llama-4-Scout-17B-16E-Instruct-FP4',
1935+
'llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4',
1936+
marks=skip_pre_blackwell),
1937+
])
1938+
def test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k(
1939+
llm_root, llm_venv, model_name, model_path, cuda_graph):
1940+
print(f"Testing {model_name} on 8 GPUs.")
1941+
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
1942+
cmd = [
1943+
str(example_root / "quickstart_advanced.py"),
1944+
"--enable_chunked_prefill",
1945+
"--model_dir",
1946+
f"{llm_models_root()}/{model_path}",
1947+
"--tp_size=8",
1948+
"--moe_ep_size=8",
1949+
"--max_seq_len=22000",
1950+
"--kv_cache_fraction=0.1",
1951+
]
1952+
if cuda_graph:
1953+
cmd.extend([
1954+
"--use_cuda_graph",
1955+
"--cuda_graph_padding_enabled",
1956+
])
1957+
llm_venv.run_cmd(cmd)
1958+
1959+
19261960
# This test is specifically to be run on 2 GPUs on Blackwell RTX 6000 Pro (SM120) architecture
19271961
# TODO: remove once we have a node with 8 GPUs and reuse test_ptp_quickstart_advanced_8gpus
19281962
@skip_no_sm120

tests/integration/test_lists/qa/examples_test_list.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,11 @@ accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp
459459
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
460460
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
461461
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
462+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True]
463+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True]
464+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True]
465+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
466+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
462467
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True]
463468
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False]
464469
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
@@ -468,6 +473,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_
468473
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
469474
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
470475
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
476+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
477+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
471478
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
472479
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
473480
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
@@ -542,6 +549,10 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-mode
542549
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1]
543550
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1]
544551
test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
552+
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Maverick-17B-128E-Instruct-FP8-llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-False]
553+
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Maverick-17B-128E-Instruct-FP8-llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-True]
554+
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True]
555+
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4-True]
545556
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
546557
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False]
547558
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False]

0 commit comments

Comments
 (0)