Skip to content

Commit c0d099c

Browse files
committed
add llama4 tp4 tests
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent a15af87 commit c0d099c

File tree

5 files changed

+74
-18
lines changed

5 files changed

+74
-18
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ meta-llama/Llama-3.3-70B-Instruct:
2121
accuracy: 84.08
2222
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
2323
- accuracy: 92.20
24+
- quant_algo: FP8
25+
kv_cache_quant_algo: FP8
26+
accuracy: 90.20
2427
meta-llama/Llama-4-Scout-17B-16E-Instruct:
2528
- accuracy: 89.70
2629
- quant_algo: NVFP4

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ meta-llama/Llama-3.3-70B-Instruct:
7171
accuracy: 80.34
7272
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
7373
- accuracy: 86.40
74+
- quant_algo: FP8
75+
kv_cache_quant_algo: FP8
76+
accuracy: 86.40
7477
- quant_algo: FP8
7578
kv_cache_quant_algo: FP8
7679
spec_dec_algo: Eagle

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
SamplingParams, TorchCompileConfig)
2828
from tensorrt_llm.quantization import QuantAlgo
2929

30-
from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_hopper,
30+
from ..conftest import (get_device_count, get_device_memory, llm_models_root,
31+
parametrize_with_ids, skip_no_hopper,
3132
skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
3233
skip_pre_hopper)
3334
from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
@@ -509,19 +510,26 @@ class TestLlama4MaverickInstruct(LlmapiAccuracyTestHarness):
509510
MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Maverick-17B-128E-Instruct"
510511

511512
@skip_pre_blackwell
512-
@pytest.mark.skip_less_mpi_world_size(8)
513513
@parametrize_with_ids("cuda_graph", [False, True])
514-
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
515-
(8, 1, 8)],
516-
ids=["tp8", "tp8ep4", "tp8ep8"])
514+
@pytest.mark.parametrize(
515+
"tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1),
516+
(4, 1, 2), (4, 1, 4)],
517+
ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"])
517518
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
519+
if get_device_memory() < 270000 and get_device_count() < 8:
520+
pytest.skip("Not enough memory for this test")
521+
if get_device_count() != tp_size * pp_size:
522+
pytest.skip("Device count mismatch with world size")
523+
524+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
518525
with LLM(
519526
self.MODEL_PATH,
520527
tensor_parallel_size=tp_size,
521528
# Keep this low to avoid warmup OOM in CI
522529
max_seq_len=8192,
523530
pipeline_parallel_size=pp_size,
524531
moe_expert_parallel_size=ep_size,
532+
kv_cache_config=kv_cache_config,
525533
cuda_graph_config=CudaGraphConfig()
526534
if cuda_graph else None) as llm:
527535
task = MMLU(self.MODEL_NAME)
@@ -547,20 +555,27 @@ def test_chunked_prefill(self, attn_backend):
547555
task.evaluate(llm)
548556

549557
@skip_pre_hopper
550-
@pytest.mark.skip_less_mpi_world_size(8)
558+
@pytest.mark.skip_less_device_memory(80000)
551559
@parametrize_with_ids("cuda_graph", [False, True])
552-
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
553-
(8, 1, 8)],
554-
ids=["tp8", "tp8ep4", "tp8ep8"])
560+
@pytest.mark.parametrize(
561+
"tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1),
562+
(4, 1, 2), (4, 1, 4)],
563+
ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"])
555564
def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
565+
if get_device_memory() < 140000 and get_device_count() < 8:
566+
pytest.skip("Not enough memory for this test")
567+
if get_device_count() != tp_size * pp_size:
568+
pytest.skip("Device count mismatch with world size")
569+
556570
with LLM(
557571
f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
558572
tensor_parallel_size=tp_size,
559573
# Keep this low to avoid warmup OOM in CI
560574
max_seq_len=8192,
561575
pipeline_parallel_size=pp_size,
562576
moe_expert_parallel_size=ep_size,
563-
use_cuda_graph=cuda_graph) as llm:
577+
cuda_graph_config=CudaGraphConfig()
578+
if cuda_graph else None) as llm:
564579
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
565580
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
566581
task = MMLU(self.MODEL_NAME)
@@ -583,7 +598,8 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
583598
moe_expert_parallel_size=ep_size,
584599
enable_chunked_prefill=True,
585600
max_num_tokens=256,
586-
use_cuda_graph=cuda_graph) as llm:
601+
cuda_graph_config=CudaGraphConfig()
602+
if cuda_graph else None) as llm:
587603
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
588604
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
589605
task = MMLU(self.MODEL_NAME)
@@ -622,16 +638,20 @@ def test_fp8_eagle3(self, tp_size, pp_size, ep_size, torch_compile):
622638
task.evaluate(llm)
623639

624640

641+
@pytest.mark.skip_less_device_memory(80000)
625642
class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
626643
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
627644

628645
@skip_pre_hopper
629-
@pytest.mark.skip_less_mpi_world_size(8)
630646
@parametrize_with_ids("cuda_graph", [False, True])
631-
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
632-
(8, 1, 8)],
633-
ids=["tp8", "tp8ep4", "tp8ep8"])
647+
@pytest.mark.parametrize(
648+
"tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1),
649+
(4, 1, 2), (4, 1, 4)],
650+
ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"])
634651
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
652+
if get_device_count() != tp_size * pp_size:
653+
pytest.skip("Device count mismatch with world size")
654+
635655
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"
636656
with LLM(
637657
model_path,
@@ -648,11 +668,13 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
648668
task.evaluate(llm)
649669

650670
@skip_pre_hopper
651-
@pytest.mark.skip_less_mpi_world_size(8)
652671
@parametrize_with_ids("cuda_graph", [True])
653672
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
654673
ids=["tp8ep8", "tp4"])
655674
def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
675+
if get_device_count() != tp_size * pp_size:
676+
pytest.skip("Device count mismatch with world size")
677+
656678
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"
657679
with LLM(
658680
model_path,
@@ -661,6 +683,7 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
661683
max_seq_len=8192,
662684
pipeline_parallel_size=pp_size,
663685
moe_expert_parallel_size=ep_size,
686+
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.8),
664687
cuda_graph_config=CudaGraphConfig()
665688
if cuda_graph else None) as llm:
666689
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
@@ -670,11 +693,13 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
670693
task.evaluate(llm)
671694

672695
@skip_pre_blackwell
673-
@pytest.mark.skip_less_mpi_world_size(8)
674696
@parametrize_with_ids("cuda_graph", [True])
675697
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
676698
ids=["tp8ep8", "tp4"])
677699
def test_fp4(self, cuda_graph, tp_size, pp_size, ep_size):
700+
if get_device_count() != tp_size * pp_size:
701+
pytest.skip("Device count mismatch with world size")
702+
678703
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4"
679704
with LLM(
680705
model_path,
@@ -715,7 +740,7 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
715740
task.evaluate(llm)
716741

717742
@skip_pre_blackwell
718-
@pytest.mark.skip_less_mpi_world_size(8)
743+
@pytest.mark.skip_less_mpi_world_size(4)
719744
@parametrize_with_ids("cuda_graph", [True])
720745
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)],
721746
ids=["tp4ep4"])

tests/integration/test_lists/qa/llm_function_full.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,18 +461,27 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
461461
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
462462
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
463463
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
464+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False]
465+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
466+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
464467
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
465468
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
466469
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True]
467470
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True]
468471
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True]
472+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True]
473+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True]
474+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True]
469475
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
470476
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
471477
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True]
472478
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False]
473479
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
474480
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
475481
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
482+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False]
483+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
484+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
476485
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
477486
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
478487
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]

tests/integration/test_lists/qa/llm_function_sanity.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,17 +71,33 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagl
7171
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
7272
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
7373
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
74+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False]
75+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
76+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
7477
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
7578
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
79+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True]
80+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True]
81+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True]
82+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True]
83+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True]
84+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True]
85+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
86+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
7687
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True]
7788
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False]
7889
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
7990
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
8091
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
92+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False]
93+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
94+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
8195
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
8296
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
8397
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
8498
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
99+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
100+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
85101
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
86102
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
87103
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized

0 commit comments

Comments
 (0)