Skip to content

Commit bff5fdf

Browse files
authored
[TRTLLM-6541][test] Add NIM Related Cases Part 1 (#6684)
Signed-off-by: Ivy Zhang <[email protected]>
1 parent daa2a65 commit bff5fdf

File tree

7 files changed

+136
-10
lines changed

7 files changed

+136
-10
lines changed

tests/integration/defs/accuracy/references/cnn_dailymail.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ microsoft/Phi-3.5-mini-instruct:
4545
- accuracy: 31.354
4646
microsoft/Phi-4-mini-instruct:
4747
- accuracy: 32.921
48+
- quant_algo: FP8
49+
accuracy: 32.823
4850
bigcode/starcoder2-7b:
4951
- accuracy: 26.611
5052
- quant_algo: FP8
@@ -132,6 +134,8 @@ meta-llama/Llama-3.1-8B-Instruct:
132134
- accuracy: 33.640
133135
- spec_dec_algo: Eagle
134136
accuracy: 33.640
137+
- extra_acc_spec: logprobs=2
138+
accuracy: 30.522
135139
- quant_algo: FP8
136140
accuracy: 33.841
137141
- quant_algo: FP8
@@ -207,7 +211,8 @@ mistralai/Mistral-7B-Instruct-v0.3:
207211
accuracy: 31.201
208212
mistralai/Mistral-Small-3.1-24B-Instruct-2503:
209213
- accuracy: 29.20
210-
mistralai/Mistral-Nemo-Base-2407:
214+
mistralai/Mistral-Nemo-12b-Base:
215+
- accuracy: 28.906
211216
- quant_algo: FP8
212217
kv_cache_quant_algo: FP8
213218
accuracy: 24.0

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,8 @@ nvidia/Nemotron-H-56B-Base-8K:
232232
accuracy: 83.82
233233
microsoft/Phi-4-mini-instruct:
234234
- accuracy: 68.98
235+
- quant_algo: FP8
236+
accuracy: 68.30
235237
bigcode/starcoder2-7b:
236238
- accuracy: 41.35
237239
- quant_algo: FP8
@@ -275,3 +277,7 @@ GPT-OSS/MXFP4:
275277
accuracy: 75.50
276278
- quant_algo: W4A8_MXFP4_FP8
277279
accuracy: 75.50
280+
mistralai/Mistral-Nemo-12b-Base:
281+
- accuracy: 69.66
282+
- quant_algo: FP8
283+
accuracy: 69.66

tests/integration/defs/accuracy/test_llm_api.py

Lines changed: 81 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
import pytest
1616

1717
from tensorrt_llm._tensorrt_engine import LLM
18-
from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
18+
from tensorrt_llm.llmapi import (EagleDecodingConfig,
19+
ExtendedRuntimePerfKnobConfig, KvCacheConfig,
20+
SamplingParams)
1921
from tensorrt_llm.models.modeling_utils import QuantConfig
2022
from tensorrt_llm.quantization import QuantAlgo
2123

@@ -76,6 +78,27 @@ def test_guided_decoding_4gpus(self, backend: str):
7678
task = JsonModeEval(self.MODEL_NAME)
7779
task.evaluate(llm)
7880

81+
def test_gather_generation_logits_cuda_graph(self):
82+
"""RCCA: https://nvbugs/5365525"""
83+
extended_runtime_perf_knob_config = ExtendedRuntimePerfKnobConfig(
84+
cuda_graph_mode=True, cuda_graph_cache_size=1)
85+
llm = LLM(
86+
self.MODEL_PATH,
87+
gather_generation_logits=True,
88+
extended_runtime_perf_knob_config=extended_runtime_perf_knob_config)
89+
with llm:
90+
task = CnnDailymail(self.MODEL_NAME)
91+
task.evaluate(llm)
92+
93+
def test_logprobs(self):
94+
sampling_config = SamplingParams(logprobs=2)
95+
llm = LLM(self.MODEL_PATH, gather_generation_logits=True)
96+
with llm:
97+
task = CnnDailymail(self.MODEL_NAME)
98+
task.evaluate(llm,
99+
sampling_params=sampling_config,
100+
extra_acc_spec="logprobs=2")
101+
79102

80103
class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
81104
MODEL_NAME = "meta-llama/Llama-3.2-1B"
@@ -177,18 +200,49 @@ def test_quant_tp4(self, quant):
177200
task.evaluate(llm)
178201

179202

180-
class TestMistral_Nemo_12B_Base(LlmapiAccuracyTestHarness):
181-
MODEL_NAME = "mistralai/Mistral-Nemo-Base-2407"
203+
class TestMistralNemo12B(LlmapiAccuracyTestHarness):
204+
MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base"
182205
MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407"
183206

207+
@pytest.mark.skip_less_device_memory(80000)
208+
def test_auto_dtype(self):
209+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
210+
211+
with LLM(self.MODEL_PATH,
212+
kv_cache_config=kv_cache_config,
213+
max_batch_size=8) as llm:
214+
task = CnnDailymail(self.MODEL_NAME)
215+
task.evaluate(llm)
216+
task = MMLU(self.MODEL_NAME)
217+
task.evaluate(llm)
218+
219+
def test_auto_dtype_tp2(self):
220+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
221+
222+
with LLM(self.MODEL_PATH,
223+
kv_cache_config=kv_cache_config,
224+
tensor_parallel_size=2,
225+
max_batch_size=8) as llm:
226+
task = CnnDailymail(self.MODEL_NAME)
227+
task.evaluate(llm)
228+
task = MMLU(self.MODEL_NAME)
229+
task.evaluate(llm)
230+
231+
@pytest.mark.skip_less_device_memory(80000)
184232
@skip_pre_ada
185233
def test_fp8(self):
186-
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
234+
quant_config = QuantConfig(QuantAlgo.FP8,
187235
kv_cache_quant_algo=QuantAlgo.FP8)
236+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
188237

189-
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
238+
with LLM(self.MODEL_PATH,
239+
quant_config=quant_config,
240+
kv_cache_config=kv_cache_config,
241+
max_batch_size=8) as llm:
190242
task = CnnDailymail(self.MODEL_NAME)
191243
task.evaluate(llm)
244+
task = MMLU(self.MODEL_NAME)
245+
task.evaluate(llm)
192246

193247

194248
class TestMistral_NeMo_Minitron_8B_Instruct(LlmapiAccuracyTestHarness):
@@ -244,6 +298,27 @@ def test_awq_tp2(self):
244298
task.evaluate(llm)
245299

246300

301+
class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
302+
MODEL_NAME = "microsoft/Phi-4-mini-instruct"
303+
MODEL_PATH = f"{llm_models_root()}/Phi-4-mini-instruct"
304+
305+
def test_auto_dtype(self):
306+
with LLM(self.MODEL_PATH) as llm:
307+
task = CnnDailymail(self.MODEL_NAME)
308+
task.evaluate(llm)
309+
task = MMLU(self.MODEL_NAME)
310+
task.evaluate(llm)
311+
312+
@skip_pre_ada
313+
def test_fp8(self):
314+
quant_config = QuantConfig(QuantAlgo.FP8)
315+
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
316+
task = CnnDailymail(self.MODEL_NAME)
317+
task.evaluate(llm)
318+
task = MMLU(self.MODEL_NAME)
319+
task.evaluate(llm)
320+
321+
247322
class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
248323
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
249324
MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"
@@ -378,7 +453,7 @@ def test_fp8(self):
378453
@skip_pre_ada
379454
def test_fp8_kvcache(self):
380455
"RCCA: https://nvbugs/5065080"
381-
quant_config = QuantConfig(QuantAlgo.FP8,
456+
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
382457
kv_cache_quant_algo=QuantAlgo.FP8)
383458
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
384459
task = CnnDailymail(self.MODEL_NAME)

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1773,6 +1773,36 @@ def test_auto_dtype_tp8(self):
17731773
task.evaluate(llm)
17741774

17751775

1776+
class TestMistralNemo12B(LlmapiAccuracyTestHarness):
1777+
MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base"
1778+
MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407"
1779+
1780+
@pytest.mark.skip_less_device_memory(80000)
1781+
def test_auto_dtype(self):
1782+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
1783+
1784+
with LLM(self.MODEL_PATH,
1785+
kv_cache_config=kv_cache_config,
1786+
max_batch_size=8) as llm:
1787+
task = CnnDailymail(self.MODEL_NAME)
1788+
task.evaluate(llm)
1789+
task = MMLU(self.MODEL_NAME)
1790+
task.evaluate(llm)
1791+
1792+
@pytest.mark.skip_less_device(2)
1793+
def test_auto_dtype_tp2(self):
1794+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
1795+
1796+
with LLM(self.MODEL_PATH,
1797+
kv_cache_config=kv_cache_config,
1798+
tensor_parallel_size=2,
1799+
max_batch_size=8) as llm:
1800+
task = CnnDailymail(self.MODEL_NAME)
1801+
task.evaluate(llm)
1802+
task = MMLU(self.MODEL_NAME)
1803+
task.evaluate(llm)
1804+
1805+
17761806
@pytest.mark.timeout(5400)
17771807
@pytest.mark.skip_less_device_memory(80000)
17781808
class TestLlama3_3NemotronSuper49Bv1(LlmapiAccuracyTestHarness):

tests/integration/test_lists/qa/llm_function_full.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,10 @@ accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp4
420420
accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2
421421
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
422422
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
423+
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
424+
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs
425+
accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_auto_dtype
426+
accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8
423427
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype
424428
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only
425429
accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise
@@ -431,7 +435,9 @@ accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8_kvcache
431435
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4]
432436
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4_awq]
433437
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int8_awq]
434-
accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
438+
accuracy/test_llm_api.py::TestMistralNemo12B::test_auto_dtype
439+
accuracy/test_llm_api.py::TestMistralNemo12B::test_auto_dtype_tp2
440+
accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8
435441
accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8
436442
accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
437443
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
@@ -576,6 +582,8 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
576582
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
577583
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
578584
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
585+
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
586+
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
579587

580588
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
581589
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ l0_h100:
253253
- examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
254254
- examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] # 5 mins
255255
- accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8
256-
- accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
256+
- accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8
257257
- examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] # 7 mins
258258
- examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
259259
- examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]

tests/integration/test_lists/waives.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
300300
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5457489)
301301
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5457489)
302302
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5457504)
303-
accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 SKIP (https://nvbugs/5413197)
303+
accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197)
304304
triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349)
305305
triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624)
306306
triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)
@@ -314,3 +314,5 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
314314
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5459817)
315315
llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5461796)
316316
disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5459811)
317+
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437384)
318+
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5365525)

0 commit comments

Comments
 (0)