|
15 | 15 | import pytest |
16 | 16 |
|
17 | 17 | from tensorrt_llm._tensorrt_engine import LLM |
18 | | -from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig |
| 18 | +from tensorrt_llm.llmapi import (EagleDecodingConfig, |
| 19 | + ExtendedRuntimePerfKnobConfig, KvCacheConfig, |
| 20 | + SamplingParams) |
19 | 21 | from tensorrt_llm.models.modeling_utils import QuantConfig |
20 | 22 | from tensorrt_llm.quantization import QuantAlgo |
21 | 23 |
|
@@ -76,6 +78,27 @@ def test_guided_decoding_4gpus(self, backend: str): |
76 | 78 | task = JsonModeEval(self.MODEL_NAME) |
77 | 79 | task.evaluate(llm) |
78 | 80 |
|
| 81 | + def test_gather_generation_logits_cuda_graph(self): |
| 82 | + """RCCA: https://nvbugs/5365525""" |
| 83 | + extended_runtime_perf_knob_config = ExtendedRuntimePerfKnobConfig( |
| 84 | + cuda_graph_mode=True, cuda_graph_cache_size=1) |
| 85 | + llm = LLM( |
| 86 | + self.MODEL_PATH, |
| 87 | + gather_generation_logits=True, |
| 88 | + extended_runtime_perf_knob_config=extended_runtime_perf_knob_config) |
| 89 | + with llm: |
| 90 | + task = CnnDailymail(self.MODEL_NAME) |
| 91 | + task.evaluate(llm) |
| 92 | + |
| 93 | + def test_logprobs(self): |
| 94 | + sampling_config = SamplingParams(logprobs=2) |
| 95 | + llm = LLM(self.MODEL_PATH, gather_generation_logits=True) |
| 96 | + with llm: |
| 97 | + task = CnnDailymail(self.MODEL_NAME) |
| 98 | + task.evaluate(llm, |
| 99 | + sampling_params=sampling_config, |
| 100 | + extra_acc_spec="logprobs=2") |
| 101 | + |
79 | 102 |
|
80 | 103 | class TestLlama3_2_1B(LlmapiAccuracyTestHarness): |
81 | 104 | MODEL_NAME = "meta-llama/Llama-3.2-1B" |
@@ -177,18 +200,49 @@ def test_quant_tp4(self, quant): |
177 | 200 | task.evaluate(llm) |
178 | 201 |
|
179 | 202 |
|
180 | | -class TestMistral_Nemo_12B_Base(LlmapiAccuracyTestHarness): |
181 | | - MODEL_NAME = "mistralai/Mistral-Nemo-Base-2407" |
| 203 | +class TestMistralNemo12B(LlmapiAccuracyTestHarness): |
| 204 | + MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base" |
182 | 205 | MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407" |
183 | 206 |
|
| 207 | + @pytest.mark.skip_less_device_memory(80000) |
| 208 | + def test_auto_dtype(self): |
| 209 | + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) |
| 210 | + |
| 211 | + with LLM(self.MODEL_PATH, |
| 212 | + kv_cache_config=kv_cache_config, |
| 213 | + max_batch_size=8) as llm: |
| 214 | + task = CnnDailymail(self.MODEL_NAME) |
| 215 | + task.evaluate(llm) |
| 216 | + task = MMLU(self.MODEL_NAME) |
| 217 | + task.evaluate(llm) |
| 218 | + |
| 219 | + def test_auto_dtype_tp2(self): |
| 220 | + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) |
| 221 | + |
| 222 | + with LLM(self.MODEL_PATH, |
| 223 | + kv_cache_config=kv_cache_config, |
| 224 | + tensor_parallel_size=2, |
| 225 | + max_batch_size=8) as llm: |
| 226 | + task = CnnDailymail(self.MODEL_NAME) |
| 227 | + task.evaluate(llm) |
| 228 | + task = MMLU(self.MODEL_NAME) |
| 229 | + task.evaluate(llm) |
| 230 | + |
| 231 | + @pytest.mark.skip_less_device_memory(80000) |
184 | 232 | @skip_pre_ada |
185 | 233 | def test_fp8(self): |
186 | | - quant_config = QuantConfig(quant_algo=QuantAlgo.FP8, |
| 234 | + quant_config = QuantConfig(QuantAlgo.FP8, |
187 | 235 | kv_cache_quant_algo=QuantAlgo.FP8) |
| 236 | + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) |
188 | 237 |
|
189 | | - with LLM(self.MODEL_PATH, quant_config=quant_config) as llm: |
| 238 | + with LLM(self.MODEL_PATH, |
| 239 | + quant_config=quant_config, |
| 240 | + kv_cache_config=kv_cache_config, |
| 241 | + max_batch_size=8) as llm: |
190 | 242 | task = CnnDailymail(self.MODEL_NAME) |
191 | 243 | task.evaluate(llm) |
| 244 | + task = MMLU(self.MODEL_NAME) |
| 245 | + task.evaluate(llm) |
192 | 246 |
|
193 | 247 |
|
194 | 248 | class TestMistral_NeMo_Minitron_8B_Instruct(LlmapiAccuracyTestHarness): |
@@ -244,6 +298,27 @@ def test_awq_tp2(self): |
244 | 298 | task.evaluate(llm) |
245 | 299 |
|
246 | 300 |
|
| 301 | +class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness): |
| 302 | + MODEL_NAME = "microsoft/Phi-4-mini-instruct" |
| 303 | + MODEL_PATH = f"{llm_models_root()}/Phi-4-mini-instruct" |
| 304 | + |
| 305 | + def test_auto_dtype(self): |
| 306 | + with LLM(self.MODEL_PATH) as llm: |
| 307 | + task = CnnDailymail(self.MODEL_NAME) |
| 308 | + task.evaluate(llm) |
| 309 | + task = MMLU(self.MODEL_NAME) |
| 310 | + task.evaluate(llm) |
| 311 | + |
| 312 | + @skip_pre_ada |
| 313 | + def test_fp8(self): |
| 314 | + quant_config = QuantConfig(QuantAlgo.FP8) |
| 315 | + with LLM(self.MODEL_PATH, quant_config=quant_config) as llm: |
| 316 | + task = CnnDailymail(self.MODEL_NAME) |
| 317 | + task.evaluate(llm) |
| 318 | + task = MMLU(self.MODEL_NAME) |
| 319 | + task.evaluate(llm) |
| 320 | + |
| 321 | + |
247 | 322 | class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness): |
248 | 323 | MODEL_NAME = "Qwen/Qwen2-7B-Instruct" |
249 | 324 | MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct" |
@@ -378,7 +453,7 @@ def test_fp8(self): |
378 | 453 | @skip_pre_ada |
379 | 454 | def test_fp8_kvcache(self): |
380 | 455 | "RCCA: https://nvbugs/5065080" |
381 | | - quant_config = QuantConfig(QuantAlgo.FP8, |
| 456 | + quant_config = QuantConfig(quant_algo=QuantAlgo.FP8, |
382 | 457 | kv_cache_quant_algo=QuantAlgo.FP8) |
383 | 458 | with LLM(self.MODEL_PATH, quant_config=quant_config) as llm: |
384 | 459 | task = CnnDailymail(self.MODEL_NAME) |
|
0 commit comments