2323 sys .path .insert (0 , project_root )
2424
2525from tests .model_loader .utils import (
26- check_tokens_id_and_text_close ,
2726 form_model_get_output_topp0 ,
28- form_model_get_output_topp1 ,
2927 get_paddle_model_path ,
30- get_torch_model_path ,
3128 run_with_timeout ,
3229)
3330
3633
3734prompts = ["解释下”温故而知新”" , "Hello, how are you?" ]
3835
36+ # {id,baseline}
37+ baseline = {
38+ "ernie-4_5-21b-a3b-bf16-paddle.wint8.default" : "test" ,
39+ }
3940
4041model_param_map = {
41- "Qwen3-0.6B" : {
42- "max_num_seqs" : 1 ,
43- "quantizations" : ["None" , "wint8" , "wint4" ],
44- },
42+ # "Qwen3-0.6B": {
43+ # "max_num_seqs": 1,
44+ # "quantizations": ["None", "wint8", "wint4"],
45+ # },
4546 "ernie-4_5-21b-a3b-bf16-paddle" : {
4647 "max_num_seqs" : 1 ,
4748 "tensor_parallel_size" : 2 ,
4849 "quantizations" : [
4950 "wint8" ,
5051 ],
5152 },
52- "Qwen2-7B-Instruct" : {
53- "max_num_seqs" : 1 ,
54- "quantizations" : ["wint4" ],
55- },
56- "Qwen2.5-VL-7B-Instruct" : {
57- "max_num_seqs" : 1 ,
58- "quantizations" : ["wint4" ],
59- "is_mm" : True ,
60- "torch_model_name_or_path" : "Qwen2.5-VL-7B-Instruct-PT" ,
61- },
62- "Qwen3-30B-A3B" : {
63- "tensor_parallel_size" : 2 ,
64- "max_num_seqs" : 1 ,
65- "quantizations" : [
66- {
67- "quant_type" : "block_wise_fp8" ,
68- "backend" : "triton" ,
69- "env" : {"DG_NVCC_OVERRIDE_CPP_STANDARD" : "17" },
70- },
71- {
72- "quant_type" : "block_wise_fp8" ,
73- "backend" : "deepgemm" ,
74- "env" : {"DG_NVCC_OVERRIDE_CPP_STANDARD" : "17" , "FD_USE_DEEP_GEMM" : "1" },
75- },
76- ],
77- },
78- "DeepSeek-V3-0324" : {
79- "tensor_parallel_size" : 2 ,
80- "quantizations" : [
81- {
82- "quant_type" : "wint4" ,
83- "env" : {
84- "FD_ATTENTION_BACKEND" : "MLA_ATTN" ,
85- "FLAGS_mla_use_tensorcore" : "1" ,
86- "FLAGS_flash_attn_version" : "3" ,
87- "FD_USE_MACHETE" : "1" ,
88- },
89- },
90- ],
91- },
53+ # "Qwen2-7B-Instruct": {
54+ # "max_num_seqs": 1,
55+ # "quantizations": ["wint4"],
56+ # },
57+ # "Qwen3-30B-A3B": {
58+ # "tensor_parallel_size": 2,
59+ # "max_num_seqs": 1,
60+ # "quantizations": [
61+ # {
62+ # "quant_type": "block_wise_fp8",
63+ # "backend": "triton",
64+ # "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
65+ # },
66+ # {
67+ # "quant_type": "block_wise_fp8",
68+ # "backend": "deepgemm",
69+ # "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17", "FD_USE_DEEP_GEMM": "1"},
70+ # },
71+ # ],
72+ # },
9273}
9374
9475
10990 quant ,
11091 cfg .get ("max_tokens" , 32 ),
11192 env ,
112- cfg .get ("is_mm" , False ),
11393 marks = [pytest .mark .core_model ],
11494 id = f"{ model } .{ quant } .{ backend } " ,
11595 )
11696 )
11797
11898
11999@pytest .mark .parametrize (
120- "model_name_or_path,torch_model_name_or_path,tensor_parallel_size,max_num_seqs,max_model_len,quantization,max_tokens,env,is_mm " ,
100+ "model_name_or_path,torch_model_name_or_path,tensor_parallel_size,max_num_seqs,max_model_len,quantization,max_tokens,env" ,
121101 params ,
122102)
123103def test_common_model (
@@ -130,31 +110,17 @@ def test_common_model(
130110 max_tokens : int ,
131111 quantization : str ,
132112 env ,
133- is_mm : bool ,
113+ request ,
134114 monkeypatch ,
135115) -> None :
116+ print ("当前用例 id:" , request .node .callspec .id )
136117 model_path = get_paddle_model_path (model_name_or_path )
137118 if env :
138119 for k , v in env .items ():
139120 monkeypatch .setenv (k , v )
140121
141- form_model_get_output = form_model_get_output_topp0 if not is_mm else form_model_get_output_topp1
142- fd_outputs_v0 = run_with_timeout (
143- target = form_model_get_output ,
144- args = (
145- fd_runner ,
146- model_path ,
147- tensor_parallel_size ,
148- max_num_seqs ,
149- max_model_len ,
150- max_tokens ,
151- quantization ,
152- "default" ,
153- FD_ENGINE_QUEUE_PORT ,
154- prompts ,
155- FD_CACHE_QUEUE_PORT ,
156- ),
157- )
122+ form_model_get_output = form_model_get_output_topp0
123+
158124 fd_outputs_v1 = run_with_timeout (
159125 target = form_model_get_output ,
160126 args = (
@@ -171,35 +137,11 @@ def test_common_model(
171137 FD_CACHE_QUEUE_PORT ,
172138 ),
173139 )
140+ print (fd_outputs_v1 )
174141
175- check_tokens_id_and_text_close (
176- outputs_0_lst = fd_outputs_v0 ,
177- outputs_1_lst = fd_outputs_v1 ,
178- name_0 = "default loader" ,
179- name_1 = "default_v1 loader" ,
180- )
181-
182- if torch_model_name_or_path != "" :
183- torch_model_path = get_torch_model_path (torch_model_name_or_path )
184- fd_outputs_v1_torch = run_with_timeout (
185- target = form_model_get_output ,
186- args = (
187- fd_runner ,
188- torch_model_path ,
189- tensor_parallel_size ,
190- max_num_seqs ,
191- max_model_len ,
192- max_tokens ,
193- quantization ,
194- "default_v1" ,
195- FD_ENGINE_QUEUE_PORT ,
196- prompts ,
197- FD_CACHE_QUEUE_PORT ,
198- ),
199- )
200- check_tokens_id_and_text_close (
201- outputs_0_lst = fd_outputs_v1 ,
202- outputs_1_lst = fd_outputs_v1_torch ,
203- name_0 = "default loader" ,
204- name_1 = "default_v1 loader" ,
205- )
142+ # check_tokens_id_and_text_close(
143+ # outputs_0_lst=fd_outputs_v0,
144+ # outputs_1_lst=fd_outputs_v1,
145+ # name_0="default loader",
146+ # name_1="default_v1 loader",
147+ # )
0 commit comments