@@ -45,7 +45,7 @@ class RequestFuncOutput:
4545 tpot : float = 0.0 # avg next-token latencies
4646 prompt_len : int = 0
4747 error : str = ""
48- decode_iteration : int = 0 # Number of decoding iterations
48+ avg_decoded_tokens_per_iter : float = 0.0 # Average tokens decoded per iteration
4949
5050
5151async def async_request_trt_llm (
@@ -82,7 +82,6 @@ async def async_request_trt_llm(
8282 ttft = 0.0
8383 st = time .perf_counter ()
8484 most_recent_timestamp = st
85- decode_iteration_count = 0 # Track decoding iterations
8685 try :
8786 async with request_session .post (url = api_url , json = payload ) as response :
8887 if response .status == 200 :
@@ -108,22 +107,27 @@ async def async_request_trt_llm(
108107 else :
109108 output .itl .append (timestamp - most_recent_timestamp )
110109
111- # Increment decode iteration for each chunk
112- decode_iteration_count += 1
113110 most_recent_timestamp = timestamp
114111
112+ # Extract avg_decoded_tokens_per_iter from TensorRT-LLM response
113+ if "avg_decoded_tokens_per_iter" in data :
114+ output .avg_decoded_tokens_per_iter = data [
115+ "avg_decoded_tokens_per_iter" ]
116+
115117 output .latency = most_recent_timestamp - st
116- output . decode_iteration = decode_iteration_count
118+
117119 else :
118120 content = await response .content .read ()
119121 data = json .loads (content .decode ())
120122 output .ttft = - 1
121123 output .itl = []
122124 output .generated_text = data ["text_output" ]
123125 output .latency = time .perf_counter () - st
124- # For non-streaming, estimate decode_iteration as number of output tokens
125- output .decode_iteration = len (output .generated_text .split (
126- )) if output .generated_text else 1
126+
127+ # Extract avg_decoded_tokens_per_iter from non-streaming TensorRT-LLM response
128+ if "avg_decoded_tokens_per_iter" in data :
129+ output .avg_decoded_tokens_per_iter = data [
130+ "avg_decoded_tokens_per_iter" ]
127131
128132 else :
129133 output .error = response .reason or ""
@@ -138,6 +142,7 @@ async def async_request_trt_llm(
138142
139143 if pbar :
140144 pbar .update (1 )
145+
141146 return output
142147
143148
@@ -183,7 +188,6 @@ async def async_request_openai_completions(
183188 generated_text = ""
184189 st = time .perf_counter ()
185190 most_recent_timestamp = st
186- decode_iteration_count = 0 # Track decoding iterations
187191 try :
188192 async with request_session .post (url = api_url ,
189193 json = payload ,
@@ -220,11 +224,13 @@ async def async_request_openai_completions(
220224 output .itl .append (timestamp -
221225 most_recent_timestamp )
222226
223- # Increment decode iteration for each chunk with text
224- if text is not None :
225- decode_iteration_count += 1
226227 most_recent_timestamp = timestamp
227228 generated_text += text or ""
229+
230+ # Extract avg_decoded_tokens_per_iter from streaming response
231+ if "avg_decoded_tokens_per_iter" in choices [0 ]:
232+ output .avg_decoded_tokens_per_iter = choices [
233+ 0 ]["avg_decoded_tokens_per_iter" ]
228234 elif usage := data .get ("usage" ):
229235 output .output_tokens = usage .get (
230236 "completion_tokens" )
@@ -237,7 +243,6 @@ async def async_request_openai_completions(
237243 "This response will be marked as failed!" )
238244 output .generated_text = generated_text
239245 output .latency = most_recent_timestamp - st
240- output .decode_iteration = decode_iteration_count
241246 else :
242247 content = await response .content .read ()
243248 data = json .loads (content .decode ())
@@ -248,8 +253,11 @@ async def async_request_openai_completions(
248253 output .ttft = - 1
249254 output .itl = []
250255 output .output_tokens = data ["usage" ]["completion_tokens" ]
251- # For non-streaming, estimate decode_iteration as number of output tokens
252- output .decode_iteration = output .output_tokens if output .output_tokens > 0 else 1
256+ # Extract avg_decoded_tokens_per_iter if available
257+ choice = data ["choices" ][0 ]
258+ if "avg_decoded_tokens_per_iter" in choice :
259+ output .avg_decoded_tokens_per_iter = choice [
260+ "avg_decoded_tokens_per_iter" ]
253261 else :
254262 output .error = response .reason or ""
255263 output .success = False
@@ -263,6 +271,7 @@ async def async_request_openai_completions(
263271
264272 if pbar :
265273 pbar .update (1 )
274+
266275 return output
267276
268277
@@ -322,7 +331,6 @@ async def async_request_openai_chat_completions(
322331 ttft = 0.0
323332 st = time .perf_counter ()
324333 most_recent_timestamp = st
325- decode_iteration_count = 0 # Track decoding iterations
326334 try :
327335 async with request_session .post (url = api_url ,
328336 json = payload ,
@@ -353,10 +361,12 @@ async def async_request_openai_chat_completions(
353361 output .itl .append (timestamp -
354362 most_recent_timestamp )
355363
356- # Increment decode iteration for each chunk with content
357- if content is not None :
358- decode_iteration_count += 1
359364 generated_text += content or ""
365+
366+ # Extract avg_decoded_tokens_per_iter from streaming chat response
367+ if "avg_decoded_tokens_per_iter" in choices [0 ]:
368+ output .avg_decoded_tokens_per_iter = choices [
369+ 0 ]["avg_decoded_tokens_per_iter" ]
360370 elif usage := data .get ("usage" ):
361371 output .output_tokens = usage .get (
362372 "completion_tokens" )
@@ -365,7 +375,6 @@ async def async_request_openai_chat_completions(
365375
366376 output .generated_text = generated_text
367377 output .latency = most_recent_timestamp - st
368- output .decode_iteration = decode_iteration_count
369378 else :
370379 content = await response .content .read ()
371380 data = json .loads (content .decode ())
@@ -375,8 +384,12 @@ async def async_request_openai_chat_completions(
375384 output .itl = []
376385 output .latency = time .perf_counter () - st
377386 output .ttft = - 1
378- # For non-streaming, estimate decode_iteration as number of output tokens
379- output .decode_iteration = output .output_tokens if output .output_tokens > 0 else 1
387+
388+ # Extract avg_decoded_tokens_per_iter if available
389+ choice = data ["choices" ][0 ]
390+ if "avg_decoded_tokens_per_iter" in choice :
391+ output .avg_decoded_tokens_per_iter = choice [
392+ "avg_decoded_tokens_per_iter" ]
380393
381394 else :
382395 output .error = response .reason or ""
@@ -391,6 +404,7 @@ async def async_request_openai_chat_completions(
391404
392405 if pbar :
393406 pbar .update (1 )
407+
394408 return output
395409
396410
0 commit comments