@@ -276,6 +276,8 @@ async def chat_completion_stream_generator(
276276 if first_iteration :
277277 num_prompt_tokens = len (prompt_token_ids )
278278 num_cached_tokens = res .get ("num_cached_tokens" , 0 )
279+ num_input_image_tokens = res .get ("num_input_image_tokens" , 0 )
280+ num_input_video_tokens = res .get ("num_input_video_tokens" , 0 )
279281 for i in range (num_choices ):
280282 choice = ChatCompletionResponseStreamChoice (
281283 index = i ,
@@ -312,7 +314,11 @@ async def chat_completion_stream_generator(
312314 prompt_tokens = num_prompt_tokens ,
313315 completion_tokens = 0 ,
314316 total_tokens = num_prompt_tokens ,
315- prompt_tokens_details = PromptTokenUsageInfo (cached_tokens = num_cached_tokens ),
317+ prompt_tokens_details = PromptTokenUsageInfo (
318+ cached_tokens = num_cached_tokens ,
319+ image_tokens = num_input_image_tokens ,
320+ video_tokens = num_input_video_tokens ,
321+ ),
316322 completion_tokens_details = CompletionTokenUsageInfo (reasoning_tokens = 0 ),
317323 )
318324 yield f"data: { chunk .model_dump_json (exclude_unset = True )} \n \n "
@@ -476,6 +482,8 @@ async def chat_completion_full_generator(
476482 draft_logprob_contents = [[] for _ in range (num_choices )]
477483 completion_token_ids = [[] for _ in range (num_choices )]
478484 num_cached_tokens = [0 ] * num_choices
485+ num_input_image_tokens = [0 ] * num_choices
486+ num_input_video_tokens = [0 ] * num_choices
479487 num_image_tokens = [0 ] * num_choices
480488 response_processor = ChatResponseProcessor (
481489 data_processor = self .engine_client .data_processor ,
@@ -546,14 +554,15 @@ async def chat_completion_full_generator(
546554 previous_num_tokens [idx ] += data ["outputs" ].get ("image_token_num" )
547555 num_image_tokens [idx ] = data ["outputs" ].get ("image_token_num" )
548556 choice = await self ._create_chat_completion_choice (
549- output = output ,
550- index = idx ,
557+ data = data ,
551558 request = request ,
552- previous_num_tokens = previous_num_tokens [idx ],
553559 prompt_token_ids = prompt_token_ids ,
554560 prompt_tokens = prompt_tokens ,
555561 completion_token_ids = completion_token_ids [idx ],
562+ previous_num_tokens = previous_num_tokens [idx ],
556563 num_cached_tokens = num_cached_tokens ,
564+ num_input_image_tokens = num_input_image_tokens ,
565+ num_input_video_tokens = num_input_video_tokens ,
557566 num_image_tokens = num_image_tokens ,
558567 logprob_contents = logprob_contents ,
559568 response_processor = response_processor ,
@@ -571,11 +580,16 @@ async def chat_completion_full_generator(
571580 prompt_tokens = num_prompt_tokens ,
572581 completion_tokens = num_generated_tokens ,
573582 total_tokens = num_prompt_tokens + num_generated_tokens ,
574- prompt_tokens_details = PromptTokenUsageInfo (cached_tokens = sum (num_cached_tokens )),
583+ prompt_tokens_details = PromptTokenUsageInfo (
584+ cached_tokens = sum (num_cached_tokens ),
585+ image_tokens = sum (num_input_image_tokens ),
586+ video_tokens = sum (num_input_video_tokens ),
587+ ),
575588 completion_tokens_details = CompletionTokenUsageInfo (
576589 reasoning_tokens = num_reasoning_tokens , image_tokens = sum (num_image_tokens )
577590 ),
578591 )
592+
579593 choices = sorted (choices , key = lambda x : x .index )
580594 res = ChatCompletionResponse (
581595 id = request_id ,
@@ -589,18 +603,21 @@ async def chat_completion_full_generator(
589603
590604 async def _create_chat_completion_choice (
591605 self ,
592- output : dict ,
593- index : int ,
606+ data : dict ,
594607 request : ChatCompletionRequest ,
595- previous_num_tokens : int ,
596608 prompt_token_ids : list ,
597609 prompt_tokens : str ,
598610 completion_token_ids : list ,
611+ previous_num_tokens : int ,
599612 num_cached_tokens : list ,
613+ num_input_image_tokens : list ,
614+ num_input_video_tokens : list ,
600615 num_image_tokens : list ,
601616 logprob_contents : list ,
602617 response_processor : ChatResponseProcessor ,
603618 ) -> ChatCompletionResponseChoice :
619+ idx = int (data ["request_id" ].split ("_" )[- 1 ])
620+ output = data ["outputs" ]
604621
605622 if output is not None and output .get ("metrics" ) and output ["metrics" ].get ("request_start_time" ):
606623 work_process_metrics .e2e_request_latency .observe (
@@ -621,13 +638,15 @@ async def _create_chat_completion_choice(
621638 message .content = output ["text" ]
622639
623640 logprobs_full_res = None
624- if logprob_contents [index ]:
625- logprobs_full_res = LogProbs (content = logprob_contents [index ])
641+ if logprob_contents [idx ]:
642+ logprobs_full_res = LogProbs (content = logprob_contents [idx ])
626643
627644 has_no_token_limit = request .max_tokens is None and request .max_completion_tokens is None
628645 max_tokens = request .max_completion_tokens or request .max_tokens
629- num_cached_tokens [index ] = output .get ("num_cached_tokens" , 0 )
630- num_image_tokens [index ] = output .get ("num_image_tokens" , 0 )
646+ num_cached_tokens [idx ] = data .get ("num_cached_tokens" , 0 )
647+ num_input_image_tokens [idx ] = data .get ("num_input_image_tokens" , 0 )
648+ num_input_video_tokens [idx ] = data .get ("num_input_video_tokens" , 0 )
649+ num_image_tokens [idx ] = output .get ("num_image_tokens" , 0 )
631650
632651 finish_reason = "stop"
633652 if has_no_token_limit or previous_num_tokens != max_tokens :
@@ -640,7 +659,7 @@ async def _create_chat_completion_choice(
640659 finish_reason = "recover_stop"
641660
642661 return ChatCompletionResponseChoice (
643- index = index ,
662+ index = idx ,
644663 message = message ,
645664 logprobs = logprobs_full_res ,
646665 finish_reason = finish_reason ,
0 commit comments