@@ -276,10 +276,19 @@ def create_response(
276276 return None
277277 else :
278278 return LlmResponse (request_id = request .py_request_id ,
279- result = LlmResult (result , request .py_result ),
279+ result = LlmResult (result , request .py_result ,
280+ result .is_final ),
280281 client_id = request .py_client_id )
281282
282283
284+ def finish_by (request : Union [
285+ 'LlmRequest' , tensorrt_llm .bindings .internal .batch_manager .LlmRequest ],
286+ reason : FinishReason , beam : int ) -> None :
287+ """CPP finish by reason does not support beam_width > 1"""
288+ request .state = LlmRequestState .GENERATION_COMPLETE
289+ request .set_finished_reason (reason , beam )
290+
291+
283292class LlmRequest (tensorrt_llm .bindings .internal .batch_manager .LlmRequest ):
284293 """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
285294 but detour some features to Python implementation"""
@@ -298,6 +307,7 @@ def __init__(
298307 stop_words_list : list [list [int ]] | None = None ,
299308 is_draft : bool = False ,
300309 ** kwargs ):
310+
301311 self .py_logits_post_processors = kwargs .pop ("py_logits_post_processors" ,
302312 None )
303313 # Multimodal data
@@ -377,8 +387,9 @@ def create_child_request(self, request_id: int):
377387 child_request .is_cuda_graph_dummy = self .is_cuda_graph_dummy
378388 child_request .is_dummy = self .is_dummy
379389
380- # Override create_response to return the child request
390+ # Mimic the behavior of the original LlmRequest.
381391 child_request .create_response = partial (create_response , child_request )
392+ child_request .finish_by = partial (finish_by , child_request )
382393
383394 return child_request
384395
@@ -394,8 +405,7 @@ def is_dummy(self):
394405
395406 def finish_by (self , reason : FinishReason , beam : int ) -> None :
396407 """CPP finish by reason does not support beam_width > 1"""
397- self .state = LlmRequestState .GENERATION_COMPLETE
398- self .set_finished_reason (reason , beam )
408+ finish_by (self , reason , beam )
399409
400410
401411def convert_wordlist (word_list ) -> List [List [int ]]:
0 commit comments