fix child request's create_response

jaedeok-nvidia · jaedeok-nvidia · commit e8ce36b8a9db · 2025-07-08T13:09:27.000+09:00
Signed-off-by: Jaedeok Kim &lt;jaedeokk@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from functools import partial
 from typing import List, Optional, Union
 
 import torch
@@ -257,6 +258,23 @@ def has_error(self):
         return self.error_msg is not None
 
 
+def create_response(
+        request: Union['LlmRequest',
+                       tensorrt_llm.bindings.internal.batch_manager.LlmRequest],
+        use_fast_logits=False,
+        mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
+    """ Create a response for a given request. """
+
+    result = request.create_result(use_fast_logits, mpi_world_rank)
+
+    if result is None:
+        return None
+    else:
+        return LlmResponse(request_id=request.py_request_id,
+                           result=LlmResult(result, request.py_result),
+                           client_id=request.py_client_id)
+
+
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
     """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
     but detour some features to Python implementation"""
@@ -375,27 +393,23 @@ def create_child_request(self, request_id: int):
             exclude_last_generation_logits=self.
             py_exclude_last_generation_logits)
 
-        # Note: This mimics the behavior of the original LlmRequest.
+        # Note: The below mimics the behavior of the original LlmRequest.
+
         # We need to ensure the child request behaves like the parent
         # LlmRequest by copying any additional Python-specific attributes that
         # might be needed for proper request handling and response generation.
         child_request.is_dummy = self.is_dummy
 
+        # Override create_response to return the child request
+        child_request.create_response = partial(create_response, child_request)
+
         return child_request
 
     def create_response(
             self,
             use_fast_logits=False,
             mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
-
-        result = super().create_result(use_fast_logits, mpi_world_rank)
-
-        if result is None:
-            return None
-        else:
-            return LlmResponse(request_id=self.py_request_id,
-                               result=LlmResult(result, self.py_result),
-                               client_id=self.py_client_id)
+        return create_response(self, use_fast_logits, mpi_world_rank)
 
     @property
     def is_dummy(self):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -343,7 +343,7 @@ def enqueue_requests(self, requests: List[ExecutorRequest]):
                 child_req_ids = None
                 sampling_config = request.sampling_config
                 beam_width = sampling_config.beam_width
-                num_return_sequences = sampling_config.num_return_sequences
+                num_return_sequences = sampling_config.num_return_sequences or beam_width
 
                 if beam_width == 1 and num_return_sequences > 1:
                     # Reserve request ids for child requests.