fix: Enable num_return_sequences (n) support in PyTorch backend

jaedeok-nvidia · jaedeok-nvidia · commit 653f456d70bb · 2025-07-08T13:09:27.000+09:00
This PR enables the `n` parameter (num_return_sequences) in the PyTorch backend, which is the default path for LLM API. While this feature was already implemented in the TRT backend via C++ Executor, it was missing in the PyExecutor. This PR fixes the gap by adding necessary APIs to the pybind of the `LlmRequest` class.

Changes:
  - Added `create_child_request` method to `pyexecutor.LlmRequest` that wraps C++'s createChildRequest method. This allows requests to properly handle their child requests and states.
  - Updated C++ LlmRequest and related Python bindings to expose additional properties required in the PyTorch backend.
  - Enhanced `PyExecutor` to create child requests, ensuring proper handling of requests when `num_return_sequences &gt; 1`.

Signed-off-by: Jaedeok Kim &lt;jaedeokk@nvidia.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -472,6 +472,11 @@ class GenericLlmRequest
         mExcludeInputFromOutput = exclude;
     }
 
+    bool getExcludeInputFromOutput()
+    {
+        return mExcludeInputFromOutput;
+    }
+
     /// @brief Get the params of the context
     /// @return The params of the context
     [[nodiscard]] std::optional<executor::ContextPhaseParams> const& getContextPhaseParams() const noexcept
@@ -769,6 +774,11 @@ class GenericLlmRequest
         return mParentRequestId;
     }
 
+    [[nodiscard]] SizeType32 getSequenceIndex() const
+    {
+        return mSequenceIndex;
+    }
+
     /// @brief Return a vector of the last-generated tokens of shape [num_beams]
     [[nodiscard]] VecTokens const& getLastTokens()
     {
@@ -1856,6 +1866,11 @@ class GenericLlmRequest
     // current position of the prompt tuning table (only used in chunked prefill mode)
     SizeType32 mPtableCurrentPosition{0};
 
+    [[nodiscard]] std::shared_ptr<std::vector<bool>> getSequenceFinalVec() const
+    {
+        return mSequenceFinalVec;
+    }
+
 protected:
     bool mIsStreaming;
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -113,6 +113,8 @@ void initBindings(pybind11::module_& m)
         .def("set_generated_tokens", &GenLlmReq::setGeneratedTokens, py::arg("generated_beam_tokens"))
         .def("pause", &GenLlmReq::pause, py::arg("max_input_len"))
         .def_property("max_sent_token_len", &GenLlmReq::getMaxSentTokenLen, &GenLlmReq::setMaxSentTokenLen)
+        .def_property(
+            "exclude_input_from_output", &GenLlmReq::getExcludeInputFromOutput, &GenLlmReq::setExcludeInputFromOutput)
         .def_property_readonly("prompt_embedding_table", &GenLlmReq::getPromptEmbeddingTable)
         .def_property_readonly("multimodal_embedding", &GenLlmReq::getMultimodalEmbedding)
         .def_property_readonly("mrope_rotary_cos_sin", &GenLlmReq::getMropeRotaryCosSin)
@@ -377,6 +379,7 @@ void initBindings(pybind11::module_& m)
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))
         .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason"))
+        .def("create_child_request", &tb::LlmRequest::createChildRequest, py::arg("request_id"))
         .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime)
         .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, py::arg("iter_counter"));
 
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -229,19 +229,15 @@ class LlmResult:
     py_result_properties = frozenset(
         ('context_logits', 'generation_logits', 'log_probs', 'cum_log_probs'))
 
-    def __init__(self,
-                 result: Union[bytes, tensorrt_llm.bindings.executor.Result],
-                 py_result: PyResult,
-                 is_final: bool = False):
+    def __init__(self, result: Union[bytes,
+                                     tensorrt_llm.bindings.executor.Result],
+                 py_result: PyResult):
         self._result = result
         self._py_result = py_result
-        self.is_final = is_final
 
     def __getattr__(self, item):
         if item in self.py_result_properties:
             return getattr(self._py_result, item)
-        if item == 'is_final':
-            return object.__getattribute__(self, 'is_final')
         result = object.__getattribute__(self, '_result')
         return getattr(result, item)
 
@@ -316,6 +312,7 @@ def __init__(
         self.py_return_logits_device_memory = return_logits_device_memory
         self.py_is_draft = is_draft
         self.py_seq_slot = None
+        self.py_exclude_last_generation_logits = exclude_last_generation_logits
 
         # TODO: remove this when use DynamicDecodeOp in pytorch flow.
         # currently, keep py_stop_words_list as python list, rather than tensor.
@@ -327,19 +324,78 @@ def __init__(
                                   return_generation_logits,
                                   exclude_last_generation_logits)
 
-    def is_generation_only_request(self):
-        return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY
+    def create_child_request(self, request_id: int):
+        # Create a child request by C++'s API to track the states each other.
+        child_request = super().create_child_request(request_id)
+
+        # Copy Python-specific attributes from parent to child
+        child_request.py_client_id = self.py_client_id
+        child_request.py_parent_request_id = self.py_request_id
+        child_request.py_request_id = child_request.request_id
+        child_request.py_llm_request_type = child_request.llm_request_type
+        child_request.py_end_id = child_request.end_id
+        child_request.py_prompt_len = child_request.prompt_len
+        child_request.py_orig_prompt_len = child_request.orig_prompt_len
+        child_request.py_max_new_tokens = child_request.max_new_tokens
+
+        # input_toknes are already cloned in create_child_request.
+        child_request.py_tokens = child_request.get_tokens()
+
+        # Copy Python-specific configuration from parent
+        child_request.py_return_log_probs = self.py_return_log_probs
+        child_request.py_return_context_logits = self.py_return_context_logits
+        child_request.py_return_generation_logits = self.py_return_generation_logits
+        child_request.py_return_logits_device_memory = self.py_return_logits_device_memory
+        child_request.py_exclude_last_generation_logits = self.py_exclude_last_generation_logits
+        child_request.py_stop_words_list = self.py_stop_words_list
+        child_request.py_logits_post_processors = self.py_logits_post_processors
+        child_request.py_rewind_len = self.py_rewind_len
+        child_request.py_decoding_iter = self.py_decoding_iter
+        child_request.py_draft_tokens = self.py_draft_tokens.copy(
+        ) if self.py_draft_tokens else []
+        child_request.py_last_draft_tokens = self.py_last_draft_tokens.copy(
+        ) if self.py_last_draft_tokens else None
+        child_request.py_num_accepted_draft_tokens = self.py_num_accepted_draft_tokens
+        child_request.py_lora_task_layer_module_configs = self.py_lora_task_layer_module_configs
+
+        # Initialize Python-specific runtime state
+        child_request.py_batch_idx = None
+        child_request.is_attention_dp_dummy = self.is_attention_dp_dummy
+        child_request.is_cuda_graph_dummy = self.is_cuda_graph_dummy
+
+        # Create PyResult for child
+        child_request.py_result = PyResult(
+            prompt_len=child_request.py_prompt_len,
+            max_new_tokens=child_request.py_max_new_tokens,
+            use_device_memory=self.py_return_logits_device_memory,
+            streaming=child_request.streaming,
+            return_log_probs=self.py_return_log_probs,
+            return_context_logits=self.py_return_context_logits,
+            return_generation_logits=self.py_return_generation_logits,
+            exclude_last_generation_logits=self.
+            py_exclude_last_generation_logits)
+
+        # Note: This mimics the behavior of the original LlmRequest.
+        # We need to ensure the child request behaves like the parent
+        # LlmRequest by copying any additional Python-specific attributes that
+        # might be needed for proper request handling and response generation.
+        child_request.is_dummy = self.is_dummy
+
+        return child_request
 
     def create_response(
             self,
             use_fast_logits=False,
             mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
-        result, is_final = super().create_serialized_result(
-            use_fast_logits, mpi_world_rank)
-        return LlmResponse(
-            request_id=self.py_request_id,
-            result=LlmResult(result, self.py_result, is_final),
-            client_id=self.py_client_id) if len(result) > 0 else None
+
+        result = super().create_result(use_fast_logits, mpi_world_rank)
+
+        if result is None:
+            return None
+        else:
+            return LlmResponse(request_id=self.py_request_id,
+                               result=LlmResult(result, self.py_result),
+                               client_id=self.py_client_id)
 
     @property
     def is_dummy(self):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -58,6 +58,7 @@ class RequestQueueItem:
     id: int
     request: Optional[ExecutorRequest] = None
     query: Optional[list] = None  # only used in `StarAttention`
+    child_req_ids: Optional[List[int]] = None  # for num_return_sequences > 1
 
     def is_shutdown_request(self):
         return self.id == SHUTDOWN_REQUEST_ID
@@ -319,6 +320,11 @@ def __enter__(self):
     def __exit__(self):
         self.shutdown()
 
+    def _get_request_id(self):
+        req_id = self.next_req_id
+        self.next_req_id += 1
+        return req_id
+
     def enqueue_requests(self, requests: List[ExecutorRequest]):
         """
         Enqueue new requests
@@ -327,13 +333,33 @@ def enqueue_requests(self, requests: List[ExecutorRequest]):
         try:
             self.enqueue_lock.acquire()
             assert self.active, "PyExecutor has already been shutdown."
-            start_time = time.time()
             for request in requests:
-                self.start_times[self.next_req_id] = start_time
+                req_id = self._get_request_id()
+
+                if self.enable_iter_perf_stats:
+                    self.start_times[req_id] = time.time()
+
+                # Generate child request IDs if needed
+                child_req_ids = None
+                sampling_config = request.sampling_config
+                beam_width = sampling_config.beam_width
+                num_return_sequences = sampling_config.num_return_sequences
+
+                if beam_width == 1 and num_return_sequences > 1:
+                    # Reserve request ids for child requests.
+                    child_req_ids = []
+                    for _ in range(num_return_sequences - 1):
+                        child_req_id = self._get_request_id()
+                        if self.enable_iter_perf_stats:
+                            self.start_times[child_req_id] = time.time()
+                        child_req_ids.append(child_req_id)
+
                 self.request_queue.put(
-                    RequestQueueItem(self.next_req_id, request))
-                req_ids.append(self.next_req_id)
-                self.next_req_id += 1
+                    RequestQueueItem(req_id,
+                                     request,
+                                     query=None,
+                                     child_req_ids=child_req_ids))
+                req_ids.append(req_id)
         finally:
             self.enqueue_lock.release()
         return req_ids
@@ -370,6 +396,12 @@ def cancel_request(self, id: int):
         """
         self.canceled_req_ids.insert(id)
 
+        # Also cancel all child requests if this is a parent request
+        # Look through active requests to find child requests
+        for request in self.active_requests:
+            if request.py_parent_request_id == id:
+                self.canceled_req_ids.insert(request.py_request_id)
+
     def shutdown(self):
         """
         Signals the server to shutdown.
@@ -438,15 +470,34 @@ def enqueue_request(self,
         try:
             self.enqueue_lock.acquire()
             assert self.active, "PyExecutor has already been shutdown."
-            req_id = self.next_req_id
+            # Allocate the main request ID first
+            req_id = self._get_request_id()
+
             if self.enable_iter_perf_stats:
                 self.start_times[req_id] = time.time()
 
-            if query is not None:
-                self.request_queue.put(RequestQueueItem(req_id, request, query))
-            else:
-                self.request_queue.put(RequestQueueItem(req_id, request))
-            self.next_req_id += 1
+            # Generate child request IDs if needed
+            child_req_ids = None
+            sampling_config = request.sampling_config
+            beam_width = (sampling_config.beam_width
+                          if sampling_config.beam_width else 1)
+            num_return_sequences = (sampling_config.num_return_sequences if
+                                    sampling_config.num_return_sequences else 1)
+
+            # Only create child requests if beam_width == 1 and num_return_sequences > 1
+            if beam_width == 1 and num_return_sequences > 1:
+                child_req_ids = []
+                for i in range(num_return_sequences - 1):
+                    child_req_id = self._get_request_id()
+                    if self.enable_iter_perf_stats:
+                        self.start_times[child_req_id] = time.time()
+                    child_req_ids.append(child_req_id)
+
+            self.request_queue.put(
+                RequestQueueItem(req_id,
+                                 request,
+                                 query=query,
+                                 child_req_ids=child_req_ids))
         finally:
             self.enqueue_lock.release()
         return req_id
@@ -1396,6 +1447,9 @@ def _merge_star_attention_requests(self,
                                        new_requests: list[RequestQueueItem]):
         result = []
         for req_item in new_requests:
+            assert req_item.child_req_ids is None, (
+                "Star attention does not yet support sampling_config.num_return_sequences > 1"
+            )
             req_id, exe_req, query_token_ids = req_item.id, req_item.request, req_item.query
             ctx_len0 = len(exe_req.input_token_ids)
             ctx_blocks, position_blocks, last_block_padding_num = [
@@ -1461,12 +1515,18 @@ def _merge_requests(self, new_requests: list[RequestQueueItem]):
             else:
                 raise NotImplementedError(f'unsupport cp type {cp_type}')
         else:
-            return [
-                executor_request_to_llm_request(
+            llm_reqs = []
+            for req_item in new_requests:
+                llm_req = executor_request_to_llm_request(
                     req_item.id, req_item.request,
                     self._should_exclude_last_generation_logits())
-                for req_item in new_requests
-            ]
+                if req_item.child_req_ids:
+                    # Create subrequests for n-returns using pre-generated child request ids.
+                    for child_req_id in req_item.child_req_ids:
+                        child_req = llm_req.create_child_request(child_req_id)
+                        llm_reqs.append(child_req)
+                llm_reqs.append(llm_req)
+            return llm_reqs
 
     @nvtx_range("_schedule")
     def _schedule(self):
@@ -1982,7 +2042,7 @@ def _enqueue_responses(self, responses: Dict[int, LlmResponse]):
                     if req_id in self.responses.keys():
                         self.responses[req_id].append(resp)
                     else:
-                        self.responses.update({req_id: [resp]})
+                        self.responses[req_id] = [resp]
                 self.response_cv.notify_all()
 
     @nvtx_range("_handle_first_token_response")
@@ -2013,7 +2073,7 @@ def _handle_responses(self):
                 requests_to_terminate.append(request)
                 continue
 
-            if request.is_generation_only_request():
+            if request.is_generation_only_request:
                 # If request is in transmission, so we don't need to emit a response
                 # Also, for the first iteration with overlap, we should skip since first
                 # token has already been emitted previously
@@ -2033,7 +2093,7 @@ def _handle_responses(self):
             if self.model_engine.iter_counter % self.stream_interval == 0 or request.is_finished:
                 response = request.create_response(False, self.dist.rank)
                 if response:
-                    request_done = response.result.is_final
+                    request_done = response.result.is_sequence_final
                     new_responses.update({req_id: response})
 
             if request_done:
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
@@ -299,8 +299,9 @@ def _handle_response(self,
                     handler(response.error_msg)
 
             response_result = response.result
-            if hasattr(response_result, "_result"):
-                response_result.deserialize()
+            # TODO(jaedeokk): Need to check. Why do we need to deserialize the result? Is it for disaggregated serving?
+            # if hasattr(response_result, "_result"):
+            #     response_result.deserialize()
 
             self._done = response_result.is_final
             context_phase_params = response_result.context_phase_params