NVIDIA · jaedeok-nvidia · Jun 24, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jun 27, 2025
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -377,6 +377,7 @@ void initBindings(pybind11::module_& m)
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))
         .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason"))
+        .def("create_child_request", &tb::LlmRequest::createChildRequest, py::arg("request_id"))
         .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime)
         .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, py::arg("iter_counter"));
 

@@ -103,6 +103,7 @@ def add_llm_args(parser):
     parser.add_argument("--temperature", type=float, default=None)
     parser.add_argument("--top_k", type=int, default=None)
     parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--n", type=int, default=1)
     parser.add_argument('--load_format', type=str, default='auto')
     parser.add_argument('--max_beam_width', type=int, default=1)
 
@@ -186,10 +187,21 @@ def setup_llm(args):
     else:
         spec_config = None
 
+    # TorchSampler needs to set mixed_sampler to True for non-greedy decoding.
+    greedy_decoding = ((args.temperature == 0.0)
+                       or (args.top_k == 1 and
+                           (args.top_p == 0.0 or args.top_p is None)))
+    mixed_sampler = (
+        not greedy_decoding and not args.enable_trtllm_sampler
+        # Eagle3 does not support mixed sampler.
+        # Refer TorchSampler._process_requests.
+        and spec_decode_algo != 'EAGLE3')
+
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=args.cuda_graph_batch_sizes,
         padding_enabled=args.cuda_graph_padding_enabled,
     ) if args.use_cuda_graph else None
+
     llm = LLM(
         model=args.model_dir,
         backend='pytorch',
@@ -209,6 +221,7 @@ def setup_llm(args):
         if args.use_torch_compile else None,
         moe_backend=args.moe_backend,
         enable_trtllm_sampler=args.enable_trtllm_sampler,
+        enable_mixed_sampler=mixed_sampler,
         max_seq_len=args.max_seq_len,
         max_batch_size=args.max_batch_size,
         max_num_tokens=args.max_num_tokens,
@@ -224,6 +237,10 @@ def setup_llm(args):
         gather_generation_logits=args.return_generation_logits,
         max_beam_width=args.max_beam_width)
 
+    if args.max_beam_width > 1:
+        # If beam search is used, set n to the beam width.
+        args.n = args.max_beam_width
+
     sampling_params = SamplingParams(
         max_tokens=args.max_tokens,
         temperature=args.temperature,
@@ -232,7 +249,7 @@ def setup_llm(args):
         return_context_logits=args.return_context_logits,
         return_generation_logits=args.return_generation_logits,
         logprobs=args.logprobs,
-        n=args.max_beam_width,
+        n=args.n,
         use_beam_search=args.max_beam_width > 1)
     return llm, sampling_params
 
@@ -246,23 +263,23 @@ def main():
 
     for i, output in enumerate(outputs):
         prompt = output.prompt
-        for beam_idx, beam in enumerate(output.outputs):
-            generated_text = beam.text
-            # Skip printing the beam_idx if no beam search was used
-            beam_id_text = f"[{beam_idx}]" if args.max_beam_width > 1 else ""
+        for seq_idx, seq_output in enumerate(output.outputs):
+            # Skip printing the sequence index if a single sequence is returned.
+            seq_id_text = f"[{seq_idx}]" if args.n > 1 else ""
+            generated_text = seq_output.text
             print(
-                f"[{i}]{beam_id_text} Prompt: {prompt!r}, Generated text: {generated_text!r}"
+                f"[{i}]{seq_id_text} Prompt: {prompt!r}, Generated text: {generated_text!r}"
             )
             if args.return_context_logits:
                 print(
-                    f"[{i}]{beam_id_text} Context logits: {output.context_logits}"
+                    f"[{i}]{seq_id_text} Context logits: {output.context_logits}"
                 )
             if args.return_generation_logits:
                 print(
-                    f"[{i}]{beam_id_text} Generation logits: {beam.generation_logits}"
+                    f"[{i}]{seq_id_text} Generation logits: {beam.generation_logits}"
                 )
             if args.logprobs:
-                print(f"[{i}]{beam_id_text} Logprobs: {beam.logprobs}")
+                print(f"[{i}]{seq_id_text} Logprobs: {beam.logprobs}")
 
 
 if __name__ == '__main__':

@@ -1,4 +1,6 @@
+import copy
 from dataclasses import dataclass
+from functools import partial
 from typing import List, Optional, Union
 
 import torch
@@ -261,6 +263,38 @@ def has_error(self):
         return self.error_msg is not None
 
 
+def create_response(
+        request: Union['LlmRequest',
+                       tensorrt_llm.bindings.internal.batch_manager.LlmRequest],
+        use_fast_logits=False,
+        mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
+    """ Create a response for a given request. """
+    create_serialized_result = \
+        super(LlmRequest, request).create_serialized_result \
+        if isinstance(request, LlmRequest) else \
+        request.create_serialized_result
+    result, is_final = create_serialized_result(use_fast_logits, mpi_world_rank)
+    return LlmResponse(
+        request_id=request.py_request_id,
+        result=LlmResult(result, request.py_result, is_final),
+        client_id=request.py_client_id) if len(result) > 0 else None
+
+
+def finish_by(request: Union[
+    'LlmRequest', tensorrt_llm.bindings.internal.batch_manager.LlmRequest],
+              reason: FinishReason, beam: int) -> None:
+    """CPP finish by reason does not support beam_width > 1"""
+    request.state = LlmRequestState.GENERATION_COMPLETE
+    request.set_finished_reason(reason, beam)
+
+
+def is_generation_only_request(
+    request: Union['LlmRequest',
+                   tensorrt_llm.bindings.internal.batch_manager.LlmRequest]
+) -> bool:
+    return request.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY
+
+
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
     """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
     but detour some features to Python implementation"""
@@ -279,6 +313,7 @@ def __init__(
             stop_words_list: list[list[int]] | None = None,
             is_draft: bool = False,
             **kwargs):
+
         self.py_logits_post_processors = kwargs.pop("py_logits_post_processors",
                                                     None)
         # Multimodal data
@@ -316,6 +351,7 @@ def __init__(
         self.py_return_logits_device_memory = return_logits_device_memory
         self.py_is_draft = is_draft
         self.py_seq_slot = None
+        self.py_exclude_last_generation_logits = exclude_last_generation_logits
 
         # TODO: remove this when use DynamicDecodeOp in pytorch flow.
         # currently, keep py_stop_words_list as python list, rather than tensor.
@@ -327,28 +363,64 @@ def __init__(
                                   return_generation_logits,
                                   exclude_last_generation_logits)
 
-    def is_generation_only_request(self):
-        return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY
+    def is_generation_only(self):
+        # is_generation_only_request is a property getter at the C++ side,
+        # so here use a different name at the pytorch backend.
+        return is_generation_only_request(self)
+
+    def create_child_request(self, request_id: int):
+        """ Create a child request.
+
+        NOTE: This function generate a child request by C++'s API to track the
+        states each other and returns the object of type batch_manager.LlmRequest,
+        which is not a llm_request.LlmRequest. As a workaround, to ensure the
+        child request behaves like its parent, this function mimics the behavior
+        of the original LlmRequest by dynamically adding the required attributes
+        of the parent request to the child request. This function will be
+        implemented when LlmRequest becomes pure-python.
+
+        See: https://github.com/NVIDIA/TensorRT-LLM/issues/3034
+        """
+
+        child_request = super().create_child_request(request_id)
+
+        # Copy all py_* attributes from parent to child
+        for attr_name, attr_value in self.__dict__.items():
+            if attr_name.startswith('py_'):
+                attr_value = getattr(self, attr_name)
+                setattr(child_request, attr_name, copy.deepcopy(attr_value))
+
+        # Override specific attributes that should use child_request values.
+        child_request.py_parent_request_id = self.py_request_id
+        child_request.py_request_id = child_request.request_id
+        child_request.py_llm_request_type = child_request.llm_request_type
+        child_request.py_batch_idx = None
+        child_request.py_seq_slot = None
+
+        # Mimic the behavior of the original LlmRequest.
+        child_request.is_attention_dp_dummy = self.is_attention_dp_dummy
+        child_request.is_cuda_graph_dummy = self.is_cuda_graph_dummy
+        child_request.is_dummy = self.is_dummy
+        child_request.is_generation_only = partial(is_generation_only_request,
+                                                   child_request)
+        child_request.create_response = partial(create_response, child_request)
+        child_request.finish_by = partial(finish_by, child_request)
+
+        return child_request
 
     def create_response(
             self,
             use_fast_logits=False,
             mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
-        result, is_final = super().create_serialized_result(
-            use_fast_logits, mpi_world_rank)
-        return LlmResponse(
-            request_id=self.py_request_id,
-            result=LlmResult(result, self.py_result, is_final),
-            client_id=self.py_client_id) if len(result) > 0 else None
+        return create_response(self, use_fast_logits, mpi_world_rank)
 
     @property
     def is_dummy(self):
         return self.is_attention_dp_dummy or self.is_cuda_graph_dummy or self.is_dummy_request
 
     def finish_by(self, reason: FinishReason, beam: int) -> None:
         """CPP finish by reason does not support beam_width > 1"""
-        self.state = LlmRequestState.GENERATION_COMPLETE
-        self.set_finished_reason(reason, beam)
+        finish_by(self, reason, beam)
 
 
 def convert_wordlist(word_list) -> List[List[int]]: