Skip to content
Closed
1 change: 1 addition & 0 deletions cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ void initBindings(pybind11::module_& m)
.def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
.def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))
.def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason"))
.def("create_child_request", &tb::LlmRequest::createChildRequest, py::arg("request_id"))
.def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime)
.def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, py::arg("iter_counter"));

Expand Down
35 changes: 26 additions & 9 deletions examples/llm-api/quickstart_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def add_llm_args(parser):
parser.add_argument("--temperature", type=float, default=None)
parser.add_argument("--top_k", type=int, default=None)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--n", type=int, default=1)
parser.add_argument('--load_format', type=str, default='auto')
parser.add_argument('--max_beam_width', type=int, default=1)

Expand Down Expand Up @@ -186,10 +187,21 @@ def setup_llm(args):
else:
spec_config = None

# TorchSampler needs to set mixed_sampler to True for non-greedy decoding.
greedy_decoding = ((args.temperature == 0.0)
or (args.top_k == 1 and
(args.top_p == 0.0 or args.top_p is None)))
mixed_sampler = (
not greedy_decoding and not args.enable_trtllm_sampler
# Eagle3 does not support mixed sampler.
# Refer TorchSampler._process_requests.
and spec_decode_algo != 'EAGLE3')

cuda_graph_config = CudaGraphConfig(
batch_sizes=args.cuda_graph_batch_sizes,
padding_enabled=args.cuda_graph_padding_enabled,
) if args.use_cuda_graph else None

llm = LLM(
model=args.model_dir,
backend='pytorch',
Expand All @@ -209,6 +221,7 @@ def setup_llm(args):
if args.use_torch_compile else None,
moe_backend=args.moe_backend,
enable_trtllm_sampler=args.enable_trtllm_sampler,
enable_mixed_sampler=mixed_sampler,
max_seq_len=args.max_seq_len,
max_batch_size=args.max_batch_size,
max_num_tokens=args.max_num_tokens,
Expand All @@ -224,6 +237,10 @@ def setup_llm(args):
gather_generation_logits=args.return_generation_logits,
max_beam_width=args.max_beam_width)

if args.max_beam_width > 1:
# If beam search is used, set n to the beam width.
args.n = args.max_beam_width

sampling_params = SamplingParams(
max_tokens=args.max_tokens,
temperature=args.temperature,
Expand All @@ -232,7 +249,7 @@ def setup_llm(args):
return_context_logits=args.return_context_logits,
return_generation_logits=args.return_generation_logits,
logprobs=args.logprobs,
n=args.max_beam_width,
n=args.n,
use_beam_search=args.max_beam_width > 1)
return llm, sampling_params

Expand All @@ -246,23 +263,23 @@ def main():

for i, output in enumerate(outputs):
prompt = output.prompt
for beam_idx, beam in enumerate(output.outputs):
generated_text = beam.text
# Skip printing the beam_idx if no beam search was used
beam_id_text = f"[{beam_idx}]" if args.max_beam_width > 1 else ""
for seq_idx, seq_output in enumerate(output.outputs):
# Skip printing the sequence index if a single sequence is returned.
seq_id_text = f"[{seq_idx}]" if args.n > 1 else ""
generated_text = seq_output.text
print(
f"[{i}]{beam_id_text} Prompt: {prompt!r}, Generated text: {generated_text!r}"
f"[{i}]{seq_id_text} Prompt: {prompt!r}, Generated text: {generated_text!r}"
)
if args.return_context_logits:
print(
f"[{i}]{beam_id_text} Context logits: {output.context_logits}"
f"[{i}]{seq_id_text} Context logits: {output.context_logits}"
)
if args.return_generation_logits:
print(
f"[{i}]{beam_id_text} Generation logits: {beam.generation_logits}"
f"[{i}]{seq_id_text} Generation logits: {beam.generation_logits}"
)
if args.logprobs:
print(f"[{i}]{beam_id_text} Logprobs: {beam.logprobs}")
print(f"[{i}]{seq_id_text} Logprobs: {beam.logprobs}")


if __name__ == '__main__':
Expand Down
92 changes: 82 additions & 10 deletions tensorrt_llm/_torch/pyexecutor/llm_request.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import copy
from dataclasses import dataclass
from functools import partial
from typing import List, Optional, Union

import torch
Expand Down Expand Up @@ -261,6 +263,38 @@ def has_error(self):
return self.error_msg is not None


def create_response(
request: Union['LlmRequest',
tensorrt_llm.bindings.internal.batch_manager.LlmRequest],
use_fast_logits=False,
mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
""" Create a response for a given request. """
create_serialized_result = \
super(LlmRequest, request).create_serialized_result \
if isinstance(request, LlmRequest) else \
request.create_serialized_result
result, is_final = create_serialized_result(use_fast_logits, mpi_world_rank)
return LlmResponse(
request_id=request.py_request_id,
result=LlmResult(result, request.py_result, is_final),
client_id=request.py_client_id) if len(result) > 0 else None


def finish_by(request: Union[
'LlmRequest', tensorrt_llm.bindings.internal.batch_manager.LlmRequest],
reason: FinishReason, beam: int) -> None:
"""CPP finish by reason does not support beam_width > 1"""
request.state = LlmRequestState.GENERATION_COMPLETE
request.set_finished_reason(reason, beam)


def is_generation_only_request(
request: Union['LlmRequest',
tensorrt_llm.bindings.internal.batch_manager.LlmRequest]
) -> bool:
return request.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY


class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
"""LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
but detour some features to Python implementation"""
Expand All @@ -279,6 +313,7 @@ def __init__(
stop_words_list: list[list[int]] | None = None,
is_draft: bool = False,
**kwargs):

self.py_logits_post_processors = kwargs.pop("py_logits_post_processors",
None)
# Multimodal data
Expand Down Expand Up @@ -316,6 +351,7 @@ def __init__(
self.py_return_logits_device_memory = return_logits_device_memory
self.py_is_draft = is_draft
self.py_seq_slot = None
self.py_exclude_last_generation_logits = exclude_last_generation_logits

# TODO: remove this when use DynamicDecodeOp in pytorch flow.
# currently, keep py_stop_words_list as python list, rather than tensor.
Expand All @@ -327,28 +363,64 @@ def __init__(
return_generation_logits,
exclude_last_generation_logits)

def is_generation_only_request(self):
return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY
def is_generation_only(self):
# is_generation_only_request is a property getter at the C++ side,
# so here use a different name at the pytorch backend.
return is_generation_only_request(self)

def create_child_request(self, request_id: int):
""" Create a child request.

NOTE: This function generate a child request by C++'s API to track the
states each other and returns the object of type batch_manager.LlmRequest,
which is not a llm_request.LlmRequest. As a workaround, to ensure the
child request behaves like its parent, this function mimics the behavior
of the original LlmRequest by dynamically adding the required attributes
of the parent request to the child request. This function will be
implemented when LlmRequest becomes pure-python.

See: https://github.com/NVIDIA/TensorRT-LLM/issues/3034
"""

child_request = super().create_child_request(request_id)

# Copy all py_* attributes from parent to child
for attr_name, attr_value in self.__dict__.items():
if attr_name.startswith('py_'):
attr_value = getattr(self, attr_name)
setattr(child_request, attr_name, copy.deepcopy(attr_value))

# Override specific attributes that should use child_request values.
child_request.py_parent_request_id = self.py_request_id
child_request.py_request_id = child_request.request_id
child_request.py_llm_request_type = child_request.llm_request_type
child_request.py_batch_idx = None
child_request.py_seq_slot = None

# Mimic the behavior of the original LlmRequest.
child_request.is_attention_dp_dummy = self.is_attention_dp_dummy
child_request.is_cuda_graph_dummy = self.is_cuda_graph_dummy
child_request.is_dummy = self.is_dummy
child_request.is_generation_only = partial(is_generation_only_request,
child_request)
child_request.create_response = partial(create_response, child_request)
child_request.finish_by = partial(finish_by, child_request)

return child_request

def create_response(
self,
use_fast_logits=False,
mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
result, is_final = super().create_serialized_result(
use_fast_logits, mpi_world_rank)
return LlmResponse(
request_id=self.py_request_id,
result=LlmResult(result, self.py_result, is_final),
client_id=self.py_client_id) if len(result) > 0 else None
return create_response(self, use_fast_logits, mpi_world_rank)

@property
def is_dummy(self):
return self.is_attention_dp_dummy or self.is_cuda_graph_dummy or self.is_dummy_request

def finish_by(self, reason: FinishReason, beam: int) -> None:
"""CPP finish by reason does not support beam_width > 1"""
self.state = LlmRequestState.GENERATION_COMPLETE
self.set_finished_reason(reason, beam)
finish_by(self, reason, beam)


def convert_wordlist(word_list) -> List[List[int]]:
Expand Down
Loading