|
| 1 | +import copy |
1 | 2 | from dataclasses import dataclass |
2 | 3 | from functools import partial |
3 | 4 | from typing import List, Optional, Union |
@@ -230,15 +231,19 @@ class LlmResult: |
230 | 231 | py_result_properties = frozenset( |
231 | 232 | ('context_logits', 'generation_logits', 'log_probs', 'cum_log_probs')) |
232 | 233 |
|
233 | | - def __init__(self, result: Union[bytes, |
234 | | - tensorrt_llm.bindings.executor.Result], |
235 | | - py_result: PyResult): |
| 234 | + def __init__(self, |
| 235 | + result: Union[bytes, tensorrt_llm.bindings.executor.Result], |
| 236 | + py_result: PyResult, |
| 237 | + is_final: bool = False): |
236 | 238 | self._result = result |
237 | 239 | self._py_result = py_result |
| 240 | + self.is_final = is_final |
238 | 241 |
|
239 | 242 | def __getattr__(self, item): |
240 | 243 | if item in self.py_result_properties: |
241 | 244 | return getattr(self._py_result, item) |
| 245 | + if item == 'is_final': |
| 246 | + return object.__getattribute__(self, 'is_final') |
242 | 247 | result = object.__getattribute__(self, '_result') |
243 | 248 | return getattr(result, item) |
244 | 249 |
|
@@ -343,58 +348,33 @@ def __init__( |
343 | 348 | exclude_last_generation_logits) |
344 | 349 |
|
345 | 350 | def create_child_request(self, request_id: int): |
346 | | - # Create a child request by C++'s API to track the states each other. |
| 351 | + """ Create a child request. |
| 352 | +
|
| 353 | + NOTE: This function generate a child request by C++'s API to track the |
| 354 | + states each other and returns the object of type batch_manager.LlmRequest, |
| 355 | + which is not a llm_request.LlmRequest. As a workaround, to ensure the |
| 356 | + child request behaves like its parent, this function mimics the behavior |
| 357 | + of the original LlmRequest by dynamically adding the required attributes |
| 358 | + of the parent request to the child request. This function will be |
| 359 | + implemented when LlmRequest becomes pure-python. |
| 360 | +
|
| 361 | + See: https://github.com/NVIDIA/TensorRT-LLM/issues/3034 |
| 362 | + """ |
| 363 | + |
347 | 364 | child_request = super().create_child_request(request_id) |
348 | 365 |
|
349 | | - # Copy Python-specific attributes from parent to child |
350 | | - child_request.py_client_id = self.py_client_id |
| 366 | + # Copy all py_* attributes from parent to child |
| 367 | + for attr_name, attr_value in self.__dict__.items(): |
| 368 | + if attr_name.startswith('py_'): |
| 369 | + attr_value = getattr(self, attr_name) |
| 370 | + setattr(child_request, attr_name, copy.deepcopy(attr_value)) |
| 371 | + |
| 372 | + # Override specific attributes that should use child_request values. |
351 | 373 | child_request.py_parent_request_id = self.py_request_id |
352 | 374 | child_request.py_request_id = child_request.request_id |
353 | | - child_request.py_llm_request_type = child_request.llm_request_type |
354 | | - child_request.py_end_id = child_request.end_id |
355 | | - child_request.py_prompt_len = child_request.prompt_len |
356 | | - child_request.py_orig_prompt_len = child_request.orig_prompt_len |
357 | | - child_request.py_max_new_tokens = child_request.max_new_tokens |
358 | | - |
359 | | - # Copy Python-specific configuration from parent |
360 | | - child_request.py_return_log_probs = self.py_return_log_probs |
361 | | - child_request.py_return_context_logits = self.py_return_context_logits |
362 | | - child_request.py_return_generation_logits = self.py_return_generation_logits |
363 | | - child_request.py_return_logits_device_memory = self.py_return_logits_device_memory |
364 | | - child_request.py_exclude_last_generation_logits = self.py_exclude_last_generation_logits |
365 | | - child_request.py_stop_words_list = self.py_stop_words_list |
366 | | - child_request.py_logits_post_processors = self.py_logits_post_processors |
367 | | - child_request.py_rewind_len = self.py_rewind_len |
368 | | - child_request.py_decoding_iter = self.py_decoding_iter |
369 | | - child_request.py_draft_tokens = self.py_draft_tokens.copy( |
370 | | - ) if self.py_draft_tokens else [] |
371 | | - child_request.py_last_draft_tokens = self.py_last_draft_tokens.copy( |
372 | | - ) if self.py_last_draft_tokens else None |
373 | | - child_request.py_num_accepted_draft_tokens = self.py_num_accepted_draft_tokens |
374 | | - child_request.py_lora_task_layer_module_configs = self.py_lora_task_layer_module_configs |
375 | | - |
376 | | - # Initialize Python-specific runtime state |
377 | 375 | child_request.py_batch_idx = None |
378 | 376 | child_request.is_attention_dp_dummy = self.is_attention_dp_dummy |
379 | 377 | child_request.is_cuda_graph_dummy = self.is_cuda_graph_dummy |
380 | | - |
381 | | - # Create PyResult for child |
382 | | - child_request.py_result = PyResult( |
383 | | - prompt_len=child_request.py_prompt_len, |
384 | | - max_new_tokens=child_request.py_max_new_tokens, |
385 | | - use_device_memory=self.py_return_logits_device_memory, |
386 | | - streaming=child_request.streaming, |
387 | | - return_log_probs=self.py_return_log_probs, |
388 | | - return_context_logits=self.py_return_context_logits, |
389 | | - return_generation_logits=self.py_return_generation_logits, |
390 | | - exclude_last_generation_logits=self. |
391 | | - py_exclude_last_generation_logits) |
392 | | - |
393 | | - # Note: The below mimics the behavior of the original LlmRequest. |
394 | | - |
395 | | - # We need to ensure the child request behaves like the parent |
396 | | - # LlmRequest by copying any additional Python-specific attributes that |
397 | | - # might be needed for proper request handling and response generation. |
398 | 378 | child_request.is_dummy = self.is_dummy |
399 | 379 |
|
400 | 380 | # Override create_response to return the child request |
|
0 commit comments