[None][opt] Add batch wait timeout in fetching requests (#6923)

Shunkangz · web-flow · commit 54ec2c1af1d9 · 2025-08-19T03:50:08.000-04:00
Signed-off-by: Shunkang &lt;182541032+Shunkangz@users.noreply.github.co&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -136,6 +136,7 @@ def __init__(
         self.pytorch_backend_config.attention_dp_enable_balance = False
         self.pytorch_backend_config.attention_dp_time_out_iters = 50
         self.pytorch_backend_config.attention_dp_batching_wait_iters = 10
+        self.pytorch_backend_config.batch_wait_timeout_ms = 0
         self.iter_counter = 0
 
         # NOTE (lucaslie): not a declared base member in the base class; required by PyExecutor...
diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py
@@ -50,6 +50,8 @@ class PyTorchConfig:
     attention_dp_time_out_iters: int = 50
     attention_dp_batching_wait_iters: int = 10
 
+    batch_wait_timeout_ms: float = 0
+
     attn_backend: str = 'TRTLLM'
     moe_backend: str = 'CUTLASS'
 
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -45,12 +45,13 @@ class ExecutorRequestQueue:
     def __init__(self, dist: Distributed, enable_attention_dp: bool,
                  max_batch_size: int, max_beam_width: int,
                  max_num_active_requests: int, enable_iter_perf_stats: bool,
-                 is_disaggregated: bool):
+                 batch_wait_timeout_ms: float, is_disaggregated: bool):
         self.dist = dist
         self.request_queue: queue.Queue[RequestQueueItem] = queue.Queue()
         self.waiting_queue: deque[RequestQueueItem] = deque()
         self.canceled_req_ids = []
         self.enable_attention_dp = enable_attention_dp
+        self.max_batch_size = max_batch_size
         self.max_beam_width = max_beam_width
         self.max_num_active_requests = max_num_active_requests
         self.is_disaggregated = is_disaggregated
@@ -59,6 +60,7 @@ def __init__(self, dist: Distributed, enable_attention_dp: bool,
         self.enable_iter_perf_stats = enable_iter_perf_stats
         self.start_times = {}
         self.active = True
+        self.batch_wait_timeout_ms = batch_wait_timeout_ms
 
         # State tracking
         self.num_fetch_requests = 0
@@ -74,6 +76,7 @@ def _get_from_request_queue(
 
         items = []
         timeout_secs = timeout.total_seconds() if timeout is not None else None
+
         try:
             if self.request_queue.empty() and (timeout_secs is None
                                                or timeout_secs > 0):
@@ -86,6 +89,26 @@ def _get_from_request_queue(
                     items.append(queue_item)
         except queue.Empty:
             pass
+
+        if self.batch_wait_timeout_ms == 0:
+            return items
+
+        if len(items) >= self.max_batch_size:
+            return items
+
+        deadline = time.monotonic() + self.batch_wait_timeout_ms / 1000.0
+        while len(items) < self.max_batch_size:
+            remaining_timeout = deadline - time.monotonic()
+
+            if remaining_timeout <= 0:
+                break
+
+            try:
+                item = self.request_queue.get(timeout=remaining_timeout)
+                items.append(item)
+            except queue.Empty:
+                break
+
         return items
 
     @staticmethod
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -186,6 +186,7 @@ def __init__(self,
         self.attention_dp_enable_balance = model_engine.pytorch_backend_config.attention_dp_enable_balance
         self.attention_dp_time_out_iters = model_engine.pytorch_backend_config.attention_dp_time_out_iters
         self.attention_dp_batching_wait_iters = model_engine.pytorch_backend_config.attention_dp_batching_wait_iters
+        self.batch_wait_timeout_ms = model_engine.pytorch_backend_config.batch_wait_timeout_ms
         self.num_fetch_requests_cur_rank = 0
         self.num_fetch_requests = 0
         self.shutdown_event = threading.Event()
@@ -239,6 +240,7 @@ def __init__(self,
             max_beam_width=self.max_beam_width,
             max_num_active_requests=self.max_num_active_requests,
             enable_iter_perf_stats=self.enable_iter_perf_stats,
+            batch_wait_timeout_ms=self.batch_wait_timeout_ms,
             is_disaggregated=kv_cache_transceiver is not None,
         )
         self.executor_request_queue.set_exclude_last_generation_logits(
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -2098,6 +2098,12 @@ class TorchLlmArgs(BaseLlmArgs):
                                  description="Print iteration logs.",
                                  status="beta")
 
+    batch_wait_timeout_ms: float = Field(
+        default=0,
+        description=
+        "If greater than 0, the request queue might wait up to batch_wait_timeout_ms to receive max_batch_size requests, if fewer than max_batch_size requests are currently available. If 0, no waiting occurs.",
+        status="prototype")
+
     torch_compile_config: Optional[TorchCompileConfig] = Field(
         default=None, description="Torch compile config.", status="prototype")
 
@@ -2344,6 +2350,13 @@ def validate_attention_dp_config(self) -> 'TorchLlmArgs':
                 )
         return self
 
+    @model_validator(mode='after')
+    def validate_batch_wait_timeout_ms(self) -> 'TorchLlmArgs':
+        """Validate batch wait timeout."""
+        if self.batch_wait_timeout_ms < 0:
+            raise ValueError("batch_wait_timeout_ms must be greater than 0")
+        return self
+
     # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
     def get_pytorch_backend_config(self) -> "PyTorchConfig":
         from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
@@ -2409,7 +2422,8 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             AttentionDpConfig.model_fields['timeout_iters'].default,
             attention_dp_batching_wait_iters=self.attention_dp_config.
             batching_wait_iters if self.attention_dp_config is not None else
-            AttentionDpConfig.model_fields['batching_wait_iters'].default)
+            AttentionDpConfig.model_fields['batching_wait_iters'].default,
+            batch_wait_timeout_ms=self.batch_wait_timeout_ms)
 
 
 def update_llm_args_with_extra_dict(
diff --git a/tests/unittest/_torch/test_executor_request_queue.py b/tests/unittest/_torch/test_executor_request_queue.py
@@ -40,6 +40,7 @@ def executor_queue(mock_dist):
                                 max_beam_width=1,
                                 max_num_active_requests=16,
                                 enable_iter_perf_stats=True,
+                                batch_wait_timeout_ms=0.0,
                                 is_disaggregated=False)
 
 
@@ -52,6 +53,7 @@ def integration_queue(mock_dist):
                                 max_beam_width=2,
                                 max_num_active_requests=8,
                                 enable_iter_perf_stats=True,
+                                batch_wait_timeout_ms=0.0,
                                 is_disaggregated=False)
 
 
@@ -215,6 +217,75 @@ def test_get_from_request_queue_with_timeout(executor_queue):
     assert elapsed < 0.2  # Should finish within timeout
 
 
+def test_get_from_request_queue_async_behavior(executor_queue):
+    """Test asynchronous behavior where requests arrive over time."""
+    import threading
+
+    def add_requests_after_delay(delay, num_requests):
+        """Helper function to add requests after a delay."""
+        time.sleep(delay)
+        for i in range(num_requests):
+            item = RequestQueueItem(i + 10, Mock())
+            executor_queue.request_queue.put(item)
+
+    # Test 1: Without batch_wait_timeout_ms (should only get initial requests)
+    executor_queue.batch_wait_timeout_ms = 0.0
+
+    initial_requests = 3
+    for i in range(initial_requests):
+        item = RequestQueueItem(i, Mock())
+        executor_queue.request_queue.put(item)
+
+    thread = threading.Thread(target=add_requests_after_delay, args=(0.05, 2))
+    thread.start()
+
+    # Get requests immediately - should only get the initial ones
+    start_time = time.time()
+    items = executor_queue._get_from_request_queue(None)
+    elapsed = time.time() - start_time
+
+    assert len(items) == initial_requests
+    assert elapsed < 0.1
+    assert all(item.id < 10 for item in items)
+
+    thread.join()
+
+    # Test 2: With batch_wait_timeout_ms (should wait and get all requests)
+    executor_queue.batch_wait_timeout_ms = 200.0
+
+    # Clear the queue and add initial requests again
+    while not executor_queue.request_queue.empty():
+        try:
+            executor_queue.request_queue.get_nowait()
+        except queue.Empty:
+            break
+
+    initial_requests = 2
+    for i in range(initial_requests):
+        item = RequestQueueItem(i + 20, Mock())
+        executor_queue.request_queue.put(item)
+
+    thread = threading.Thread(target=add_requests_after_delay, args=(0.05, 3))
+    thread.start()
+
+    # Get requests with batch_wait_timeout_ms - should wait and get all
+    start_time = time.time()
+    items = executor_queue._get_from_request_queue(None)
+    elapsed = time.time() - start_time
+
+    # Should wait and return all requests
+    assert len(items) == initial_requests + 3
+    assert elapsed >= 0.05
+    assert elapsed < 0.3
+
+    initial_ids = {item.id for item in items if 20 <= item.id < 30}
+    delayed_ids = {item.id for item in items if 10 <= item.id < 20}
+    assert len(initial_ids) == initial_requests
+    assert len(delayed_ids) == 3
+
+    thread.join()
+
+
 def test_get_from_waiting_queue(executor_queue):
     """Test getting items from waiting queue."""
     # Add items to waiting queue
@@ -371,6 +442,7 @@ def attention_dp_queue(mock_dist_attention_dp):
                                  max_beam_width=2,
                                  max_num_active_requests=8,
                                  enable_iter_perf_stats=True,
+                                 batch_wait_timeout_ms=0.0,
                                  is_disaggregated=False)
     # Initialize all_ranks_num_active_requests
     return queue
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
@@ -123,6 +123,10 @@ methods:
         annotation: bool
         default: False
         status: prototype
+      batch_wait_timeout_ms:
+        annotation: float
+        default: 0
+        status: prototype
       print_iter_log:
         annotation: bool
         default: False