feat: Return context response immediately when stream_interval > 1 (#5836)

kaiyux · web-flow · commit bb5b16fcb9c5 · 2025-07-09T00:19:57.000+09:00
Signed-off-by: Kaiyu Xie &lt;26294424+kaiyux@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2037,7 +2037,8 @@ def _handle_responses(self):
                 request.update_perf_metrics(self.model_engine.iter_counter)
 
             request_done = False
-            if self.model_engine.iter_counter % self.stream_interval == 0 or request.is_finished:
+            if request.py_decoding_iter == 1 or request.is_finished or \
+                    request.py_decoding_iter % self.stream_interval == 0:
                 response = request.create_response(False, self.dist.rank)
                 if response:
                     request_done = response.result.is_final
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
@@ -534,6 +534,40 @@ async def main():
     test_non_streaming_usage_wait()
 
 
+@pytest.mark.parametrize("chunked", [True, False])
+@pytest.mark.part0
+def test_llm_generate_async_with_stream_interval(chunked):
+    model_path = f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B"
+    max_num_tokens = 256
+    with LLM_torch(model_path,
+                   max_num_tokens=max_num_tokens,
+                   stream_interval=4,
+                   enable_chunked_prefill=chunked) as llm:
+        sampling_params = SamplingParams(max_tokens=13,
+                                         ignore_eos=True,
+                                         detokenize=False)
+        step = 0
+        last_step_len = 0
+        prompt = "The capital of France is "
+        if chunked:
+            prompt = prompt * max_num_tokens
+        for output in llm.generate_async(prompt,
+                                         sampling_params=sampling_params,
+                                         streaming=True):
+            current_step_len = len(output.outputs[0].token_ids)
+            # The output lens of each step need to be [1, 3, 4, 4, 1]
+            if step == 0:
+                assert current_step_len == 1
+            elif step == 1:
+                assert current_step_len - last_step_len == 3
+            elif step == 2 or step == 3:
+                assert current_step_len - last_step_len == 4
+            else:
+                assert current_step_len - last_step_len == 1
+            step += 1
+            last_step_len = current_step_len
+
+
 @pytest.fixture(scope="module")
 def llm_for_sampling_params():
     build_config = BuildConfig(max_beam_width=3)