InternLM · lvhan028 · Feb 27, 2024 · Feb 21, 2024 · Feb 22, 2024 · Feb 23, 2024
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -6,7 +6,7 @@
 import sys
 from configparser import ConfigParser
 from contextlib import contextmanager
-from queue import Queue
+from queue import LifoQueue, Queue
 from threading import Thread
 from typing import Iterable, List, Optional, Union
 
@@ -507,6 +507,27 @@ def _func(device_id, enque_output):
             t.start()
             self.threads[device_id] = t
 
+    def _async_forward_callback(self, result, ctx, que: LifoQueue):
+        que.put((False, result))
+
+    def _async_forward_thread(self, inputs, que: LifoQueue):
+        instance_comm = self.tm_model.model_comm.create_instance_comm(
+            self.gpu_count)
+
+        def _func(device_id, enque_output):
+            with cuda_ctx(device_id):
+                output = self.model_insts[device_id].forward(
+                    inputs, instance_comm)
+                if enque_output:
+                    que.put((True, output))
+
+        for device_id in range(self.gpu_count):
+            t = Thread(target=_func,
+                       args=(device_id, device_id == 0),
+                       daemon=True)
+            t.start()
+            self.threads[device_id] = t
+
     def _update_generation_config(self, config: EngineGenerationConfig,
                                   **kwargs: dict):
         if config is None:
@@ -699,8 +720,13 @@ async def async_stream_infer(self,
             stream_output (bool): indicator for stream output
             kwargs (dict): kwargs for backward compatibility
         """
+        # start forward thread
+        que = LifoQueue()
+        from functools import partial
+        _forward_callback = partial(self._async_forward_callback, que=que)
+        _forward_thread = partial(self._async_forward_thread, que=que)
         if stream_output and not stop:
-            self.model_insts[0].register_callback(self._forward_callback)
+            self.model_insts[0].register_callback(_forward_callback)
 
         gen_config = self._update_generation_config(gen_config, **kwargs)
         inputs, input_lengths = self.prepare_inputs(
@@ -715,23 +741,17 @@ async def async_stream_infer(self,
             gen_config=gen_config)
 
         tm_inputs = _np_dict_to_tm_dict(inputs)
-        # start forward thread
-        self.que = Queue()
-        self._forward_thread(tm_inputs)
+        _forward_thread(tm_inputs)
 
         seq_start = input_lengths + input_lengths.new_tensor(step)
 
+        prev_len = 0
         # generator
         while True:
-            # Thanks for https://github.com/frankxyy and his issue
-            # https://github.com/InternLM/lmdeploy/issues/832
-            while self.que.qsize() == 0:
-                await asyncio.sleep(0.002)  # sleep(0) makes server unstable
+            while que.qsize() == 0:  # let other requests in
+                await asyncio.sleep(0.002)
 
-            while self.que.qsize() > 1:
-                self.que.get()
-
-            finish, tm_outputs = self.que.get()
+            finish, tm_outputs = que.get()
 
             outputs = _tm_dict_to_torch_dict(tm_outputs)
 
@@ -756,13 +776,15 @@ async def async_stream_infer(self,
                     outputs = (status, output[:-1].tolist(), len_)
                 else:
                     outputs = (status, output.tolist(), len_)
+            if outputs[-1] < prev_len and not finish:
+                continue
+            else:
+                prev_len = outputs[-1]
             yield outputs
 
             if finish:
                 for t in self.threads:
                     t.join()
-                while self.que.qsize() > 0:
-                    self.que.get()
                 break
 
         if stream_output and not stop: