From 9512f594d2875d6501fe3dcd9162f313c78acb73 Mon Sep 17 00:00:00 2001
From: ZedalHuang <1520787127@qq.com>
Date: Thu, 25 Jul 2024 15:06:19 +0800
Subject: [PATCH] use await in _background_process_outputs to improve
 api_server throughput

---
 llumnix/entrypoints/vllm/api_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py
index 21663cd9..344b879b 100644
--- a/llumnix/entrypoints/vllm/api_server.py
+++ b/llumnix/entrypoints/vllm/api_server.py
@@ -49,7 +49,8 @@
 
 async def _background_process_outputs():
     while True:
-        request_outputs = request_output_queue.get_nowait_batch(num_items=request_output_queue.qsize())
+        qsize = await request_output_queue.actor.qsize.remote()
+        request_outputs = await request_output_queue.actor.get_nowait_batch.remote(qsize)
         for request_output in request_outputs:
             request_id = request_output.request_id
             # Request could be dispatched twice when manager is dead, the first request will free the request_streams when finished.
@@ -59,7 +60,6 @@ async def _background_process_outputs():
             if request_output.finished:
                 request_streams[request_id].finish()
                 del request_streams[request_id]
-        await asyncio.sleep(0.01)
 
 # pylint: disable=unused-argument
 @asynccontextmanager