[Core] Add back ray queue to put request output tokens back to the ap…

…i server (#41)
AlibabaPAI · Oct 9, 2024 · 653ba46 · 653ba46
1 parent e9cf870
commit 653ba46
Show file tree

Hide file tree

Showing 38 changed files with 364 additions and 157 deletions.
diff --git a/.github/workflows/bench_test.yml b/.github/workflows/bench_test.yml
@@ -1,9 +1,6 @@
 name: bench_test
 
 on:
-  push:
-    branches:
-    - main
   pull_request:
     branches:
     - main

diff --git a/.github/workflows/e2e_test.yml b/.github/workflows/e2e_test.yml
@@ -1,9 +1,6 @@
 name: e2e_test
 
 on:
-  push:
-    branches:
-    - main
   pull_request:
     branches:
     - main

diff --git a/.github/workflows/migration_test.yml b/.github/workflows/migration_test.yml
@@ -1,9 +1,6 @@
 name: migration_test
 
 on:
-  push:
-    branches:
-    - main
   pull_request:
     branches:
     - main

diff --git a/.github/workflows/offline_inference.yml b/.github/workflows/offline_inference.yml
@@ -1,9 +1,6 @@
 name: offline_inference
 
 on:
-  push:
-    branches:
-    - main
   pull_request:
     branches:
     - main

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -1,9 +1,6 @@
 name: pylint
 
 on:
-  push:
-    branches:
-    - main
   pull_request:
     branches:
     - main

diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
@@ -1,9 +1,6 @@
 name: unit_test
 
 on:
-  push:
-    branches:
-    - main
   pull_request:
     branches:
     - main

diff --git a/.github/workflows/whl.yml → .github/workflows/whl_build.yml b/.github/workflows/whl.yml → .github/workflows/whl_build.yml
@@ -1,9 +1,6 @@
 name: whl_build
 
 on:
-  push:
-    branches:
-    - main
   pull_request:
     branches:
     - main

diff --git a/configs/base.yml b/configs/base.yml
@@ -1,6 +1,7 @@
 SERVER:
   HOST: '127.0.0.1'
   PORT: 37000
+  QUEUE_TYPE: "rayqueue"
 
 RAY:
   RAY_CLUSTER_PORT: 30037
@@ -19,7 +20,7 @@ MANAGER:
   ENABLE_DEFRAG: True
   REQUEST_MIGRATION_POLICY: 'SJF'
 
-  MIGRATION_BACKEND: 'rpc'
+  MIGRATION_BACKEND: 'gloo'
   MIGRATION_CACHE_BLOCKS: 512
 
   ENABLE_SCALING: False
diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py
@@ -1,21 +1,14 @@
 from typing import List
 import os
-import uuid
 import asyncio
 
 import ray
-from ray.util.queue import Queue as RayQueue
-from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 
 from llumnix import launch_ray_cluster, connect_to_ray_cluster, init_manager, init_llumlets
 from llumnix import (SamplingParams, ServerInfo, EngineManagerArgs, LLMEngineManager, Llumlet,
-                     EngineArgs, RequestOutput)
+                     EngineArgs, QueueType)
 from llumnix.utils import random_uuid
-from llumnix.rpc.queue_server import QueueServer
-from llumnix.rpc.queue_client import QueueClient
-from llumnix.rpc.utils import get_open_zmq_ipc_path
-from llumnix.entrypoints.llumnix_utils import get_ip_address
-
+from llumnix.queue.ray_queue_server import RayQueueServer
 
 # Sample prompts.
 prompts = [
@@ -45,8 +38,10 @@
 # Create llumlets.
 llumlet_ids: List[str] = None
 llumlets: List[Llumlet] = None
-llumlet_ids, llumlets = init_llumlets(manager_args, engine_args,
-                                      node_id=ray.get_runtime_context().get_node_id())
+llumlet_ids, llumlets = init_llumlets(
+    manager_args, engine_args, ray.get_runtime_context().get_node_id(),
+    QueueType("rayqueue")
+)
 
 
 # Create a manager. If the manager is created first, and then the llumlets are created, manager.scale_up
@@ -55,11 +50,8 @@
 
 # The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in.
 server_id = random_uuid()
-ip = get_ip_address()
-port = 1234
-server_info = ServerInfo(server_id, ip, port)
-rpc_path = get_open_zmq_ipc_path(server_info.request_output_queue_ip, server_info.request_output_queue_port)
-request_output_queue = QueueServer(rpc_path)
+request_output_queue = RayQueueServer()
+server_info = ServerInfo(server_id, QueueType("rayqueue"), request_output_queue, None, None)
 
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
@@ -94,9 +86,6 @@ async def main():
 for actor in named_actors:
     try:
         actor_handle = ray.get_actor(actor['name'], namespace=actor['namespace'])
-    except:
-        continue
-    try:
         ray.kill(actor_handle)
     except:
         continue

diff --git a/llumnix/__init__.py b/llumnix/__init__.py
@@ -20,6 +20,7 @@
 from llumnix.arg_utils import EngineManagerArgs
 from llumnix.llm_engine_manager import LLMEngineManager
 from llumnix.llumlet.llumlet import Llumlet
+from llumnix.queue.queue_type import QueueType
 
 from .version import __version__
 
@@ -32,7 +33,8 @@
     "init_llumlets",
     "EngineManagerArgs",
     "LLMEngineManager",
-    "Llumlet"
+    "Llumlet",
+    "QueueType",
 ]
 
 __all__.extend(getattr(vllm, "__all__", []))
diff --git a/llumnix/backends/utils.py b/llumnix/backends/utils.py
@@ -12,30 +12,31 @@
 # limitations under the License.
 
 from typing import Optional, Tuple
+
 import ray
-# pylint: disable=unused-import
 from ray.util.placement_group import PlacementGroup
 
 from llumnix.backends.backend_interface import BackendInterface, BackendType
+from llumnix.queue.queue_type import QueueType
 
-
-def init_backend_engine(instance_id: str, backend_type: BackendType, *args, **kwargs) -> BackendInterface:
+def init_backend_engine(instance_id: str, output_queue_type: QueueType,
+                        backend_type: BackendType, *args, **kwargs) -> BackendInterface:
     if backend_type == BackendType.VLLM:
         # pylint: disable=import-outside-toplevel
         from llumnix.backends.vllm.llm_engine import BackendVLLM
-        backend_engine = BackendVLLM(instance_id, *args, **kwargs)
+        backend_engine = BackendVLLM(instance_id, output_queue_type, *args, **kwargs)
     elif backend_type == BackendType.SIM_VLLM:
         # pylint: disable=import-outside-toplevel
         from llumnix.backends.vllm.simulator import BackendSimVLLM
-        backend_engine = BackendSimVLLM(instance_id, *args, **kwargs)
+        backend_engine = BackendSimVLLM(instance_id, output_queue_type, *args, **kwargs)
     else:
         raise ValueError(f'Unsupported backend: {backend_type}')
     return backend_engine
 
 def initialize_placement_group(
     world_size: int = 1,
     detached: bool = False
-) -> Tuple[str, Optional["PlacementGroup"]]:
+) -> Tuple[str, Optional[PlacementGroup]]:
     """Initialize the distributed cluster probably with Ray.
 
     Args:

diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py
@@ -35,16 +35,19 @@
 from llumnix.backends.profiling import LatencyMemData
 from llumnix.server_info import ServerInfo
 from llumnix.internal_config import MigrationConfig
-from llumnix.rpc.queue_client import QueueClient
+from llumnix.queue.queue_client_base import QueueClientBase
+from llumnix.queue.utils import get_output_queue_client, QueueType
 
 logger = init_logger(__name__)
 
 
 class AsyncPutQueueThread(threading.Thread):
-    def __init__(self, instance_id):
+    def __init__(self, instance_id, output_queue_type: QueueType):
         super().__init__()
         self.instance_id = instance_id
-        self.request_output_queue_client = QueueClient()
+
+        self.request_output_queue_client: QueueClientBase \
+            = get_output_queue_client(output_queue_type)
         self.engine_actor_handle = None
         self.loop = asyncio.new_event_loop()
         self.daemon = True
@@ -82,20 +85,21 @@ def put_nowait_batch_to_servers(self,
 
 
 class LLMEngineLlumnix(LLMEngine):
-    def __init__(self, instance_id: str, *arg, **kwargs) -> None:
+    def __init__(self, instance_id: str, output_queue_type: QueueType, *arg, **kwargs) -> None:
         super().__init__(*arg, **kwargs)
         self.instance_id = instance_id
         self.step_counter = Counter()
         self.instance_info = None
         # TODO(s5u13b): Reduce the overhead.
-        self.async_put_queue_thread = AsyncPutQueueThread(instance_id)
+        self.async_put_queue_thread = AsyncPutQueueThread(instance_id, output_queue_type)
         self.async_put_queue_thread.start()
 
     # pylint: disable=W0221
     @classmethod
     def from_engine_args(
         cls,
         engine_args: EngineArgs,
+        output_queue_type: QueueType,
         migration_config: MigrationConfig,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         instance_id: str = None,
@@ -124,6 +128,7 @@ def from_engine_args(
         # Create the LLM engine.
         engine = cls(
             instance_id=instance_id,
+            output_queue_type=output_queue_type,
             **engine_config.to_dict(),
             executor_class=executor_class,
             log_stats=not engine_args.disable_log_stats,
@@ -217,12 +222,14 @@ class BackendVLLM(BackendInterface):
     def __init__(
         self,
         instance_id: str,
+        output_queue_type: QueueType,
         migration_config: MigrationConfig,
         engine_args: EngineArgs,
         placement_group: PlacementGroup = None,
         node_id: str = None
     ) -> None:
         self.engine: LLMEngineLlumnix = LLMEngineLlumnix.from_engine_args(engine_args=engine_args,
+                                                                          output_queue_type=output_queue_type,
                                                                           migration_config=migration_config,
                                                                           instance_id=instance_id,
                                                                           placement_group=placement_group,

diff --git a/llumnix/backends/vllm/simulator.py b/llumnix/backends/vllm/simulator.py
@@ -14,6 +14,7 @@
 import os
 import threading
 from typing import List
+import ray.actor
 
 from vllm.engine.arg_utils import EngineArgs
 
@@ -22,7 +23,7 @@
 from llumnix.backends.vllm.scheduler import SchedulerLlumnix
 from llumnix.backends.vllm.llm_engine import LLMEngineLlumnix, BackendVLLM
 from llumnix.backends.profiling import ProfilingDatabase, LatencyMemData, ProfilingResult, SimParallelConfig
-
+from llumnix.queue.queue_type import QueueType
 
 logger = init_logger(__name__)
 
@@ -31,6 +32,7 @@ class BackendSimVLLM(BackendVLLM):
     def __init__(
         self,
         instance_id: str,
+        output_queue_type: QueueType,
         migration_config: MigrationConfig,
         profiling_result_file_path: str,
         engine_args: EngineArgs,
@@ -54,6 +56,7 @@ def __init__(
         latency_mem: LatencyMemData = profiling_result.para_dict[sim_parallel_config]
         # multi-instance args
         self.engine: LLMEngineLlumnix = LLMEngineLlumnix.from_engine_args(engine_args=engine_args,
+                                                                          output_queue_type=output_queue_type,
                                                                           migration_config=migration_config,
                                                                           instance_id=instance_id,
                                                                           latency_mem=latency_mem)
@@ -66,5 +69,6 @@ def __init__(
         )
         self._thread.start()
 
-    def send_blocks(self, dst_ray_actor: "ray.actor.ActorHandle", src_blocks: List[int], dst_blocks: List[int]) -> None:
+    # pylint: disable=unused-argument
+    def send_blocks(self, dst_ray_actor: ray.actor.ActorHandle, src_blocks: List[int], dst_blocks: List[int]) -> None:
         self.engine.model_executor.send_blocks(len(src_blocks))
diff --git a/llumnix/config/default.py b/llumnix/config/default.py
@@ -26,6 +26,8 @@
 _C.SERVER.HOST = "localhost"
 # Port number for the server
 _C.SERVER.PORT = 8000
+# Queue type for request output queue
+_C.SERVER.QUEUE_TYPE = "rayqueue"
 # Port number for the request output queue
 _C.SERVER.REQUEST_OUTPUT_QUEUE_PORT = 1234
 # Path to SSL key file for secure connections
@@ -95,7 +97,7 @@
 _C.MANAGER.LAST_STAGE_MAX_BLOCKS = 16
 
 # Communication backend of migration
-_C.MANAGER.MIGRATION_BACKEND = "rpc"
+_C.MANAGER.MIGRATION_BACKEND = "gloo"
 # Timeout(s) for initializing migration backend
 _C.MANAGER.MIGRATION_BACKEND_INIT_TIMEOUT = 10.0
 # Number of cache blocks in migration