working

KuilongCui · KuilongCui · commit d05c4cdb21bf · 2024-10-25T03:54:57.000Z
diff --git a/llumnix/__init__.py b/llumnix/__init__.py
@@ -11,8 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import vllm
-from vllm import *
+# import vllm
+# from vllm import *
 
 from llumnix.server_info import ServerInfo
 from llumnix.entrypoints.utils import (launch_ray_cluster,
@@ -39,4 +39,4 @@
     "QueueType",
 ]
 
-__all__.extend(getattr(vllm, "__all__", []))
+# __all__.extend(getattr(vllm, "__all__", []))
diff --git a/llumnix/backends/bladellm/migration_backend.py b/llumnix/backends/bladellm/migration_backend.py
@@ -36,11 +36,10 @@ def __init__(self, worker_address: str, migration_config: MigrationConfig, state
         self.worker_address = worker_address
         self.state_manager = state_manager
         self.num_migration_cache_blocks = migration_config.migration_cache_blocks
-
-        migration_cache_key_shape = state_manager._kv_cache[0].shape
-        migration_cache_key_shape[0] = migration_config.migration_cache_blocks
-        migration_cache_value_shape = state_manager._kv_cache[1].shape
-        migration_cache_value_shape[0] = migration_config.migration_cache_blocks
+        migration_cache_key_shape = list([len(state_manager._kv_cache[0])]) + list(state_manager._kv_cache[0][0].shape)
+        migration_cache_key_shape[1] = migration_config.migration_cache_blocks
+        migration_cache_value_shape = list([len(state_manager._kv_cache[1])]) + list(state_manager._kv_cache[1][0].shape)
+        migration_cache_value_shape[1] = migration_config.migration_cache_blocks
         if state_manager.dtype in NUMPY_SUPPORTED_DTYPES:
             self.migration_cache_dtype = state_manager.dtype
         else:
diff --git a/llumnix/backends/bladellm/proto/migration_worker_pb2_grpc.py b/llumnix/backends/bladellm/proto/migration_worker_pb2_grpc.py
@@ -3,7 +3,7 @@
 import grpc
 
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
-import migration_worker_pb2 as migration__worker__pb2
+import llumnix.backends.bladellm.proto.migration_worker_pb2 as migration__worker__pb2
 
 
 class MigrationWorkerStub(object):
diff --git a/llumnix/backends/bladellm/worker.py b/llumnix/backends/bladellm/worker.py
@@ -30,12 +30,11 @@
 
 logger = init_logger(__name__)
 
-class MigrationWorker(migration_worker_pb2_grpc.LlumnixWorkerServicer, RemoteWorker):
+class MigrationWorker(migration_worker_pb2_grpc.MigrationWorkerServicer, RemoteWorker):
     def __init__(self, instance_id: str, worker_addr: str, migration_config: MigrationConfig,
                  *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-
-        torch.cuda.set_device(args.device)
+        torch.cuda.set_device(args[1].device)
         self.instance_id = instance_id
         self.migration_backend = get_migration_backend(worker_addr, migration_config, self._engine._state_manager)
 
@@ -89,7 +88,7 @@ async def worker_server(rank: int, args: ServingArgs, instance_id: str, migratio
         else f"unix://{args.worker_socket_path}.{instance_id}.{rank}"
     )
 
-    worker = MigrationWorker(rank, args, instance_id, listen_addr, migration_config)
+    worker = MigrationWorker(instance_id, listen_addr, migration_config, rank, args)
 
     server = grpc.aio.server(migration_thread_pool=ThreadPoolExecutor(max_workers=1))
     bladellm_pb2_grpc.add_WorkerServicer_to_server(worker, server)
diff --git a/tests/unit_test/backends/bladellm/proto/mock_migration_worker_pb2_grpc.py b/tests/unit_test/backends/bladellm/proto/mock_migration_worker_pb2_grpc.py
@@ -3,7 +3,7 @@
 import grpc
 
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
-import mock_migration_worker_pb2 as mock__migration__worker__pb2
+import tests.unit_test.backends.bladellm.proto.mock_migration_worker_pb2 as mock__migration__worker__pb2
 
 
 class MockMigrationWorkerStub(object):
diff --git a/tests/unit_test/backends/bladellm/test_migration_backend.py b/tests/unit_test/backends/bladellm/test_migration_backend.py
@@ -11,13 +11,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import time
 from typing import List
 import random
 import asyncio
 import torch
 import pytest
 import grpc
-from multiprocessing import Process
+from multiprocessing import Process, set_start_method
 from concurrent.futures import ThreadPoolExecutor
 from google.protobuf import empty_pb2
 import numpy as np
@@ -30,10 +31,12 @@
 from llumnix.backends.bladellm.proto import migration_worker_pb2_grpc, migration_worker_pb2
 from llumnix.internal_config import MigrationConfig
 from llumnix.utils import random_uuid
+from llumnix.arg_utils import EngineManagerArgs
 
-from .proto import mock_migration_worker_pb2_grpc, mock_migration_worker_pb2
+from tests.unit_test.backends.bladellm.proto import mock_migration_worker_pb2_grpc, mock_migration_worker_pb2
 
-class MockMigrationWorker(mock_migration_worker_pb2_grpc.MockMigrationWorkerServicer, MigrationWorker):
+# class MockMigrationWorker(mock_migration_worker_pb2_grpc.MockMigrationWorkerServicer, MigrationWorker):
+class MockMigrationWorker(MigrationWorker):
     def get_kv_cache_meta(self, request, context):
         return mock_migration_worker_pb2.KvCacheMeta(
             shape=self.migration_backend.dummy_key_cache.shape,
@@ -56,63 +59,68 @@ def get_gpu_cache(self, request, context):
         torch.cuda.synchronize()
         return mock_migration_worker_pb2.KvCacheData(key=key, value=value)
 
-async def worker_main(rank: int, args: ServingArgs, instance_id: str, migration_config: MigrationConfig):
-    asyncio.run(launch_worker(rank, args, instance_id, migration_config))
-
-async def launch_worker(rank: int, args: ServingArgs, instance_id: str, migration_config: MigrationConfig):
-    listen_addr = f"unix://{args.worker_socket_path}.{instance_id}.{rank}"
-    worker = MockMigrationWorker(rank, args, instance_id, listen_addr, migration_config)
+def worker_main(listen_addr: str, rank: int, args: ServingArgs, instance_id: str, migration_config: MigrationConfig):
+    asyncio.run(launch_worker(listen_addr, rank, args, instance_id, migration_config))
 
+async def launch_worker(listen_addr: str, rank: int, args: ServingArgs, instance_id: str, migration_config: MigrationConfig):
+    worker = MockMigrationWorker(instance_id, listen_addr, migration_config, rank, args)
     server = grpc.aio.server(migration_thread_pool=ThreadPoolExecutor(max_workers=1))
     bladellm_pb2_grpc.add_WorkerServicer_to_server(worker, server)
     migration_worker_pb2_grpc.add_MigrationWorkerServicer_to_server(worker, server)
-    mock_migration_worker_pb2_grpc.add_MockMigrationWorkerServicer_to_server(worker, server)
+    # mock_migration_worker_pb2_grpc.add_MockMigrationWorkerServicer_to_server(worker, server)
     server.add_insecure_port(listen_addr)
+    print(f"Starting server on {listen_addr}")
     await server.start()
+    print(f"Server running on {listen_addr}")
     await server.wait_for_termination()
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPU to run the test.")
 @pytest.mark.parametrize("backend", ['grpc'])
 def test_migrate_cache(backend):
     worker_count = 2
     worker_args = ServingArgs(
-        max_gpu_memory_utilization=0.1,
+        max_gpu_memory_utilization=0.5,
         block_size=3,
         load_model_options=LoadModelOptions(
-            model='/mnt/self-hosted/model/Qwen2.5-7B', attn_cls="paged", disable_cuda_graph=True
+            model='facebook/opt-125m', attn_cls="paged", disable_cuda_graph=True
         )
     )
-    worker_socket_addr = []
-    migration_config = MigrationConfig(migration_backend=backend, migration_cache_blocks=8)
+    
+    worker_socket_addrs = []
+    migration_config = EngineManagerArgs(migration_backend=backend, migration_cache_blocks=8).create_migration_config()
 
+    set_start_method("spawn", force=True)
     backends: List[Process] = []
     for i in range(worker_count):
         instance_id = random_uuid()
-        p = Process(target=worker_main, args=(0, worker_args, instance_id, migration_config))
-        worker_socket_addr.append(f"unix://{worker_args.worker_socket_path}.{instance_id}.{0}")
+        worker_args.device=f"cuda:{i}"
+        worker_socket_addrs.append(f"localhost:{1234+i}")
+
+        p = Process(target=worker_main, args=(worker_socket_addrs[-1], i, worker_args, instance_id, migration_config))
         p.start()
         backends.append(p)
 
     # assert all(ray.get([worker0.execute_method.remote('rebuild_migration_backend',
     #                                     instance_rank=instance_rank, group_name=group_name),
     #                     worker1.execute_method.remote('rebuild_migration_backend',
     #                                     instance_rank=instance_rank, group_name=group_name)]))
+    time.sleep(5)
 
     for i in range(worker_count):
-        with grpc.insecure_channel(worker_socket_addr[i]) as channel:
+        with grpc.insecure_channel(worker_socket_addrs[i]) as channel:
             stub = migration_worker_pb2_grpc.MigrationWorkerStub(channel)
             responce = stub.warmup(empty_pb2.Empty())
             assert responce.ok
 
-    with grpc.insecure_channel(worker_socket_addr[0]) as channel:
+    with grpc.insecure_channel(worker_socket_addrs[0]) as channel:
         stub = mock_migration_worker_pb2_grpc.MockMigrationWorkerStub(channel)
         responce = stub.get_kv_cache_meta(empty_pb2.Empty())
         kv_cache_shape, dtype, total_gpu_blocks = responce.shape, responce.dtype, responce.num_gpu_blocks
-    
+
     dummy_key_data = torch.randn(size=kv_cache_shape, dtype=dtype)
     dummy_value_data = torch.randn(size=kv_cache_shape, dtype=dtype)
 
-    with grpc.insecure_channel(worker_socket_addr[0]) as channel:
+    with grpc.insecure_channel(worker_socket_addrs[0]) as channel:
         stub = mock_migration_worker_pb2_grpc.MockMigrationWorkerStub(channel)
         responce = stub.set_gpu_cache(mock_migration_worker_pb2.KvCacheData(
             key=dummy_key_data.numpy().tobytes(),
@@ -126,15 +134,15 @@ def test_migrate_cache(backend):
     dst_blocks = list(range(total_gpu_blocks))
     random.shuffle(dst_blocks)
 
-    with grpc.insecure_channel(worker_socket_addr[1]) as channel:
+    with grpc.insecure_channel(worker_socket_addrs[1]) as channel:
         src_stub = migration_worker_pb2_grpc.MigrationWorkerStub(channel)
         src_stub.migrate_cache(migration_worker_pb2.MigrateRequest(
-            src_handlers=[None, worker_socket_addr[0]],
+            src_handlers=[None, worker_socket_addrs[0]],
             src_blocks=list(range(total_gpu_blocks)),
             dst_blocks=dst_blocks,
         ))
 
-    with grpc.insecure_channel(worker_socket_addr[1]) as channel:
+    with grpc.insecure_channel(worker_socket_addrs[1]) as channel:
         stub = mock_migration_worker_pb2_grpc.MockMigrationWorkerStub(channel)
         responce = stub.get_gpu_cache(empty_pb2.Empty())
         worker1_key_data = torch.from_numpy(np.frombuffer(responce.key, dtype=dtype))