AlibabaPAI · ZeldaHuang · Nov 4, 2024 · Nov 4, 2024 · Nov 4, 2024 · Nov 11, 2024
diff --git a/Makefile b/Makefile
@@ -22,7 +22,7 @@ install:
 .PHONY: lint
 lint: check_pylint_installed check_pytest_installed
 	@pylint --rcfile=.pylintrc -s n  --jobs=128 ./llumnix
-	
+
 	@pylint --rcfile=.pylintrc \
 			--disable=protected-access,super-init-not-called,unused-argument,redefined-outer-name,invalid-name \
 			-s n --jobs=128 ./tests
@@ -61,22 +61,27 @@ test: check_pytest_installed
 
 .PHONY: unit_test
 unit_test: check_pytest_installed
+	@ray stop
 	@pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
-	
+
 .PHONY: offline_test
 offline_test:
+	@ray stop
 	@python examlpes/offline_inference.py
 
 .PHONY: e2e_test
 e2e_test:
+	@ray stop
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
 
 .PHONY: bench_test
 bench_test:
+	@ray stop
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py
 
 .PHONY: migration_test
 migration_test:
+	@ray stop
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_migration.py
 
 ####################################### test end ########################################

diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py
@@ -84,14 +84,12 @@ async def query_model_vllm(prompt, verbose, ip_ports):
 
     async with aiohttp.ClientSession(timeout=timeout) as session:
         best_of = 1
-        use_beam_search = False
         output_len = expected_response_len
         request_dict = {
             "prompt": prompt,
             "n": 1,
             "best_of": best_of,
-            "use_beam_search": use_beam_search,
-            "temperature": 0.0 if use_beam_search else 1.0,
+            "temperature": 1.0,
             "top_k": 1,
             "max_tokens": max(output_len, 1),
             "ignore_eos": True,
@@ -815,18 +813,18 @@ def main():
     except FileNotFoundError:
         os.mknod(file_name)
     with open(file_name, 'w') as f:
-        results.append({"qps": args.qps, 
+        results.append({"qps": args.qps,
                         "cv": args.coefficient_variation,
-                        "request_ids": request_ids, 
+                        "request_ids": request_ids,
                         "request_lens": request_lens,
-                        "request_latencies": request_latencies, 
-                        "prefill_token_latencies": prefill_token_latencies, 
+                        "request_latencies": request_latencies,
+                        "prefill_token_latencies": prefill_token_latencies,
                         "decode_token_latencies": decode_token_latencies,
-                        "decode_sum_latencies": decode_sum_latencies, 
+                        "decode_sum_latencies": decode_sum_latencies,
                         "all_decode_token_latencies": all_decode_token_latencies,
                         "inference_latencies": inference_latencies,
                         "per_token_latencies_breakdown_dict": per_token_latencies_breakdown_dict,
-                        "throughput": throughput, 
+                        "throughput": throughput,
                         "instance_num": avg_instance_num})
         json.dump(results, f)
 

diff --git a/configs/vllm.yml b/configs/vllm.yml
@@ -18,7 +18,7 @@ MANAGER:
   ENABLE_DEFRAG: True
   REQUEST_MIGRATION_POLICY: 'SR'
 
-  MIGRATION_BACKEND: 'gloo'
+  MIGRATION_BACKEND: 'rayrpc'
   MIGRATION_BUFFER_BLOCKS: 512
 
   ENABLE_SCALING: False
diff --git a/docs/Quickstart.md b/docs/Quickstart.md
@@ -82,11 +82,10 @@ HEAD_NODE=1 python -m llumnix.entrypoints.vllm.api_server \
                 --initial-instances $INITIAL_INSTANCES \
                 --launch-ray-cluster \
                 --model $MODEL_PATH \
-                --engine-use-ray \
                 --worker-use-ray \
                 --max-model-len 4096
 ```
-`CONFIG_PATH` is the path to the configuration file for Llumnix, and we give an example configuration file [here](../configs/base.yml). `MODEL_PATH` defines the location of your model. `INITIAL_INSTANCES` determines the number of instances to be launched on the current node, 
+`CONFIG_PATH` is the path to the configuration file for Llumnix, and we give an example configuration file [here](../configs/base.yml). `MODEL_PATH` defines the location of your model. `INITIAL_INSTANCES` determines the number of instances to be launched on the current node,
 
 Second, you can run the benchmark to evaluate the serving performance:
 

diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py
@@ -74,10 +74,10 @@ async def main():
     for request in prompts:
         request_id = random_uuid()
         await manager.generate.remote(request_id=request_id,
-                                      server_info=server_info, 
+                                      server_info=server_info,
                                       prompt=request,
-                                      sampling_params=sampling_params,)
-    
+                                      params=sampling_params,)
+
     await output_task
 
 asyncio.run(main())

diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py
@@ -13,7 +13,7 @@
 
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Iterable, List, Union, Deque
+from typing import Iterable, List, Union, Deque, Tuple
 
 from llumnix.llumlet.request import LlumnixRequest, RequestStatus
 from llumnix.server_info import ServerInfo
@@ -71,7 +71,7 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
 
     # Methods for migration
     @abstractmethod
-    def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_stage_num_blocks: int) -> List[int]:
+    def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_stage_num_blocks: int) -> Tuple[List[int], List[int]]:
         """Retrieves the incremental block table for a given request.
 
         This method is used to fetch a list of block numbers that represent the incremental
@@ -88,7 +88,7 @@ def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_st
                                    need to be fetched in the current stage.
 
         Returns:
-            A list of integers, where each integer represents a block number that indicates
+            A list of integers and its token ids, where each integer represents a block number that indicates
             physical index of kv cache block tensor. These block numbers can then be used
             to transfer to dstination instance.
         """
@@ -191,7 +191,8 @@ def pre_alloc(self,
                   request_id: str,
                   request_status: RequestStatus,
                   request_arrival_time: float,
-                  block_num: int) -> List[int]:
+                  block_num: int,
+                  token_ids: List[int]) -> List[int]:
         """Pre-allocates cache blocks for a migrating request.
 
         This method selects a specified number of free cache blocks to be reserved for an incoming
@@ -207,7 +208,7 @@ def pre_alloc(self,
             request_status: The status (waiting/running) of the request.
             request_arrival_time: The arrival time of the request.
             block_num: The number of cache blocks that need to be pre-allocated for the request.
-
+            token_ids: The token IDs of the request.
         Returns:
             A list of integers where each integer represents the block table reserved for the migration request.
         """

diff --git a/llumnix/backends/migration_backend_interface.py b/llumnix/backends/migration_backend_interface.py
@@ -33,9 +33,9 @@ def migrate_cache(self, src_handle, src_blocks: List[int], dst_blocks: List[int]
         raise NotImplementedError
 
     @abstractmethod
-    def do_send(self, dst_handle, blocks: List[int]):
+    def do_send(self, dst_handle, blocks: List[int], virtuel_engine: int):
         raise NotImplementedError
 
     @abstractmethod
-    def do_recv(self, src_handle, blocks: List[int]):
+    def do_recv(self, src_handle, blocks: List[int], virtuel_engine: int):
         raise NotImplementedError