diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 5913ee11..16af3014 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -362,9 +362,9 @@ def commit_dst_request(self, backend_request: SequenceGroupLlumnix) -> None: async def send_blocks(self, dst_ray_actor: "ray.actor.ActorHandle", src_blocks: List[int], dst_blocks: List[int]) -> None: await dst_ray_actor.execute_engine_method.remote("_run_workers", "migrate_cache", - dst_blocks=dst_blocks, - src_blocks=src_blocks, - src_worker_handle_list=self.worker_handle_list) + dst_blocks=dst_blocks, + src_blocks=src_blocks, + src_worker_handle_list=self.worker_handle_list) def _run_workers(self, *args, **kwargs): # pylint: disable=protected-access diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index 1b226ba1..ce5ca429 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -22,7 +22,6 @@ class SequenceGroupLlumnix(SequenceGroup, LlumnixRequest): def __init__(self, request_id, server_info, expected_steps: int, *args, **kwargs) -> None: SequenceGroup.__init__(self, request_id, *args, **kwargs) LlumnixRequest.__init__(self, request_id, server_info, expected_steps) - self.try_schedule_times = 0 @property def prompt_len(self) -> int: diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index 085dbd71..b47f9112 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -36,6 +36,7 @@ def __init__(self, request_id: int, server_info: ServerInfo, expected_steps: int self.last_preemption_time = None self.stage_timestamps = [] self.stage_num_blocks_list = [] + self.try_schedule_times = 0 self.waiting_migrating = False # end-of-migration self.eom = False diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index d49e5012..352710c7 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -59,7 +59,7 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool f"--max-model-len {max_model_len} " f"--dispatch-policy {dispatch_policy} " f"--trust-remote-code " - f"--request-migration-policy SR " + f"--request-migration-policy LCR " f"--migration-backend {migration_backend} " f"--migration-cache-blocks 32 " f"--tensor-parallel-size 1 "