AlibabaPAI · s5u13b · Dec 10, 2024 · Nov 18, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/.github/workflows/bench_test.yml b/.github/workflows/bench_test.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,15 +20,15 @@ jobs:
   bench_tests:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 60
+    timeout-minutes: 30
     steps:
     - name: Checkout
       uses: actions/checkout@v4
     - name: Kill Running Containers
       run: |
         [[ -n $(docker ps -q) ]] && docker kill $(docker ps -q) || echo "No running containers to kill."
     - name: Build And Test
-      run: ./tools/bench_test.sh
+      run: ./tools/run_test.sh bench_test
     - name: Create comment from file
       if: ${{ github.event_name != 'push' }}
       uses: actions/github-script@v7

diff --git a/.github/workflows/e2e_test.yml b/.github/workflows/e2e_test.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,12 +20,12 @@ jobs:
   e2e_tests:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 60
+    timeout-minutes: 30
     steps:
     - name: Checkout
       uses: actions/checkout@v4
     - name: Kill Running Containers
       run: |
         [[ -n $(docker ps -q) ]] && docker kill $(docker ps -q) || echo "No running containers to kill."
     - name: Build And Test
-      run: ./tools/e2e_test.sh
+      run: ./tools/run_test.sh e2e_test
diff --git a/.github/workflows/migration_test.yml b/.github/workflows/migration_test.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,15 +20,15 @@ jobs:
   migration_tests:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 60
+    timeout-minutes: 90
     steps:
     - name: Checkout
       uses: actions/checkout@v4
     - name: Kill Running Containers
       run: |
         [[ -n $(docker ps -q) ]] && docker kill $(docker ps -q) || echo "No running containers to kill."
     - name: Build And Test
-      run: ./tools/migration_test.sh
+      run: ./tools/run_test.sh migration_test
     - name: Create comment from file
       if: ${{ github.event_name != 'push' }}
       uses: actions/github-script@v7

diff --git a/.github/workflows/offline_inference.yml b/.github/workflows/offline_inference.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,13 +20,8 @@ jobs:
   offline_inference:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 10
+    timeout-minutes: 5
     steps:
     - uses: actions/checkout@v4
     - name: Run offline inference example
-      run: |
-        nvidia-docker run --rm -t --net host --ipc host \
-          -v ${PWD}:/workspace \
-          -w /workspace \
-          registry.cn-beijing.aliyuncs.com/llumnix/llumnix-dev:20240909_action_678a439 \
-          bash -c "pip install -e . > /dev/null && make offline_test"
+      run: ./tools/run_test.sh offline_test
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,7 +20,7 @@ jobs:
   pylint_test:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 10
+    timeout-minutes: 5
     steps:
     - uses: actions/checkout@v4
     - name: Analysing the code with pylint

diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,12 +20,12 @@ jobs:
   unit_tests:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 60
+    timeout-minutes: 30
     steps:
     - name: Checkout
       uses: actions/checkout@v4
     - name: Kill Running Containers
       run: |
         [[ -n $(docker ps -q) ]] && docker kill $(docker ps -q) || echo "No running containers to kill."
     - name: Build And Test
-      run: ./tools/unit_test.sh
+      run: ./tools/run_test.sh unit_test
diff --git a/.github/workflows/whl_build.yml b/.github/workflows/whl_build.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   whl_build:
     runs-on: ubuntu-latest
-    timeout-minutes: 10
+    timeout-minutes: 1
 
     steps:
     - name: Checkout

diff --git a/Makefile b/Makefile
@@ -31,9 +31,9 @@ lint: check_pylint_installed check_pytest_installed
 test: check_pytest_installed
 	@pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
 	@python examlpes/offline_inference.py
-	@pytest -v ./tests/e2e_test/test_e2e.py
-	@pytest -v ./tests/e2e_test/test_bench.py
-	@pytest -v ./tests/e2e_test/test_migration.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_migration.py
 
 .PHONY: unit_test
 unit_test: check_pytest_installed
@@ -45,15 +45,15 @@ offline_test:
 
 .PHONY: e2e_test
 e2e_test:
-	@pytest -v ./tests/e2e_test/test_e2e.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
 
 .PHONY: bench_test
 bench_test:
-	@pytest -v ./tests/e2e_test/test_bench.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py
 
 .PHONY: migration_test
 migration_test:
-	@pytest -v ./tests/e2e_test/test_migration.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_migration.py
 
 #################### pygloo install for gloo migration backend begin ####################
 

diff --git a/configs/base.yml b/configs/base.yml
@@ -1,7 +1,7 @@
 SERVER:
   HOST: '127.0.0.1'
   PORT: 1234
-  QUEUE_TYPE: "rayqueue"
+  REQUEST_OUTPUT_QUEUE_TYPE: "rayqueue"
   RAY_CLUSTER_PORT: 6379
   LAUNCH_RAY_CLUSTER: True
 
@@ -20,6 +20,5 @@ MANAGER:
 
   MIGRATION_BACKEND: 'gloo'
   MIGRATION_BUFFER_BLOCKS: 512
-  MIGRATION_INTERNAL_BUFFER_NUM: 2
 
   ENABLE_SCALING: False
diff --git a/docs/Arguments.md b/docs/Arguments.md
@@ -32,15 +32,14 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
             [--profiling-result-file-path PROFILING_RESULT_FILE_PATH]
             [--gpu-type GPU_TYPE]
             [--polling-interval POLLING_INTERVAL]
-            [--migration-backend {gloo,rpc}]
+            [--migration-backend {gloo,nccl,rpc}]
             [--migration-buffer-blocks MIGRATION_BUFFER_BLOCKS]
             [--migration-backend-init-timeout MIGRATION_BACKEND_INIT_TIMEOUT]
             [--migration-num-layers MIGRATION_NUM_LAYERS]
             [--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS]
             [--max-stages MAX_STAGES]
             [--enable-pd-disagg]
             [--num-dispatch-instances NUM_DISPATCH_INSTANCES]
-            [--migration-internal-buffer-num MIGRATION_INTERNAL_BUFFER_NUM]
             [--log-request-timestamps]
 
 ```
@@ -149,7 +148,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 - Default: "rpc"
 
 `--migration-buffer-blocks`
-- Number of cache blocks in each migration buffer.
+- Number of cache blocks in migration.
 - Default: 512
 
 `--migration-backend-init-timeout`
@@ -168,10 +167,6 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 - Drop migration if the number of stages > max_stages.
 - Default: 3
 
-`--migration-internal-buffer-num`
-- Number of the buffer in migration backend for sending and receiving
-- Default: 2
-
 `--log-request-timestamps`
 - Enable logging request timestamps.
 

diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py
@@ -47,7 +47,7 @@ def add_argument(self, *args, **kwargs):
 class LlumnixEntrypointsArgs:
     launch_ray_cluster: bool = None
     ray_cluster_port: int = None
-    queue_type: str = None
+    request_output_queue_type: str = None
     request_output_queue_port: int = None
     disable_log_requests_server: bool = None
     log_request_timestamps: bool = None
@@ -82,10 +82,10 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         parser.add_argument("--ray-cluster-port",
                             type=int,
                             help='ray cluster port')
-        parser.add_argument("--queue-type",
+        parser.add_argument("--request-output-queue-type",
                             type=str,
                             choices=['rayqueue', 'zmq'],
-                            help='queue type for request output queue')
+                            help='request output queue type for request output queue')
         parser.add_argument("--request-output-queue-port",
                             type=int,
                             help='port for zmq')
@@ -138,7 +138,6 @@ class EngineManagerArgs:
     migration_num_layers: int = None
     last_stage_max_blocks: int = None
     max_stages: int = None
-    migration_internal_buffer_num: int = None
 
     enable_pd_disagg: bool = None
 
@@ -177,8 +176,7 @@ def create_migration_config(self) -> MigrationConfig:
                                            self.migration_num_layers,
                                            self.last_stage_max_blocks,
                                            self.max_stages,
-                                           self.migration_backend_init_timeout,
-                                           self.migration_internal_buffer_num)
+                                           self.migration_backend_init_timeout)
         return migration_config
 
     @classmethod
@@ -197,9 +195,6 @@ def check_args(cls, args: 'EngineManagerArgs', parser: argparse.ArgumentParser):
             if hasattr(action, 'choices') and action.choices is not None and hasattr(args, action.dest):
                 assert getattr(args, action.dest) in action.choices, f"{action.dest} should be one of {action.choices}."
 
-        assert args.migration_backend != 'nccl', 'NCCL has been temporarily deprecated due to its incompatibility with \
-            concurrent migrations in Llumnix.'
-
         assert args.migration_backend != 'gloo' or (args.migration_backend == 'gloo' \
             and not args.disable_init_instance_by_manager and not args.disable_fixed_node_init_instance), \
             ("When using gloo as migration backend, "
@@ -316,16 +311,13 @@ def add_cli_args(
                             help='timeout(s) for initializing migration backend')
         parser.add_argument('--migration-buffer-blocks',
                             type=int,
-                            help='number of cache blocks in each migration buffer')
+                            help='number of cache blocks in migration')
         parser.add_argument('--migration-num-layers',
                             type=int,
                             help='number of kv-cache layers to transfer in each round during migration')
         parser.add_argument('--last-stage-max-blocks',
                             type=int,
                             help='if the number pf remain blocks < last_stage_max_blocks, do last stage migration')
-        parser.add_argument('--migration-internal-buffer-num',
-                            type=int,
-                            help='number of the buffer in migration backend for sending and receiving')
         parser.add_argument('--max-stages',
                             type=int,
                             help='drop migration if the number of stages > max_stages')

diff --git a/llumnix/backends/migration_backend_interface.py b/llumnix/backends/migration_backend_interface.py
@@ -13,9 +13,7 @@
 
 from abc import ABC, abstractmethod
 from typing import List
-import queue
 
-import torch
 
 class MigrationBackendBase(ABC):
     @abstractmethod
@@ -41,24 +39,3 @@ def do_send(self, dst_handle, blocks: List[int]):
     @abstractmethod
     def do_recv(self, src_handle, blocks: List[int]):
         raise NotImplementedError
-
-class BufferMigrationBackend(MigrationBackendBase):
-    def __init__(self, num_buffer, buffer_shape, buffer_dtype, buffer_device, pin_memory, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        self.num_buffer = num_buffer
-
-        self.dummy_buffer = [
-            torch.empty(size=buffer_shape, dtype=buffer_dtype, device=buffer_device, pin_memory=pin_memory)
-            for _ in range(self.num_buffer)
-        ]
-
-        self.avaiable_buffer_queue = queue.Queue()
-        for i in range(self.num_buffer):
-            self.avaiable_buffer_queue.put_nowait(i)
-
-    def get_available_cache(self):
-        return self.avaiable_buffer_queue.get()
-
-    def put_back_cache(self, buffer_id):
-        self.avaiable_buffer_queue.put_nowait(buffer_id)
diff --git a/llumnix/backends/utils.py b/llumnix/backends/utils.py
@@ -19,16 +19,16 @@
 from llumnix.backends.backend_interface import BackendInterface, BackendType
 from llumnix.queue.queue_type import QueueType
 
-def init_backend_engine(instance_id: str, output_queue_type: QueueType,
+def init_backend_engine(instance_id: str, request_output_queue_type: QueueType,
                         backend_type: BackendType, *args, **kwargs) -> BackendInterface:
     if backend_type == BackendType.VLLM:
         # pylint: disable=import-outside-toplevel
         from llumnix.backends.vllm.llm_engine import BackendVLLM
-        backend_engine = BackendVLLM(instance_id, output_queue_type, *args, **kwargs)
+        backend_engine = BackendVLLM(instance_id, request_output_queue_type, *args, **kwargs)
     elif backend_type == BackendType.SIM_VLLM:
         # pylint: disable=import-outside-toplevel
         from llumnix.backends.vllm.simulator import BackendSimVLLM
-        backend_engine = BackendSimVLLM(instance_id, output_queue_type, *args, **kwargs)
+        backend_engine = BackendSimVLLM(instance_id, request_output_queue_type, *args, **kwargs)
     else:
         raise ValueError(f'Unsupported backend: {backend_type}')
     return backend_engine