Merge branch 'main' into vllm_upgrade

AlibabaPAI · Dec 11, 2024 · 6044d19 · 6044d19
2 parents 1d9c438 + b319b23
commit 6044d19
Show file tree

Hide file tree

Showing 72 changed files with 1,930 additions and 969 deletions.
diff --git a/.github/workflows/bench_test.yml b/.github/workflows/bench_test.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,15 +20,15 @@ jobs:
   bench_tests:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 60
+    timeout-minutes: 30
     steps:
     - name: Checkout
       uses: actions/checkout@v4
     - name: Kill Running Containers
       run: |
         [[ -n $(docker ps -q) ]] && docker kill $(docker ps -q) || echo "No running containers to kill."
     - name: Build And Test
-      run: ./tools/bench_test.sh
+      run: ./tools/run_test.sh bench_test
     - name: Create comment from file
       if: ${{ github.event_name != 'push' }}
       uses: actions/github-script@v7

diff --git a/.github/workflows/e2e_test.yml b/.github/workflows/e2e_test.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,12 +20,12 @@ jobs:
   e2e_tests:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 60
+    timeout-minutes: 30
     steps:
     - name: Checkout
       uses: actions/checkout@v4
     - name: Kill Running Containers
       run: |
         [[ -n $(docker ps -q) ]] && docker kill $(docker ps -q) || echo "No running containers to kill."
     - name: Build And Test
-      run: ./tools/e2e_test.sh
+      run: ./tools/run_test.sh e2e_test
diff --git a/.github/workflows/migration_test.yml b/.github/workflows/migration_test.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,15 +20,15 @@ jobs:
   migration_tests:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 60
+    timeout-minutes: 90
     steps:
     - name: Checkout
       uses: actions/checkout@v4
     - name: Kill Running Containers
       run: |
         [[ -n $(docker ps -q) ]] && docker kill $(docker ps -q) || echo "No running containers to kill."
     - name: Build And Test
-      run: ./tools/migration_test.sh
+      run: ./tools/run_test.sh migration_test
     - name: Create comment from file
       if: ${{ github.event_name != 'push' }}
       uses: actions/github-script@v7

diff --git a/.github/workflows/offline_inference.yml b/.github/workflows/offline_inference.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,13 +20,8 @@ jobs:
   offline_inference:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 10
+    timeout-minutes: 5
     steps:
     - uses: actions/checkout@v4
     - name: Run offline inference example
-      run: |
-        nvidia-docker run --rm -t --net host --ipc host \
-          -v ${PWD}:/workspace \
-          -w /workspace \
-          registry.cn-beijing.aliyuncs.com/llumnix/llumnix-dev:20240909_action_678a439 \
-          bash -c "pip install -e . > /dev/null && make offline_test"
+      run: ./tools/run_test.sh offline_test
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,7 +20,7 @@ jobs:
   pylint_test:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 10
+    timeout-minutes: 5
     steps:
     - uses: actions/checkout@v4
     - name: Analysing the code with pylint

diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   cancel_previous_workflows:
     runs-on: ubuntu-latest
-    timeout-minutes: 3
+    timeout-minutes: 1
     steps:
     - uses: styfle/[email protected]
       with:
@@ -20,12 +20,12 @@ jobs:
   unit_tests:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 60
+    timeout-minutes: 30
     steps:
     - name: Checkout
       uses: actions/checkout@v4
     - name: Kill Running Containers
       run: |
         [[ -n $(docker ps -q) ]] && docker kill $(docker ps -q) || echo "No running containers to kill."
     - name: Build And Test
-      run: ./tools/unit_test.sh
+      run: ./tools/run_test.sh unit_test
diff --git a/.github/workflows/whl_build.yml b/.github/workflows/whl_build.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   whl_build:
     runs-on: ubuntu-latest
-    timeout-minutes: 10
+    timeout-minutes: 1
 
     steps:
     - name: Checkout

diff --git a/Makefile b/Makefile
@@ -21,38 +21,39 @@ install:
 
 .PHONY: lint
 lint: check_pylint_installed check_pytest_installed
-	@pylint --rcfile=.pylintrc -s n  --jobs=32 ./llumnix
+	@pylint --rcfile=.pylintrc -s n  --jobs=128 ./llumnix
 
 	@pylint --rcfile=.pylintrc \
 			--disable=protected-access,super-init-not-called,unused-argument,redefined-outer-name,invalid-name \
-			-s n --jobs=32 ./tests
+			-s n --jobs=128 ./tests
 
 .PHONY: test
 test: check_pytest_installed
-	@pytest -x -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
+	@pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
 	@python examlpes/offline_inference.py
-	@pytest -v tests/e2e_test/test_e2e.py
-	@pytest -v -x ./tests/e2e_test/test_migration.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_migration.py
 
 .PHONY: unit_test
 unit_test: check_pytest_installed
-	@pytest -x -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
+	@pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
 
 .PHONY: offline_test
 offline_test:
 	@python examlpes/offline_inference.py
 
 .PHONY: e2e_test
 e2e_test:
-	@pytest -v tests/e2e_test/test_e2e.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
 
 .PHONY: bench_test
 bench_test:
-	@pytest -v ./tests/e2e_test/test_bench.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py
 
 .PHONY: migration_test
 migration_test:
-	@pytest -v -x ./tests/e2e_test/test_migration.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_migration.py
 
 #################### pygloo install for gloo migration backend begin ####################
 

diff --git a/README.md b/README.md
@@ -10,7 +10,8 @@ Efficient and easy <i>multi-instance</i> LLM serving
 
 ## 🔥 Latest News
 
-- [2024.7] We officially released the first version of Llumnix!
+- [2024.11] Llumnix v0.1.0 launched!
+- [2024.7] We officially released the first version of Llumnix.
 - [2024.6] We released our OSDI '24 [research paper](https://arxiv.org/abs/2406.03243) on arxiv.
 
 ## 🚀 Why Llumnix
@@ -22,14 +23,16 @@ Llumnix provides optimized multi-instance serving performance in terms of:
 - *Low latency*
   - **Reduced time-to-first-token** (TTFT) and queuing delays with less memory fragmentation
   - **Reduced time-between-tokens** (TBT) and preemption stalls with better load balancing
-- *High throughput* with integration with state-of-the-art inference engines
+- *High throughput*
+  - Integration with state-of-the-art inference engines
+  - Support for techniques like prefill-decoding disaggregation
 
 Llumnix achieves this with:
 
 - Dynamic, fine-grained, KV-cache-aware scheduling
 - Continuous **rescheduling** across instances
   - Enabled by a KV cache migration mechanism with near-zero overhead
-  - Exploited for continuous load balancing and de-fragmentation
+  - Exploited for continuous load balancing, de-fragmentation, and prefill-decoding disaggregation
 
 Llumnix is easy to use with:
 
@@ -54,23 +57,24 @@ python -m llumnix.entrypoints.vllm.api_server \
 During the serving deployment execution, Llumnix will automatically configure itself and serve as the request scheduling layer on top of the multiple vLLM engine instances.
 
 Visit our [documentation](./docs/) to get started:
-- [QuickStart](./docs/Quickstart.md)
+- [Quick Start](./docs/Quickstart.md)
 - [Supported Models](./docs/Supported_Models.md)
 - [Fault Tolerance](./docs/Fault_Tolerance.md)
 - [Simulator](./docs/Simulator.md)
+- [Prefill-decoding Disaggregation](./docs/Prefill-decoding_Disaggregation.md)
 
 ## Performance
-We evaluate the performance of the KV-cache-aware load-balancing scheduler and migration mechanism of Llumnix with 16 Llama2-7B/Qwen1.5-7B instances, each using an A10 GPU (24GB).
+We evaluate the performance of the KV-cache-aware load-balancing scheduler and migration mechanism of Llumnix with 16 Qwen2.5-7B instances (each using an A10-24GB GPU) and 16 Llama2-13B instances (each using an A800-80GB GPU).
 
 We use Poisson distributions with different request rates to generate request arrivals. For the input/output lengths of requests, we use ShareGPT dataset.
 
 <div align=center>
-<img src="./docs/performance.png" align="center" width=80%/>
+<img src="./docs/v0.1.0_benchmark.png" align="center" width=80%/>
 </div>
 
-With the KV-cache-aware load-balancing scheduler, Llumnix outperforms a simple load balancing scheduler based on queue sizes in TTFT (prefill) by up to 1.8x and 7.7x for mean and P99, and 1.4x for P99 TBT (decode).
+Llumnix outperforms a simple round-robin scheduler in TTFT (prefill) by up to 6.4x and 12.1x for mean and P99, and 12% for P99 TBT (decode). Llumnix also shows significantly shorter average preemption stalls (by two orders of magnitude).
 
-With migration mechanism, Llumnix maintains lower preemption stalls, further outperformers load-balance scheduler in TTFT by up to 1.7x and 3.3x for mean and P99, and 1.3x for P99 TBT.
+With the KV-cache-aware load-balancing scheduler and the migration mechanism, Llumnix also outperforms a simple load balancing scheduler based on queue sizes in TTFT (prefill) by up to 4.6x and 9.1x for mean and P99, and 15% for P99 TBT (decode).
 
 ## Roadmap
 

diff --git a/configs/base.yml b/configs/base.yml
@@ -1,9 +1,7 @@
 SERVER:
   HOST: '127.0.0.1'
   PORT: 1234
-  QUEUE_TYPE: "rayqueue"
-
-RAY:
+  REQUEST_OUTPUT_QUEUE_TYPE: "rayqueue"
   RAY_CLUSTER_PORT: 6379
   LAUNCH_RAY_CLUSTER: True
 
@@ -18,9 +16,9 @@ MANAGER:
 
   ENABLE_MIGRATION: True
   ENABLE_DEFRAG: True
-  REQUEST_MIGRATION_POLICY: 'SJF'
+  REQUEST_MIGRATION_POLICY: 'SR'
 
   MIGRATION_BACKEND: 'gloo'
-  MIGRATION_CACHE_BLOCKS: 512
+  MIGRATION_BUFFER_BLOCKS: 512
 
   ENABLE_SCALING: False
diff --git a/docs/Arguments.md b/docs/Arguments.md
@@ -12,12 +12,12 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
             [--initial-instances INITIAL_INSTANCES]
             [--load-metric {remaining_steps,usage_ratio}]
             [--polling-interval POLLING_INTERVAL]
-            [--dispatch-policy {balanced,load,queue}]
+            [--dispatch-policy {balanced,load,queue,rr}]
             [--enable-migration]
             [--pair-migration-frequency PAIR_MIGRATION_FREQUENCY]
             [--pair-migration-policy {balanced,defrag_constrained,defrag_relaxed}]
             [--migrate-out-threshold MIGRATE_OUT_THRESHOLD]
-            [--request-migration-policy {LCFS,SJF,LJF}]
+            [--request-migration-policy {LCR,SR,LR,FCW,FCWSR}]
             [--enable-defrag ENABLE_DEFRAG]
             [--enable-scaling]
             [--min-instances MIN_INSTANCES]
@@ -33,11 +33,13 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
             [--gpu-type GPU_TYPE]
             [--polling-interval POLLING_INTERVAL]
             [--migration-backend {gloo,nccl,rpc}]
-            [--migration-cache-blocks MIGRATION_CACHE_BLOCKS]
+            [--migration-buffer-blocks MIGRATION_BUFFER_BLOCKS]
             [--migration-backend-init-timeout MIGRATION_BACKEND_INIT_TIMEOUT]
             [--migration-num-layers MIGRATION_NUM_LAYERS]
             [--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS]
             [--max-stages MAX_STAGES]
+            [--enable-pd-disagg]
+            [--num-dispatch-instances NUM_DISPATCH_INSTANCES]
             [--log-request-timestamps]
 
 ```
@@ -66,7 +68,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 
 `--dispatch-policy`
 - Request dispatch policy.
-- Possible choices: balanced, load, queue
+- Possible choices: balanced, load, queue, rr
 - Default: "load"
 
 `--enable-migration`
@@ -87,8 +89,8 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 
 `--request-migration-policy`
 - Request migration policy.
-- Possible choices: LCFS, SJF, LJF
-- Default: "SJF"
+- Possible choices: LCR, SR, LR, FCW, FCWSR
+- Default: "SR"
 
 `--enable-defrag`
 - Enable defragmentation through migration based on virtual usage.
@@ -145,7 +147,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 - Possible choices: gloo, rpc
 - Default: "rpc"
 
-`--migration-cache-blocks`
+`--migration-buffer-blocks`
 - Number of cache blocks in migration.
 - Default: 512
 
@@ -168,6 +170,12 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 `--log-request-timestamps`
 - Enable logging request timestamps.
 
+`--enable-pd-disagg`
+- Enable prefill decoding disaggregation.
+
+`--num-dispatch-instances`
+- Number of available instances for dispatch.
+
 # Unsupported vLLM feature options
 
 `--device`