Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] upgrade vllm to v0.6.3.post1 #69

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
9 changes: 7 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ install:
.PHONY: lint
lint: check_pylint_installed check_pytest_installed
@pylint --rcfile=.pylintrc -s n --jobs=128 ./llumnix

@pylint --rcfile=.pylintrc \
--disable=protected-access,super-init-not-called,unused-argument,redefined-outer-name,invalid-name \
-s n --jobs=128 ./tests
Expand Down Expand Up @@ -61,22 +61,27 @@ test: check_pytest_installed

.PHONY: unit_test
unit_test: check_pytest_installed
@ray stop
@pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings

.PHONY: offline_test
offline_test:
@ray stop
@python examlpes/offline_inference.py

.PHONY: e2e_test
e2e_test:
@ray stop
@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py

.PHONY: bench_test
bench_test:
@ray stop
@pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py

.PHONY: migration_test
migration_test:
@ray stop
@pytest -v -x -s --tb=long ./tests/e2e_test/test_migration.py

####################################### test end ########################################
Expand Down
16 changes: 7 additions & 9 deletions benchmark/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,12 @@ async def query_model_vllm(prompt, verbose, ip_ports):

async with aiohttp.ClientSession(timeout=timeout) as session:
best_of = 1
use_beam_search = False
output_len = expected_response_len
request_dict = {
"prompt": prompt,
"n": 1,
"best_of": best_of,
"use_beam_search": use_beam_search,
"temperature": 0.0 if use_beam_search else 1.0,
"temperature": 1.0,
"top_k": 1,
"max_tokens": max(output_len, 1),
"ignore_eos": True,
Expand Down Expand Up @@ -815,18 +813,18 @@ def main():
except FileNotFoundError:
os.mknod(file_name)
with open(file_name, 'w') as f:
results.append({"qps": args.qps,
results.append({"qps": args.qps,
"cv": args.coefficient_variation,
"request_ids": request_ids,
"request_ids": request_ids,
"request_lens": request_lens,
"request_latencies": request_latencies,
"prefill_token_latencies": prefill_token_latencies,
"request_latencies": request_latencies,
"prefill_token_latencies": prefill_token_latencies,
"decode_token_latencies": decode_token_latencies,
"decode_sum_latencies": decode_sum_latencies,
"decode_sum_latencies": decode_sum_latencies,
"all_decode_token_latencies": all_decode_token_latencies,
"inference_latencies": inference_latencies,
"per_token_latencies_breakdown_dict": per_token_latencies_breakdown_dict,
"throughput": throughput,
"throughput": throughput,
"instance_num": avg_instance_num})
json.dump(results, f)

Expand Down
2 changes: 1 addition & 1 deletion configs/vllm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ MANAGER:
ENABLE_DEFRAG: True
REQUEST_MIGRATION_POLICY: 'SR'

MIGRATION_BACKEND: 'gloo'
MIGRATION_BACKEND: 'rayrpc'
MIGRATION_BUFFER_BLOCKS: 512

ENABLE_SCALING: False
3 changes: 1 addition & 2 deletions docs/Quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,10 @@ HEAD_NODE=1 python -m llumnix.entrypoints.vllm.api_server \
--initial-instances $INITIAL_INSTANCES \
--launch-ray-cluster \
--model $MODEL_PATH \
--engine-use-ray \
--worker-use-ray \
--max-model-len 4096
```
`CONFIG_PATH` is the path to the configuration file for Llumnix, and we give an example configuration file [here](../configs/base.yml). `MODEL_PATH` defines the location of your model. `INITIAL_INSTANCES` determines the number of instances to be launched on the current node,
`CONFIG_PATH` is the path to the configuration file for Llumnix, and we give an example configuration file [here](../configs/base.yml). `MODEL_PATH` defines the location of your model. `INITIAL_INSTANCES` determines the number of instances to be launched on the current node,

Second, you can run the benchmark to evaluate the serving performance:

Expand Down
6 changes: 3 additions & 3 deletions examlpes/offline_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ async def main():
for request in prompts:
request_id = random_uuid()
await manager.generate.remote(request_id=request_id,
server_info=server_info,
server_info=server_info,
prompt=request,
sampling_params=sampling_params,)
params=sampling_params,)

await output_task

asyncio.run(main())
Expand Down
11 changes: 6 additions & 5 deletions llumnix/backends/backend_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from abc import ABC, abstractmethod
from enum import Enum
from typing import Iterable, List, Union, Deque
from typing import Iterable, List, Union, Deque, Tuple

from llumnix.llumlet.request import LlumnixRequest, RequestStatus
from llumnix.server_info import ServerInfo
Expand Down Expand Up @@ -71,7 +71,7 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:

# Methods for migration
@abstractmethod
def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_stage_num_blocks: int) -> List[int]:
def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_stage_num_blocks: int) -> Tuple[List[int], List[int]]:
"""Retrieves the incremental block table for a given request.

This method is used to fetch a list of block numbers that represent the incremental
Expand All @@ -88,7 +88,7 @@ def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_st
need to be fetched in the current stage.

Returns:
A list of integers, where each integer represents a block number that indicates
A list of integers and its token ids, where each integer represents a block number that indicates
physical index of kv cache block tensor. These block numbers can then be used
to transfer to dstination instance.
"""
Expand Down Expand Up @@ -191,7 +191,8 @@ def pre_alloc(self,
request_id: str,
request_status: RequestStatus,
request_arrival_time: float,
block_num: int) -> List[int]:
block_num: int,
token_ids: List[int]) -> List[int]:
"""Pre-allocates cache blocks for a migrating request.

This method selects a specified number of free cache blocks to be reserved for an incoming
Expand All @@ -207,7 +208,7 @@ def pre_alloc(self,
request_status: The status (waiting/running) of the request.
request_arrival_time: The arrival time of the request.
block_num: The number of cache blocks that need to be pre-allocated for the request.

token_ids: The token IDs of the request.
Returns:
A list of integers where each integer represents the block table reserved for the migration request.
"""
Expand Down
4 changes: 2 additions & 2 deletions llumnix/backends/migration_backend_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def migrate_cache(self, src_handle, src_blocks: List[int], dst_blocks: List[int]
raise NotImplementedError

@abstractmethod
def do_send(self, dst_handle, blocks: List[int]):
def do_send(self, dst_handle, blocks: List[int], virtuel_engine: int):
raise NotImplementedError

@abstractmethod
def do_recv(self, src_handle, blocks: List[int]):
def do_recv(self, src_handle, blocks: List[int], virtuel_engine: int):
raise NotImplementedError
Loading
Loading