Skip to content

Commit

Permalink
[Misc] Add an example demonstrating how to run llumnix offline (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
KuilongCui authored Sep 3, 2024
1 parent 77f048d commit 960e4c6
Show file tree
Hide file tree
Showing 9 changed files with 201 additions and 21 deletions.
30 changes: 22 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ init:
@git submodule update --init --recursive

.PHONY: install
install: cupy
install:
@pip install -e .

.PHONY: lint
lint: check_pylint_installed
@pylint --rcfile=.pylintrc -s n ./llumnix ./tests --exit-zero

.PHONY: test
test:
test: check_pytest_installed
@pytest -q -x --ignore=third_party/ --disable-warnings

#################### pygloo install for gloo migration backend begin ####################
Expand All @@ -40,9 +40,9 @@ pygloo: init

###################################### cupy begin #######################################

.PHONY: cupy
cupy:
@./tools/cupy_install.sh
.PHONY: cupy-cuda
cupy-cuda:
@./tools/cupy_cuda_install.sh

####################################### cupy end ########################################

Expand All @@ -52,12 +52,26 @@ PYLINT_VERSION = 2.12.2

.PHONY: check_pylint_installed
check_pylint_installed:
@command -v pylint >/dev/null 2>&1 || { \
@python3 -m pip show pylint > /dev/null 2>&1 || { \
echo "pylint is not installed. Installing pylint $(PYLINT_VERSION)..."; \
python3 -m pip install pylint==$(PYLINT_VERSION); }

@python3 -c "import pylint_pytest" >/dev/null 2>&1 || { \
###################################### pylint end #######################################

##################################### pytest begin ######################################

.PHONY: check_pytest_installed
check_pytest_installed:
@python3 -m pip show pytest > /dev/null 2>&1 || { \
echo "pytest is not installed. Installing pytest ..."; \
python3 -m pip install pytest; }

@python3 -m pip show pytest-asyncio > /dev/null 2>&1 || { \
echo "pytest-asyncio is not installed. Installing pytest-asyncio ..."; \
python3 -m pip install pytest-asyncio; }

@python3 -m pip show pylint-pytest > /dev/null 2>&1 || { \
echo "pylint-pytest is not installed. Installing pylint-pytest ..."; \
python3 -m pip install pylint-pytest; }

###################################### pylint end #######################################
###################################### pytest end #######################################
15 changes: 11 additions & 4 deletions docs/Quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@ Llumnix requires python `3.8.1~3.10.0` and is currently built on top of vLLM (ve

[vLLM Installation](https://docs.vllm.ai/en/v0.4.2/getting_started/installation.html)

## Build from Source
### Install from Pypi

You can install Llumnix from pypi:
```
pip install llumnix
```

### Build from Source

You can build and install Llumnix from source:
```
Expand All @@ -15,11 +22,11 @@ cd llumnix
make install
```

If you want to use gloo as migration backend, please refer to [this link](https://github.com/ZeldaHuang/pygloo/blob/main/.github/workflows/ubuntu_basic.yml#L24C1-L26C1) to install [Bazel](https://github.com/bazelbuild/bazel) >= 5.1.0. Then, run `make pygloo` to install [pygloo](https://github.com/ZeldaHuang/pygloo).
The default migration backend is RPC. If you want to use NCCL as the migration backend, run `make cupy-cuda` to install [cupy-cuda](https://pypi.org/search/?q=cupy-cuda) manually, as it is related to the CUDA version.

Note: Using conda is not recommended, as it cannot properly handle pygloo's dependency on gcc libstdc++.so.6: version GLIBCXX_3.4.30.
If you want to use Gloo as migration backend, **in addition to installing cupy-cuda**, please refer to [this link](https://github.com/ZeldaHuang/pygloo/blob/main/.github/workflows/ubuntu_basic.yml#L24C1-L26C1) to install [Bazel](https://github.com/bazelbuild/bazel) >= 5.1.0. Then, run `make pygloo` to install [pygloo](https://github.com/ZeldaHuang/pygloo).

We will provide official releases through pypi soon.
Note: Using conda is not recommended, as it cannot properly handle pygloo's dependency on gcc libstdc++.so.6: version GLIBCXX_3.4.30.

After installation, you can follow this guide to use Llumnix for multi-instance LLM serving quickly.

Expand Down
102 changes: 102 additions & 0 deletions examlpes/offline_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from typing import List
import os
import uuid
import asyncio

import ray
from ray.util.queue import Queue as RayQueue
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy

from llumnix import launch_ray_cluster, connect_to_ray_cluster, init_manager, init_llumlets
from llumnix import (SamplingParams, ServerInfo, EngineManagerArgs, LLMEngineManager, Llumlet,
EngineArgs, RequestOutput)

# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]

# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Launch ray cluster
os.environ['HEAD_NODE'] = '1'
os.environ['HEAD_NODE_IP'] = '127.0.0.1'
ray_cluster_port=37000

# Note: launch_ray_cluster will stop current ray cluster first, then init a new one.
launch_ray_cluster(ray_cluster_port=ray_cluster_port)
connect_to_ray_cluster(port=ray_cluster_port)

# Set manager args and engine args.
manager_args = EngineManagerArgs()
engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True,
trust_remote_code=True, max_model_len=370)

# Create llumlets.
llumlet_ids: List[str] = None
llumlets: List[Llumlet] = None
llumlet_ids, llumlets = init_llumlets(manager_args, engine_args,
node_id=ray.get_runtime_context().get_node_id())


# Create a manager. If the manager is created first, and then the llumlets are created, manager.scale_up
# need to be called to add the newly created llumlets to the management of the manager.
manager: LLMEngineManager = init_manager(manager_args)

# The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in.
server_id = str(uuid.uuid4().hex)
request_output_queue = RayQueue(actor_options={
"scheduling_strategy": NodeAffinitySchedulingStrategy(
node_id=ray.get_runtime_context().get_node_id(),
soft=False)
})
server_info = ServerInfo(server_id, request_output_queue)

# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
async def background_process_outputs(num_tasks):
finish_task = 0
while finish_task != num_tasks:
await asyncio.sleep(0.1)
qsize = await request_output_queue.actor.qsize.remote()
if qsize > 0:
request_outputs: List[RequestOutput] = await request_output_queue.actor.get_nowait_batch.remote(qsize)
for request_output in request_outputs:
if request_output.finished:
finish_task += 1
prompt = request_output.prompt
generated_text = request_output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

async def main():
output_task = asyncio.create_task(background_process_outputs(len(prompts)))

for request in prompts:
request_id = str(uuid.uuid4().hex)
await manager.generate.remote(request_id=request_id,
server_info=server_info,
prompt=request,
sampling_params=sampling_params,)

await output_task

asyncio.run(main())

# Kill all actor, as detach actor will not be killed by ray.shutdown.
named_actors = ray.util.list_named_actors(True)
for actor in named_actors:
try:
actor_handle = ray.get_actor(actor['name'], namespace=actor['namespace'])
except:
continue
try:
ray.kill(actor_handle)
except:
continue

# Shutdown ray cluster.
ray.shutdown()
26 changes: 26 additions & 0 deletions llumnix/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,29 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import vllm
from vllm import *

from llumnix.server_info import ServerInfo
from llumnix.entrypoints.llumnix_utils import (launch_ray_cluster, connect_to_ray_cluster,
init_manager, init_llumlets)
from llumnix.arg_utils import EngineManagerArgs
from llumnix.llm_engine_manager import LLMEngineManager
from llumnix.llumlet.llumlet import Llumlet

from .version import __version__

__all__ = [
"__version__",
"ServerInfo",
"launch_ray_cluster",
"connect_to_ray_cluster",
"init_manager",
"init_llumlets",
"EngineManagerArgs",
"LLMEngineManager",
"Llumlet"
]

__all__.extend(getattr(vllm, "__all__", []))
4 changes: 3 additions & 1 deletion llumnix/backends/vllm/migration_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

from typing import List
import torch
import cupy
from func_timeout import func_set_timeout, FunctionTimedOut

import ray
Expand Down Expand Up @@ -145,6 +144,9 @@ def __init__(self, migration_config: MigrationConfig, cache_engine: CacheEngine,
scheduling_strategy, is_driver_worker, gpu_cache) -> None:
super().__init__()

# pylint: disable=C0415
import cupy

self.migration_config = migration_config
self.cache_engine = cache_engine
self.backend = migration_config.migration_backend
Expand Down
14 changes: 14 additions & 0 deletions llumnix/version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (c) 2024, Alibaba Group;
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.0.1"
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,5 @@ aiohttp
scipy
pandas
matplotlib
pytest-asyncio
cupy-cuda12x # for nccl migration backend
func_timeout
numpy < 1.24.0 # for gloo migration backend's compatibility with numpy.float
29 changes: 23 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,30 @@ def get_requirements() -> List[str]:
requirements = f.read().strip().split("\n")
return requirements

def readme():
with open('README.md', encoding='utf-8') as f:
content = f.read()
return content

setup(
name='llumnix',
version='0.0.1',
packages=find_packages(),
install_requires=get_requirements(),
author='Llumnix Team',
version='0.0.2',
python_requires='>=3.8.1, <3.11',
description='Efficient and easy multi-instance LLM serving',
license="Apache 2.0",
long_description=readme(),
long_description_content_type="text/markdown",
author='Llumnix Team',
url='https://github.com/AlibabaPAI/llumnix',
)
license="Apache 2.0",
packages=find_packages(),
install_requires=get_requirements(),
platforms=["all"],
classifiers=[
'Programming Language :: Python',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
"License :: OSI Approved :: Apache Software License",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
)
File renamed without changes.

0 comments on commit 960e4c6

Please sign in to comment.