diff --git a/Makefile b/Makefile index 58305148..5b2ff03d 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ init: @git submodule update --init --recursive .PHONY: install -install: cupy +install: @pip install -e . .PHONY: lint @@ -24,7 +24,7 @@ lint: check_pylint_installed @pylint --rcfile=.pylintrc -s n ./llumnix ./tests .PHONY: test -test: +test: check_pytest_installed @pytest -q -x --ignore=third_party/ --disable-warnings #################### pygloo install for gloo migration backend begin #################### @@ -40,9 +40,9 @@ pygloo: init ###################################### cupy begin ####################################### -.PHONY: cupy -cupy: - @./tools/cupy_install.sh +.PHONY: cupy-cuda +cupy-cuda: + @./tools/cupy_cuda_install.sh ####################################### cupy end ######################################## @@ -52,12 +52,26 @@ PYLINT_VERSION = 2.12.2 .PHONY: check_pylint_installed check_pylint_installed: - @command -v pylint >/dev/null 2>&1 || { \ + @python3 -m pip show pylint > /dev/null 2>&1 || { \ echo "pylint is not installed. Installing pylint $(PYLINT_VERSION)..."; \ python3 -m pip install pylint==$(PYLINT_VERSION); } - @python3 -c "import pylint_pytest" >/dev/null 2>&1 || { \ +###################################### pylint end ####################################### + +##################################### pytest begin ###################################### + +.PHONY: check_pytest_installed +check_pytest_installed: + @python3 -m pip show pytest > /dev/null 2>&1 || { \ + echo "pytest is not installed. Installing pytest ..."; \ + python3 -m pip install pytest; } + + @python3 -m pip show pytest-asyncio > /dev/null 2>&1 || { \ + echo "pytest-asyncio is not installed. Installing pytest-asyncio ..."; \ + python3 -m pip install pytest-asyncio; } + + @python3 -m pip show pylint-pytest > /dev/null 2>&1 || { \ echo "pylint-pytest is not installed. Installing pylint-pytest ..."; \ python3 -m pip install pylint-pytest; } -###################################### pylint end ####################################### +###################################### pytest end ####################################### diff --git a/docs/Quickstart.md b/docs/Quickstart.md index 56609ffe..98690772 100644 --- a/docs/Quickstart.md +++ b/docs/Quickstart.md @@ -6,7 +6,14 @@ Llumnix requires python `3.8.1~3.10.0` and is currently built on top of vLLM (ve [vLLM Installation](https://docs.vllm.ai/en/v0.4.2/getting_started/installation.html) -## Build from Source +### Install from Pypi + +You can install Llumnix from pypi: +``` +pip install llumnix +``` + +### Build from Source You can build and install Llumnix from source: ``` @@ -15,11 +22,11 @@ cd llumnix make install ``` -If you want to use gloo as migration backend, please refer to [this link](https://github.com/ZeldaHuang/pygloo/blob/main/.github/workflows/ubuntu_basic.yml#L24C1-L26C1) to install [Bazel](https://github.com/bazelbuild/bazel) >= 5.1.0. Then, run `make pygloo` to install [pygloo](https://github.com/ZeldaHuang/pygloo). +If you want to use NCCL as the migration backend, run `make cupy-cuda` to install (cupy-cuda) [https://pypi.org/search/?q=cupy-cuda] manually, as it is related to the CUDA version. -Note: Using conda is not recommended, as it cannot properly handle pygloo's dependency on gcc libstdc++.so.6: version GLIBCXX_3.4.30. +If you want to use Gloo as migration backend, in addition to installing cupy-cuda, please refer to [this link](https://github.com/ZeldaHuang/pygloo/blob/main/.github/workflows/ubuntu_basic.yml#L24C1-L26C1) to install [Bazel](https://github.com/bazelbuild/bazel) >= 5.1.0. Then, run `make pygloo` to install [pygloo](https://github.com/ZeldaHuang/pygloo). -We will provide official releases through pypi soon. +Note: Using conda is not recommended, as it cannot properly handle pygloo's dependency on gcc libstdc++.so.6: version GLIBCXX_3.4.30. After installation, you can follow this guide to use Llumnix for multi-instance LLM serving quickly. diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py new file mode 100644 index 00000000..53ddf25b --- /dev/null +++ b/examlpes/offline_inference.py @@ -0,0 +1,98 @@ +from typing import List +import os +import uuid +import asyncio + +import ray +from ray.util.queue import Queue as RayQueue +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + +from llumnix import launch_ray_cluster, connect_to_ray_cluster, init_manager, init_llumlets +from llumnix import (SamplingParams, ServerInfo, EngineManagerArgs, LLMEngineManager, Llumlet, + EngineArgs, RequestOutput) + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Launch ray cluster +os.environ['HEAD_NODE'] = '1' +os.environ['HEAD_NODE_IP'] = '127.0.0.1' +ray_cluster_port=37000 + +# Note: launch_ray_cluster will stop current ray cluster first, then init a new one. +launch_ray_cluster(ray_cluster_port=ray_cluster_port) +connect_to_ray_cluster(port=ray_cluster_port) + +# Set manager args and engine args +manager_args = EngineManagerArgs() +engine_args = EngineArgs(model="/mnt/cuikuilong.ckl/Qwen-7B", worker_use_ray=True, + trust_remote_code=True, max_model_len=370) + +# Create llumlets +llumlet_ids: List[str] = None +llumlets: List[Llumlet] = None +llumlet_ids, llumlets = init_llumlets(manager_args, engine_args, + node_id=ray.get_runtime_context().get_node_id()) + + +# Create a manager. If the manager is created first, and then the llumlets are created, manager.scale_up +# need to be called to add the newly created llumlets to the management of the manager. +manager: LLMEngineManager = init_manager(manager_args) + +# The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in. +server_id = str(uuid.uuid4().hex) +request_output_queue = RayQueue(actor_options={ + "scheduling_strategy": NodeAffinitySchedulingStrategy( + node_id=ray.get_runtime_context().get_node_id(), + soft=False) +}) +server_info = ServerInfo(server_id, request_output_queue) + +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +async def background_process_outputs(num_tasks): + finish_task = 0 + while finish_task != num_tasks: + await asyncio.sleep(0.1) + qsize = await request_output_queue.actor.qsize.remote() + if qsize > 0: + request_outputs: List[RequestOutput] = await request_output_queue.actor.get_nowait_batch.remote(qsize) + for request_output in request_outputs: + if request_output.finished: + finish_task += 1 + prompt = request_output.prompt + generated_text = request_output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +async def main(): + output_task = asyncio.create_task(background_process_outputs(len(prompts))) + + for request in prompts: + request_id = str(uuid.uuid4().hex) + await manager.generate.remote(request_id=request_id, + server_info=server_info, + prompt=request, + sampling_params=sampling_params,) + + await output_task + +asyncio.run(main()) + +# kill all actor, as detach actor will not be killed by ray.shutdown +named_actors = ray.util.list_named_actors(True) +for actor in named_actors: + try: + actor_handle = ray.get_actor(actor['name'], namespace=actor['namespace']) + except: + continue + +# shutdown ray cluster +ray.shutdown() diff --git a/llumnix/__init__.py b/llumnix/__init__.py index 4638bd9c..4ea77baf 100644 --- a/llumnix/__init__.py +++ b/llumnix/__init__.py @@ -10,3 +10,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +import vllm +from vllm import * + +from llumnix.server_info import ServerInfo +from llumnix.entrypoints.llumnix_utils import (launch_ray_cluster, connect_to_ray_cluster, + init_manager, init_llumlets) +from llumnix.arg_utils import EngineManagerArgs +from llumnix.llm_engine_manager import LLMEngineManager +from llumnix.llumlet.llumlet import Llumlet + +from .version import __version__ + +__all__ = [ + "__version__", + "ServerInfo", + "launch_ray_cluster", + "connect_to_ray_cluster", + "init_manager", + "init_llumlets", + "EngineManagerArgs", + "LLMEngineManager", + "Llumlet" +] + +__all__.extend(getattr(vllm, "__all__", [])) diff --git a/llumnix/backends/vllm/migration_backend.py b/llumnix/backends/vllm/migration_backend.py index a05c6cda..5fb00fed 100644 --- a/llumnix/backends/vllm/migration_backend.py +++ b/llumnix/backends/vllm/migration_backend.py @@ -13,7 +13,6 @@ from typing import List import torch -import cupy from func_timeout import func_set_timeout, FunctionTimedOut import ray @@ -145,6 +144,8 @@ def __init__(self, migration_config: MigrationConfig, cache_engine: CacheEngine, scheduling_strategy, is_driver_worker, gpu_cache) -> None: super().__init__() + import cupy + self.migration_config = migration_config self.cache_engine = cache_engine self.backend = migration_config.migration_backend diff --git a/llumnix/version.py b/llumnix/version.py new file mode 100644 index 00000000..f102a9ca --- /dev/null +++ b/llumnix/version.py @@ -0,0 +1 @@ +__version__ = "0.0.1" diff --git a/requirements.txt b/requirements.txt index 55f38b5d..e203bf6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,5 @@ aiohttp scipy pandas matplotlib -pytest-asyncio -cupy-cuda12x # for nccl migration backend func_timeout numpy < 1.24.0 # for gloo migration backend's compatibility with numpy.float diff --git a/setup.py b/setup.py index d7d275dd..aabc8be7 100644 --- a/setup.py +++ b/setup.py @@ -26,13 +26,30 @@ def get_requirements() -> List[str]: requirements = f.read().strip().split("\n") return requirements +def readme(): + with open('README.md', encoding='utf-8') as f: + content = f.read() + return content + setup( name='llumnix', version='0.0.1', - packages=find_packages(), - install_requires=get_requirements(), - author='Llumnix Team', + python_requires='>=3.8.1, <=3.10', description='Efficient and easy multi-instance LLM serving', - license="Apache 2.0", + long_description=readme(), + long_description_content_type="text/markdown", + author='Llumnix Team', url='https://github.com/AlibabaPAI/llumnix', -) \ No newline at end of file + license="Apache 2.0", + packages=find_packages(), + install_requires=get_requirements(), + platforms=["all"], + classifiers=[ + 'Programming Language :: Python', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + "License :: OSI Approved :: Apache Software License", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + ], +) diff --git a/tools/cupy_install.sh b/tools/cupy_cuda_install.sh similarity index 100% rename from tools/cupy_install.sh rename to tools/cupy_cuda_install.sh