From 7ff05faa30d42b4e32d6f43cf018baa4b30f0c19 Mon Sep 17 00:00:00 2001
From: xianyu <cuikuilong.ckl@alibaba-inc.com>
Date: Wed, 28 Aug 2024 13:14:08 +0000
Subject: [PATCH] [Misc] pypi

---
 Makefile                                      | 30 ++++--
 docs/Quickstart.md                            | 15 ++-
 examlpes/offline_inference.py                 | 98 +++++++++++++++++++
 llumnix/__init__.py                           | 26 +++++
 llumnix/backends/vllm/migration_backend.py    |  3 +-
 llumnix/version.py                            |  1 +
 requirements.txt                              |  2 -
 setup.py                                      | 27 ++++-
 .../{cupy_install.sh => cupy_cuda_install.sh} |  0
 9 files changed, 182 insertions(+), 20 deletions(-)
 create mode 100644 examlpes/offline_inference.py
 create mode 100644 llumnix/version.py
 rename tools/{cupy_install.sh => cupy_cuda_install.sh} (100%)

diff --git a/Makefile b/Makefile
index 58305148..5b2ff03d 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,7 @@ init:
 	@git submodule update --init --recursive
 
 .PHONY: install
-install: cupy
+install:
 	@pip install -e .
 
 .PHONY: lint
@@ -24,7 +24,7 @@ lint: check_pylint_installed
 	@pylint --rcfile=.pylintrc -s n ./llumnix ./tests
 
 .PHONY: test
-test:
+test: check_pytest_installed
 	@pytest -q -x --ignore=third_party/ --disable-warnings
 
 #################### pygloo install for gloo migration backend begin ####################
@@ -40,9 +40,9 @@ pygloo: init
 
 ###################################### cupy begin #######################################
 
-.PHONY: cupy
-cupy:
-	@./tools/cupy_install.sh
+.PHONY: cupy-cuda
+cupy-cuda:
+	@./tools/cupy_cuda_install.sh
 
 ####################################### cupy end ########################################
 
@@ -52,12 +52,26 @@ PYLINT_VERSION = 2.12.2
 
 .PHONY: check_pylint_installed
 check_pylint_installed:
-	@command -v pylint >/dev/null 2>&1 || { \
+	@python3 -m pip show pylint > /dev/null 2>&1 || { \
 		echo "pylint is not installed. Installing pylint $(PYLINT_VERSION)..."; \
 		python3 -m pip install pylint==$(PYLINT_VERSION); }
 
-	@python3 -c "import pylint_pytest" >/dev/null 2>&1 || { \
+###################################### pylint end #######################################
+
+##################################### pytest begin ######################################
+
+.PHONY: check_pytest_installed
+check_pytest_installed:
+	@python3 -m pip show pytest > /dev/null 2>&1 || { \
+		echo "pytest is not installed. Installing pytest ..."; \
+		python3 -m pip install pytest; }
+
+	@python3 -m pip show pytest-asyncio > /dev/null 2>&1 || { \
+		echo "pytest-asyncio is not installed. Installing pytest-asyncio ..."; \
+		python3 -m pip install pytest-asyncio; }
+
+	@python3 -m pip show pylint-pytest > /dev/null 2>&1 || { \
 		echo "pylint-pytest is not installed. Installing pylint-pytest ..."; \
 		python3 -m pip install pylint-pytest; }
 
-###################################### pylint end #######################################
+###################################### pytest end #######################################
diff --git a/docs/Quickstart.md b/docs/Quickstart.md
index 56609ffe..98690772 100644
--- a/docs/Quickstart.md
+++ b/docs/Quickstart.md
@@ -6,7 +6,14 @@ Llumnix requires python `3.8.1~3.10.0` and is currently built on top of vLLM (ve
 
 [vLLM Installation](https://docs.vllm.ai/en/v0.4.2/getting_started/installation.html)
 
-## Build from Source
+### Install from Pypi
+
+You can install Llumnix from pypi:
+```
+pip install llumnix
+```
+
+### Build from Source
 
 You can build and install Llumnix from source:
 ```
@@ -15,11 +22,11 @@ cd llumnix
 make install
 ```
 
-If you want to use gloo as migration backend, please refer to [this link](https://github.com/ZeldaHuang/pygloo/blob/main/.github/workflows/ubuntu_basic.yml#L24C1-L26C1) to install [Bazel](https://github.com/bazelbuild/bazel) >= 5.1.0. Then, run `make pygloo` to install [pygloo](https://github.com/ZeldaHuang/pygloo).
+If you want to use NCCL as the migration backend, run `make cupy-cuda` to install (cupy-cuda) [https://pypi.org/search/?q=cupy-cuda] manually, as it is related to the CUDA version.
 
-Note: Using conda is not recommended, as it cannot properly handle pygloo's dependency on gcc libstdc++.so.6: version GLIBCXX_3.4.30.
+If you want to use Gloo as migration backend, in addition to installing cupy-cuda, please refer to [this link](https://github.com/ZeldaHuang/pygloo/blob/main/.github/workflows/ubuntu_basic.yml#L24C1-L26C1) to install [Bazel](https://github.com/bazelbuild/bazel) >= 5.1.0. Then, run `make pygloo` to install [pygloo](https://github.com/ZeldaHuang/pygloo).
 
-We will provide official releases through pypi soon.
+Note: Using conda is not recommended, as it cannot properly handle pygloo's dependency on gcc libstdc++.so.6: version GLIBCXX_3.4.30.
 
 After installation, you can follow this guide to use Llumnix for multi-instance LLM serving quickly.
 
diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py
new file mode 100644
index 00000000..53ddf25b
--- /dev/null
+++ b/examlpes/offline_inference.py
@@ -0,0 +1,98 @@
+from typing import List
+import os
+import uuid
+import asyncio
+
+import ray
+from ray.util.queue import Queue as RayQueue
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+
+from llumnix import launch_ray_cluster, connect_to_ray_cluster, init_manager, init_llumlets
+from llumnix import (SamplingParams, ServerInfo, EngineManagerArgs, LLMEngineManager, Llumlet,
+                     EngineArgs, RequestOutput)
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Launch ray cluster
+os.environ['HEAD_NODE'] = '1'
+os.environ['HEAD_NODE_IP'] = '127.0.0.1'
+ray_cluster_port=37000
+
+# Note: launch_ray_cluster will stop current ray cluster first, then init a new one.
+launch_ray_cluster(ray_cluster_port=ray_cluster_port)
+connect_to_ray_cluster(port=ray_cluster_port)
+
+# Set manager args and engine args
+manager_args = EngineManagerArgs()
+engine_args = EngineArgs(model="/mnt/cuikuilong.ckl/Qwen-7B", worker_use_ray=True,
+                         trust_remote_code=True, max_model_len=370)
+
+# Create llumlets
+llumlet_ids: List[str] = None
+llumlets: List[Llumlet] = None
+llumlet_ids, llumlets = init_llumlets(manager_args, engine_args,
+                                      node_id=ray.get_runtime_context().get_node_id())
+
+
+# Create a manager. If the manager is created first, and then the llumlets are created, manager.scale_up
+# need to be called to add the newly created llumlets to the management of the manager.
+manager: LLMEngineManager = init_manager(manager_args)
+
+# The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in.
+server_id = str(uuid.uuid4().hex)
+request_output_queue = RayQueue(actor_options={
+    "scheduling_strategy": NodeAffinitySchedulingStrategy(
+        node_id=ray.get_runtime_context().get_node_id(),
+        soft=False)
+})
+server_info = ServerInfo(server_id, request_output_queue)
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+async def background_process_outputs(num_tasks):
+    finish_task = 0
+    while finish_task != num_tasks:
+        await asyncio.sleep(0.1)
+        qsize = await request_output_queue.actor.qsize.remote()
+        if qsize > 0:
+            request_outputs: List[RequestOutput] = await request_output_queue.actor.get_nowait_batch.remote(qsize)
+            for request_output in request_outputs:
+                if request_output.finished:
+                    finish_task += 1
+                    prompt = request_output.prompt
+                    generated_text = request_output.outputs[0].text
+                    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+   
+async def main():
+    output_task = asyncio.create_task(background_process_outputs(len(prompts)))
+
+    for request in prompts:
+        request_id = str(uuid.uuid4().hex)
+        await manager.generate.remote(request_id=request_id,
+                                      server_info=server_info, 
+                                      prompt=request,
+                                      sampling_params=sampling_params,)
+    
+    await output_task
+
+asyncio.run(main())
+
+# kill all actor, as detach actor will not be killed by ray.shutdown
+named_actors = ray.util.list_named_actors(True)
+for actor in named_actors:
+    try:
+        actor_handle = ray.get_actor(actor['name'], namespace=actor['namespace'])
+    except:
+        continue
+
+# shutdown ray cluster
+ray.shutdown()
diff --git a/llumnix/__init__.py b/llumnix/__init__.py
index 4638bd9c..4ea77baf 100644
--- a/llumnix/__init__.py
+++ b/llumnix/__init__.py
@@ -10,3 +10,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import vllm
+from vllm import *
+
+from llumnix.server_info import ServerInfo
+from llumnix.entrypoints.llumnix_utils import (launch_ray_cluster, connect_to_ray_cluster,
+                                               init_manager, init_llumlets)
+from llumnix.arg_utils import EngineManagerArgs
+from llumnix.llm_engine_manager import LLMEngineManager
+from llumnix.llumlet.llumlet import Llumlet
+
+from .version import __version__
+
+__all__ = [
+    "__version__",
+    "ServerInfo",
+    "launch_ray_cluster",
+    "connect_to_ray_cluster",
+    "init_manager",
+    "init_llumlets",
+    "EngineManagerArgs",
+    "LLMEngineManager",
+    "Llumlet"
+]
+
+__all__.extend(getattr(vllm, "__all__", []))
diff --git a/llumnix/backends/vllm/migration_backend.py b/llumnix/backends/vllm/migration_backend.py
index a05c6cda..5fb00fed 100644
--- a/llumnix/backends/vllm/migration_backend.py
+++ b/llumnix/backends/vllm/migration_backend.py
@@ -13,7 +13,6 @@
 
 from typing import List
 import torch
-import cupy
 from func_timeout import func_set_timeout, FunctionTimedOut
 
 import ray
@@ -145,6 +144,8 @@ def __init__(self, migration_config: MigrationConfig, cache_engine: CacheEngine,
                  scheduling_strategy, is_driver_worker, gpu_cache) -> None:
         super().__init__()
 
+        import cupy
+
         self.migration_config = migration_config
         self.cache_engine = cache_engine
         self.backend = migration_config.migration_backend
diff --git a/llumnix/version.py b/llumnix/version.py
new file mode 100644
index 00000000..f102a9ca
--- /dev/null
+++ b/llumnix/version.py
@@ -0,0 +1 @@
+__version__ = "0.0.1"
diff --git a/requirements.txt b/requirements.txt
index 55f38b5d..e203bf6b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,5 @@ aiohttp
 scipy
 pandas
 matplotlib
-pytest-asyncio
-cupy-cuda12x # for nccl migration backend
 func_timeout
 numpy < 1.24.0 # for gloo migration backend's compatibility with numpy.float
diff --git a/setup.py b/setup.py
index d7d275dd..aabc8be7 100644
--- a/setup.py
+++ b/setup.py
@@ -26,13 +26,30 @@ def get_requirements() -> List[str]:
         requirements = f.read().strip().split("\n")
     return requirements
 
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
 setup(
     name='llumnix',
     version='0.0.1',
-    packages=find_packages(),
-    install_requires=get_requirements(),
-    author='Llumnix Team',
+    python_requires='>=3.8.1, <=3.10',
     description='Efficient and easy multi-instance LLM serving',
-    license="Apache 2.0",
+    long_description=readme(),
+    long_description_content_type="text/markdown",
+    author='Llumnix Team',
     url='https://github.com/AlibabaPAI/llumnix',
-)
\ No newline at end of file
+    license="Apache 2.0",
+    packages=find_packages(),
+    install_requires=get_requirements(),
+    platforms=["all"],
+    classifiers=[
+          'Programming Language :: Python',
+          'Programming Language :: Python :: 3.8',
+          'Programming Language :: Python :: 3.9',
+          'Programming Language :: Python :: 3.10',
+          "License :: OSI Approved :: Apache Software License",
+          "Topic :: Scientific/Engineering :: Artificial Intelligence",
+      ],
+)
diff --git a/tools/cupy_install.sh b/tools/cupy_cuda_install.sh
similarity index 100%
rename from tools/cupy_install.sh
rename to tools/cupy_cuda_install.sh