diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..35514f9 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,15 @@ +venv*/ +Dockerfile +.gitignore +.env +.git +.github/ +.ruff_cache/ +.pre-commit-config.yaml +docs/ +*.md +LICENSE +MANIFEST.in +__pycache__/ +*.egg-info/ +*log diff --git a/DEVELOPING.md b/DEVELOPING.md index b230366..dbb3ed6 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -33,13 +33,30 @@ cd guidellm pip install -e .[dev] ``` -If you work with `deepsparse` backend, etc it has some other software limitations. In order to install dependencies for the specific backend, run: +In case of working with `deepsparse` backend, etc it has some other software limitations. In order to install dependencies for the specific backend, run: ```sh pip install -e .[deepsparse] # or pip install -e '.[deepsparse]' ``` +In case of working with `vllm` backend, etc it has some other software limitations. In order to install dependencies for the specific backend, run: + +```sh +pip install -e .[vllm] +# or pip install -e '.[vllm]' +``` + +According to the [installation guide](https://docs.vllm.ai/en/v0.4.0.post1/getting_started/installation.html) `vllm` is supported only on **Linux**. It means that running the application and tests will fail. + +Workaround with Docker: + +```sh +cd guidellm/ +docker build -t guidellm:latest . +docker run -v ./:./ guidellm:latest python -m pytest -s -v src/unit/backend/test_vllm.py +``` + ## Project Structure The project follows a standard Python project structure: @@ -163,6 +180,19 @@ The end-to-end tests are located in the `tests/e2e` directory. To run the end-to tox -e test-e2e ``` +### Running unsopported tests + +Some of the test might be not supported on your system (_for instance `vllm` is not supported on MacOS yet_). In order to run them on Linux Operating System you might use technologies like **WSL** on Windows, or **Docker** on Windows or MacOS. + +In order to run under the Docker just run the command below: + +```sh +docker build --platform linux/amd64 --tag guidellm:latest . +docker run --rm --env-file .env guidellm:latest pytest tests/ +``` + +
+ ## Formatting, Linting, and Type Checking ### Running Quality Checks (Linting) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5db54bd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM --platform=linux/amd64 python:3.8-slim + +# Environment variables +ENV PYTHONUNBUFFERED=1 + +RUN : \ + && apt-get update \ + # dependencies for building Python packages && cleaning up unused files + && apt-get install -y \ + build-essential \ + libcurl4-openssl-dev \ + libssl-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && pip install --upgrade \ + pip \ + setuptools + +WORKDIR /app + +# Install project dependencies +COPY ./ ./ +RUN pip install -e .[dev,deepsparse,vllm] diff --git a/pyproject.toml b/pyproject.toml index 942c1cd..407260a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,9 @@ dev = [ deepsparse = [ "deepsparse; python_version < '3.12'", ] +vllm = [ + "vllm; sys_platform == 'linux'", +] [project.entry-points.console_scripts] @@ -108,7 +111,7 @@ exclude = ["venv", ".tox"] follow_imports = 'silent' [[tool.mypy.overrides]] -module = ["deepsparse.*", "transformers.*"] +module = ["deepsparse.*", "transformers.*", "vllm.*"] ignore_missing_imports=true diff --git a/src/guidellm/backend/base.py b/src/guidellm/backend/base.py index 010cdd2..8500369 100644 --- a/src/guidellm/backend/base.py +++ b/src/guidellm/backend/base.py @@ -15,7 +15,7 @@ __all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"] -BackendEnginePublic = Literal["openai_server", "deepsparse"] +BackendEnginePublic = Literal["openai_server", "deepsparse", "vllm"] BackendEngine = Union[BackendEnginePublic, Literal["test"]] diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py index b5cbc12..8a12c92 100644 --- a/src/guidellm/backend/openai.py +++ b/src/guidellm/backend/openai.py @@ -10,7 +10,7 @@ __all__ = ["OpenAIBackend"] -@Backend.register("openai_server") +@Backend.register(backend_type="openai_server") class OpenAIBackend(Backend): """ An OpenAI backend implementation for generative AI results. diff --git a/src/guidellm/backend/vllm/__init__.py b/src/guidellm/backend/vllm/__init__.py new file mode 100644 index 0000000..b4f0504 --- /dev/null +++ b/src/guidellm/backend/vllm/__init__.py @@ -0,0 +1,26 @@ +""" +This package encapsulates the "vLLM Backend" implementation. + +ref: https://github.com/vllm-project/vllm + +The `vllm` package supports Python3.8..Python3.11, +when the `guidellm` start from Python3.8. + +Safe range of versions is Python3.8..Python3.11 +for the vLLM Backend implementation. + +In the end ensure that the `vllm` package is installed. +""" + +from guidellm.utils import check_python_version, module_is_available + +check_python_version(min_version="3.8", max_version="3.12") + +module_is_available( + module="vllm", + helper=("`vllm` package is not available. Try run: `pip install -e '.[vllm]'`"), +) + +from .backend import VllmBackend # noqa: E402 + +__all__ = ["VllmBackend"] diff --git a/src/guidellm/backend/vllm/backend.py b/src/guidellm/backend/vllm/backend.py new file mode 100644 index 0000000..a048db4 --- /dev/null +++ b/src/guidellm/backend/vllm/backend.py @@ -0,0 +1,122 @@ +from typing import Any, AsyncGenerator, Dict, List, Optional + +from loguru import logger +from vllm import LLM, CompletionOutput, SamplingParams + +from guidellm.backend import Backend, GenerativeResponse +from guidellm.config import settings +from guidellm.core import TextGenerationRequest + + +@Backend.register(backend_type="vllm") +class VllmBackend(Backend): + """ + An vLLM Backend implementation for the generative AI result. + """ + + def __init__(self, model: Optional[str] = None, **request_args): + _model = self._get_model(model) + self._request_args: Dict[str, Any] = request_args + self.llm = LLM(_model) + + # NOTE: Must be after all the parameters since ``self.llm`` is going to be used + # by ``make_request`` within ``Backend.test_connection()`` + super().__init__(type_="vllm", model=_model, target="not used") + + logger.info(f"vLLM Backend uses model '{self._model}'") + + def _get_model(self, model_from_cli: Optional[str] = None) -> str: + """Provides the model by the next priority list: + 1. from function argument (comes from CLI) + 1. from environment variable + 2. `self.default_model` from `self.available_models` + """ + + if model_from_cli is not None: + return model_from_cli + elif settings.llm_model is not None: + logger.info( + "Using vLLM model from environment variable: " f"{settings.llm_model}" + ) + return settings.llm_model + else: + logger.info(f"Using default vLLM model: {self.default_model}") + return self.default_model + + async def make_request( + self, request: TextGenerationRequest + ) -> AsyncGenerator[GenerativeResponse, None]: + """ + Make a request to the vLLM Python API client. + + :param request: The result request to submit. + :type request: TextGenerationRequest + :return: An iterator over the generative responses. + :rtype: Iterator[GenerativeResponse] + """ + + logger.debug(f"Making request to vLLM backend with prompt: {request.prompt}") + + token_count = 0 + request_args = { + **self._request_args, + "inputs": [request.prompt], + "sampling_params": SamplingParams(max_tokens=request.output_token_count), + } + + final_response = GenerativeResponse( + type_="final", + prompt=request.prompt, + prompt_token_count=request.prompt_token_count, + output_token_count=token_count, + ) + + if not (result := self.llm.generate(**request_args)): + yield final_response + return + + try: + generations: List[CompletionOutput] = result[0].outputs + except IndexError: + yield final_response + return + + for generation in generations: + if not (token := generation.text): + break + else: + token_count += 1 + yield GenerativeResponse( + type_="token_iter", + add_token=token, + prompt=request.prompt, + prompt_token_count=request.prompt_token_count, + output_token_count=token_count, + ) + + yield GenerativeResponse( + type_="final", + prompt=request.prompt, + prompt_token_count=request.prompt_token_count, + output_token_count=token_count, + ) + + def available_models(self) -> List[str]: + """ + Get the available models for the backend. + + ref: https://docs.vllm.ai/en/v0.4.1/models/supported_models.html + + :return: A list of available models. + :rtype: List[str] + """ + + return [ + "mistralai/Mistral-7B-Instruct-v0.3", + "meta-llama/Meta-Llama-3-8B-Instruct", + ] + + def _token_count(self, text: str) -> int: + token_count = len(text.split()) + logger.debug(f"Token count for text '{text}': {token_count}") + return token_count diff --git a/src/guidellm/utils/progress.py b/src/guidellm/utils/progress.py index 5c7a845..5ae8416 100644 --- a/src/guidellm/utils/progress.py +++ b/src/guidellm/utils/progress.py @@ -162,9 +162,11 @@ def update_benchmark( total=completed_total, completed=completed_count if not completed else completed_total, req_per_sec=(f"{req_per_sec:.2f}" if req_per_sec else "#.##"), - start_time_str=datetime.fromtimestamp(start_time).strftime("%H:%M:%S") - if start_time - else "--:--:--", + start_time_str=( + datetime.fromtimestamp(start_time).strftime("%H:%M:%S") + if start_time + else "--:--:--" + ), ) logger.debug( "Updated benchmark task at index {}: {}% complete", diff --git a/tests/dummy/__init__.py b/tests/dummy/__init__.py index a0cccdb..216b702 100644 --- a/tests/dummy/__init__.py +++ b/tests/dummy/__init__.py @@ -1,8 +1,5 @@ """ The tests.dummy package package represents dummy data factories and test services. - -test.dummy.data.openai_model_factory - openai.types.Model test factory -test.dummy.data.openai_completion_factory - openai.types.Completion test factory """ -from . import data, services # noqa: F401 +from . import data, services, vllm # noqa: F401 diff --git a/tests/dummy/data/__init__.py b/tests/dummy/data/__init__.py index 95a2c94..e69de29 100644 --- a/tests/dummy/data/__init__.py +++ b/tests/dummy/data/__init__.py @@ -1,3 +0,0 @@ -from .openai import openai_completion_factory, openai_model_factory - -__all__ = ["openai_completion_factory", "openai_model_factory"] diff --git a/tests/dummy/data/openai.py b/tests/dummy/data/openai.py deleted file mode 100644 index 6e16865..0000000 --- a/tests/dummy/data/openai.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -This module includes data models factories for openai 3-rd party package -""" - -import random -import string -import time -import uuid -from typing import Generator - -from openai.types import Completion, Model - - -def words(n: int = 1) -> Generator[str, None, None]: - for _ in range(n): - yield "".join( - random.choice(string.ascii_letters) for _ in range(random.randint(3, 10)) - ) - - -def openai_completion_factory( - n: int = 3, - **kwargs, -) -> Generator[Completion, None, None]: - """ - The factory that yields the openai Completion instance. - """ - - for i in range(1, n + 1): - payload = { - "id": str(uuid.uuid4()), - "choices": [], - "stop": not i < n, - "content": " ".join(words(random.randint(3, 10))) if i < n else "", - "object": "text_completion", - "model": "mock-model", - "created": int(time.time()), - } - payload.update(kwargs) - - yield Completion(**payload) # type: ignore - - -def openai_model_factory(n: int = 3) -> Generator[Model, None, None]: - """ - The factory that yields the random openai Model instance. - """ - for _ in range(n): - yield Model( - id=str(uuid.uuid4()), - created=int(time.time()), - object="model", - owned_by="neuralmagic", - ) diff --git a/tests/dummy/vllm.py b/tests/dummy/vllm.py new file mode 100644 index 0000000..2210b80 --- /dev/null +++ b/tests/dummy/vllm.py @@ -0,0 +1,81 @@ +""" +This module includes data models factories for the `vllm` 3-rd party package +""" + +import random +from typing import Generator, List, Optional + +from pydantic import BaseModel, ConfigDict + +from guidellm.utils import random_strings + +__all__ = ["TestLLM", "CompletionOutput"] + + +class CompletionOutput(BaseModel): + """Test interface of `vllm.CompletionOutput`.""" + + text: str + + +class SamplingParams(BaseModel): + """Test interface of `vllm.SamplingParams`.""" + + max_tokens: Optional[int] = 16 + + +class CompletionOutputs(BaseModel): + outputs: List[CompletionOutput] + + +class TestLLM(BaseModel): + """Test interface of `vllm.LLM`. + + Args: + _outputs_number(int | None): the number of generated tokens per output. + Should be used only for testing purposes. + Default: randint(10..20) + _generations: dynamic representation of generated responses + from deepsparse interface. + + """ + + model_config = ConfigDict( + extra="allow", + validate_assignment=True, + arbitrary_types_allowed=True, + from_attributes=True, + ) + + model: str + max_num_batched_tokens: int + + def _generate_completion_outputs( + self, max_tokens: Optional[int] + ) -> Generator[CompletionOutput, None, None]: + + # NOTE: This value is used only for testing purposes + self._expected_outputs: List[CompletionOutput] = [] + + for text in random_strings( + min_chars=5, + max_chars=random.randint(10, 20), + n=max_tokens or random.randint(10, 20), + ): + instance = CompletionOutput(text=text) + self._expected_outputs.append(instance) + + yield instance + + def generate( + self, inputs: List[str], sampling_params: SamplingParams + ) -> List[CompletionOutputs]: + return [ + CompletionOutputs( + outputs=list( + self._generate_completion_outputs( + max_tokens=sampling_params.max_tokens + ) + ) + ) + ] diff --git a/tests/unit/backend/test_deepsparse_backend.py b/tests/unit/backend/test_deepsparse.py similarity index 85% rename from tests/unit/backend/test_deepsparse_backend.py rename to tests/unit/backend/test_deepsparse.py index 58e5761..244db47 100644 --- a/tests/unit/backend/test_deepsparse_backend.py +++ b/tests/unit/backend/test_deepsparse.py @@ -1,5 +1,12 @@ +""" +This module includes unit tests for the Deepsparse backend. + +Notes: tests from this module are going to be skipped in case + the Python version is >= 3.12 according to the deepsparse limitation. +""" + import sys -from typing import Any, Dict, Generator, List, Optional +from typing import Any, Generator, List, Optional import pytest from pydantic import BaseModel @@ -38,7 +45,7 @@ class TestTextGenerationPipeline: Method `__call__` allows to mock the result object that comes from `deepsparse.pipeline.Pipeline()` so everything is encapsulated right here. - :param self._generation: dynamic representation of generated responses + :param self._generations: dynamic representation of generated responses from deepsparse interface. """ @@ -89,7 +96,7 @@ def mock_deepsparse_pipeline(mocker): {"model": "test/custom_llm"}, ], ) -def test_backend_creation(create_payload: Dict, backend_class): +def test_backend_creation(create_payload, backend_class): """Test the "Deepspaarse Backend" class with defaults and custom input parameters. """ @@ -132,9 +139,7 @@ def test_backend_model_from_env(mocker, backend_class): ], ) @pytest.mark.asyncio() -async def test_make_request( - text_generation_request_create_payload: Dict, backend_class -): +async def test_make_request(text_generation_request_create_payload, backend_class): backend = backend_class() output_tokens: List[str] = [] @@ -153,23 +158,21 @@ async def test_make_request( @pytest.mark.smoke() @pytest.mark.parametrize( - ("text_generation_request_create_payload", "error"), + ("text_generation_request", "error"), [ ( - {"prompt": "Test prompt", "output_token_count": -1}, + TextGenerationRequest(prompt="Test prompt", output_token_count=-1), ValueError, ), ], ) @pytest.mark.asyncio() async def test_make_request_invalid_request_payload( - text_generation_request_create_payload: Dict, error, backend_class + text_generation_request, error, backend_class ): backend = backend_class() with pytest.raises(error): [ respnose - async for respnose in backend.make_request( - request=TextGenerationRequest(**text_generation_request_create_payload) - ) + async for respnose in backend.make_request(request=text_generation_request) ] diff --git a/tests/unit/backend/test_openai_backend.py b/tests/unit/backend/test_openai.py similarity index 100% rename from tests/unit/backend/test_openai_backend.py rename to tests/unit/backend/test_openai.py diff --git a/tests/unit/backend/test_vllm.py b/tests/unit/backend/test_vllm.py new file mode 100644 index 0000000..16e83b0 --- /dev/null +++ b/tests/unit/backend/test_vllm.py @@ -0,0 +1,157 @@ +""" +This module includes unit tests for the vLLM backend. + +Notes: tests from this module are going to be skipped in case + the rimtime platform is not a Linux / WSL according to vllm documentation. +""" + +import sys +from typing import Callable, List, Optional + +import pytest + +from guidellm.backend import Backend +from guidellm.config import reload_settings, settings +from guidellm.core import TextGenerationRequest +from tests import dummy + +pytestmark = pytest.mark.skipif( + sys.platform != "linux", + reason="Unsupported Platform. Try using Linux or WSL instead.", +) + + +@pytest.fixture(scope="session") +def backend_class(): + from guidellm.backend.vllm import VllmBackend + + return VllmBackend + + +@pytest.fixture() +def vllm_patch_factory(mocker) -> Callable[[str], dummy.vllm.TestLLM]: + """ + Skip VLLM initializer due to external calls. + Replace VllmBackend.llm object with mock representation. + + This vllm patch is injected into each test automatically. If you need + to override the Mock object - use this fixture. + """ + + def inner(model: Optional[str] = None, max_tokens: Optional[int] = None): + + return mocker.patch( + "vllm.LLM.__new__", + return_value=dummy.vllm.TestLLM( + model=model or settings.llm_model, + max_num_batched_tokens=max_tokens or 4096, + ), + ) + + return inner + + +@pytest.fixture(autouse=True) +def vllm_auto_patch(vllm_patch_factory): + """ + Automatically patch the ``vllm.LLM`` with defaults. + """ + + return vllm_patch_factory() + + +@pytest.mark.smoke() +@pytest.mark.parametrize( + "create_payload", + [ + {}, + {"model": "test/custom_llm"}, + ], +) +def test_backend_creation(create_payload, backend_class, vllm_patch_factory): + """Test the "Deepspaarse Backend" class + with defaults and custom input parameters. + """ + + vllm_patch_factory(model=create_payload.get("model")) + + backends = [ + Backend.create("vllm", **create_payload), + backend_class(**create_payload), + ] + + for backend in backends: + assert backend.llm + ( + backend.model == custom_model + if (custom_model := create_payload.get("model")) + else backend.default_model + ) + + +@pytest.mark.smoke() +def test_backend_model_from_env(mocker, backend_class): + mocker.patch.dict( + "os.environ", + {"GUIDELLM__LLM_MODEL": "test_backend_model_from_env"}, + ) + + reload_settings() + + backends = [Backend.create("vllm"), backend_class()] + + for backend in backends: + assert backend.model == "test_backend_model_from_env" + + +@pytest.mark.smoke() +@pytest.mark.parametrize( + "text_generation_request_create_payload", + [ + {"prompt": "Test prompt"}, + {"prompt": "Test prompt", "output_token_count": 20}, + ], +) +@pytest.mark.asyncio() +async def test_make_request(text_generation_request_create_payload, backend_class): + backend = backend_class() + + output_tokens: List[str] = [] + async for response in backend.make_request( + request=TextGenerationRequest(**text_generation_request_create_payload) + ): + if response.add_token: + output_tokens.append(response.add_token) + + assert "".join(output_tokens) == "".join( + generation.text for generation in backend.llm._expected_outputs + ) + + if max_tokens := text_generation_request_create_payload.get("output_token_count"): + assert len(backend.llm._expected_outputs) == max_tokens + + +@pytest.mark.smoke() +@pytest.mark.parametrize( + ("text_generation_request", "error"), + [ + ( + TextGenerationRequest(prompt="Test prompt", output_token_count=-1), + ValueError, + ), + ( + TextGenerationRequest(prompt="Test prompt", output_token_count=0), + ValueError, + ), + ], +) +@pytest.mark.asyncio() +async def test_make_request_invalid_request_payload( + text_generation_request, error, backend_class +): + backend = backend_class() + with pytest.raises(error): + [ + respnose + async for respnose in backend.make_request(request=text_generation_request) + ] diff --git a/tox.ini b/tox.ini index 40611c5..76b04cc 100644 --- a/tox.ini +++ b/tox.ini @@ -6,7 +6,7 @@ env_list = py38,py39,py310,py311,py312 [testenv] description = Run all tests deps = - .[dev,deepsparse] + .[dev,deepsparse,vllm] commands = pytest tests/ {posargs} @@ -14,7 +14,7 @@ commands = [testenv:test-unit] description = Run unit tests deps = - .[dev,deepsparse] + .[dev,deepsparse,vllm] commands = python -m pytest tests/unit {posargs} @@ -22,7 +22,7 @@ commands = [testenv:test-integration] description = Run integration tests deps = - .[dev,deepsparse] + .[dev,deepsparse,vllm] commands = python -m pytest tests/integration {posargs} @@ -30,7 +30,7 @@ commands = [testenv:test-e2e] description = Run end-to-end tests deps = - .[dev,deepsparse] + .[dev,deepsparse,vllm] commands = python -m pytest tests/e2e {posargs}