From 5e93c1f1c4dcce4b3cf208679e52b18f582ed8bb Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Thu, 29 Aug 2024 16:40:48 +0300
Subject: [PATCH 01/19] =?UTF-8?q?=F0=9F=9A=9A=20Better=20naming=20is=20pro?=
 =?UTF-8?q?vided?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* backend/test_openai_backend.py -> backend/test_openai.py
* backend/test_deepsparse_backend.py -> backend/test_deepsparse.py
---
 .../backend/{test_deepsparse_backend.py => test_deepsparse.py}    | 0
 tests/unit/backend/{test_openai_backend.py => test_openai.py}     | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/unit/backend/{test_deepsparse_backend.py => test_deepsparse.py} (100%)
 rename tests/unit/backend/{test_openai_backend.py => test_openai.py} (100%)

diff --git a/tests/unit/backend/test_deepsparse_backend.py b/tests/unit/backend/test_deepsparse.py
similarity index 100%
rename from tests/unit/backend/test_deepsparse_backend.py
rename to tests/unit/backend/test_deepsparse.py
diff --git a/tests/unit/backend/test_openai_backend.py b/tests/unit/backend/test_openai.py
similarity index 100%
rename from tests/unit/backend/test_openai_backend.py
rename to tests/unit/backend/test_openai.py

From cea679e9ec6b2439566c0bdbb775d281a7ed81cc Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Fri, 30 Aug 2024 13:22:38 +0300
Subject: [PATCH 02/19] =?UTF-8?q?=E2=9C=A8=20vllm=20backend=20integration?=
 =?UTF-8?q?=20is=20added=20not=20tested?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml                        |   5 +-
 src/guidellm/backend/__init__.py      |  16 ++++
 src/guidellm/backend/base.py          |   2 +-
 src/guidellm/backend/vllm/__init__.py |  21 +++++
 src/guidellm/backend/vllm/backend.py  | 122 ++++++++++++++++++++++++++
 tests/unit/backend/test_vllm.py       |  49 +++++++++++
 6 files changed, 213 insertions(+), 2 deletions(-)
 create mode 100644 src/guidellm/backend/vllm/__init__.py
 create mode 100644 src/guidellm/backend/vllm/backend.py
 create mode 100644 tests/unit/backend/test_vllm.py

diff --git a/pyproject.toml b/pyproject.toml
index 942c1cd..62bb64b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,6 +74,9 @@ dev = [
 deepsparse = [
     "deepsparse; python_version < '3.12'",
 ]
+vllm = [
+    "vllm",
+]
 
 
 [project.entry-points.console_scripts]
@@ -108,7 +111,7 @@ exclude = ["venv", ".tox"]
 follow_imports = 'silent'
 
 [[tool.mypy.overrides]]
-module = ["deepsparse.*", "transformers.*"]
+module = ["deepsparse.*", "transformers.*", "vllm.*"]
 ignore_missing_imports=true
 
 
diff --git a/src/guidellm/backend/__init__.py b/src/guidellm/backend/__init__.py
index b6d1b9d..4498498 100644
--- a/src/guidellm/backend/__init__.py
+++ b/src/guidellm/backend/__init__.py
@@ -1,3 +1,19 @@
 from .base import Backend, BackendEngine, BackendEnginePublic, GenerativeResponse
+<<<<<<< HEAD
 
 __all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"]
+=======
+from .deepsparse.backend import DeepsparseBackend
+from .openai import OpenAIBackend
+from .vllm.backend import VllmBackend
+
+__all__ = [
+    "Backend",
+    "BackendEngine",
+    "BackendEnginePublic",
+    "GenerativeResponse",
+    "OpenAIBackend",
+    "DeepsparseBackend",
+    "VllmBackend",
+]
+>>>>>>> 8a8e2ff (✨ vllm backend integration is added)
diff --git a/src/guidellm/backend/base.py b/src/guidellm/backend/base.py
index 010cdd2..8500369 100644
--- a/src/guidellm/backend/base.py
+++ b/src/guidellm/backend/base.py
@@ -15,7 +15,7 @@
 __all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"]
 
 
-BackendEnginePublic = Literal["openai_server", "deepsparse"]
+BackendEnginePublic = Literal["openai_server", "deepsparse", "vllm"]
 BackendEngine = Union[BackendEnginePublic, Literal["test"]]
 
 
diff --git a/src/guidellm/backend/vllm/__init__.py b/src/guidellm/backend/vllm/__init__.py
new file mode 100644
index 0000000..e9da0f4
--- /dev/null
+++ b/src/guidellm/backend/vllm/__init__.py
@@ -0,0 +1,21 @@
+"""
+This package encapsulates the "vLLM Backend" implementation.
+ref: https://github.com/vllm-project/vllm
+
+The `vllm` package supports Python3.8..Python3.11,
+when the `guidellm` start from Python3.8.
+
+Safe range of versions is Python3.8..Python3.11
+for the vLLM Backend implementation.
+"""
+
+from guidellm.utils import check_python_version, module_is_available
+
+# Ensure that python is in valid range
+check_python_version(min_version="3.8", max_version="3.11")
+
+# Ensure that vllm is installed
+module_is_available(
+    module="vllm",
+    helper=("`vllm` package is not available. Try run: `pip install -e '.[vllm]'`"),
+)
diff --git a/src/guidellm/backend/vllm/backend.py b/src/guidellm/backend/vllm/backend.py
new file mode 100644
index 0000000..db5180b
--- /dev/null
+++ b/src/guidellm/backend/vllm/backend.py
@@ -0,0 +1,122 @@
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+from loguru import logger
+from vllm import LLM, CompletionOutput, SamplingParams
+
+from guidellm.backend import Backend, GenerativeResponse
+from guidellm.config import settings
+from guidellm.core import TextGenerationRequest
+
+
+@Backend.register(backend_type="deepsparse")
+class VllmBackend(Backend):
+    """
+    An vLLM Backend implementation for the generative AI result.
+    """
+
+    def __init__(self, model: Optional[str] = None, **request_args):
+        super().__init__(
+            type_="vllm",
+            model=self._get_model(model),
+            target="not used",
+        )
+
+        self._request_args: Dict[str, Any] = request_args
+        self.llm = LLM(self._model)
+
+        logger.info(f"vLLM Backend uses model '{self._model}'")
+
+    def _get_model(self, model_from_cli: Optional[str] = None) -> str:
+        """Provides the model by the next priority list:
+        1. from function argument (comes from CLI)
+        1. from environment variable
+        2. `self.default_model` from `self.available_models`
+        """
+
+        if model_from_cli is not None:
+            return model_from_cli
+        elif settings.llm_model is not None:
+            logger.info(
+                "Using vLLM model from environment variable: " f"{settings.llm_model}"
+            )
+            return settings.llm_model
+        else:
+            logger.info(f"Using default vLLM model: {self.default_model}")
+            return self.default_model
+
+    async def make_request(
+        self, request: TextGenerationRequest
+    ) -> AsyncGenerator[GenerativeResponse, None]:
+        """
+        Make a request to the vLLM Python API client.
+
+        :param request: The result request to submit.
+        :type request: TextGenerationRequest
+        :return: An iterator over the generative responses.
+        :rtype: Iterator[GenerativeResponse]
+        """
+
+        logger.debug(f"Making request to vLLM backend with prompt: {request.prompt}")
+
+        token_count = 0
+        request_args = {
+            **self._request_args,
+            "inputs": [request.prompt],
+            "sampling_params": SamplingParams(max_tokens=request.output_token_count),
+        }
+
+        final_response = GenerativeResponse(
+            type_="final",
+            prompt=request.prompt,
+            prompt_token_count=request.prompt_token_count,
+            output_token_count=token_count,
+        )
+
+        if not (result := self.llm.generate(**request_args)):
+            yield final_response
+            return
+
+        try:
+            generations: List[CompletionOutput] = result[0].outputs
+        except IndexError:
+            yield final_response
+            return
+
+        for generation in generations:
+            if not (token := generation.text):
+                yield GenerativeResponse(
+                    type_="final",
+                    prompt=request.prompt,
+                    prompt_token_count=request.prompt_token_count,
+                    output_token_count=token_count,
+                )
+                break
+            else:
+                token_count += 1
+                yield GenerativeResponse(
+                    type_="token_iter",
+                    add_token=token,
+                    prompt=request.prompt,
+                    prompt_token_count=request.prompt_token_count,
+                    output_token_count=token_count,
+                )
+
+    def available_models(self) -> List[str]:
+        """
+        Get the available models for the backend.
+
+        ref: https://docs.vllm.ai/en/v0.4.1/models/supported_models.html
+
+        :return: A list of available models.
+        :rtype: List[str]
+        """
+
+        return [
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            "meta-llama/Meta-Llama-3-8B-Instruct",
+        ]
+
+    def _token_count(self, text: str) -> int:
+        token_count = len(text.split())
+        logger.debug(f"Token count for text '{text}': {token_count}")
+        return token_count
diff --git a/tests/unit/backend/test_vllm.py b/tests/unit/backend/test_vllm.py
new file mode 100644
index 0000000..13be052
--- /dev/null
+++ b/tests/unit/backend/test_vllm.py
@@ -0,0 +1,49 @@
+from typing import Dict, List, cast
+
+import pytest
+from vllm import LLM
+
+from guidellm.backend import Backend, VllmBackend
+
+
+@pytest.fixture(autouse=True)
+def mock_vllm_llm(mocker):
+    llm = LLM(
+        model="facebook/opt-125m",
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,
+    )
+
+    return mocker.patch("vllm.LLM", return_value=llm)
+
+
+@pytest.mark.smoke()
+@pytest.mark.parametrize(
+    "create_payload",
+    [
+        {},
+        {"model": "test/custom_llm"},
+    ],
+)
+def test_backend_creation(create_payload: Dict):
+    """Test the "Deepspaarse Backend" class
+    with defaults and custom input parameters.
+    """
+
+    backends: List[VllmBackend] = cast(
+        List[VllmBackend],
+        [
+            Backend.create("vllm", **create_payload),
+            VllmBackend(**create_payload),
+        ],
+    )
+
+    for backend in backends:
+        assert backend.llm
+        (
+            backend.model == custom_model
+            if (custom_model := create_payload.get("model"))
+            else backend.default_model
+        )

From 440d4beb1dc372530dba0b35367877d905b12335 Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Fri, 30 Aug 2024 13:52:08 +0300
Subject: [PATCH 03/19] =?UTF-8?q?=E2=9C=85=20vllm=20tests=20are=20skipped?=
 =?UTF-8?q?=20if=20platform=20is=20not=20Linux?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/guidellm/backend/__init__.py      | 16 ----------
 src/guidellm/backend/openai.py        |  2 +-
 src/guidellm/backend/vllm/__init__.py |  9 ++++--
 src/guidellm/backend/vllm/backend.py  |  2 +-
 tests/unit/backend/test_deepsparse.py |  7 +++++
 tests/unit/backend/test_vllm.py       | 42 +++++++++++++++++++--------
 6 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/src/guidellm/backend/__init__.py b/src/guidellm/backend/__init__.py
index 4498498..b6d1b9d 100644
--- a/src/guidellm/backend/__init__.py
+++ b/src/guidellm/backend/__init__.py
@@ -1,19 +1,3 @@
 from .base import Backend, BackendEngine, BackendEnginePublic, GenerativeResponse
-<<<<<<< HEAD
 
 __all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"]
-=======
-from .deepsparse.backend import DeepsparseBackend
-from .openai import OpenAIBackend
-from .vllm.backend import VllmBackend
-
-__all__ = [
-    "Backend",
-    "BackendEngine",
-    "BackendEnginePublic",
-    "GenerativeResponse",
-    "OpenAIBackend",
-    "DeepsparseBackend",
-    "VllmBackend",
-]
->>>>>>> 8a8e2ff (✨ vllm backend integration is added)
diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
index b5cbc12..8a12c92 100644
--- a/src/guidellm/backend/openai.py
+++ b/src/guidellm/backend/openai.py
@@ -10,7 +10,7 @@
 __all__ = ["OpenAIBackend"]
 
 
-@Backend.register("openai_server")
+@Backend.register(backend_type="openai_server")
 class OpenAIBackend(Backend):
     """
     An OpenAI backend implementation for generative AI results.
diff --git a/src/guidellm/backend/vllm/__init__.py b/src/guidellm/backend/vllm/__init__.py
index e9da0f4..d3b06b4 100644
--- a/src/guidellm/backend/vllm/__init__.py
+++ b/src/guidellm/backend/vllm/__init__.py
@@ -1,5 +1,6 @@
 """
 This package encapsulates the "vLLM Backend" implementation.
+
 ref: https://github.com/vllm-project/vllm
 
 The `vllm` package supports Python3.8..Python3.11,
@@ -7,15 +8,19 @@
 
 Safe range of versions is Python3.8..Python3.11
 for the vLLM Backend implementation.
+
+In the end ensure that the `vllm` package is installed.
 """
 
 from guidellm.utils import check_python_version, module_is_available
 
-# Ensure that python is in valid range
 check_python_version(min_version="3.8", max_version="3.11")
 
-# Ensure that vllm is installed
 module_is_available(
     module="vllm",
     helper=("`vllm` package is not available. Try run: `pip install -e '.[vllm]'`"),
 )
+
+from .backend import VllmBackend  # noqa: E402
+
+__all__ = ["VllmBackend"]
diff --git a/src/guidellm/backend/vllm/backend.py b/src/guidellm/backend/vllm/backend.py
index db5180b..fd99b28 100644
--- a/src/guidellm/backend/vllm/backend.py
+++ b/src/guidellm/backend/vllm/backend.py
@@ -8,7 +8,7 @@
 from guidellm.core import TextGenerationRequest
 
 
-@Backend.register(backend_type="deepsparse")
+@Backend.register(backend_type="vllm")
 class VllmBackend(Backend):
     """
     An vLLM Backend implementation for the generative AI result.
diff --git a/tests/unit/backend/test_deepsparse.py b/tests/unit/backend/test_deepsparse.py
index 58e5761..cac49ca 100644
--- a/tests/unit/backend/test_deepsparse.py
+++ b/tests/unit/backend/test_deepsparse.py
@@ -1,3 +1,10 @@
+"""
+This module includes unit tests for the Deepsparse backend.
+
+Notes: tests from this module are going to be skipped in case
+    the Python version is >= 3.12 according to the deepsparse limitation.
+"""
+
 import sys
 from typing import Any, Dict, Generator, List, Optional
 
diff --git a/tests/unit/backend/test_vllm.py b/tests/unit/backend/test_vllm.py
index 13be052..de432f6 100644
--- a/tests/unit/backend/test_vllm.py
+++ b/tests/unit/backend/test_vllm.py
@@ -1,14 +1,35 @@
-from typing import Dict, List, cast
+"""
+This module includes unit tests for the vLLM backend.
+
+Notes: tests from this module are going to be skipped in case
+    the rimtime platform is not a Linux / WSL according to vllm documentation.
+"""
+
+import importlib
+import sys
+from typing import Dict
 
 import pytest
-from vllm import LLM
 
-from guidellm.backend import Backend, VllmBackend
+from guidellm.backend import Backend
+
+pytestmark = pytest.mark.skipif(
+    sys.platform != "linux",
+    reason="Unsupported Platform. Try using Linux or WSL instead.",
+)
+
+
+@pytest.fixture(scope="module")
+def backend_class():
+    from guidellm.backend.vllm import VllmBackend
+
+    return VllmBackend
 
 
 @pytest.fixture(autouse=True)
 def mock_vllm_llm(mocker):
-    llm = LLM(
+    module = importlib.import_module("vllm")
+    llm = module.LLM(
         model="facebook/opt-125m",
         max_num_batched_tokens=4096,
         tensor_parallel_size=1,
@@ -27,18 +48,15 @@ def mock_vllm_llm(mocker):
         {"model": "test/custom_llm"},
     ],
 )
-def test_backend_creation(create_payload: Dict):
+def test_backend_creation(create_payload: Dict, backend_class):
     """Test the "Deepspaarse Backend" class
     with defaults and custom input parameters.
     """
 
-    backends: List[VllmBackend] = cast(
-        List[VllmBackend],
-        [
-            Backend.create("vllm", **create_payload),
-            VllmBackend(**create_payload),
-        ],
-    )
+    backends = [
+        Backend.create("vllm", **create_payload),
+        backend_class(**create_payload),
+    ]
 
     for backend in backends:
         assert backend.llm

From 1a715a676b51f62d9d3474389bbdd6004e1d11c1 Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Fri, 30 Aug 2024 13:55:34 +0300
Subject: [PATCH 04/19] =?UTF-8?q?=F0=9F=93=8C=20vLLM=20python=20version=20?=
 =?UTF-8?q?is=20increased=20to=203.12?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/guidellm/backend/vllm/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/guidellm/backend/vllm/__init__.py b/src/guidellm/backend/vllm/__init__.py
index d3b06b4..b4f0504 100644
--- a/src/guidellm/backend/vllm/__init__.py
+++ b/src/guidellm/backend/vllm/__init__.py
@@ -14,7 +14,7 @@
 
 from guidellm.utils import check_python_version, module_is_available
 
-check_python_version(min_version="3.8", max_version="3.11")
+check_python_version(min_version="3.8", max_version="3.12")
 
 module_is_available(
     module="vllm",

From 14f4c42d20a73e8619ccb69e96637be86ae9afbf Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Fri, 30 Aug 2024 14:02:39 +0300
Subject: [PATCH 05/19] =?UTF-8?q?=F0=9F=93=8C=20`vllm`=20dependency=20is?=
 =?UTF-8?q?=20skipped=20if=20platform=20IS=20NOT=20Linux?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 2 +-
 tox.ini        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 62bb64b..407260a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,7 +75,7 @@ deepsparse = [
     "deepsparse; python_version < '3.12'",
 ]
 vllm = [
-    "vllm",
+    "vllm; sys_platform == 'linux'",
 ]
 
 
diff --git a/tox.ini b/tox.ini
index 40611c5..76b04cc 100644
--- a/tox.ini
+++ b/tox.ini
@@ -6,7 +6,7 @@ env_list = py38,py39,py310,py311,py312
 [testenv]
 description = Run all tests
 deps =
-    .[dev,deepsparse]
+    .[dev,deepsparse,vllm]
 commands =
     pytest tests/ {posargs}
 
@@ -14,7 +14,7 @@ commands =
 [testenv:test-unit]
 description = Run unit tests
 deps =
-    .[dev,deepsparse]
+    .[dev,deepsparse,vllm]
 commands =
     python -m pytest tests/unit {posargs}
 
@@ -22,7 +22,7 @@ commands =
 [testenv:test-integration]
 description = Run integration tests
 deps =
-    .[dev,deepsparse]
+    .[dev,deepsparse,vllm]
 commands =
     python -m pytest tests/integration {posargs}
 
@@ -30,7 +30,7 @@ commands =
 [testenv:test-e2e]
 description = Run end-to-end tests
 deps =
-    .[dev,deepsparse]
+    .[dev,deepsparse,vllm]
 commands =
     python -m pytest tests/e2e {posargs}
 

From 6e3087027217bf66e405fbc98a8b819eb2fa46b1 Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Mon, 2 Sep 2024 08:58:33 +0300
Subject: [PATCH 06/19] =?UTF-8?q?=F0=9F=9A=A7=20WIP?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile                      | 20 +++++++++
 tests/unit/backend/test_vllm.py | 78 ++++++++++++++++++++++++++++++---
 2 files changed, 92 insertions(+), 6 deletions(-)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..61aaac4
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+# Environment variables
+ENV PYTHONUNBUFFERED=1
+
+RUN apt-get update \
+    # dependencies for building Python packages && cleaning up unused files
+    && apt-get install -y build-essential \
+    libcurl4-openssl-dev libssl-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+
+# Python dependencies
+RUN pip install --upgrade pip setuptools
+
+WORKDIR /app/
+
+COPY ./ ./
+
+RUN pip install -e '.[dev,deepsparse,vllm]'
diff --git a/tests/unit/backend/test_vllm.py b/tests/unit/backend/test_vllm.py
index de432f6..148c37c 100644
--- a/tests/unit/backend/test_vllm.py
+++ b/tests/unit/backend/test_vllm.py
@@ -7,16 +7,18 @@
 
 import importlib
 import sys
-from typing import Dict
+from typing import Dict, List
 
 import pytest
 
 from guidellm.backend import Backend
+from guidellm.config import reload_settings
+from guidellm.core import TextGenerationRequest
 
-pytestmark = pytest.mark.skipif(
-    sys.platform != "linux",
-    reason="Unsupported Platform. Try using Linux or WSL instead.",
-)
+# pytestmark = pytest.mark.skipif(
+#     sys.platform != "linux",
+#     reason="Unsupported Platform. Try using Linux or WSL instead.",
+# )
 
 
 @pytest.fixture(scope="module")
@@ -29,7 +31,7 @@ def backend_class():
 @pytest.fixture(autouse=True)
 def mock_vllm_llm(mocker):
     module = importlib.import_module("vllm")
-    llm = module.LLM(
+    llm = getattr(module, "LLM")(
         model="facebook/opt-125m",
         max_num_batched_tokens=4096,
         tensor_parallel_size=1,
@@ -65,3 +67,67 @@ def test_backend_creation(create_payload: Dict, backend_class):
             if (custom_model := create_payload.get("model"))
             else backend.default_model
         )
+
+
+@pytest.mark.smoke()
+def test_backend_model_from_env(mocker, backend_class):
+    mocker.patch.dict(
+        "os.environ",
+        {"GUIDELLM__LLM_MODEL": "test_backend_model_from_env"},
+    )
+
+    reload_settings()
+
+    backends = [Backend.create("vllm"), backend_class()]
+
+    for backend in backends:
+        assert backend.model == "test_backend_model_from_env"
+
+
+@pytest.mark.smoke()
+@pytest.mark.parametrize(
+    "text_generation_request_create_payload",
+    [
+        {"prompt": "Test prompt"},
+        {"prompt": "Test prompt", "output_token_count": 20},
+    ],
+)
+@pytest.mark.asyncio()
+async def test_make_request(
+    text_generation_request_create_payload: Dict, backend_class
+):
+    backend = backend_class()
+
+    output_tokens: List[str] = []
+    async for response in backend.make_request(
+        request=TextGenerationRequest(**text_generation_request_create_payload)
+    ):
+        if response.add_token:
+            output_tokens.append(response.add_token)
+    assert "".join(output_tokens) == "".join(
+        generation.text for generation in backend.pipeline._generations
+    )
+
+    if max_tokens := text_generation_request_create_payload.get("output_token_count"):
+        assert len(backend.pipeline._generations) == max_tokens
+
+
+@pytest.mark.smoke()
+@pytest.mark.parametrize(
+    ("text_generation_request_create_payload", "error"),
+    [
+        ({"prompt": "Test prompt"}, ValueError),
+    ],
+)
+@pytest.mark.asyncio()
+async def test_make_request_invalid_request_payload(
+    text_generation_request_create_payload: Dict, error, backend_class
+):
+    backend = backend_class()
+    with pytest.raises(error):
+        [
+            respnose
+            async for respnose in backend.make_request(
+                request=TextGenerationRequest(**text_generation_request_create_payload)
+            )
+        ]

From b0c0acb733ab98516c101a23c8434324da53516a Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Mon, 2 Sep 2024 23:25:24 +0300
Subject: [PATCH 07/19] =?UTF-8?q?=F0=9F=9A=A7=20WIP?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/guidellm/backend/vllm/__init__.py |  8 ++--
 tests/dummy/__init__.py               |  2 +-
 tests/dummy/data/__init__.py          |  3 +-
 tests/dummy/vllm.py                   | 65 +++++++++++++++++++++++++++
 tests/unit/backend/test_vllm.py       |  9 +---
 5 files changed, 74 insertions(+), 13 deletions(-)
 create mode 100644 tests/dummy/vllm.py

diff --git a/src/guidellm/backend/vllm/__init__.py b/src/guidellm/backend/vllm/__init__.py
index b4f0504..18040a7 100644
--- a/src/guidellm/backend/vllm/__init__.py
+++ b/src/guidellm/backend/vllm/__init__.py
@@ -16,10 +16,10 @@
 
 check_python_version(min_version="3.8", max_version="3.12")
 
-module_is_available(
-    module="vllm",
-    helper=("`vllm` package is not available. Try run: `pip install -e '.[vllm]'`"),
-)
+# module_is_available(
+#     module="vllm",
+#     helper=("`vllm` package is not available. Try run: `pip install -e '.[vllm]'`"),
+# )
 
 from .backend import VllmBackend  # noqa: E402
 
diff --git a/tests/dummy/__init__.py b/tests/dummy/__init__.py
index a0cccdb..dc04765 100644
--- a/tests/dummy/__init__.py
+++ b/tests/dummy/__init__.py
@@ -5,4 +5,4 @@
 test.dummy.data.openai_completion_factory - openai.types.Completion test factory
 """
 
-from . import data, services  # noqa: F401
+from . import data, services, vllm  # noqa: F401
diff --git a/tests/dummy/data/__init__.py b/tests/dummy/data/__init__.py
index 95a2c94..4e5b820 100644
--- a/tests/dummy/data/__init__.py
+++ b/tests/dummy/data/__init__.py
@@ -1,3 +1,4 @@
+from . import vllm
 from .openai import openai_completion_factory, openai_model_factory
 
-__all__ = ["openai_completion_factory", "openai_model_factory"]
+__all__ = ["openai_completion_factory", "openai_model_factory", "vllm"]
diff --git a/tests/dummy/vllm.py b/tests/dummy/vllm.py
new file mode 100644
index 0000000..7874c43
--- /dev/null
+++ b/tests/dummy/vllm.py
@@ -0,0 +1,65 @@
+"""
+This module includes data models factories for the `vllm` 3-rd party package
+"""
+
+import random
+from functools import partial
+from typing import List, Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from guidellm.utils import random_strings
+
+__all__ = ["TestLLM", "CompletionOutput"]
+
+
+class CompletionOutput(BaseModel):
+    """Test interface of `vllm.CompletionOutput`."""
+
+    text: str
+
+
+class SamplingParams(BaseModel):
+    """Test interface of `vllm.SamplingParams`."""
+
+    max_tokens: int
+
+
+class TestLLM(BaseModel):
+    """Test interface of `vllm.LLM`.
+
+    Args:
+        _outputs_number(int | None): the number of generated tokens per output.
+            Should be used only for testing purposes.
+            Default: randint(10..20)
+
+    """
+
+    model_config = ConfigDict(
+        extra="allow",
+        validate_assignment=True,
+        arbitrary_types_allowed=True,
+        from_attributes=True,
+    )
+
+    model: str
+    max_num_batched_tokens: int
+
+    _outputs_number: int = Field(default_factory=partial(random.randint, 10, 20))
+
+    def _generate_completion_outputs(self, max_tokens: int) -> List[CompletionOutput]:
+        self._outputs_number = random.randint(10, 20)
+
+        return [
+            CompletionOutput(text=text)
+            for text in random_strings(
+                min_chars=0, max_chars=max_tokens, n=self._outputs_number
+            )
+        ]
+
+    def generate(
+        self, inputs: List[str], sampling_params: SamplingParams
+    ) -> Optional[List[List[CompletionOutput]]]:
+        return [
+            self._generate_completion_outputs(max_tokens=sampling_params.max_tokens)
+        ]
diff --git a/tests/unit/backend/test_vllm.py b/tests/unit/backend/test_vllm.py
index 148c37c..a77e5fd 100644
--- a/tests/unit/backend/test_vllm.py
+++ b/tests/unit/backend/test_vllm.py
@@ -5,8 +5,6 @@
     the rimtime platform is not a Linux / WSL according to vllm documentation.
 """
 
-import importlib
-import sys
 from typing import Dict, List
 
 import pytest
@@ -14,6 +12,7 @@
 from guidellm.backend import Backend
 from guidellm.config import reload_settings
 from guidellm.core import TextGenerationRequest
+from tests import dummy
 
 # pytestmark = pytest.mark.skipif(
 #     sys.platform != "linux",
@@ -30,13 +29,9 @@ def backend_class():
 
 @pytest.fixture(autouse=True)
 def mock_vllm_llm(mocker):
-    module = importlib.import_module("vllm")
-    llm = getattr(module, "LLM")(
+    llm = dummy.vllm.TestLLM(
         model="facebook/opt-125m",
         max_num_batched_tokens=4096,
-        tensor_parallel_size=1,
-        gpu_memory_utilization=0.10,
-        enforce_eager=True,
     )
 
     return mocker.patch("vllm.LLM", return_value=llm)

From 9f431ea8beb81091e13229bf82ede6174b1cba3c Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Mon, 2 Sep 2024 23:35:01 +0300
Subject: [PATCH 08/19] dummy.data.openai is removed

---
 tests/dummy/__init__.py      |  3 --
 tests/dummy/data/__init__.py |  4 ---
 tests/dummy/data/openai.py   | 54 ------------------------------------
 tests/dummy/vllm.py          |  3 +-
 4 files changed, 2 insertions(+), 62 deletions(-)
 delete mode 100644 tests/dummy/data/openai.py

diff --git a/tests/dummy/__init__.py b/tests/dummy/__init__.py
index dc04765..216b702 100644
--- a/tests/dummy/__init__.py
+++ b/tests/dummy/__init__.py
@@ -1,8 +1,5 @@
 """
 The tests.dummy package package represents dummy data factories and test services.
-
-test.dummy.data.openai_model_factory - openai.types.Model test factory
-test.dummy.data.openai_completion_factory - openai.types.Completion test factory
 """
 
 from . import data, services, vllm  # noqa: F401
diff --git a/tests/dummy/data/__init__.py b/tests/dummy/data/__init__.py
index 4e5b820..e69de29 100644
--- a/tests/dummy/data/__init__.py
+++ b/tests/dummy/data/__init__.py
@@ -1,4 +0,0 @@
-from . import vllm
-from .openai import openai_completion_factory, openai_model_factory
-
-__all__ = ["openai_completion_factory", "openai_model_factory", "vllm"]
diff --git a/tests/dummy/data/openai.py b/tests/dummy/data/openai.py
deleted file mode 100644
index 6e16865..0000000
--- a/tests/dummy/data/openai.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""
-This module includes data models factories for openai 3-rd party package
-"""
-
-import random
-import string
-import time
-import uuid
-from typing import Generator
-
-from openai.types import Completion, Model
-
-
-def words(n: int = 1) -> Generator[str, None, None]:
-    for _ in range(n):
-        yield "".join(
-            random.choice(string.ascii_letters) for _ in range(random.randint(3, 10))
-        )
-
-
-def openai_completion_factory(
-    n: int = 3,
-    **kwargs,
-) -> Generator[Completion, None, None]:
-    """
-    The factory that yields the openai Completion instance.
-    """
-
-    for i in range(1, n + 1):
-        payload = {
-            "id": str(uuid.uuid4()),
-            "choices": [],
-            "stop": not i < n,
-            "content": " ".join(words(random.randint(3, 10))) if i < n else "",
-            "object": "text_completion",
-            "model": "mock-model",
-            "created": int(time.time()),
-        }
-        payload.update(kwargs)
-
-        yield Completion(**payload)  # type: ignore
-
-
-def openai_model_factory(n: int = 3) -> Generator[Model, None, None]:
-    """
-    The factory that yields the random openai Model instance.
-    """
-    for _ in range(n):
-        yield Model(
-            id=str(uuid.uuid4()),
-            created=int(time.time()),
-            object="model",
-            owned_by="neuralmagic",
-        )
diff --git a/tests/dummy/vllm.py b/tests/dummy/vllm.py
index 7874c43..6428b7e 100644
--- a/tests/dummy/vllm.py
+++ b/tests/dummy/vllm.py
@@ -45,7 +45,8 @@ class TestLLM(BaseModel):
     model: str
     max_num_batched_tokens: int
 
-    _outputs_number: int = Field(default_factory=partial(random.randint, 10, 20))
+    # NOTE: This value is used only for testing purposes
+    outputs_number: int = Field(default_factory=partial(random.randint, 10, 20))
 
     def _generate_completion_outputs(self, max_tokens: int) -> List[CompletionOutput]:
         self._outputs_number = random.randint(10, 20)

From 832b316ebbe9a1703dcbb6076e769e4f09112d9d Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Wed, 4 Sep 2024 13:55:36 +0300
Subject: [PATCH 09/19] WIP Docker tests

---
 '                                    |  26 ++++
 .dockerignore                        |  14 ++
 DEVELOPING.md                        |  19 ++-
 Dockerfile                           |   6 +-
 requirements.txt                     | 184 +++++++++++++++++++++++++++
 src/guidellm/backend/vllm/backend.py |   4 +-
 6 files changed, 247 insertions(+), 6 deletions(-)
 create mode 100644 '
 create mode 100644 .dockerignore
 create mode 100644 requirements.txt

diff --git a/' b/'
new file mode 100644
index 0000000..bce016a
--- /dev/null
+++ b/'
@@ -0,0 +1,26 @@
+BUILDPLATFORM=linux/amd64
+PYTHONPATH=/Users/dmytroparfeniuk/Projects/neuralmagic/vllm/
+
+# commands
+# https://huggingface.co/datasets/openai/openai_humaneval
+alias act="source .tox/py39/bin/activate"
+alias install="python -m pip install -e '.[dev,deepsparse]'"
+alias run='python -m src.guidellm.main --data=openai_humaneval --max-requests=1 --max-seconds=20 --rate-type=constant --rate=1.0 --backend=deepsparse --model='
+alias e2e='python -m pytest -vvv -s tests/e2e'
+
+alias test='docker run --rm --env-file='.env' -v ./:/app/guidellm guidellm:latest pytest -s -vvv tests/unit/backend/test_vllm.py::test_backend_creation'
+
+alias unit='python -m pytest -vvv -s tests/unit'
+alias fix='python -m ruff check --fix'
+alias check='python -m ruff check . && python -m mypy --check-untyped-defs && python -m mdformat --check README.md DEVELOPING.md docs/ src/ tests/'
+alias types='python -m mypy --check-untyped-defs'
+alias clean='rm -rf build dist .mypy_cache .pytest_cache .tox .ruff_cache .coverage && find . -type f- name ".pyc" | xargs rm && find . -type d -name "__pycache__" -exec rm -r {} + && rm-rf *.egg-info'
+
+# Application
+GUIDELLM__LOGGING__CONSOLE_LOG_LEVEL=WARNING
+
+# OpenAI
+# GUIDELLM__OPENAI__BASE_URL=http://127.0.0.1:8080
+GUIDELLM__OPENAI__BASE_URL=http://192.168.50.36:7070
+GUIDELLM__OPENAI__API_KEY=invalid
+
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..5002a2b
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,14 @@
+venv*/
+Dockerfile
+.gitignore
+.env
+.git
+.github/
+.ruff_cache/
+.pre-commit-config.yaml
+docs/
+*.md
+LICENSE
+MANIFEST.in
+__pycache__/
+*.egg-info/
diff --git a/DEVELOPING.md b/DEVELOPING.md
index b230366..f19079a 100644
--- a/DEVELOPING.md
+++ b/DEVELOPING.md
@@ -33,13 +33,30 @@ cd guidellm
 pip install -e .[dev]
 ```
 
-If you work with `deepsparse` backend, etc it has some other software limitations. In order to install dependencies for the specific backend, run:
+In case of working with `deepsparse` backend, etc it has some other software limitations. In order to install dependencies for the specific backend, run:
 
 ```sh
 pip install -e .[deepsparse]
 # or pip install -e '.[deepsparse]'
 ```
 
+In case of working with `vllm` backend, etc it has some other software limitations. In order to install dependencies for the specific backend, run:
+
+```sh
+pip install -e .[vllm]
+# or pip install -e '.[vllm]'
+```
+
+According to the [installation guide](https://docs.vllm.ai/en/v0.4.0.post1/getting_started/installation.html) `vllm` is supported only on **Linux**. It means that running the application and tests will fail.
+
+Workaround with Docker:
+```sh
+cd guidellm/
+docker build -t guidellm:latest .
+docker run -v ./:./ guidellm:latest python -m pytest -s -v src/unit/backend/test_vllm.py
+```
+
+
 ## Project Structure
 
 The project follows a standard Python project structure:
diff --git a/Dockerfile b/Dockerfile
index 61aaac4..c880287 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,9 +12,9 @@ RUN apt-get update \
 
 # Python dependencies
 RUN pip install --upgrade pip setuptools
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
 
-WORKDIR /app/
 
+WORKDIR /app/
 COPY ./ ./
-
-RUN pip install -e '.[dev,deepsparse,vllm]'
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..4a05ce7
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,184 @@
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+alabaster==0.7.13
+annotated-types==0.7.0
+anyio==4.4.0
+async-timeout==4.0.3
+attrs==24.2.0
+audioread==3.0.1
+babel==2.16.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.0
+cfgv==3.4.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpickle==3.0.0
+colorama==0.4.6
+coverage==7.6.1
+datasets==2.21.0
+decorator==5.1.1
+dill==0.3.8
+diskcache==5.6.3
+distlib==0.3.8
+distro==1.9.0
+docutils==0.20.1
+exceptiongroup==1.2.2
+fastapi==0.112.2
+filelock==3.15.4
+frozenlist==1.4.1
+fsspec==2024.6.1
+ftfy==6.2.3
+gguf==0.9.1
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.2
+huggingface-hub==0.24.6
+identify==2.6.0
+idna==3.8
+imagesize==1.4.1
+importlib_metadata==8.4.0
+importlib_resources==6.4.4
+iniconfig==2.0.0
+interegular==0.3.3
+Jinja2==3.1.4
+jiter==0.5.0
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+lark==1.2.2
+lazy_loader==0.4
+librosa==0.10.2.post1
+linkify-it-py==2.0.3
+llvmlite==0.41.1
+lm-format-enforcer==0.10.6
+loguru==0.7.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdformat==0.7.17
+mdformat-gfm==0.3.6
+mdformat_footnote==0.1.1
+mdformat_frontmatter==2.0.8
+mdformat_tables==1.0.0
+mdit-py-plugins==0.4.1
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.0.8
+msgspec==0.18.6
+multidict==6.0.5
+multiprocess==0.70.16
+mypy==1.10.1
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.1
+nodeenv==1.9.1
+numba==0.58.1
+numpy==1.24.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==12.560.30
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.68
+nvidia-nvtx-cu12==12.1.105
+openai==1.43.0
+outlines==0.0.46
+packaging==24.1
+pandas==2.0.3
+pillow==10.4.0
+pkgutil_resolve_name==1.3.10
+platformdirs==4.2.2
+pluggy==1.5.0
+pooch==1.8.2
+pre-commit==3.5.0
+prometheus-fastapi-instrumentator==7.0.0
+prometheus_client==0.20.0
+protobuf==5.28.0
+psutil==6.0.0
+py-cpuinfo==9.0.0
+pyairports==2.1.1
+pyarrow==17.0.0
+pycountry==24.6.1
+pycparser==2.22
+pydantic==2.8.2
+pydantic-settings==2.4.0
+pydantic_core==2.20.1
+Pygments==2.18.0
+pyproject-api==1.7.1
+pytest==8.2.2
+pytest-asyncio==0.23.8
+pytest-cov==5.0.0
+pytest-mock==3.14.0
+pytest-rerunfailures==14.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.1
+PyYAML==6.0.2
+pyzmq==26.2.0
+ray==2.10.0
+referencing==0.35.1
+regex==2024.7.24
+requests==2.32.3
+requests-mock==1.12.1
+rich==13.8.0
+rpds-py==0.20.0
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.8
+ruff==0.5.7
+safetensors==0.4.4
+scikit-learn==1.3.2
+scipy==1.10.1
+sentencepiece==0.2.0
+six==1.16.0
+sniffio==1.3.1
+snowballstemmer==2.2.0
+soundfile==0.12.1
+soxr==0.3.7
+Sphinx==7.1.2
+sphinxcontrib-applehelp==1.0.4
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-htmlhelp==2.0.1
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.5
+starlette==0.38.4
+sympy==1.13.2
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+tokenizers==0.19.1
+tomli==2.0.1
+torch==2.4.0
+torchvision==0.19.0
+tox==4.16.0
+tqdm==4.66.5
+transformers==4.44.2
+triton==3.0.0
+types-click==7.1.8
+types-PyYAML==6.0.12.20240808
+types-requests==2.32.0.20240712
+types-toml==0.10.8.20240310
+typing_extensions==4.12.2
+tzdata==2024.1
+uc-micro-py==1.0.3
+urllib3==2.2.2
+uvicorn==0.30.6
+uvloop==0.20.0
+virtualenv==20.26.3
+vllm==0.5.5
+vllm-flash-attn==2.6.1
+watchfiles==0.24.0
+wcwidth==0.2.13
+websockets==13.0.1
+xformers==0.0.27.post2
+xxhash==3.5.0
+yarl==1.9.7
+zipp==3.20.1
diff --git a/src/guidellm/backend/vllm/backend.py b/src/guidellm/backend/vllm/backend.py
index fd99b28..434c12f 100644
--- a/src/guidellm/backend/vllm/backend.py
+++ b/src/guidellm/backend/vllm/backend.py
@@ -15,15 +15,15 @@ class VllmBackend(Backend):
     """
 
     def __init__(self, model: Optional[str] = None, **request_args):
+        self._request_args: Dict[str, Any] = request_args
+
         super().__init__(
             type_="vllm",
             model=self._get_model(model),
             target="not used",
         )
 
-        self._request_args: Dict[str, Any] = request_args
         self.llm = LLM(self._model)
-
         logger.info(f"vLLM Backend uses model '{self._model}'")
 
     def _get_model(self, model_from_cli: Optional[str] = None) -> str:

From 3d8c80f0f985caef465fda8dd76c384088263cf1 Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Wed, 4 Sep 2024 21:17:14 +0300
Subject: [PATCH 10/19] removed tmp file

---
 ' | 26 --------------------------
 1 file changed, 26 deletions(-)
 delete mode 100644 '

diff --git a/' b/'
deleted file mode 100644
index bce016a..0000000
--- a/'
+++ /dev/null
@@ -1,26 +0,0 @@
-BUILDPLATFORM=linux/amd64
-PYTHONPATH=/Users/dmytroparfeniuk/Projects/neuralmagic/vllm/
-
-# commands
-# https://huggingface.co/datasets/openai/openai_humaneval
-alias act="source .tox/py39/bin/activate"
-alias install="python -m pip install -e '.[dev,deepsparse]'"
-alias run='python -m src.guidellm.main --data=openai_humaneval --max-requests=1 --max-seconds=20 --rate-type=constant --rate=1.0 --backend=deepsparse --model='
-alias e2e='python -m pytest -vvv -s tests/e2e'
-
-alias test='docker run --rm --env-file='.env' -v ./:/app/guidellm guidellm:latest pytest -s -vvv tests/unit/backend/test_vllm.py::test_backend_creation'
-
-alias unit='python -m pytest -vvv -s tests/unit'
-alias fix='python -m ruff check --fix'
-alias check='python -m ruff check . && python -m mypy --check-untyped-defs && python -m mdformat --check README.md DEVELOPING.md docs/ src/ tests/'
-alias types='python -m mypy --check-untyped-defs'
-alias clean='rm -rf build dist .mypy_cache .pytest_cache .tox .ruff_cache .coverage && find . -type f- name ".pyc" | xargs rm && find . -type d -name "__pycache__" -exec rm -r {} + && rm-rf *.egg-info'
-
-# Application
-GUIDELLM__LOGGING__CONSOLE_LOG_LEVEL=WARNING
-
-# OpenAI
-# GUIDELLM__OPENAI__BASE_URL=http://127.0.0.1:8080
-GUIDELLM__OPENAI__BASE_URL=http://192.168.50.36:7070
-GUIDELLM__OPENAI__API_KEY=invalid
-

From 7836d452a27dd4ba53da519c248d54c004071479 Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Fri, 6 Sep 2024 14:59:30 +0300
Subject: [PATCH 11/19] Dockerfile remove COPY

---
 Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index c880287..ae9335c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,4 +17,3 @@ RUN pip install -r requirements.txt
 
 
 WORKDIR /app/
-COPY ./ ./

From defe53d41d746e70b3887d8c78cb774beb78e91b Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Mon, 9 Sep 2024 12:45:15 +0300
Subject: [PATCH 12/19] =?UTF-8?q?=F0=9F=9A=A7=20WIP?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile                           |  2 +-
 src/guidellm/backend/base.py         |  1 +
 src/guidellm/backend/vllm/backend.py | 12 ++++-----
 tests/dummy/vllm.py                  |  1 +
 tests/unit/backend/test_vllm.py      | 39 ++++++++++++++++++----------
 5 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ae9335c..779d835 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,6 +2,7 @@ FROM --platform=linux/amd64 python:3.8-slim
 
 # Environment variables
 ENV PYTHONUNBUFFERED=1
+ENV PYTHONPATH=/app/guidellm/src/
 
 RUN apt-get update \
     # dependencies for building Python packages && cleaning up unused files
@@ -15,5 +16,4 @@ RUN pip install --upgrade pip setuptools
 COPY requirements.txt ./
 RUN pip install -r requirements.txt
 
-
 WORKDIR /app/
diff --git a/src/guidellm/backend/base.py b/src/guidellm/backend/base.py
index 8500369..becf1b2 100644
--- a/src/guidellm/backend/base.py
+++ b/src/guidellm/backend/base.py
@@ -228,6 +228,7 @@ async def submit(self, request: TextGenerationRequest) -> TextGenerationResult:
         result.start(request.prompt)
         received_final = False
 
+        breakpoint()  # TODO: remove
         async for response in self.make_request(request):
             logger.debug("Received response: {}", response)
             if response.type_ == "token_iter":
diff --git a/src/guidellm/backend/vllm/backend.py b/src/guidellm/backend/vllm/backend.py
index 434c12f..2899311 100644
--- a/src/guidellm/backend/vllm/backend.py
+++ b/src/guidellm/backend/vllm/backend.py
@@ -14,16 +14,13 @@ class VllmBackend(Backend):
     An vLLM Backend implementation for the generative AI result.
     """
 
-    def __init__(self, model: Optional[str] = None, **request_args):
+    def __init__(self, model: str = settings.llm_model, **request_args):
+        _model = self._get_model(model)
         self._request_args: Dict[str, Any] = request_args
+        self.llm = LLM(_model)
 
-        super().__init__(
-            type_="vllm",
-            model=self._get_model(model),
-            target="not used",
-        )
+        super().__init__(type_="vllm", model=_model, target="not used")
 
-        self.llm = LLM(self._model)
         logger.info(f"vLLM Backend uses model '{self._model}'")
 
     def _get_model(self, model_from_cli: Optional[str] = None) -> str:
@@ -72,6 +69,7 @@ async def make_request(
             output_token_count=token_count,
         )
 
+        breakpoint()  # TODO: remove
         if not (result := self.llm.generate(**request_args)):
             yield final_response
             return
diff --git a/tests/dummy/vllm.py b/tests/dummy/vllm.py
index 6428b7e..e82f9cc 100644
--- a/tests/dummy/vllm.py
+++ b/tests/dummy/vllm.py
@@ -61,6 +61,7 @@ def _generate_completion_outputs(self, max_tokens: int) -> List[CompletionOutput
     def generate(
         self, inputs: List[str], sampling_params: SamplingParams
     ) -> Optional[List[List[CompletionOutput]]]:
+        breakpoint()  # TODO: remove
         return [
             self._generate_completion_outputs(max_tokens=sampling_params.max_tokens)
         ]
diff --git a/tests/unit/backend/test_vllm.py b/tests/unit/backend/test_vllm.py
index a77e5fd..fc2bc50 100644
--- a/tests/unit/backend/test_vllm.py
+++ b/tests/unit/backend/test_vllm.py
@@ -5,19 +5,20 @@
     the rimtime platform is not a Linux / WSL according to vllm documentation.
 """
 
-from typing import Dict, List
+import sys
+from typing import Callable, Dict, List, Optional
 
 import pytest
 
 from guidellm.backend import Backend
-from guidellm.config import reload_settings
+from guidellm.config import reload_settings, settings
 from guidellm.core import TextGenerationRequest
 from tests import dummy
 
-# pytestmark = pytest.mark.skipif(
-#     sys.platform != "linux",
-#     reason="Unsupported Platform. Try using Linux or WSL instead.",
-# )
+pytestmark = pytest.mark.skipif(
+    sys.platform != "linux",
+    reason="Unsupported Platform. Try using Linux or WSL instead.",
+)
 
 
 @pytest.fixture(scope="module")
@@ -28,13 +29,23 @@ def backend_class():
 
 
 @pytest.fixture(autouse=True)
-def mock_vllm_llm(mocker):
-    llm = dummy.vllm.TestLLM(
-        model="facebook/opt-125m",
-        max_num_batched_tokens=4096,
-    )
+def vllm_patch_factory(mocker) -> Callable[[str], dummy.vllm.TestLLM]:
+    """
+    Skip VLLM initializer due to external calls.
+    Replace VllmBackend.llm object with mock representation.
+    """
 
-    return mocker.patch("vllm.LLM", return_value=llm)
+    def inner(model: Optional[str] = None, max_tokens: Optional[int] = None):
+
+        return mocker.patch(
+            "vllm.LLM.__new__",
+            return_value=dummy.vllm.TestLLM(
+                model=model or settings.llm_model,
+                max_num_batched_tokens=max_tokens or 4096,
+            ),
+        )
+
+    return inner
 
 
 @pytest.mark.smoke()
@@ -45,11 +56,13 @@ def mock_vllm_llm(mocker):
         {"model": "test/custom_llm"},
     ],
 )
-def test_backend_creation(create_payload: Dict, backend_class):
+def test_backend_creation(create_payload: Dict, backend_class, vllm_patch_factory):
     """Test the "Deepspaarse Backend" class
     with defaults and custom input parameters.
     """
 
+    vllm_patch_factory(model=create_payload.get("model"))
+
     backends = [
         Backend.create("vllm", **create_payload),
         backend_class(**create_payload),

From 907135848ac43a6cb295cc365f2c4bf62a1617a4 Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Mon, 9 Sep 2024 20:40:59 +0300
Subject: [PATCH 13/19] dockerfile is improved

---
 Dockerfile | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 779d835..d9c0bbe 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,10 +4,14 @@ FROM --platform=linux/amd64 python:3.8-slim
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONPATH=/app/guidellm/src/
 
-RUN apt-get update \
+RUN : \
+    && apt-get update \
     # dependencies for building Python packages && cleaning up unused files
-    && apt-get install -y build-essential \
-    libcurl4-openssl-dev libssl-dev \
+    && apt-get install -y --no-insatll-recommend \
+        build-essential \
+        libcurl4-openssl-dev \
+        libssl-dev \
+    && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
 

From 59e4cc655db25bb8a59327a1bda982e420c95973 Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Mon, 9 Sep 2024 20:41:21 +0300
Subject: [PATCH 14/19] tests are comlete

---
 src/guidellm/backend/base.py          |  1 -
 src/guidellm/backend/vllm/backend.py  | 16 +++++------
 tests/dummy/vllm.py                   | 40 +++++++++++++++++----------
 tests/unit/backend/test_deepsparse.py |  2 +-
 tests/unit/backend/test_vllm.py       | 23 +++++++++++----
 5 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/src/guidellm/backend/base.py b/src/guidellm/backend/base.py
index becf1b2..8500369 100644
--- a/src/guidellm/backend/base.py
+++ b/src/guidellm/backend/base.py
@@ -228,7 +228,6 @@ async def submit(self, request: TextGenerationRequest) -> TextGenerationResult:
         result.start(request.prompt)
         received_final = False
 
-        breakpoint()  # TODO: remove
         async for response in self.make_request(request):
             logger.debug("Received response: {}", response)
             if response.type_ == "token_iter":
diff --git a/src/guidellm/backend/vllm/backend.py b/src/guidellm/backend/vllm/backend.py
index 2899311..f161055 100644
--- a/src/guidellm/backend/vllm/backend.py
+++ b/src/guidellm/backend/vllm/backend.py
@@ -14,7 +14,7 @@ class VllmBackend(Backend):
     An vLLM Backend implementation for the generative AI result.
     """
 
-    def __init__(self, model: str = settings.llm_model, **request_args):
+    def __init__(self, model: Optional[str] = None, **request_args):
         _model = self._get_model(model)
         self._request_args: Dict[str, Any] = request_args
         self.llm = LLM(_model)
@@ -69,7 +69,6 @@ async def make_request(
             output_token_count=token_count,
         )
 
-        breakpoint()  # TODO: remove
         if not (result := self.llm.generate(**request_args)):
             yield final_response
             return
@@ -82,12 +81,6 @@ async def make_request(
 
         for generation in generations:
             if not (token := generation.text):
-                yield GenerativeResponse(
-                    type_="final",
-                    prompt=request.prompt,
-                    prompt_token_count=request.prompt_token_count,
-                    output_token_count=token_count,
-                )
                 break
             else:
                 token_count += 1
@@ -99,6 +92,13 @@ async def make_request(
                     output_token_count=token_count,
                 )
 
+        yield GenerativeResponse(
+            type_="final",
+            prompt=request.prompt,
+            prompt_token_count=request.prompt_token_count,
+            output_token_count=token_count,
+        )
+
     def available_models(self) -> List[str]:
         """
         Get the available models for the backend.
diff --git a/tests/dummy/vllm.py b/tests/dummy/vllm.py
index e82f9cc..a7b2052 100644
--- a/tests/dummy/vllm.py
+++ b/tests/dummy/vllm.py
@@ -3,8 +3,7 @@
 """
 
 import random
-from functools import partial
-from typing import List, Optional
+from typing import Generator, List, Optional
 
 from pydantic import BaseModel, ConfigDict, Field
 
@@ -25,6 +24,10 @@ class SamplingParams(BaseModel):
     max_tokens: int
 
 
+class CompletionOutputs(BaseModel):
+    outputs: List[CompletionOutput]
+
+
 class TestLLM(BaseModel):
     """Test interface of `vllm.LLM`.
 
@@ -32,6 +35,8 @@ class TestLLM(BaseModel):
         _outputs_number(int | None): the number of generated tokens per output.
             Should be used only for testing purposes.
             Default: randint(10..20)
+        _generations: dynamic representation of generated responses
+            from deepsparse interface.
 
     """
 
@@ -45,23 +50,28 @@ class TestLLM(BaseModel):
     model: str
     max_num_batched_tokens: int
 
-    # NOTE: This value is used only for testing purposes
-    outputs_number: int = Field(default_factory=partial(random.randint, 10, 20))
+    def _generate_completion_outputs(
+        self, max_tokens: int
+    ) -> Generator[CompletionOutputs, None, None]:
 
-    def _generate_completion_outputs(self, max_tokens: int) -> List[CompletionOutput]:
-        self._outputs_number = random.randint(10, 20)
+        # NOTE: This value is used only for testing purposes
+        self._expected_outputs: List[CompletionOutput] = []
 
-        return [
-            CompletionOutput(text=text)
-            for text in random_strings(
-                min_chars=0, max_chars=max_tokens, n=self._outputs_number
-            )
-        ]
+        for text in random_strings(
+            min_chars=0, max_chars=random.randint(10, 20), n=max_tokens
+        ):
+            instance = CompletionOutput(text=text)
+            self._expected_outputs.append(instance)
+
+            yield instance
 
     def generate(
         self, inputs: List[str], sampling_params: SamplingParams
-    ) -> Optional[List[List[CompletionOutput]]]:
-        breakpoint()  # TODO: remove
+    ) -> List[CompletionOutputs]:
         return [
-            self._generate_completion_outputs(max_tokens=sampling_params.max_tokens)
+            CompletionOutputs(
+                outputs=self._generate_completion_outputs(
+                    max_tokens=sampling_params.max_tokens
+                )
+            )
         ]
diff --git a/tests/unit/backend/test_deepsparse.py b/tests/unit/backend/test_deepsparse.py
index cac49ca..0d667ea 100644
--- a/tests/unit/backend/test_deepsparse.py
+++ b/tests/unit/backend/test_deepsparse.py
@@ -45,7 +45,7 @@ class TestTextGenerationPipeline:
     Method `__call__` allows to mock the result object that comes from
     `deepsparse.pipeline.Pipeline()` so everything is encapsulated right here.
 
-    :param self._generation: dynamic representation of generated responses
+    :param self._generations: dynamic representation of generated responses
         from deepsparse interface.
     """
 
diff --git a/tests/unit/backend/test_vllm.py b/tests/unit/backend/test_vllm.py
index fc2bc50..b8adbe4 100644
--- a/tests/unit/backend/test_vllm.py
+++ b/tests/unit/backend/test_vllm.py
@@ -21,18 +21,21 @@
 )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="session")
 def backend_class():
     from guidellm.backend.vllm import VllmBackend
 
     return VllmBackend
 
 
-@pytest.fixture(autouse=True)
+@pytest.fixture()
 def vllm_patch_factory(mocker) -> Callable[[str], dummy.vllm.TestLLM]:
     """
     Skip VLLM initializer due to external calls.
     Replace VllmBackend.llm object with mock representation.
+
+    This vllm patch is injected into each test automatically. If you need
+    to override the Mock object - use this fixture.
     """
 
     def inner(model: Optional[str] = None, max_tokens: Optional[int] = None):
@@ -48,6 +51,15 @@ def inner(model: Optional[str] = None, max_tokens: Optional[int] = None):
     return inner
 
 
+@pytest.fixture(autouse=True)
+def vllm_auto_patch(vllm_patch_factory):
+    """
+    Automatically patch the ``vllm.LLM`` with defaults.
+    """
+
+    return vllm_patch_factory()
+
+
 @pytest.mark.smoke()
 @pytest.mark.parametrize(
     "create_payload",
@@ -96,7 +108,7 @@ def test_backend_model_from_env(mocker, backend_class):
 @pytest.mark.parametrize(
     "text_generation_request_create_payload",
     [
-        {"prompt": "Test prompt"},
+        # {"prompt": "Test prompt"},
         {"prompt": "Test prompt", "output_token_count": 20},
     ],
 )
@@ -112,12 +124,13 @@ async def test_make_request(
     ):
         if response.add_token:
             output_tokens.append(response.add_token)
+
     assert "".join(output_tokens) == "".join(
-        generation.text for generation in backend.pipeline._generations
+        generation.text for generation in getattr(backend.llm, "_expected_outputs")
     )
 
     if max_tokens := text_generation_request_create_payload.get("output_token_count"):
-        assert len(backend.pipeline._generations) == max_tokens
+        assert len(getattr(backend.llm, "_expected_outputs")) == max_tokens
 
 
 @pytest.mark.smoke()

From bd76806a62cded71ad6a89be694c86255a94b57a Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Mon, 9 Sep 2024 20:41:33 +0300
Subject: [PATCH 15/19] docker testing guide is added

---
 DEVELOPING.md | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/DEVELOPING.md b/DEVELOPING.md
index f19079a..aa6c7a9 100644
--- a/DEVELOPING.md
+++ b/DEVELOPING.md
@@ -50,13 +50,13 @@ pip install -e .[vllm]
 According to the [installation guide](https://docs.vllm.ai/en/v0.4.0.post1/getting_started/installation.html) `vllm` is supported only on **Linux**. It means that running the application and tests will fail.
 
 Workaround with Docker:
+
 ```sh
 cd guidellm/
 docker build -t guidellm:latest .
 docker run -v ./:./ guidellm:latest python -m pytest -s -v src/unit/backend/test_vllm.py
 ```
 
-
 ## Project Structure
 
 The project follows a standard Python project structure:
@@ -180,6 +180,21 @@ The end-to-end tests are located in the `tests/e2e` directory. To run the end-to
 tox -e test-e2e
 ```
 
+### Running unsopported tests
+
+Some of the test might be not supported on your system (_for instance `vllm` is not supported on MacOS yet_). In order to run them on Linux Operating System you might use technologies like **WSL** on Windows, or **Docker** on Windows or MacOS.
+
+In order to run under the Docker just run the command below:
+
+```sh
+docker build -t guidellm:latest --shm-size=1024m .
+docker run --rm --env-file .env -v ./:/app/guidellm -it guidellm:latest pytest tests/
+```
+
+Using `--shm-size=1024m` is recommended due to potential local Docker configuration. Increase or decrease this value depending on your needs.
+
+<br>
+
 ## Formatting, Linting, and Type Checking
 
 ### Running Quality Checks (Linting)

From 92c88192bf4a4f2d50eef1879e9a6f0c9a66213a Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Mon, 9 Sep 2024 21:05:34 +0300
Subject: [PATCH 16/19] =?UTF-8?q?=F0=9F=92=9A=20Code=20quality=20is=20prov?=
 =?UTF-8?q?ided?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/guidellm/backend/vllm/__init__.py |  8 ++++----
 src/guidellm/backend/vllm/backend.py  |  2 ++
 tests/dummy/vllm.py                   | 12 +++++++-----
 tests/unit/backend/test_vllm.py       |  6 +++---
 4 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/guidellm/backend/vllm/__init__.py b/src/guidellm/backend/vllm/__init__.py
index 18040a7..b4f0504 100644
--- a/src/guidellm/backend/vllm/__init__.py
+++ b/src/guidellm/backend/vllm/__init__.py
@@ -16,10 +16,10 @@
 
 check_python_version(min_version="3.8", max_version="3.12")
 
-# module_is_available(
-#     module="vllm",
-#     helper=("`vllm` package is not available. Try run: `pip install -e '.[vllm]'`"),
-# )
+module_is_available(
+    module="vllm",
+    helper=("`vllm` package is not available. Try run: `pip install -e '.[vllm]'`"),
+)
 
 from .backend import VllmBackend  # noqa: E402
 
diff --git a/src/guidellm/backend/vllm/backend.py b/src/guidellm/backend/vllm/backend.py
index f161055..a048db4 100644
--- a/src/guidellm/backend/vllm/backend.py
+++ b/src/guidellm/backend/vllm/backend.py
@@ -19,6 +19,8 @@ def __init__(self, model: Optional[str] = None, **request_args):
         self._request_args: Dict[str, Any] = request_args
         self.llm = LLM(_model)
 
+        # NOTE: Must be after all the parameters since ``self.llm`` is going to be used
+        #       by ``make_request`` within ``Backend.test_connection()``
         super().__init__(type_="vllm", model=_model, target="not used")
 
         logger.info(f"vLLM Backend uses model '{self._model}'")
diff --git a/tests/dummy/vllm.py b/tests/dummy/vllm.py
index a7b2052..f86e61d 100644
--- a/tests/dummy/vllm.py
+++ b/tests/dummy/vllm.py
@@ -3,9 +3,9 @@
 """
 
 import random
-from typing import Generator, List, Optional
+from typing import Generator, List
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict
 
 from guidellm.utils import random_strings
 
@@ -52,7 +52,7 @@ class TestLLM(BaseModel):
 
     def _generate_completion_outputs(
         self, max_tokens: int
-    ) -> Generator[CompletionOutputs, None, None]:
+    ) -> Generator[CompletionOutput, None, None]:
 
         # NOTE: This value is used only for testing purposes
         self._expected_outputs: List[CompletionOutput] = []
@@ -70,8 +70,10 @@ def generate(
     ) -> List[CompletionOutputs]:
         return [
             CompletionOutputs(
-                outputs=self._generate_completion_outputs(
-                    max_tokens=sampling_params.max_tokens
+                outputs=list(
+                    self._generate_completion_outputs(
+                        max_tokens=sampling_params.max_tokens
+                    )
                 )
             )
         ]
diff --git a/tests/unit/backend/test_vllm.py b/tests/unit/backend/test_vllm.py
index b8adbe4..ccc0e87 100644
--- a/tests/unit/backend/test_vllm.py
+++ b/tests/unit/backend/test_vllm.py
@@ -108,7 +108,7 @@ def test_backend_model_from_env(mocker, backend_class):
 @pytest.mark.parametrize(
     "text_generation_request_create_payload",
     [
-        # {"prompt": "Test prompt"},
+        {"prompt": "Test prompt"},
         {"prompt": "Test prompt", "output_token_count": 20},
     ],
 )
@@ -126,11 +126,11 @@ async def test_make_request(
             output_tokens.append(response.add_token)
 
     assert "".join(output_tokens) == "".join(
-        generation.text for generation in getattr(backend.llm, "_expected_outputs")
+        generation.text for generation in backend.llm._expected_outputs
     )
 
     if max_tokens := text_generation_request_create_payload.get("output_token_count"):
-        assert len(getattr(backend.llm, "_expected_outputs")) == max_tokens
+        assert len(backend.llm._expected_outputs) == max_tokens
 
 
 @pytest.mark.smoke()

From e25be2ffe842aef587e19776e0c8a243754ff987 Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Mon, 9 Sep 2024 21:44:48 +0300
Subject: [PATCH 17/19] Dockefile improved. Removed unused parts

---
 .dockerignore    |   1 +
 DEVELOPING.md    |   6 +-
 Dockerfile       |  17 +++--
 requirements.txt | 184 -----------------------------------------------
 4 files changed, 13 insertions(+), 195 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/.dockerignore b/.dockerignore
index 5002a2b..35514f9 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -12,3 +12,4 @@ LICENSE
 MANIFEST.in
 __pycache__/
 *.egg-info/
+*log
diff --git a/DEVELOPING.md b/DEVELOPING.md
index aa6c7a9..b65bc09 100644
--- a/DEVELOPING.md
+++ b/DEVELOPING.md
@@ -187,12 +187,10 @@ Some of the test might be not supported on your system (_for instance `vllm` is
 In order to run under the Docker just run the command below:
 
 ```sh
-docker build -t guidellm:latest --shm-size=1024m .
-docker run --rm --env-file .env -v ./:/app/guidellm -it guidellm:latest pytest tests/
+docker build -t guidellm:latest .
+docker run --rm --env-file .env guidellm:latest pytest tests/
 ```
 
-Using `--shm-size=1024m` is recommended due to potential local Docker configuration. Increase or decrease this value depending on your needs.
-
 <br>
 
 ## Formatting, Linting, and Type Checking
diff --git a/Dockerfile b/Dockerfile
index d9c0bbe..2b88eee 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,17 +7,20 @@ ENV PYTHONPATH=/app/guidellm/src/
 RUN : \
     && apt-get update \
     # dependencies for building Python packages && cleaning up unused files
-    && apt-get install -y --no-insatll-recommend \
+    && apt-get install -y \
         build-essential \
         libcurl4-openssl-dev \
         libssl-dev \
     && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/* \
+    && pip install --upgrade \
+        pip \
+        setuptools
 
 
-# Python dependencies
-RUN pip install --upgrade pip setuptools
-COPY requirements.txt ./
-RUN pip install -r requirements.txt
+WORKDIR /app
+
+# Install project dependencies
+COPY ./ ./
+RUN pip install -e .[dev,deepsparse,vllm]
 
-WORKDIR /app/
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 4a05ce7..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,184 +0,0 @@
-aiohappyeyeballs==2.4.0
-aiohttp==3.10.5
-aiosignal==1.3.1
-alabaster==0.7.13
-annotated-types==0.7.0
-anyio==4.4.0
-async-timeout==4.0.3
-attrs==24.2.0
-audioread==3.0.1
-babel==2.16.0
-cachetools==5.5.0
-certifi==2024.8.30
-cffi==1.17.0
-cfgv==3.4.0
-chardet==5.2.0
-charset-normalizer==3.3.2
-click==8.1.7
-cloudpickle==3.0.0
-colorama==0.4.6
-coverage==7.6.1
-datasets==2.21.0
-decorator==5.1.1
-dill==0.3.8
-diskcache==5.6.3
-distlib==0.3.8
-distro==1.9.0
-docutils==0.20.1
-exceptiongroup==1.2.2
-fastapi==0.112.2
-filelock==3.15.4
-frozenlist==1.4.1
-fsspec==2024.6.1
-ftfy==6.2.3
-gguf==0.9.1
-h11==0.14.0
-httpcore==1.0.5
-httptools==0.6.1
-httpx==0.27.2
-huggingface-hub==0.24.6
-identify==2.6.0
-idna==3.8
-imagesize==1.4.1
-importlib_metadata==8.4.0
-importlib_resources==6.4.4
-iniconfig==2.0.0
-interegular==0.3.3
-Jinja2==3.1.4
-jiter==0.5.0
-joblib==1.4.2
-jsonschema==4.23.0
-jsonschema-specifications==2023.12.1
-lark==1.2.2
-lazy_loader==0.4
-librosa==0.10.2.post1
-linkify-it-py==2.0.3
-llvmlite==0.41.1
-lm-format-enforcer==0.10.6
-loguru==0.7.2
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-mdformat==0.7.17
-mdformat-gfm==0.3.6
-mdformat_footnote==0.1.1
-mdformat_frontmatter==2.0.8
-mdformat_tables==1.0.0
-mdit-py-plugins==0.4.1
-mdurl==0.1.2
-mpmath==1.3.0
-msgpack==1.0.8
-msgspec==0.18.6
-multidict==6.0.5
-multiprocess==0.70.16
-mypy==1.10.1
-mypy-extensions==1.0.0
-nest-asyncio==1.6.0
-networkx==3.1
-nodeenv==1.9.1
-numba==0.58.1
-numpy==1.24.4
-nvidia-cublas-cu12==12.1.3.1
-nvidia-cuda-cupti-cu12==12.1.105
-nvidia-cuda-nvrtc-cu12==12.1.105
-nvidia-cuda-runtime-cu12==12.1.105
-nvidia-cudnn-cu12==9.1.0.70
-nvidia-cufft-cu12==11.0.2.54
-nvidia-curand-cu12==10.3.2.106
-nvidia-cusolver-cu12==11.4.5.107
-nvidia-cusparse-cu12==12.1.0.106
-nvidia-ml-py==12.560.30
-nvidia-nccl-cu12==2.20.5
-nvidia-nvjitlink-cu12==12.6.68
-nvidia-nvtx-cu12==12.1.105
-openai==1.43.0
-outlines==0.0.46
-packaging==24.1
-pandas==2.0.3
-pillow==10.4.0
-pkgutil_resolve_name==1.3.10
-platformdirs==4.2.2
-pluggy==1.5.0
-pooch==1.8.2
-pre-commit==3.5.0
-prometheus-fastapi-instrumentator==7.0.0
-prometheus_client==0.20.0
-protobuf==5.28.0
-psutil==6.0.0
-py-cpuinfo==9.0.0
-pyairports==2.1.1
-pyarrow==17.0.0
-pycountry==24.6.1
-pycparser==2.22
-pydantic==2.8.2
-pydantic-settings==2.4.0
-pydantic_core==2.20.1
-Pygments==2.18.0
-pyproject-api==1.7.1
-pytest==8.2.2
-pytest-asyncio==0.23.8
-pytest-cov==5.0.0
-pytest-mock==3.14.0
-pytest-rerunfailures==14.0
-python-dateutil==2.9.0.post0
-python-dotenv==1.0.1
-pytz==2024.1
-PyYAML==6.0.2
-pyzmq==26.2.0
-ray==2.10.0
-referencing==0.35.1
-regex==2024.7.24
-requests==2.32.3
-requests-mock==1.12.1
-rich==13.8.0
-rpds-py==0.20.0
-ruamel.yaml==0.18.6
-ruamel.yaml.clib==0.2.8
-ruff==0.5.7
-safetensors==0.4.4
-scikit-learn==1.3.2
-scipy==1.10.1
-sentencepiece==0.2.0
-six==1.16.0
-sniffio==1.3.1
-snowballstemmer==2.2.0
-soundfile==0.12.1
-soxr==0.3.7
-Sphinx==7.1.2
-sphinxcontrib-applehelp==1.0.4
-sphinxcontrib-devhelp==1.0.2
-sphinxcontrib-htmlhelp==2.0.1
-sphinxcontrib-jsmath==1.0.1
-sphinxcontrib-qthelp==1.0.3
-sphinxcontrib-serializinghtml==1.1.5
-starlette==0.38.4
-sympy==1.13.2
-threadpoolctl==3.5.0
-tiktoken==0.7.0
-tokenizers==0.19.1
-tomli==2.0.1
-torch==2.4.0
-torchvision==0.19.0
-tox==4.16.0
-tqdm==4.66.5
-transformers==4.44.2
-triton==3.0.0
-types-click==7.1.8
-types-PyYAML==6.0.12.20240808
-types-requests==2.32.0.20240712
-types-toml==0.10.8.20240310
-typing_extensions==4.12.2
-tzdata==2024.1
-uc-micro-py==1.0.3
-urllib3==2.2.2
-uvicorn==0.30.6
-uvloop==0.20.0
-virtualenv==20.26.3
-vllm==0.5.5
-vllm-flash-attn==2.6.1
-watchfiles==0.24.0
-wcwidth==0.2.13
-websockets==13.0.1
-xformers==0.0.27.post2
-xxhash==3.5.0
-yarl==1.9.7
-zipp==3.20.1

From 809694cd107f8359ae43a80af37a34d5b351723f Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Tue, 10 Sep 2024 08:55:13 +0300
Subject: [PATCH 18/19] =?UTF-8?q?=E2=9C=85=20tests=20are=20fixed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/guidellm/utils/progress.py        |  8 +++++---
 tests/dummy/vllm.py                   | 10 ++++++----
 tests/unit/backend/test_deepsparse.py | 18 +++++++-----------
 tests/unit/backend/test_vllm.py       | 25 ++++++++++++++-----------
 4 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/src/guidellm/utils/progress.py b/src/guidellm/utils/progress.py
index 5c7a845..5ae8416 100644
--- a/src/guidellm/utils/progress.py
+++ b/src/guidellm/utils/progress.py
@@ -162,9 +162,11 @@ def update_benchmark(
             total=completed_total,
             completed=completed_count if not completed else completed_total,
             req_per_sec=(f"{req_per_sec:.2f}" if req_per_sec else "#.##"),
-            start_time_str=datetime.fromtimestamp(start_time).strftime("%H:%M:%S")
-            if start_time
-            else "--:--:--",
+            start_time_str=(
+                datetime.fromtimestamp(start_time).strftime("%H:%M:%S")
+                if start_time
+                else "--:--:--"
+            ),
         )
         logger.debug(
             "Updated benchmark task at index {}: {}% complete",
diff --git a/tests/dummy/vllm.py b/tests/dummy/vllm.py
index f86e61d..2210b80 100644
--- a/tests/dummy/vllm.py
+++ b/tests/dummy/vllm.py
@@ -3,7 +3,7 @@
 """
 
 import random
-from typing import Generator, List
+from typing import Generator, List, Optional
 
 from pydantic import BaseModel, ConfigDict
 
@@ -21,7 +21,7 @@ class CompletionOutput(BaseModel):
 class SamplingParams(BaseModel):
     """Test interface of `vllm.SamplingParams`."""
 
-    max_tokens: int
+    max_tokens: Optional[int] = 16
 
 
 class CompletionOutputs(BaseModel):
@@ -51,14 +51,16 @@ class TestLLM(BaseModel):
     max_num_batched_tokens: int
 
     def _generate_completion_outputs(
-        self, max_tokens: int
+        self, max_tokens: Optional[int]
     ) -> Generator[CompletionOutput, None, None]:
 
         # NOTE: This value is used only for testing purposes
         self._expected_outputs: List[CompletionOutput] = []
 
         for text in random_strings(
-            min_chars=0, max_chars=random.randint(10, 20), n=max_tokens
+            min_chars=5,
+            max_chars=random.randint(10, 20),
+            n=max_tokens or random.randint(10, 20),
         ):
             instance = CompletionOutput(text=text)
             self._expected_outputs.append(instance)
diff --git a/tests/unit/backend/test_deepsparse.py b/tests/unit/backend/test_deepsparse.py
index 0d667ea..244db47 100644
--- a/tests/unit/backend/test_deepsparse.py
+++ b/tests/unit/backend/test_deepsparse.py
@@ -6,7 +6,7 @@
 """
 
 import sys
-from typing import Any, Dict, Generator, List, Optional
+from typing import Any, Generator, List, Optional
 
 import pytest
 from pydantic import BaseModel
@@ -96,7 +96,7 @@ def mock_deepsparse_pipeline(mocker):
         {"model": "test/custom_llm"},
     ],
 )
-def test_backend_creation(create_payload: Dict, backend_class):
+def test_backend_creation(create_payload, backend_class):
     """Test the "Deepspaarse Backend" class
     with defaults and custom input parameters.
     """
@@ -139,9 +139,7 @@ def test_backend_model_from_env(mocker, backend_class):
     ],
 )
 @pytest.mark.asyncio()
-async def test_make_request(
-    text_generation_request_create_payload: Dict, backend_class
-):
+async def test_make_request(text_generation_request_create_payload, backend_class):
     backend = backend_class()
 
     output_tokens: List[str] = []
@@ -160,23 +158,21 @@ async def test_make_request(
 
 @pytest.mark.smoke()
 @pytest.mark.parametrize(
-    ("text_generation_request_create_payload", "error"),
+    ("text_generation_request", "error"),
     [
         (
-            {"prompt": "Test prompt", "output_token_count": -1},
+            TextGenerationRequest(prompt="Test prompt", output_token_count=-1),
             ValueError,
         ),
     ],
 )
 @pytest.mark.asyncio()
 async def test_make_request_invalid_request_payload(
-    text_generation_request_create_payload: Dict, error, backend_class
+    text_generation_request, error, backend_class
 ):
     backend = backend_class()
     with pytest.raises(error):
         [
             respnose
-            async for respnose in backend.make_request(
-                request=TextGenerationRequest(**text_generation_request_create_payload)
-            )
+            async for respnose in backend.make_request(request=text_generation_request)
         ]
diff --git a/tests/unit/backend/test_vllm.py b/tests/unit/backend/test_vllm.py
index ccc0e87..16e83b0 100644
--- a/tests/unit/backend/test_vllm.py
+++ b/tests/unit/backend/test_vllm.py
@@ -6,7 +6,7 @@
 """
 
 import sys
-from typing import Callable, Dict, List, Optional
+from typing import Callable, List, Optional
 
 import pytest
 
@@ -68,7 +68,7 @@ def vllm_auto_patch(vllm_patch_factory):
         {"model": "test/custom_llm"},
     ],
 )
-def test_backend_creation(create_payload: Dict, backend_class, vllm_patch_factory):
+def test_backend_creation(create_payload, backend_class, vllm_patch_factory):
     """Test the "Deepspaarse Backend" class
     with defaults and custom input parameters.
     """
@@ -113,9 +113,7 @@ def test_backend_model_from_env(mocker, backend_class):
     ],
 )
 @pytest.mark.asyncio()
-async def test_make_request(
-    text_generation_request_create_payload: Dict, backend_class
-):
+async def test_make_request(text_generation_request_create_payload, backend_class):
     backend = backend_class()
 
     output_tokens: List[str] = []
@@ -135,20 +133,25 @@ async def test_make_request(
 
 @pytest.mark.smoke()
 @pytest.mark.parametrize(
-    ("text_generation_request_create_payload", "error"),
+    ("text_generation_request", "error"),
     [
-        ({"prompt": "Test prompt"}, ValueError),
+        (
+            TextGenerationRequest(prompt="Test prompt", output_token_count=-1),
+            ValueError,
+        ),
+        (
+            TextGenerationRequest(prompt="Test prompt", output_token_count=0),
+            ValueError,
+        ),
     ],
 )
 @pytest.mark.asyncio()
 async def test_make_request_invalid_request_payload(
-    text_generation_request_create_payload: Dict, error, backend_class
+    text_generation_request, error, backend_class
 ):
     backend = backend_class()
     with pytest.raises(error):
         [
             respnose
-            async for respnose in backend.make_request(
-                request=TextGenerationRequest(**text_generation_request_create_payload)
-            )
+            async for respnose in backend.make_request(request=text_generation_request)
         ]

From 78b78ed5d730c158e0b72d46eba31aa4e56264ba Mon Sep 17 00:00:00 2001
From: Dmytro Parfeniuk <parfeniukinik@gmail.com>
Date: Tue, 10 Sep 2024 10:43:00 +0300
Subject: [PATCH 19/19] =?UTF-8?q?=F0=9F=90=B3=20--platform=20is=20removed?=
 =?UTF-8?q?=20from=20Dockerfile?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 DEVELOPING.md | 2 +-
 Dockerfile    | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/DEVELOPING.md b/DEVELOPING.md
index b65bc09..dbb3ed6 100644
--- a/DEVELOPING.md
+++ b/DEVELOPING.md
@@ -187,7 +187,7 @@ Some of the test might be not supported on your system (_for instance `vllm` is
 In order to run under the Docker just run the command below:
 
 ```sh
-docker build -t guidellm:latest .
+docker build --platform linux/amd64 --tag guidellm:latest .
 docker run --rm --env-file .env guidellm:latest pytest tests/
 ```
 
diff --git a/Dockerfile b/Dockerfile
index 2b88eee..5db54bd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,6 @@ FROM --platform=linux/amd64 python:3.8-slim
 
 # Environment variables
 ENV PYTHONUNBUFFERED=1
-ENV PYTHONPATH=/app/guidellm/src/
 
 RUN : \
     && apt-get update \
@@ -17,10 +16,8 @@ RUN : \
         pip \
         setuptools
 
-
 WORKDIR /app
 
 # Install project dependencies
 COPY ./ ./
 RUN pip install -e .[dev,deepsparse,vllm]
-