From 4312cb42788949c0c7a8a57d5fc65ceed6c4c0ea Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 4 Sep 2024 22:09:59 +0000
Subject: [PATCH 01/45] Add mean and percentile info as computed_field
 properties such that they become serializable

---
 src/guidellm/core/report.py | 26 ++++-------
 src/guidellm/core/result.py | 92 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 99 insertions(+), 19 deletions(-)

diff --git a/src/guidellm/core/report.py b/src/guidellm/core/report.py
index b6791e4..c48eed5 100644
--- a/src/guidellm/core/report.py
+++ b/src/guidellm/core/report.py
@@ -147,19 +147,15 @@ def _create_benchmark_report_data_tokens_summary(
     for benchmark in report.benchmarks_sorted:
         table.add_row(
             _benchmark_rate_id(benchmark),
-            f"{benchmark.prompt_token_distribution.mean:.2f}",
+            f"{benchmark.prompt_token:.2f}",
             ", ".join(
                 f"{percentile:.1f}"
-                for percentile in benchmark.prompt_token_distribution.percentiles(
-                    [1, 5, 50, 95, 99]
-                )
+                for percentile in benchmark.prompt_token_percentiles
             ),
-            f"{benchmark.output_token_distribution.mean:.2f}",
+            f"{benchmark.output_token:.2f}",
             ", ".join(
                 f"{percentile:.1f}"
-                for percentile in benchmark.output_token_distribution.percentiles(
-                    [1, 5, 50, 95, 99]
-                )
+                for percentile in benchmark.output_token_percentiles
             ),
         )
     logger.debug("Created data tokens summary table for the report.")
@@ -181,7 +177,7 @@ def _create_benchmark_report_dist_perf_summary(
         "Benchmark",
         "Request Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (sec)",
         "Time to First Token [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)",
-        "Inter Token Latency [1%, 5%, 10%, 50%, 90% 95%, 99%] (ms)",
+        "Inter Token Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)",
         title="[magenta]Performance Stats by Benchmark[/magenta]",
         title_style="bold",
         title_justify="left",
@@ -193,21 +189,15 @@ def _create_benchmark_report_dist_perf_summary(
             _benchmark_rate_id(benchmark),
             ", ".join(
                 f"{percentile:.2f}"
-                for percentile in benchmark.request_latency_distribution.percentiles(
-                    [1, 5, 10, 50, 90, 95, 99]
-                )
+                for percentile in benchmark.request_latency_percentiles
             ),
             ", ".join(
                 f"{percentile * 1000:.1f}"
-                for percentile in benchmark.ttft_distribution.percentiles(
-                    [1, 5, 10, 50, 90, 95, 99]
-                )
+                for percentile in benchmark.time_to_first_token_percentiles
             ),
             ", ".join(
                 f"{percentile * 1000:.1f}"
-                for percentile in benchmark.itl_distribution.percentiles(
-                    [1, 5, 10, 50, 90, 95, 99]
-                )
+                for percentile in benchmark.inter_token_latency_percentiles
             ),
         )
     logger.debug("Created distribution performance summary table for the report.")
diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py
index f218784..5fd29a8 100644
--- a/src/guidellm/core/result.py
+++ b/src/guidellm/core/result.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict, List, Literal, Optional, Union
 
 from loguru import logger
-from pydantic import Field
+from pydantic import Field, computed_field
 
 from guidellm.core.distribution import Distribution
 from guidellm.core.request import TextGenerationRequest
@@ -221,6 +221,7 @@ def __iter__(self):
         """
         return iter(self.results)
 
+    @computed_field
     @property
     def request_count(self) -> int:
         """
@@ -231,6 +232,7 @@ def request_count(self) -> int:
         """
         return len(self.results)
 
+    @computed_field
     @property
     def error_count(self) -> int:
         """
@@ -241,6 +243,7 @@ def error_count(self) -> int:
         """
         return len(self.errors)
 
+    @computed_field
     @property
     def total_count(self) -> int:
         """
@@ -251,6 +254,7 @@ def total_count(self) -> int:
         """
         return self.request_count + self.error_count
 
+    @computed_field
     @property
     def start_time(self) -> Optional[float]:
         """
@@ -264,6 +268,7 @@ def start_time(self) -> Optional[float]:
 
         return self.results[0].start_time
 
+    @computed_field
     @property
     def end_time(self) -> Optional[float]:
         """
@@ -277,6 +282,7 @@ def end_time(self) -> Optional[float]:
 
         return self.results[-1].end_time
 
+    @computed_field
     @property
     def duration(self) -> float:
         """
@@ -290,6 +296,7 @@ def duration(self) -> float:
 
         return self.end_time - self.start_time
 
+    @computed_field
     @property
     def completed_request_rate(self) -> float:
         """
@@ -303,6 +310,7 @@ def completed_request_rate(self) -> float:
 
         return len(self.results) / self.duration
 
+    @computed_field
     @property
     def request_latency(self) -> float:
         """
@@ -332,6 +340,19 @@ def request_latency_distribution(self) -> Distribution:
             ]
         )
 
+    @computed_field
+    @property
+    def request_latency_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles of request latency in seconds.
+
+        :return: List of percentile request latency in seconds
+        :rtype: List[float]
+        """
+        return self.request_latency_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
+
+
+    @computed_field
     @property
     def time_to_first_token(self) -> float:
         """
@@ -360,7 +381,19 @@ def ttft_distribution(self) -> Distribution:
                 if result.first_token_time is not None
             ]
         )
+    
+    @computed_field
+    @property
+    def time_to_first_token_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for time taken to decode the first token in milliseconds.
+
+        :return: List of percentile time taken to decode the first token in milliseconds.
+        :rtype: List[float]
+        """        
+        return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
 
+    @computed_field
     @property
     def inter_token_latency(self) -> float:
         """
@@ -387,7 +420,19 @@ def itl_distribution(self) -> Distribution:
                 decode for result in self.results for decode in result.decode_times.data
             ]
         )
+    
+    @computed_field
+    @property
+    def inter_token_latency_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for the time between tokens in milliseconds.
 
+        :return: List of percentiles for the average time between tokens.
+        :rtype: List[float]
+        """
+        return self.itl_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
+
+    @computed_field
     @property
     def output_token_throughput(self) -> float:
         """
@@ -403,6 +448,17 @@ def output_token_throughput(self) -> float:
 
         return total_tokens / self.duration
 
+    @computed_field
+    @property
+    def prompt_token(self) -> float:
+        """
+        Get the average number of prompt tokens.
+
+        :return: The average number of prompt tokens.
+        :rtype: float
+        """
+        return self.prompt_token_distribution.mean
+
     @property
     def prompt_token_distribution(self) -> Distribution:
         """
@@ -413,6 +469,28 @@ def prompt_token_distribution(self) -> Distribution:
         """
         return Distribution(data=[result.prompt_token_count for result in self.results])
 
+    @computed_field
+    @property
+    def prompt_token_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for number of prompt tokens.
+
+        :return: List of percentiles of number of prompt tokens.
+        :rtype: List[float]
+        """
+        return self.prompt_token_distribution.percentiles([1, 5, 50, 95, 99])
+
+    @computed_field
+    @property
+    def output_token(self) -> float:
+        """
+        Get the average number of output tokens.
+
+        :return: The average number of output tokens.
+        :rtype: float
+        """
+        return self.output_token_distribution.mean
+
     @property
     def output_token_distribution(self) -> Distribution:
         """
@@ -423,6 +501,18 @@ def output_token_distribution(self) -> Distribution:
         """
         return Distribution(data=[result.output_token_count for result in self.results])
 
+    @computed_field
+    @property
+    def output_token_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for number of output tokens.
+
+        :return: List of percentiles of number of output tokens.
+        :rtype: List[float]
+        """
+        return self.output_token_distribution.percentiles([1, 5, 50, 95, 99])
+
+    @computed_field
     @property
     def overloaded(self) -> bool:
         if (

From 46e10764262ac05cbc7605bc65e4a2fad5b597a2 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Sep 2024 01:19:42 +0000
Subject: [PATCH 02/45] quality fixes

---
 src/guidellm/core/result.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py
index 5fd29a8..90906b7 100644
--- a/src/guidellm/core/result.py
+++ b/src/guidellm/core/result.py
@@ -381,7 +381,7 @@ def ttft_distribution(self) -> Distribution:
                 if result.first_token_time is not None
             ]
         )
-    
+
     @computed_field
     @property
     def time_to_first_token_percentiles(self) -> List[float]:
@@ -390,7 +390,7 @@ def time_to_first_token_percentiles(self) -> List[float]:
 
         :return: List of percentile time taken to decode the first token in milliseconds.
         :rtype: List[float]
-        """        
+        """
         return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
 
     @computed_field
@@ -420,7 +420,7 @@ def itl_distribution(self) -> Distribution:
                 decode for result in self.results for decode in result.decode_times.data
             ]
         )
-    
+
     @computed_field
     @property
     def inter_token_latency_percentiles(self) -> List[float]:

From 65fafdefc154a7184926c162fbe90a676cf5870e Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Sep 2024 01:23:25 +0000
Subject: [PATCH 03/45] quality fix

---
 src/guidellm/core/result.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py
index 90906b7..95a4230 100644
--- a/src/guidellm/core/result.py
+++ b/src/guidellm/core/result.py
@@ -386,9 +386,11 @@ def ttft_distribution(self) -> Distribution:
     @property
     def time_to_first_token_percentiles(self) -> List[float]:
         """
-        Get standard percentiles for time taken to decode the first token in milliseconds.
+        Get standard percentiles for time taken to decode the first token
+        in milliseconds.
 
-        :return: List of percentile time taken to decode the first token in milliseconds.
+        :return: List of percentile time taken to decode the first token
+        in milliseconds.
         :rtype: List[float]
         """
         return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])

From cc8d2c67c33affb6d99c2e86a090fb803388e098 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 6 Sep 2024 14:28:54 +0000
Subject: [PATCH 04/45] Quality fixes

---
 src/guidellm/core/result.py | 38 ++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py
index 95a4230..aebd176 100644
--- a/src/guidellm/core/result.py
+++ b/src/guidellm/core/result.py
@@ -221,7 +221,7 @@ def __iter__(self):
         """
         return iter(self.results)
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def request_count(self) -> int:
         """
@@ -232,7 +232,7 @@ def request_count(self) -> int:
         """
         return len(self.results)
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def error_count(self) -> int:
         """
@@ -243,7 +243,7 @@ def error_count(self) -> int:
         """
         return len(self.errors)
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def total_count(self) -> int:
         """
@@ -254,7 +254,7 @@ def total_count(self) -> int:
         """
         return self.request_count + self.error_count
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def start_time(self) -> Optional[float]:
         """
@@ -268,7 +268,7 @@ def start_time(self) -> Optional[float]:
 
         return self.results[0].start_time
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def end_time(self) -> Optional[float]:
         """
@@ -282,7 +282,7 @@ def end_time(self) -> Optional[float]:
 
         return self.results[-1].end_time
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def duration(self) -> float:
         """
@@ -296,7 +296,7 @@ def duration(self) -> float:
 
         return self.end_time - self.start_time
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def completed_request_rate(self) -> float:
         """
@@ -310,7 +310,7 @@ def completed_request_rate(self) -> float:
 
         return len(self.results) / self.duration
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def request_latency(self) -> float:
         """
@@ -340,7 +340,7 @@ def request_latency_distribution(self) -> Distribution:
             ]
         )
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def request_latency_percentiles(self) -> List[float]:
         """
@@ -352,7 +352,7 @@ def request_latency_percentiles(self) -> List[float]:
         return self.request_latency_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
 
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def time_to_first_token(self) -> float:
         """
@@ -382,7 +382,7 @@ def ttft_distribution(self) -> Distribution:
             ]
         )
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def time_to_first_token_percentiles(self) -> List[float]:
         """
@@ -395,7 +395,7 @@ def time_to_first_token_percentiles(self) -> List[float]:
         """
         return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def inter_token_latency(self) -> float:
         """
@@ -423,7 +423,7 @@ def itl_distribution(self) -> Distribution:
             ]
         )
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def inter_token_latency_percentiles(self) -> List[float]:
         """
@@ -434,7 +434,7 @@ def inter_token_latency_percentiles(self) -> List[float]:
         """
         return self.itl_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def output_token_throughput(self) -> float:
         """
@@ -450,7 +450,7 @@ def output_token_throughput(self) -> float:
 
         return total_tokens / self.duration
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def prompt_token(self) -> float:
         """
@@ -471,7 +471,7 @@ def prompt_token_distribution(self) -> Distribution:
         """
         return Distribution(data=[result.prompt_token_count for result in self.results])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def prompt_token_percentiles(self) -> List[float]:
         """
@@ -482,7 +482,7 @@ def prompt_token_percentiles(self) -> List[float]:
         """
         return self.prompt_token_distribution.percentiles([1, 5, 50, 95, 99])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def output_token(self) -> float:
         """
@@ -503,7 +503,7 @@ def output_token_distribution(self) -> Distribution:
         """
         return Distribution(data=[result.output_token_count for result in self.results])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def output_token_percentiles(self) -> List[float]:
         """
@@ -514,7 +514,7 @@ def output_token_percentiles(self) -> List[float]:
         """
         return self.output_token_distribution.percentiles([1, 5, 50, 95, 99])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def overloaded(self) -> bool:
         if (

From bb9bc0c1b8d72fc34d2acce8421e15a00077e4c2 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 4 Nov 2024 19:07:22 +0000
Subject: [PATCH 05/45] Add class to describe image samples and loading logic
 for images from url

---
 src/guidellm/utils/images.py | 69 ++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 src/guidellm/utils/images.py

diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
new file mode 100644
index 0000000..5e96ce1
--- /dev/null
+++ b/src/guidellm/utils/images.py
@@ -0,0 +1,69 @@
+from PIL import Image
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from pydantic import Field, ConfigDict
+from typing import List, Optional
+from io import BytesIO
+
+from loguru import logger
+
+import requests
+
+from guidellm.config import settings
+from guidellm.core.serializable import Serializable
+
+__all__ = ["load_images", "ImageDescriptor"]
+
+class ImageDescriptor(Serializable):
+    """
+    A class to represent image data in serializable format.
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    
+    url: Optional[str] = Field(description="url address for image.")
+    image: Image.Image = Field(description="PIL image", exclude=True)
+    filename: Optional[int] = Field(
+        default=None,
+        description="Image filename.",
+    )
+    
+
+def load_images(data: str) -> List[ImageDescriptor]:
+    """
+    Load an HTML file from a path or URL
+
+    :param data: the path or URL to load the HTML file from
+    :type data: Union[str, Path]
+    :return: Descriptor containing image url and the data in PIL.Image.Image format
+    :rtype: ImageDescriptor
+    """
+
+    images = []
+    if not data:
+        return None
+    if isinstance(data, str) and data.startswith("http"):
+        response = requests.get(data, timeout=settings.request_timeout)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, 'html.parser')
+        for img_tag in soup.find_all("img"):
+            img_url = img_tag.get("src")
+
+            if img_url:
+                # Handle relative URLs
+                img_url = urljoin(data, img_url)
+                
+                # Download the image
+                logger.debug("Loading image: {}", img_url)
+                img_response = requests.get(img_url)
+                img_response.raise_for_status()
+                
+                # Load image into Pillow
+                images.append(
+                    ImageDescriptor(
+                        url=img_url, 
+                        image=Image.open(BytesIO(img_response.content)),
+                    )
+                )
+
+        return images
\ No newline at end of file

From 59002b511339a22833a20a51023c40b680a1a3f5 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 4 Nov 2024 19:07:49 +0000
Subject: [PATCH 06/45] Add class to describe image samples and loading logic
 for images from url

---
 src/guidellm/utils/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py
index 2fdd8ca..6f2f669 100644
--- a/src/guidellm/utils/__init__.py
+++ b/src/guidellm/utils/__init__.py
@@ -12,6 +12,7 @@
     split_lines_by_punctuation,
     split_text,
 )
+from .images import load_images, ImageDescriptor
 from .transformers import (
     load_transformers_dataset,
     resolve_transformers_dataset,

From cb1f244ac5ce15dad230bbf84f541868b3ffa393 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 4 Nov 2024 19:10:01 +0000
Subject: [PATCH 07/45] Add url used to download images from for emulated
 requests

---
 src/guidellm/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/guidellm/config.py b/src/guidellm/config.py
index c3d950e..df750ea 100644
--- a/src/guidellm/config.py
+++ b/src/guidellm/config.py
@@ -90,6 +90,7 @@ class EmulatedDataSettings(BaseModel):
             "force_new_line_punctuation": True,
         }
     )
+    image_source: List[str] = "https://www.gutenberg.org/cache/epub/1342/pg1342-images.html"
 
 
 class OpenAISettings(BaseModel):

From 24e652721ef3fe8268754f5802cb49e878145384 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 4 Nov 2024 19:10:24 +0000
Subject: [PATCH 08/45] Add support to images in requests

---
 src/guidellm/backend/openai.py   | 25 ++++++++++++++++++++++---
 src/guidellm/core/request.py     | 15 ++++++++++++++-
 src/guidellm/request/emulated.py | 17 +++++++++++++++--
 3 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
index 8c83f91..c740b34 100644
--- a/src/guidellm/backend/openai.py
+++ b/src/guidellm/backend/openai.py
@@ -1,4 +1,5 @@
 from typing import AsyncGenerator, Dict, List, Optional
+import io, base64
 
 from loguru import logger
 from openai import AsyncOpenAI, OpenAI
@@ -103,11 +104,11 @@ async def make_request(
 
         request_args.update(self._request_args)
 
+        messages = self._build_messages(request)
+
         stream = await self._async_client.chat.completions.create(
             model=self.model,
-            messages=[
-                {"role": "user", "content": request.prompt},
-            ],
+            messages=messages,
             stream=True,
             **request_args,
         )
@@ -167,3 +168,21 @@ def validate_connection(self):
         except Exception as error:
             logger.error("Failed to validate OpenAI connection: {}", error)
             raise error
+
+    def _build_messages(self, request: TextGenerationRequest) -> Dict:
+        if request.number_images == 0:
+            messages = [{"role": "user", "content": request.prompt}]
+        else:
+            content = []
+            for image in request.images:
+                stream = io.BytesIO()
+                im_format = image.image.format or "PNG"
+                image.image.save(stream, format=im_format)
+                im_b64 = base64.b64encode(stream.getvalue()).decode("ascii")
+                image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"}
+                content.append({"type": "image_url", "image_url": image_url})
+
+            content.append({"type": "text", "text": request.prompt})            
+            messages = [{"role": "user", "content": content}]
+        
+        return messages
diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py
index 4f7315c..8f93b56 100644
--- a/src/guidellm/core/request.py
+++ b/src/guidellm/core/request.py
@@ -1,9 +1,10 @@
 import uuid
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, List
 
 from pydantic import Field
 
 from guidellm.core.serializable import Serializable
+from guidellm.utils import ImageDescriptor
 
 
 class TextGenerationRequest(Serializable):
@@ -16,6 +17,10 @@ class TextGenerationRequest(Serializable):
         description="The unique identifier for the request.",
     )
     prompt: str = Field(description="The input prompt for the text generation.")
+    images: Optional[List[ImageDescriptor]] = Field(
+        default=None,
+        description="Input images.",
+    )
     prompt_token_count: Optional[int] = Field(
         default=None,
         description="The number of tokens in the input prompt.",
@@ -29,6 +34,13 @@ class TextGenerationRequest(Serializable):
         description="The parameters for the text generation request.",
     )
 
+    @property
+    def number_images(self) -> int:
+        if self.images is None:
+            return 0
+        else:
+            return len(self.images)
+
     def __str__(self) -> str:
         prompt_short = (
             self.prompt[:32] + "..."
@@ -41,4 +53,5 @@ def __str__(self) -> str:
             f"prompt={prompt_short}, prompt_token_count={self.prompt_token_count}, "
             f"output_token_count={self.output_token_count}, "
             f"params={self.params})"
+            f"images={self.number_images}"
         )
diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py
index 7d481cb..b7053de 100644
--- a/src/guidellm/request/emulated.py
+++ b/src/guidellm/request/emulated.py
@@ -11,7 +11,7 @@
 from guidellm.config import settings
 from guidellm.core.request import TextGenerationRequest
 from guidellm.request.base import GenerationMode, RequestGenerator
-from guidellm.utils import clean_text, filter_text, load_text, split_text
+from guidellm.utils import clean_text, filter_text, load_text, split_text, load_images
 
 __all__ = ["EmulatedConfig", "EmulatedRequestGenerator", "EndlessTokens"]
 
@@ -30,6 +30,7 @@ class EmulatedConfig:
         generated_tokens_variance (Optional[int]): Variance for generated tokens.
         generated_tokens_min (Optional[int]): Minimum number of generated tokens.
         generated_tokens_max (Optional[int]): Maximum number of generated tokens.
+        images (Optional[int]): Number of input images.
     """
 
     @staticmethod
@@ -47,7 +48,7 @@ def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig":
         """
         if not config:
             logger.debug("Creating default configuration")
-            return EmulatedConfig(prompt_tokens=1024, generated_tokens=256)
+            return EmulatedConfig(prompt_tokens=1024, generated_tokens=256, images=0)
 
         if isinstance(config, dict):
             logger.debug("Loading configuration from dict: {}", config)
@@ -105,6 +106,8 @@ def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig":
     generated_tokens_min: Optional[int] = None
     generated_tokens_max: Optional[int] = None
 
+    images: int = 0
+
     @property
     def prompt_tokens_range(self) -> Tuple[int, int]:
         """
@@ -327,6 +330,8 @@ def __init__(
             settings.emulated_data.filter_start,
             settings.emulated_data.filter_end,
         )
+        if self._config.images > 0:
+            self._images = load_images(settings.emulated_data.image_source)
         self._rng = np.random.default_rng(random_seed)
 
         # NOTE: Must be after all the parameters since the queue population
@@ -355,6 +360,7 @@ def create_item(self) -> TextGenerationRequest:
         logger.debug("Creating new text generation request")
         target_prompt_token_count = self._config.sample_prompt_tokens(self._rng)
         prompt = self.sample_prompt(target_prompt_token_count)
+        images = self.sample_images()
         prompt_token_count = len(self.tokenizer.tokenize(prompt))
         output_token_count = self._config.sample_output_tokens(self._rng)
         logger.debug("Generated prompt: {}", prompt)
@@ -363,6 +369,7 @@ def create_item(self) -> TextGenerationRequest:
             prompt=prompt,
             prompt_token_count=prompt_token_count,
             output_token_count=output_token_count,
+            images=images,
         )
 
     def sample_prompt(self, tokens: int) -> str:
@@ -395,3 +402,9 @@ def sample_prompt(self, tokens: int) -> str:
                 right = mid
 
         return self._tokens.create_text(start_line_index, left)
+    
+    
+    def sample_images(self):
+        image_indices = self._rng.choice(len(self._images), size=self._config.images, replace=False)
+
+        return [self._images[i] for i in image_indices]
\ No newline at end of file

From 394670999785536b696978eae411cbcf7c4583cd Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 4 Nov 2024 19:45:56 +0000
Subject: [PATCH 09/45] quality fixes

---
 src/guidellm/backend/openai.py   |  7 ++++---
 src/guidellm/core/request.py     |  2 +-
 src/guidellm/request/emulated.py |  8 ++++----
 src/guidellm/utils/__init__.py   |  2 +-
 src/guidellm/utils/images.py     | 27 +++++++++++++--------------
 5 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
index c740b34..f75bb3b 100644
--- a/src/guidellm/backend/openai.py
+++ b/src/guidellm/backend/openai.py
@@ -1,5 +1,6 @@
+import base64
+import io
 from typing import AsyncGenerator, Dict, List, Optional
-import io, base64
 
 from loguru import logger
 from openai import AsyncOpenAI, OpenAI
@@ -182,7 +183,7 @@ def _build_messages(self, request: TextGenerationRequest) -> Dict:
                 image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"}
                 content.append({"type": "image_url", "image_url": image_url})
 
-            content.append({"type": "text", "text": request.prompt})            
+            content.append({"type": "text", "text": request.prompt})
             messages = [{"role": "user", "content": content}]
-        
+
         return messages
diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py
index 8f93b56..a1ff199 100644
--- a/src/guidellm/core/request.py
+++ b/src/guidellm/core/request.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import Any, Dict, Optional, List
+from typing import Any, Dict, List, Optional
 
 from pydantic import Field
 
diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py
index b7053de..9dc3825 100644
--- a/src/guidellm/request/emulated.py
+++ b/src/guidellm/request/emulated.py
@@ -11,7 +11,7 @@
 from guidellm.config import settings
 from guidellm.core.request import TextGenerationRequest
 from guidellm.request.base import GenerationMode, RequestGenerator
-from guidellm.utils import clean_text, filter_text, load_text, split_text, load_images
+from guidellm.utils import clean_text, filter_text, load_images, load_text, split_text
 
 __all__ = ["EmulatedConfig", "EmulatedRequestGenerator", "EndlessTokens"]
 
@@ -402,9 +402,9 @@ def sample_prompt(self, tokens: int) -> str:
                 right = mid
 
         return self._tokens.create_text(start_line_index, left)
-    
-    
+
+
     def sample_images(self):
         image_indices = self._rng.choice(len(self._images), size=self._config.images, replace=False)
 
-        return [self._images[i] for i in image_indices]
\ No newline at end of file
+        return [self._images[i] for i in image_indices]
diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py
index 6f2f669..1e51f22 100644
--- a/src/guidellm/utils/__init__.py
+++ b/src/guidellm/utils/__init__.py
@@ -1,3 +1,4 @@
+from .images import ImageDescriptor, load_images
 from .injector import create_report, inject_data
 from .progress import BenchmarkReportProgress
 from .text import (
@@ -12,7 +13,6 @@
     split_lines_by_punctuation,
     split_text,
 )
-from .images import load_images, ImageDescriptor
 from .transformers import (
     load_transformers_dataset,
     resolve_transformers_dataset,
diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index 5e96ce1..5d73bc0 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -1,13 +1,12 @@
-from PIL import Image
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
-from pydantic import Field, ConfigDict
-from typing import List, Optional
 from io import BytesIO
-
-from loguru import logger
+from typing import List, Optional
+from urllib.parse import urljoin
 
 import requests
+from bs4 import BeautifulSoup
+from loguru import logger
+from PIL import Image
+from pydantic import ConfigDict, Field
 
 from guidellm.config import settings
 from guidellm.core.serializable import Serializable
@@ -19,14 +18,14 @@ class ImageDescriptor(Serializable):
     A class to represent image data in serializable format.
     """
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    
+
     url: Optional[str] = Field(description="url address for image.")
     image: Image.Image = Field(description="PIL image", exclude=True)
     filename: Optional[int] = Field(
         default=None,
         description="Image filename.",
     )
-    
+
 
 def load_images(data: str) -> List[ImageDescriptor]:
     """
@@ -45,25 +44,25 @@ def load_images(data: str) -> List[ImageDescriptor]:
         response = requests.get(data, timeout=settings.request_timeout)
         response.raise_for_status()
 
-        soup = BeautifulSoup(response.text, 'html.parser')
+        soup = BeautifulSoup(response.text, "html.parser")
         for img_tag in soup.find_all("img"):
             img_url = img_tag.get("src")
 
             if img_url:
                 # Handle relative URLs
                 img_url = urljoin(data, img_url)
-                
+
                 # Download the image
                 logger.debug("Loading image: {}", img_url)
                 img_response = requests.get(img_url)
                 img_response.raise_for_status()
-                
+
                 # Load image into Pillow
                 images.append(
                     ImageDescriptor(
-                        url=img_url, 
+                        url=img_url,
                         image=Image.open(BytesIO(img_response.content)),
                     )
                 )
 
-        return images
\ No newline at end of file
+        return images

From 7d93b020d34e28ebdf04e7bc39a13c867fa6ef97 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 4 Nov 2024 19:53:57 +0000
Subject: [PATCH 10/45] Quality fixes

---
 src/guidellm/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/guidellm/__init__.py b/src/guidellm/__init__.py
index e562018..b10b445 100644
--- a/src/guidellm/__init__.py
+++ b/src/guidellm/__init__.py
@@ -6,6 +6,7 @@
 # flake8: noqa
 
 import os
+
 import transformers  # type: ignore
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Silence warnings for tokenizers

From a441dade284aa9e682c652430a739d7298c3e82e Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 4 Nov 2024 20:01:26 +0000
Subject: [PATCH 11/45] Quality fixes

---
 src/guidellm/request/emulated.py | 4 +++-
 src/guidellm/utils/__init__.py   | 2 ++
 src/guidellm/utils/images.py     | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py
index 9dc3825..f15387e 100644
--- a/src/guidellm/request/emulated.py
+++ b/src/guidellm/request/emulated.py
@@ -405,6 +405,8 @@ def sample_prompt(self, tokens: int) -> str:
 
 
     def sample_images(self):
-        image_indices = self._rng.choice(len(self._images), size=self._config.images, replace=False)
+        image_indices = self._rng.choice(
+            len(self._images), size=self._config.images, replace=False,
+        )
 
         return [self._images[i] for i in image_indices]
diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py
index 1e51f22..eb4931b 100644
--- a/src/guidellm/utils/__init__.py
+++ b/src/guidellm/utils/__init__.py
@@ -38,4 +38,6 @@
     "resolve_transformers_dataset_split",
     "split_lines_by_punctuation",
     "split_text",
+    "ImageDescriptor",
+    "load_images",
 ]
diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index 5d73bc0..5c5a727 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -65,4 +65,4 @@ def load_images(data: str) -> List[ImageDescriptor]:
                     )
                 )
 
-        return images
+    return images

From 570670b6c2a24869a40635f0112af7d92da0e73c Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 5 Nov 2024 01:11:06 +0000
Subject: [PATCH 12/45] Quality fixes

---
 src/guidellm/backend/openai.py | 2 +-
 src/guidellm/utils/images.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
index f75bb3b..90d2791 100644
--- a/src/guidellm/backend/openai.py
+++ b/src/guidellm/backend/openai.py
@@ -179,7 +179,7 @@ def _build_messages(self, request: TextGenerationRequest) -> Dict:
                 stream = io.BytesIO()
                 im_format = image.image.format or "PNG"
                 image.image.save(stream, format=im_format)
-                im_b64 = base64.b64encode(stream.getvalue()).decode("ascii")
+                im_b64 = base64.b64encode(stream.getvalue()).decode("utf-8")
                 image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"}
                 content.append({"type": "image_url", "image_url": image_url})
 
diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index 5c5a727..5d73bc0 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -65,4 +65,4 @@ def load_images(data: str) -> List[ImageDescriptor]:
                     )
                 )
 
-    return images
+        return images

From 984da28e4423f1726888b6a37de30621029d3622 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 5 Nov 2024 02:43:04 +0000
Subject: [PATCH 13/45] Add new dependencies

---
 .pre-commit-config.yaml | 3 +++
 pyproject.toml          | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2a085bb..6bcf150 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,6 +27,9 @@ repos:
         pyyaml,
         requests,
         rich,
+        pillow,
+        base64,
+        io,
         transformers,
 
         # dev dependencies
diff --git a/pyproject.toml b/pyproject.toml
index 6ab2c6e..b83abfd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,9 @@ dependencies = [
     "pyyaml>=6.0.0",
     "requests",
     "rich",
+    "pillow",
+    "base64",
+    "io",
     "transformers",
 ]
 

From 355f368b559d2b35e8d95fc7e761b53fce5cf15d Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 3 Dec 2024 22:54:54 +0000
Subject: [PATCH 14/45] Allow images to be resized to specific resolution

---
 src/guidellm/core/request.py     | 12 ++++++++++--
 src/guidellm/request/emulated.py | 10 ++++++++--
 src/guidellm/utils/images.py     |  6 +++++-
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py
index a1ff199..8585979 100644
--- a/src/guidellm/core/request.py
+++ b/src/guidellm/core/request.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from pydantic import Field
 
@@ -41,6 +41,14 @@ def number_images(self) -> int:
         else:
             return len(self.images)
 
+    @property
+    def image_resolution(self) -> Tuple[int]:
+        if self.images is None:
+            return None
+        else:
+            return [im.size for im in self.images]
+
+
     def __str__(self) -> str:
         prompt_short = (
             self.prompt[:32] + "..."
@@ -53,5 +61,5 @@ def __str__(self) -> str:
             f"prompt={prompt_short}, prompt_token_count={self.prompt_token_count}, "
             f"output_token_count={self.output_token_count}, "
             f"params={self.params})"
-            f"images={self.number_images}"
+            f"image_resolution={self.image_resolution}"
         )
diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py
index f15387e..8818ff9 100644
--- a/src/guidellm/request/emulated.py
+++ b/src/guidellm/request/emulated.py
@@ -30,7 +30,8 @@ class EmulatedConfig:
         generated_tokens_variance (Optional[int]): Variance for generated tokens.
         generated_tokens_min (Optional[int]): Minimum number of generated tokens.
         generated_tokens_max (Optional[int]): Maximum number of generated tokens.
-        images (Optional[int]): Number of input images.
+        images (Optional[int]): Number of images.
+        image_resultion (Optional[List[int]]): Resolution of images.
     """
 
     @staticmethod
@@ -107,6 +108,11 @@ def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig":
     generated_tokens_max: Optional[int] = None
 
     images: int = 0
+    image_resolution = None
+
+    def __post_init__(self):
+        if self.images is not None and self.image_resultion is not None and self.images > 0:
+            assert len(self.image_resolution) == 2
 
     @property
     def prompt_tokens_range(self) -> Tuple[int, int]:
@@ -331,7 +337,7 @@ def __init__(
             settings.emulated_data.filter_end,
         )
         if self._config.images > 0:
-            self._images = load_images(settings.emulated_data.image_source)
+            self._images = load_images(settings.emulated_data.image_source, self._config.image_resolution)
         self._rng = np.random.default_rng(random_seed)
 
         # NOTE: Must be after all the parameters since the queue population
diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index 5d73bc0..569fe75 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -27,7 +27,7 @@ class ImageDescriptor(Serializable):
     )
 
 
-def load_images(data: str) -> List[ImageDescriptor]:
+def load_images(data: str, image_resolution: Optional[List[int]]) -> List[ImageDescriptor]:
     """
     Load an HTML file from a path or URL
 
@@ -56,6 +56,10 @@ def load_images(data: str) -> List[ImageDescriptor]:
                 logger.debug("Loading image: {}", img_url)
                 img_response = requests.get(img_url)
                 img_response.raise_for_status()
+                image = Image.open(BytesIO(img_response.content))
+
+                if image_resolution is not None:
+                    image = image.resize(image_resolution)
 
                 # Load image into Pillow
                 images.append(

From 43f14d4febc467daf8ef2028ec10e40e9d4c5f37 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 4 Dec 2024 18:07:22 +0000
Subject: [PATCH 15/45] Ignore EOS

---
 src/guidellm/backend/openai.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
index b5cbc12..8ae18c5 100644
--- a/src/guidellm/backend/openai.py
+++ b/src/guidellm/backend/openai.py
@@ -92,6 +92,7 @@ async def make_request(
                 {
                     "max_tokens": request.output_token_count,
                     "stop": None,
+                    "ignore_eos": True,
                 }
             )
         elif settings.openai.max_gen_tokens and settings.openai.max_gen_tokens > 0:

From d9819e94e87c2855214eda586861886f34c1e61d Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 4 Dec 2024 18:47:05 +0000
Subject: [PATCH 16/45] Ignore EOS

---
 src/guidellm/backend/openai.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
index 8ae18c5..6f420ad 100644
--- a/src/guidellm/backend/openai.py
+++ b/src/guidellm/backend/openai.py
@@ -92,7 +92,9 @@ async def make_request(
                 {
                     "max_tokens": request.output_token_count,
                     "stop": None,
-                    "ignore_eos": True,
+                    "extra_body": {
+                        "ignore_eos": True,
+                    }
                 }
             )
         elif settings.openai.max_gen_tokens and settings.openai.max_gen_tokens > 0:

From 503a56c6c4e4feb3d669cf9d6cc1ff095f175062 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 22:41:44 +0000
Subject: [PATCH 17/45] Add image processing dependencies

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index b83abfd..caaeef0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,8 @@ dependencies = [
     "base64",
     "io",
     "transformers",
+    "pillow",
+    "bs4",
 ]
 
 [project.optional-dependencies]

From ffcb28ded1762b309fa46bebae94eec645dcb07f Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 22:43:35 +0000
Subject: [PATCH 18/45] Fix support to images

---
 src/guidellm/core/request.py     |  4 ++--
 src/guidellm/request/emulated.py |  8 +++++---
 src/guidellm/utils/images.py     | 10 +++++++++-
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py
index 8585979..cc82659 100644
--- a/src/guidellm/core/request.py
+++ b/src/guidellm/core/request.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 
 from pydantic import Field
 
@@ -42,7 +42,7 @@ def number_images(self) -> int:
             return len(self.images)
 
     @property
-    def image_resolution(self) -> Tuple[int]:
+    def image_resolution(self) -> List[int]:
         if self.images is None:
             return None
         else:
diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py
index 8818ff9..43c3389 100644
--- a/src/guidellm/request/emulated.py
+++ b/src/guidellm/request/emulated.py
@@ -31,7 +31,8 @@ class EmulatedConfig:
         generated_tokens_min (Optional[int]): Minimum number of generated tokens.
         generated_tokens_max (Optional[int]): Maximum number of generated tokens.
         images (Optional[int]): Number of images.
-        image_resultion (Optional[List[int]]): Resolution of images.
+        width (Optional[int]): Width of images.
+        height (Optional[int]): Height of images.
     """
 
     @staticmethod
@@ -108,7 +109,8 @@ def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig":
     generated_tokens_max: Optional[int] = None
 
     images: int = 0
-    image_resolution = None
+    width: int = None
+    height: int = None
 
     def __post_init__(self):
         if self.images is not None and self.image_resultion is not None and self.images > 0:
@@ -337,7 +339,7 @@ def __init__(
             settings.emulated_data.filter_end,
         )
         if self._config.images > 0:
-            self._images = load_images(settings.emulated_data.image_source, self._config.image_resolution)
+            self._images = load_images(settings.emulated_data.image_source, [self._config.width, self._config.height])
         self._rng = np.random.default_rng(random_seed)
 
         # NOTE: Must be after all the parameters since the queue population
diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index 569fe75..72846d7 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -6,7 +6,7 @@
 from bs4 import BeautifulSoup
 from loguru import logger
 from PIL import Image
-from pydantic import ConfigDict, Field
+from pydantic import ConfigDict, Field, computed_field
 
 from guidellm.config import settings
 from guidellm.core.serializable import Serializable
@@ -26,6 +26,14 @@ class ImageDescriptor(Serializable):
         description="Image filename.",
     )
 
+    @computed_field # type: ignore[misc]
+    @property
+    def image_resolution(self) -> List[int]:
+        if self.images is None:
+            return None
+        else:
+            return [im.size for im in self.images]
+
 
 def load_images(data: str, image_resolution: Optional[List[int]]) -> List[ImageDescriptor]:
     """

From 6106a719c5b64f4f6a834879a3512432b45bb92f Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 22:51:42 +0000
Subject: [PATCH 19/45] Fix serialization

---
 src/guidellm/core/request.py | 4 ++--
 src/guidellm/utils/images.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py
index cc82659..8ace3d1 100644
--- a/src/guidellm/core/request.py
+++ b/src/guidellm/core/request.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from pydantic import Field
 
@@ -42,7 +42,7 @@ def number_images(self) -> int:
             return len(self.images)
 
     @property
-    def image_resolution(self) -> List[int]:
+    def image_resolution(self) -> List[Tuple[int]]:
         if self.images is None:
             return None
         else:
diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index 72846d7..ed025db 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -1,5 +1,5 @@
 from io import BytesIO
-from typing import List, Optional
+from typing import List, Optional, Tuple
 from urllib.parse import urljoin
 
 import requests
@@ -28,11 +28,11 @@ class ImageDescriptor(Serializable):
 
     @computed_field # type: ignore[misc]
     @property
-    def image_resolution(self) -> List[int]:
-        if self.images is None:
+    def image_resolution(self) -> Tuple[int]:
+        if self.image is None:
             return None
         else:
-            return [im.size for im in self.images]
+            return self.image.size
 
 
 def load_images(data: str, image_resolution: Optional[List[int]]) -> List[ImageDescriptor]:

From 81718204b74c649486e91c900b05a48b053944e7 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 22:52:30 +0000
Subject: [PATCH 20/45] Fix image registration

---
 src/guidellm/request/emulated.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py
index 43c3389..02f564a 100644
--- a/src/guidellm/request/emulated.py
+++ b/src/guidellm/request/emulated.py
@@ -112,10 +112,6 @@ def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig":
     width: int = None
     height: int = None
 
-    def __post_init__(self):
-        if self.images is not None and self.image_resultion is not None and self.images > 0:
-            assert len(self.image_resolution) == 2
-
     @property
     def prompt_tokens_range(self) -> Tuple[int, int]:
         """

From e845510888c212c4395da2e40b5d58113c03ea97 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 22:55:43 +0000
Subject: [PATCH 21/45] Fix pydantic format

---
 src/guidellm/core/request.py | 2 +-
 src/guidellm/utils/images.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py
index 8ace3d1..06d0f37 100644
--- a/src/guidellm/core/request.py
+++ b/src/guidellm/core/request.py
@@ -42,7 +42,7 @@ def number_images(self) -> int:
             return len(self.images)
 
     @property
-    def image_resolution(self) -> List[Tuple[int]]:
+    def image_resolution(self) -> List[Tuple[int, int]]:
         if self.images is None:
             return None
         else:
diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index ed025db..ed1b30a 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -28,7 +28,7 @@ class ImageDescriptor(Serializable):
 
     @computed_field # type: ignore[misc]
     @property
-    def image_resolution(self) -> Tuple[int]:
+    def image_resolution(self) -> Tuple[int, int]:
         if self.image is None:
             return None
         else:

From d1ad0f89a53c4c8d09269e8df0ff613a2a18355a Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 23:20:48 +0000
Subject: [PATCH 22/45] Use resized image

---
 src/guidellm/utils/images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index ed1b30a..fb66d43 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -73,7 +73,7 @@ def load_images(data: str, image_resolution: Optional[List[int]]) -> List[ImageD
                 images.append(
                     ImageDescriptor(
                         url=img_url,
-                        image=Image.open(BytesIO(img_response.content)),
+                        image=image,
                     )
                 )
 

From 40e8e92d39f73f01e6c5bf8a069c00ee5ca4d221 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 6 Dec 2024 13:54:21 -0500
Subject: [PATCH 23/45] Update pyproject.toml

---
 pyproject.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index caaeef0..b83abfd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,8 +40,6 @@ dependencies = [
     "base64",
     "io",
     "transformers",
-    "pillow",
-    "bs4",
 ]
 
 [project.optional-dependencies]

From 0d8eb2f03aa61b9dd22a787b90ee3da9de35a8cf Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 6 Dec 2024 14:06:33 -0500
Subject: [PATCH 24/45] Update pyproject.toml

---
 pyproject.toml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b83abfd..6ab2c6e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,9 +36,6 @@ dependencies = [
     "pyyaml>=6.0.0",
     "requests",
     "rich",
-    "pillow",
-    "base64",
-    "io",
     "transformers",
 ]
 

From 511d3cb56997543265607011fc4af468624c66d0 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 6 Dec 2024 14:07:04 -0500
Subject: [PATCH 25/45] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6bcf150..2a085bb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,9 +27,6 @@ repos:
         pyyaml,
         requests,
         rich,
-        pillow,
-        base64,
-        io,
         transformers,
 
         # dev dependencies

From bca2614c0da83fba8a12ab76453e40e464e44997 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 6 Dec 2024 21:54:52 +0000
Subject: [PATCH 26/45] Adds aiohttp backend

---
 src/guidellm/backend/__init__.py |   2 +
 src/guidellm/backend/aiohttp.py  | 160 +++++++++++++++++++++++++++++++
 src/guidellm/backend/base.py     |   2 +-
 src/guidellm/config.py           |   4 +
 4 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 src/guidellm/backend/aiohttp.py

diff --git a/src/guidellm/backend/__init__.py b/src/guidellm/backend/__init__.py
index 875e319..1391018 100644
--- a/src/guidellm/backend/__init__.py
+++ b/src/guidellm/backend/__init__.py
@@ -1,5 +1,6 @@
 from .base import Backend, BackendEngine, BackendEnginePublic, GenerativeResponse
 from .openai import OpenAIBackend
+from .aiohttp import AiohttpBackend
 
 __all__ = [
     "Backend",
@@ -7,4 +8,5 @@
     "BackendEnginePublic",
     "GenerativeResponse",
     "OpenAIBackend",
+    "AiohttpBackend"
 ]
diff --git a/src/guidellm/backend/aiohttp.py b/src/guidellm/backend/aiohttp.py
new file mode 100644
index 0000000..138f45a
--- /dev/null
+++ b/src/guidellm/backend/aiohttp.py
@@ -0,0 +1,160 @@
+from typing import AsyncGenerator, Dict, List, Optional
+from loguru import logger
+
+import aiohttp
+import json
+
+from guidellm.backend.base import Backend, GenerativeResponse
+from guidellm.config import settings
+from guidellm.core import TextGenerationRequest
+
+__all__ = ["AiohttpBackend"]
+
+@Backend.register("aiohttp_server")
+class AiohttpBackend(Backend):
+    """
+    An aiohttp-based backend implementation for LLM requests.
+
+    This class provides an interface to communicate with a server hosting
+    an LLM API using aiohttp for asynchronous requests.
+    """
+
+    def __init__(
+        self,
+        openai_api_key: Optional[str] = None,
+        target: Optional[str] = None,
+        model: Optional[str] = None,
+        timeout: Optional[float] = None,
+        **request_args,
+    ):
+        self._request_args: Dict = request_args        
+        self._api_key: str = openai_api_key or settings.aiohttp.api_key
+
+        if not self._api_key:
+            err = ValueError(
+                "`GUIDELLM__AIOHTTP__API_KEY` environment variable or "
+                "--openai-api-key CLI parameter must be specified for the "
+                "aiohttp backend."
+            )
+            logger.error("{}", err)
+            raise err
+
+        base_url = target or settings.aiohttp.base_url
+        self._api_url = f"{base_url}/chat/completions"
+
+        if not base_url:
+            err = ValueError(
+                "`GUIDELLM__AIOHTTP__BASE_URL` environment variable or "
+                "target parameter must be specified for the OpenAI backend."
+            )
+            logger.error("{}", err)
+            raise err
+
+        self._timeout = aiohttp.ClientTimeout(total=timeout or settings.request_timeout)
+        self._model = model
+
+        super().__init__(type_="aiohttp_backend", target=base_url, model=self._model)
+        logger.info("aiohttp {} Backend listening on {}", self._model, base_url)
+
+    async def make_request(
+        self,
+        request: TextGenerationRequest,
+    ) -> AsyncGenerator[GenerativeResponse, None]:
+        """
+        Make a request to the aiohttp backend.
+
+        Sends a prompt to the LLM server and streams the response tokens.
+
+        :param request: The text generation request to submit.
+        :type request: TextGenerationRequest
+        :yield: A stream of GenerativeResponse objects.
+        :rtype: AsyncGenerator[GenerativeResponse, None]
+        """
+
+        async with aiohttp.ClientSession(timeout=self._timeout) as session:
+            logger.debug("Making request to aiohttp backend with prompt: {}", request.prompt)
+
+            request_args = {}
+            if request.output_token_count is not None:
+                request_args.update(
+                    {
+                        "max_completion_tokens": request.output_token_count,
+                        "stop": None,
+                        "ignore_eos": True,
+                    }
+                )
+            elif settings.aiohttp.max_gen_tokens and settings.aiohttp.max_gen_tokens > 0:
+                request_args.update(
+                    {
+                        "max_tokens": settings.aiohttp.max_gen_tokens,
+                    }
+                )
+
+            request_args.update(self._request_args)
+
+            payload = {
+                "model": self._model,
+                "messages": [
+                    {"role": "user", "content": request.prompt},
+                ],
+                "stream": True,
+                **request_args,
+            }
+
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self._api_key}",
+            }
+
+            try:
+                async with session.post(url=self._api_url, json=payload, headers=headers) as response:
+                    if response.status != 200:
+                        error_message = await response.text()
+                        logger.error("Request failed: {} - {}", response.status, error_message)
+                        raise Exception(f"Failed to generate response: {error_message}")
+
+                    token_count = 0
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk == "[DONE]":
+                            # Final response
+                            yield GenerativeResponse(
+                                type_="final",
+                                prompt=request.prompt,
+                                output_token_count=token_count,
+                                prompt_token_count=request.prompt_token_count,
+                            )
+                        else:
+                            # Intermediate token response
+                            token_count += 1
+                            data = json.loads(chunk)
+                            delta = data["choices"][0]["delta"]
+                            token = delta["content"]
+                            yield GenerativeResponse(
+                                type_="token_iter",
+                                add_token=token,
+                                prompt=request.prompt,
+                                output_token_count=token_count,
+                                prompt_token_count=request.prompt_token_count,
+                            )
+            except Exception as e:
+                logger.error("Error while making request: {}", e)
+                raise
+
+    def available_models(self) -> List[str]:
+        """
+        Retrieve a list of available models from the server.
+        """
+        # This could include an API call to `self._api_url/models` if the server supports it.
+        logger.warning("Fetching available models is not implemented for aiohttp backend.")
+        return []
+
+    def validate_connection(self):
+        """
+        Validate the connection to the backend server.
+        """
+        logger.info("Connection validation is not explicitly implemented for aiohttp backend.")
diff --git a/src/guidellm/backend/base.py b/src/guidellm/backend/base.py
index d71c5f6..a165859 100644
--- a/src/guidellm/backend/base.py
+++ b/src/guidellm/backend/base.py
@@ -15,7 +15,7 @@
 __all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"]
 
 
-BackendEnginePublic = Literal["openai_server"]
+BackendEnginePublic = Literal["openai_server", "aiohttp_server"]
 BackendEngine = Union[BackendEnginePublic, Literal["test"]]
 
 
diff --git a/src/guidellm/config.py b/src/guidellm/config.py
index c3d950e..0594709 100644
--- a/src/guidellm/config.py
+++ b/src/guidellm/config.py
@@ -108,6 +108,9 @@ class OpenAISettings(BaseModel):
     max_gen_tokens: int = 4096
 
 
+class AiohttpSettings(OpenAISettings):
+    pass
+
 class ReportGenerationSettings(BaseModel):
     """
     Report generation settings for the application
@@ -152,6 +155,7 @@ class Settings(BaseSettings):
 
     # Request settings
     openai: OpenAISettings = OpenAISettings()
+    aiohttp: AiohttpSettings = AiohttpSettings()
 
     # Report settings
     report_generation: ReportGenerationSettings = ReportGenerationSettings()

From 72db6a4e87093fcf9ca56cf2186af6fa69842250 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 11 Dec 2024 04:22:39 +0000
Subject: [PATCH 27/45] Add support for aiohttp backend

---
 src/guidellm/backend/aiohttp.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/guidellm/backend/aiohttp.py b/src/guidellm/backend/aiohttp.py
index 138f45a..fbbd971 100644
--- a/src/guidellm/backend/aiohttp.py
+++ b/src/guidellm/backend/aiohttp.py
@@ -1,3 +1,5 @@
+import base64
+import io
 from typing import AsyncGenerator, Dict, List, Optional
 from loguru import logger
 
@@ -92,11 +94,11 @@ async def make_request(
 
             request_args.update(self._request_args)
 
+            messages = self._build_messages(request)
+
             payload = {
                 "model": self._model,
-                "messages": [
-                    {"role": "user", "content": request.prompt},
-                ],
+                "messages": messages,
                 "stream": True,
                 **request_args,
             }
@@ -158,3 +160,21 @@ def validate_connection(self):
         Validate the connection to the backend server.
         """
         logger.info("Connection validation is not explicitly implemented for aiohttp backend.")
+
+    def _build_messages(self, request: TextGenerationRequest) -> Dict:
+        if request.number_images == 0:
+            messages = [{"role": "user", "content": request.prompt}]
+        else:
+            content = []
+            for image in request.images:
+                stream = io.BytesIO()
+                im_format = image.image.format or "PNG"
+                image.image.save(stream, format=im_format)
+                im_b64 = base64.b64encode(stream.getvalue()).decode("utf-8")
+                image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"}
+                content.append({"type": "image_url", "image_url": image_url})
+
+            content.append({"type": "text", "text": request.prompt})
+            messages = [{"role": "user", "content": content}]
+
+        return messages

From ba4187dea12888c7e74cfe0acfb6049fa460f4e7 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 4 Sep 2024 22:09:59 +0000
Subject: [PATCH 28/45] Add mean and percentile info as computed_field
 properties such that they become serializable

---
 src/guidellm/core/report.py | 26 ++++-------
 src/guidellm/core/result.py | 92 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 99 insertions(+), 19 deletions(-)

diff --git a/src/guidellm/core/report.py b/src/guidellm/core/report.py
index b6791e4..c48eed5 100644
--- a/src/guidellm/core/report.py
+++ b/src/guidellm/core/report.py
@@ -147,19 +147,15 @@ def _create_benchmark_report_data_tokens_summary(
     for benchmark in report.benchmarks_sorted:
         table.add_row(
             _benchmark_rate_id(benchmark),
-            f"{benchmark.prompt_token_distribution.mean:.2f}",
+            f"{benchmark.prompt_token:.2f}",
             ", ".join(
                 f"{percentile:.1f}"
-                for percentile in benchmark.prompt_token_distribution.percentiles(
-                    [1, 5, 50, 95, 99]
-                )
+                for percentile in benchmark.prompt_token_percentiles
             ),
-            f"{benchmark.output_token_distribution.mean:.2f}",
+            f"{benchmark.output_token:.2f}",
             ", ".join(
                 f"{percentile:.1f}"
-                for percentile in benchmark.output_token_distribution.percentiles(
-                    [1, 5, 50, 95, 99]
-                )
+                for percentile in benchmark.output_token_percentiles
             ),
         )
     logger.debug("Created data tokens summary table for the report.")
@@ -181,7 +177,7 @@ def _create_benchmark_report_dist_perf_summary(
         "Benchmark",
         "Request Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (sec)",
         "Time to First Token [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)",
-        "Inter Token Latency [1%, 5%, 10%, 50%, 90% 95%, 99%] (ms)",
+        "Inter Token Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)",
         title="[magenta]Performance Stats by Benchmark[/magenta]",
         title_style="bold",
         title_justify="left",
@@ -193,21 +189,15 @@ def _create_benchmark_report_dist_perf_summary(
             _benchmark_rate_id(benchmark),
             ", ".join(
                 f"{percentile:.2f}"
-                for percentile in benchmark.request_latency_distribution.percentiles(
-                    [1, 5, 10, 50, 90, 95, 99]
-                )
+                for percentile in benchmark.request_latency_percentiles
             ),
             ", ".join(
                 f"{percentile * 1000:.1f}"
-                for percentile in benchmark.ttft_distribution.percentiles(
-                    [1, 5, 10, 50, 90, 95, 99]
-                )
+                for percentile in benchmark.time_to_first_token_percentiles
             ),
             ", ".join(
                 f"{percentile * 1000:.1f}"
-                for percentile in benchmark.itl_distribution.percentiles(
-                    [1, 5, 10, 50, 90, 95, 99]
-                )
+                for percentile in benchmark.inter_token_latency_percentiles
             ),
         )
     logger.debug("Created distribution performance summary table for the report.")
diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py
index f218784..5fd29a8 100644
--- a/src/guidellm/core/result.py
+++ b/src/guidellm/core/result.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict, List, Literal, Optional, Union
 
 from loguru import logger
-from pydantic import Field
+from pydantic import Field, computed_field
 
 from guidellm.core.distribution import Distribution
 from guidellm.core.request import TextGenerationRequest
@@ -221,6 +221,7 @@ def __iter__(self):
         """
         return iter(self.results)
 
+    @computed_field
     @property
     def request_count(self) -> int:
         """
@@ -231,6 +232,7 @@ def request_count(self) -> int:
         """
         return len(self.results)
 
+    @computed_field
     @property
     def error_count(self) -> int:
         """
@@ -241,6 +243,7 @@ def error_count(self) -> int:
         """
         return len(self.errors)
 
+    @computed_field
     @property
     def total_count(self) -> int:
         """
@@ -251,6 +254,7 @@ def total_count(self) -> int:
         """
         return self.request_count + self.error_count
 
+    @computed_field
     @property
     def start_time(self) -> Optional[float]:
         """
@@ -264,6 +268,7 @@ def start_time(self) -> Optional[float]:
 
         return self.results[0].start_time
 
+    @computed_field
     @property
     def end_time(self) -> Optional[float]:
         """
@@ -277,6 +282,7 @@ def end_time(self) -> Optional[float]:
 
         return self.results[-1].end_time
 
+    @computed_field
     @property
     def duration(self) -> float:
         """
@@ -290,6 +296,7 @@ def duration(self) -> float:
 
         return self.end_time - self.start_time
 
+    @computed_field
     @property
     def completed_request_rate(self) -> float:
         """
@@ -303,6 +310,7 @@ def completed_request_rate(self) -> float:
 
         return len(self.results) / self.duration
 
+    @computed_field
     @property
     def request_latency(self) -> float:
         """
@@ -332,6 +340,19 @@ def request_latency_distribution(self) -> Distribution:
             ]
         )
 
+    @computed_field
+    @property
+    def request_latency_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles of request latency in seconds.
+
+        :return: List of percentile request latency in seconds
+        :rtype: List[float]
+        """
+        return self.request_latency_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
+
+
+    @computed_field
     @property
     def time_to_first_token(self) -> float:
         """
@@ -360,7 +381,19 @@ def ttft_distribution(self) -> Distribution:
                 if result.first_token_time is not None
             ]
         )
+    
+    @computed_field
+    @property
+    def time_to_first_token_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for time taken to decode the first token in milliseconds.
+
+        :return: List of percentile time taken to decode the first token in milliseconds.
+        :rtype: List[float]
+        """        
+        return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
 
+    @computed_field
     @property
     def inter_token_latency(self) -> float:
         """
@@ -387,7 +420,19 @@ def itl_distribution(self) -> Distribution:
                 decode for result in self.results for decode in result.decode_times.data
             ]
         )
+    
+    @computed_field
+    @property
+    def inter_token_latency_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for the time between tokens in milliseconds.
 
+        :return: List of percentiles for the average time between tokens.
+        :rtype: List[float]
+        """
+        return self.itl_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
+
+    @computed_field
     @property
     def output_token_throughput(self) -> float:
         """
@@ -403,6 +448,17 @@ def output_token_throughput(self) -> float:
 
         return total_tokens / self.duration
 
+    @computed_field
+    @property
+    def prompt_token(self) -> float:
+        """
+        Get the average number of prompt tokens.
+
+        :return: The average number of prompt tokens.
+        :rtype: float
+        """
+        return self.prompt_token_distribution.mean
+
     @property
     def prompt_token_distribution(self) -> Distribution:
         """
@@ -413,6 +469,28 @@ def prompt_token_distribution(self) -> Distribution:
         """
         return Distribution(data=[result.prompt_token_count for result in self.results])
 
+    @computed_field
+    @property
+    def prompt_token_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for number of prompt tokens.
+
+        :return: List of percentiles of number of prompt tokens.
+        :rtype: List[float]
+        """
+        return self.prompt_token_distribution.percentiles([1, 5, 50, 95, 99])
+
+    @computed_field
+    @property
+    def output_token(self) -> float:
+        """
+        Get the average number of output tokens.
+
+        :return: The average number of output tokens.
+        :rtype: float
+        """
+        return self.output_token_distribution.mean
+
     @property
     def output_token_distribution(self) -> Distribution:
         """
@@ -423,6 +501,18 @@ def output_token_distribution(self) -> Distribution:
         """
         return Distribution(data=[result.output_token_count for result in self.results])
 
+    @computed_field
+    @property
+    def output_token_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for number of output tokens.
+
+        :return: List of percentiles of number of output tokens.
+        :rtype: List[float]
+        """
+        return self.output_token_distribution.percentiles([1, 5, 50, 95, 99])
+
+    @computed_field
     @property
     def overloaded(self) -> bool:
         if (

From 1566348a6b9f91762d8fb6f94b621416ad166b1f Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Sep 2024 01:19:42 +0000
Subject: [PATCH 29/45] quality fixes

---
 src/guidellm/core/result.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py
index 5fd29a8..90906b7 100644
--- a/src/guidellm/core/result.py
+++ b/src/guidellm/core/result.py
@@ -381,7 +381,7 @@ def ttft_distribution(self) -> Distribution:
                 if result.first_token_time is not None
             ]
         )
-    
+
     @computed_field
     @property
     def time_to_first_token_percentiles(self) -> List[float]:
@@ -390,7 +390,7 @@ def time_to_first_token_percentiles(self) -> List[float]:
 
         :return: List of percentile time taken to decode the first token in milliseconds.
         :rtype: List[float]
-        """        
+        """
         return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
 
     @computed_field
@@ -420,7 +420,7 @@ def itl_distribution(self) -> Distribution:
                 decode for result in self.results for decode in result.decode_times.data
             ]
         )
-    
+
     @computed_field
     @property
     def inter_token_latency_percentiles(self) -> List[float]:

From e37166f68b029cafc56b097b444e5324971dd3d7 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Sep 2024 01:23:25 +0000
Subject: [PATCH 30/45] quality fix

---
 src/guidellm/core/result.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py
index 90906b7..95a4230 100644
--- a/src/guidellm/core/result.py
+++ b/src/guidellm/core/result.py
@@ -386,9 +386,11 @@ def ttft_distribution(self) -> Distribution:
     @property
     def time_to_first_token_percentiles(self) -> List[float]:
         """
-        Get standard percentiles for time taken to decode the first token in milliseconds.
+        Get standard percentiles for time taken to decode the first token
+        in milliseconds.
 
-        :return: List of percentile time taken to decode the first token in milliseconds.
+        :return: List of percentile time taken to decode the first token
+        in milliseconds.
         :rtype: List[float]
         """
         return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])

From 9039845f9ffa1c0bb84855704845883a43d00c61 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 6 Sep 2024 14:28:54 +0000
Subject: [PATCH 31/45] Quality fixes

---
 src/guidellm/core/result.py | 38 ++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py
index 95a4230..aebd176 100644
--- a/src/guidellm/core/result.py
+++ b/src/guidellm/core/result.py
@@ -221,7 +221,7 @@ def __iter__(self):
         """
         return iter(self.results)
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def request_count(self) -> int:
         """
@@ -232,7 +232,7 @@ def request_count(self) -> int:
         """
         return len(self.results)
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def error_count(self) -> int:
         """
@@ -243,7 +243,7 @@ def error_count(self) -> int:
         """
         return len(self.errors)
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def total_count(self) -> int:
         """
@@ -254,7 +254,7 @@ def total_count(self) -> int:
         """
         return self.request_count + self.error_count
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def start_time(self) -> Optional[float]:
         """
@@ -268,7 +268,7 @@ def start_time(self) -> Optional[float]:
 
         return self.results[0].start_time
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def end_time(self) -> Optional[float]:
         """
@@ -282,7 +282,7 @@ def end_time(self) -> Optional[float]:
 
         return self.results[-1].end_time
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def duration(self) -> float:
         """
@@ -296,7 +296,7 @@ def duration(self) -> float:
 
         return self.end_time - self.start_time
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def completed_request_rate(self) -> float:
         """
@@ -310,7 +310,7 @@ def completed_request_rate(self) -> float:
 
         return len(self.results) / self.duration
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def request_latency(self) -> float:
         """
@@ -340,7 +340,7 @@ def request_latency_distribution(self) -> Distribution:
             ]
         )
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def request_latency_percentiles(self) -> List[float]:
         """
@@ -352,7 +352,7 @@ def request_latency_percentiles(self) -> List[float]:
         return self.request_latency_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
 
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def time_to_first_token(self) -> float:
         """
@@ -382,7 +382,7 @@ def ttft_distribution(self) -> Distribution:
             ]
         )
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def time_to_first_token_percentiles(self) -> List[float]:
         """
@@ -395,7 +395,7 @@ def time_to_first_token_percentiles(self) -> List[float]:
         """
         return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def inter_token_latency(self) -> float:
         """
@@ -423,7 +423,7 @@ def itl_distribution(self) -> Distribution:
             ]
         )
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def inter_token_latency_percentiles(self) -> List[float]:
         """
@@ -434,7 +434,7 @@ def inter_token_latency_percentiles(self) -> List[float]:
         """
         return self.itl_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def output_token_throughput(self) -> float:
         """
@@ -450,7 +450,7 @@ def output_token_throughput(self) -> float:
 
         return total_tokens / self.duration
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def prompt_token(self) -> float:
         """
@@ -471,7 +471,7 @@ def prompt_token_distribution(self) -> Distribution:
         """
         return Distribution(data=[result.prompt_token_count for result in self.results])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def prompt_token_percentiles(self) -> List[float]:
         """
@@ -482,7 +482,7 @@ def prompt_token_percentiles(self) -> List[float]:
         """
         return self.prompt_token_distribution.percentiles([1, 5, 50, 95, 99])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def output_token(self) -> float:
         """
@@ -503,7 +503,7 @@ def output_token_distribution(self) -> Distribution:
         """
         return Distribution(data=[result.output_token_count for result in self.results])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def output_token_percentiles(self) -> List[float]:
         """
@@ -514,7 +514,7 @@ def output_token_percentiles(self) -> List[float]:
         """
         return self.output_token_distribution.percentiles([1, 5, 50, 95, 99])
 
-    @computed_field
+    @computed_field # type: ignore[misc]
     @property
     def overloaded(self) -> bool:
         if (

From 036917e543cdbd61bde487121ed26a2b000ef8b7 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 4 Dec 2024 18:07:22 +0000
Subject: [PATCH 32/45] Ignore EOS

---
 src/guidellm/backend/openai.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
index 90d2791..4ea1396 100644
--- a/src/guidellm/backend/openai.py
+++ b/src/guidellm/backend/openai.py
@@ -94,6 +94,7 @@ async def make_request(
                 {
                     "max_tokens": request.output_token_count,
                     "stop": None,
+                    "ignore_eos": True,
                 }
             )
         elif settings.openai.max_gen_tokens and settings.openai.max_gen_tokens > 0:

From 6e6691cb319dbd49866061357251fa410a418589 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 4 Dec 2024 18:47:05 +0000
Subject: [PATCH 33/45] Ignore EOS

---
 src/guidellm/backend/openai.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
index 4ea1396..9843fc1 100644
--- a/src/guidellm/backend/openai.py
+++ b/src/guidellm/backend/openai.py
@@ -94,7 +94,9 @@ async def make_request(
                 {
                     "max_tokens": request.output_token_count,
                     "stop": None,
-                    "ignore_eos": True,
+                    "extra_body": {
+                        "ignore_eos": True,
+                    }
                 }
             )
         elif settings.openai.max_gen_tokens and settings.openai.max_gen_tokens > 0:

From ff742f16df70952be2f79d24b19b95fbefcc671c Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 22:41:44 +0000
Subject: [PATCH 34/45] Add image processing dependencies

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index b83abfd..caaeef0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,8 @@ dependencies = [
     "base64",
     "io",
     "transformers",
+    "pillow",
+    "bs4",
 ]
 
 [project.optional-dependencies]

From 2b1706f452598b489a2dfdccca532d6ee9be0eff Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 22:43:35 +0000
Subject: [PATCH 35/45] Fix support to images

---
 src/guidellm/core/request.py     |  4 ++--
 src/guidellm/request/emulated.py |  8 +++++---
 src/guidellm/utils/images.py     | 10 +++++++++-
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py
index 8585979..cc82659 100644
--- a/src/guidellm/core/request.py
+++ b/src/guidellm/core/request.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 
 from pydantic import Field
 
@@ -42,7 +42,7 @@ def number_images(self) -> int:
             return len(self.images)
 
     @property
-    def image_resolution(self) -> Tuple[int]:
+    def image_resolution(self) -> List[int]:
         if self.images is None:
             return None
         else:
diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py
index 8818ff9..43c3389 100644
--- a/src/guidellm/request/emulated.py
+++ b/src/guidellm/request/emulated.py
@@ -31,7 +31,8 @@ class EmulatedConfig:
         generated_tokens_min (Optional[int]): Minimum number of generated tokens.
         generated_tokens_max (Optional[int]): Maximum number of generated tokens.
         images (Optional[int]): Number of images.
-        image_resultion (Optional[List[int]]): Resolution of images.
+        width (Optional[int]): Width of images.
+        height (Optional[int]): Height of images.
     """
 
     @staticmethod
@@ -108,7 +109,8 @@ def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig":
     generated_tokens_max: Optional[int] = None
 
     images: int = 0
-    image_resolution = None
+    width: int = None
+    height: int = None
 
     def __post_init__(self):
         if self.images is not None and self.image_resultion is not None and self.images > 0:
@@ -337,7 +339,7 @@ def __init__(
             settings.emulated_data.filter_end,
         )
         if self._config.images > 0:
-            self._images = load_images(settings.emulated_data.image_source, self._config.image_resolution)
+            self._images = load_images(settings.emulated_data.image_source, [self._config.width, self._config.height])
         self._rng = np.random.default_rng(random_seed)
 
         # NOTE: Must be after all the parameters since the queue population
diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index 569fe75..72846d7 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -6,7 +6,7 @@
 from bs4 import BeautifulSoup
 from loguru import logger
 from PIL import Image
-from pydantic import ConfigDict, Field
+from pydantic import ConfigDict, Field, computed_field
 
 from guidellm.config import settings
 from guidellm.core.serializable import Serializable
@@ -26,6 +26,14 @@ class ImageDescriptor(Serializable):
         description="Image filename.",
     )
 
+    @computed_field # type: ignore[misc]
+    @property
+    def image_resolution(self) -> List[int]:
+        if self.images is None:
+            return None
+        else:
+            return [im.size for im in self.images]
+
 
 def load_images(data: str, image_resolution: Optional[List[int]]) -> List[ImageDescriptor]:
     """

From 01610325c976dc13082c693cf4f3bd2137d64120 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 22:51:42 +0000
Subject: [PATCH 36/45] Fix serialization

---
 src/guidellm/core/request.py | 4 ++--
 src/guidellm/utils/images.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py
index cc82659..8ace3d1 100644
--- a/src/guidellm/core/request.py
+++ b/src/guidellm/core/request.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from pydantic import Field
 
@@ -42,7 +42,7 @@ def number_images(self) -> int:
             return len(self.images)
 
     @property
-    def image_resolution(self) -> List[int]:
+    def image_resolution(self) -> List[Tuple[int]]:
         if self.images is None:
             return None
         else:
diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index 72846d7..ed025db 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -1,5 +1,5 @@
 from io import BytesIO
-from typing import List, Optional
+from typing import List, Optional, Tuple
 from urllib.parse import urljoin
 
 import requests
@@ -28,11 +28,11 @@ class ImageDescriptor(Serializable):
 
     @computed_field # type: ignore[misc]
     @property
-    def image_resolution(self) -> List[int]:
-        if self.images is None:
+    def image_resolution(self) -> Tuple[int]:
+        if self.image is None:
             return None
         else:
-            return [im.size for im in self.images]
+            return self.image.size
 
 
 def load_images(data: str, image_resolution: Optional[List[int]]) -> List[ImageDescriptor]:

From 35379d369af504498a9fe55fd23be5b24ea2a71c Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 22:52:30 +0000
Subject: [PATCH 37/45] Fix image registration

---
 src/guidellm/request/emulated.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py
index 43c3389..02f564a 100644
--- a/src/guidellm/request/emulated.py
+++ b/src/guidellm/request/emulated.py
@@ -112,10 +112,6 @@ def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig":
     width: int = None
     height: int = None
 
-    def __post_init__(self):
-        if self.images is not None and self.image_resultion is not None and self.images > 0:
-            assert len(self.image_resolution) == 2
-
     @property
     def prompt_tokens_range(self) -> Tuple[int, int]:
         """

From d47887cebfe8564c2842a8c641be3153cac5729f Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 22:55:43 +0000
Subject: [PATCH 38/45] Fix pydantic format

---
 src/guidellm/core/request.py | 2 +-
 src/guidellm/utils/images.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py
index 8ace3d1..06d0f37 100644
--- a/src/guidellm/core/request.py
+++ b/src/guidellm/core/request.py
@@ -42,7 +42,7 @@ def number_images(self) -> int:
             return len(self.images)
 
     @property
-    def image_resolution(self) -> List[Tuple[int]]:
+    def image_resolution(self) -> List[Tuple[int, int]]:
         if self.images is None:
             return None
         else:
diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index ed025db..ed1b30a 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -28,7 +28,7 @@ class ImageDescriptor(Serializable):
 
     @computed_field # type: ignore[misc]
     @property
-    def image_resolution(self) -> Tuple[int]:
+    def image_resolution(self) -> Tuple[int, int]:
         if self.image is None:
             return None
         else:

From 33c13ecb93947e417051e252872c1725d773b9e4 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 5 Dec 2024 23:20:48 +0000
Subject: [PATCH 39/45] Use resized image

---
 src/guidellm/utils/images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py
index ed1b30a..fb66d43 100644
--- a/src/guidellm/utils/images.py
+++ b/src/guidellm/utils/images.py
@@ -73,7 +73,7 @@ def load_images(data: str, image_resolution: Optional[List[int]]) -> List[ImageD
                 images.append(
                     ImageDescriptor(
                         url=img_url,
-                        image=Image.open(BytesIO(img_response.content)),
+                        image=image,
                     )
                 )
 

From cb324d8b38b1793beb56f460670932f44d133146 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 6 Dec 2024 13:54:21 -0500
Subject: [PATCH 40/45] Update pyproject.toml

---
 pyproject.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index caaeef0..b83abfd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,8 +40,6 @@ dependencies = [
     "base64",
     "io",
     "transformers",
-    "pillow",
-    "bs4",
 ]
 
 [project.optional-dependencies]

From 4d3bc8dfb51314b90790390865324c023e8c96f5 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 6 Dec 2024 14:06:33 -0500
Subject: [PATCH 41/45] Update pyproject.toml

---
 pyproject.toml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b83abfd..6ab2c6e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,9 +36,6 @@ dependencies = [
     "pyyaml>=6.0.0",
     "requests",
     "rich",
-    "pillow",
-    "base64",
-    "io",
     "transformers",
 ]
 

From 30e874e0a0a9aa0ad217fb8c0f77ca433b82ecc4 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 6 Dec 2024 14:07:04 -0500
Subject: [PATCH 42/45] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6bcf150..2a085bb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,9 +27,6 @@ repos:
         pyyaml,
         requests,
         rich,
-        pillow,
-        base64,
-        io,
         transformers,
 
         # dev dependencies

From 4426822db7ebb5fac1ed4e8e3237ea2bd79e507d Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 6 Dec 2024 21:54:52 +0000
Subject: [PATCH 43/45] Adds aiohttp backend

---
 src/guidellm/backend/__init__.py |   2 +
 src/guidellm/backend/aiohttp.py  | 160 +++++++++++++++++++++++++++++++
 src/guidellm/backend/base.py     |   2 +-
 src/guidellm/config.py           |   4 +
 4 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 src/guidellm/backend/aiohttp.py

diff --git a/src/guidellm/backend/__init__.py b/src/guidellm/backend/__init__.py
index 875e319..1391018 100644
--- a/src/guidellm/backend/__init__.py
+++ b/src/guidellm/backend/__init__.py
@@ -1,5 +1,6 @@
 from .base import Backend, BackendEngine, BackendEnginePublic, GenerativeResponse
 from .openai import OpenAIBackend
+from .aiohttp import AiohttpBackend
 
 __all__ = [
     "Backend",
@@ -7,4 +8,5 @@
     "BackendEnginePublic",
     "GenerativeResponse",
     "OpenAIBackend",
+    "AiohttpBackend"
 ]
diff --git a/src/guidellm/backend/aiohttp.py b/src/guidellm/backend/aiohttp.py
new file mode 100644
index 0000000..138f45a
--- /dev/null
+++ b/src/guidellm/backend/aiohttp.py
@@ -0,0 +1,160 @@
+from typing import AsyncGenerator, Dict, List, Optional
+from loguru import logger
+
+import aiohttp
+import json
+
+from guidellm.backend.base import Backend, GenerativeResponse
+from guidellm.config import settings
+from guidellm.core import TextGenerationRequest
+
+__all__ = ["AiohttpBackend"]
+
+@Backend.register("aiohttp_server")
+class AiohttpBackend(Backend):
+    """
+    An aiohttp-based backend implementation for LLM requests.
+
+    This class provides an interface to communicate with a server hosting
+    an LLM API using aiohttp for asynchronous requests.
+    """
+
+    def __init__(
+        self,
+        openai_api_key: Optional[str] = None,
+        target: Optional[str] = None,
+        model: Optional[str] = None,
+        timeout: Optional[float] = None,
+        **request_args,
+    ):
+        self._request_args: Dict = request_args        
+        self._api_key: str = openai_api_key or settings.aiohttp.api_key
+
+        if not self._api_key:
+            err = ValueError(
+                "`GUIDELLM__AIOHTTP__API_KEY` environment variable or "
+                "--openai-api-key CLI parameter must be specified for the "
+                "aiohttp backend."
+            )
+            logger.error("{}", err)
+            raise err
+
+        base_url = target or settings.aiohttp.base_url
+        self._api_url = f"{base_url}/chat/completions"
+
+        if not base_url:
+            err = ValueError(
+                "`GUIDELLM__AIOHTTP__BASE_URL` environment variable or "
+                "target parameter must be specified for the OpenAI backend."
+            )
+            logger.error("{}", err)
+            raise err
+
+        self._timeout = aiohttp.ClientTimeout(total=timeout or settings.request_timeout)
+        self._model = model
+
+        super().__init__(type_="aiohttp_backend", target=base_url, model=self._model)
+        logger.info("aiohttp {} Backend listening on {}", self._model, base_url)
+
+    async def make_request(
+        self,
+        request: TextGenerationRequest,
+    ) -> AsyncGenerator[GenerativeResponse, None]:
+        """
+        Make a request to the aiohttp backend.
+
+        Sends a prompt to the LLM server and streams the response tokens.
+
+        :param request: The text generation request to submit.
+        :type request: TextGenerationRequest
+        :yield: A stream of GenerativeResponse objects.
+        :rtype: AsyncGenerator[GenerativeResponse, None]
+        """
+
+        async with aiohttp.ClientSession(timeout=self._timeout) as session:
+            logger.debug("Making request to aiohttp backend with prompt: {}", request.prompt)
+
+            request_args = {}
+            if request.output_token_count is not None:
+                request_args.update(
+                    {
+                        "max_completion_tokens": request.output_token_count,
+                        "stop": None,
+                        "ignore_eos": True,
+                    }
+                )
+            elif settings.aiohttp.max_gen_tokens and settings.aiohttp.max_gen_tokens > 0:
+                request_args.update(
+                    {
+                        "max_tokens": settings.aiohttp.max_gen_tokens,
+                    }
+                )
+
+            request_args.update(self._request_args)
+
+            payload = {
+                "model": self._model,
+                "messages": [
+                    {"role": "user", "content": request.prompt},
+                ],
+                "stream": True,
+                **request_args,
+            }
+
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self._api_key}",
+            }
+
+            try:
+                async with session.post(url=self._api_url, json=payload, headers=headers) as response:
+                    if response.status != 200:
+                        error_message = await response.text()
+                        logger.error("Request failed: {} - {}", response.status, error_message)
+                        raise Exception(f"Failed to generate response: {error_message}")
+
+                    token_count = 0
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk == "[DONE]":
+                            # Final response
+                            yield GenerativeResponse(
+                                type_="final",
+                                prompt=request.prompt,
+                                output_token_count=token_count,
+                                prompt_token_count=request.prompt_token_count,
+                            )
+                        else:
+                            # Intermediate token response
+                            token_count += 1
+                            data = json.loads(chunk)
+                            delta = data["choices"][0]["delta"]
+                            token = delta["content"]
+                            yield GenerativeResponse(
+                                type_="token_iter",
+                                add_token=token,
+                                prompt=request.prompt,
+                                output_token_count=token_count,
+                                prompt_token_count=request.prompt_token_count,
+                            )
+            except Exception as e:
+                logger.error("Error while making request: {}", e)
+                raise
+
+    def available_models(self) -> List[str]:
+        """
+        Retrieve a list of available models from the server.
+        """
+        # This could include an API call to `self._api_url/models` if the server supports it.
+        logger.warning("Fetching available models is not implemented for aiohttp backend.")
+        return []
+
+    def validate_connection(self):
+        """
+        Validate the connection to the backend server.
+        """
+        logger.info("Connection validation is not explicitly implemented for aiohttp backend.")
diff --git a/src/guidellm/backend/base.py b/src/guidellm/backend/base.py
index d71c5f6..a165859 100644
--- a/src/guidellm/backend/base.py
+++ b/src/guidellm/backend/base.py
@@ -15,7 +15,7 @@
 __all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"]
 
 
-BackendEnginePublic = Literal["openai_server"]
+BackendEnginePublic = Literal["openai_server", "aiohttp_server"]
 BackendEngine = Union[BackendEnginePublic, Literal["test"]]
 
 
diff --git a/src/guidellm/config.py b/src/guidellm/config.py
index df750ea..a19a624 100644
--- a/src/guidellm/config.py
+++ b/src/guidellm/config.py
@@ -109,6 +109,9 @@ class OpenAISettings(BaseModel):
     max_gen_tokens: int = 4096
 
 
+class AiohttpSettings(OpenAISettings):
+    pass
+
 class ReportGenerationSettings(BaseModel):
     """
     Report generation settings for the application
@@ -153,6 +156,7 @@ class Settings(BaseSettings):
 
     # Request settings
     openai: OpenAISettings = OpenAISettings()
+    aiohttp: AiohttpSettings = AiohttpSettings()
 
     # Report settings
     report_generation: ReportGenerationSettings = ReportGenerationSettings()

From b7845157d102a44f261129d08e05d580374d9ce3 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 11 Dec 2024 04:22:39 +0000
Subject: [PATCH 44/45] Add support for aiohttp backend

---
 src/guidellm/backend/aiohttp.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/guidellm/backend/aiohttp.py b/src/guidellm/backend/aiohttp.py
index 138f45a..fbbd971 100644
--- a/src/guidellm/backend/aiohttp.py
+++ b/src/guidellm/backend/aiohttp.py
@@ -1,3 +1,5 @@
+import base64
+import io
 from typing import AsyncGenerator, Dict, List, Optional
 from loguru import logger
 
@@ -92,11 +94,11 @@ async def make_request(
 
             request_args.update(self._request_args)
 
+            messages = self._build_messages(request)
+
             payload = {
                 "model": self._model,
-                "messages": [
-                    {"role": "user", "content": request.prompt},
-                ],
+                "messages": messages,
                 "stream": True,
                 **request_args,
             }
@@ -158,3 +160,21 @@ def validate_connection(self):
         Validate the connection to the backend server.
         """
         logger.info("Connection validation is not explicitly implemented for aiohttp backend.")
+
+    def _build_messages(self, request: TextGenerationRequest) -> Dict:
+        if request.number_images == 0:
+            messages = [{"role": "user", "content": request.prompt}]
+        else:
+            content = []
+            for image in request.images:
+                stream = io.BytesIO()
+                im_format = image.image.format or "PNG"
+                image.image.save(stream, format=im_format)
+                im_b64 = base64.b64encode(stream.getvalue()).decode("utf-8")
+                image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"}
+                content.append({"type": "image_url", "image_url": image_url})
+
+            content.append({"type": "text", "text": request.prompt})
+            messages = [{"role": "user", "content": content}]
+
+        return messages

From b5bac800215da1794cfa6d2217dd544a67000ad5 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Sat, 8 Feb 2025 01:34:38 +0000
Subject: [PATCH 45/45] Refactor generate_benchmark_report to set default
 values for parameters

---
 src/guidellm/main.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/guidellm/main.py b/src/guidellm/main.py
index 4016ece..4748b12 100644
--- a/src/guidellm/main.py
+++ b/src/guidellm/main.py
@@ -186,17 +186,17 @@ def generate_benchmark_report_cli(
 
 def generate_benchmark_report(
     target: str,
-    backend: BackendEnginePublic,
-    model: Optional[str],
     data: Optional[str],
     data_type: Literal["emulated", "file", "transformers"],
-    tokenizer: Optional[str],
-    rate_type: ProfileGenerationMode,
-    rate: Optional[float],
-    max_seconds: Optional[int],
-    max_requests: Union[Literal["dataset"], int, None],
-    output_path: str,
-    cont_refresh_table: bool,
+    backend: BackendEnginePublic="openai_server",
+    model: Optional[str]=None,
+    tokenizer: Optional[str]=None,
+    rate_type: ProfileGenerationMode="sweep",
+    rate: Optional[float]=None,
+    max_seconds: Optional[int]=120,
+    max_requests: Union[Literal["dataset"], int, None]=None,
+    output_path: str=None,
+    cont_refresh_table: bool=False,
 ) -> GuidanceReport:
     """
     Generate a benchmark report for a specified backend and dataset.