vllm-project · comaniac · Oct 18, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
@@ -7,7 +7,7 @@
                     Iterable, List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast, overload
-
+from collections import Counter as collectionsCounter
 import torch
 from typing_extensions import TypeVar
 
@@ -1616,7 +1616,24 @@ def _get_stats(self,
         num_generation_tokens_requests: List[int] = []
         n_requests: List[int] = []
         finished_reason_requests: List[str] = []
-
+
+        # Lora requests
+        running_adapters = dict(collectionsCounter([
+            running_request.lora_request.lora_name
+            for scheduler in self.scheduler
+            for running_request in scheduler.running
+            if running_request.lora_request
+            ]))
+        waiting_adapters = dict(collectionsCounter([
+            waiting_request.lora_request.lora_name
+            for scheduler in self.scheduler
+            for waiting_request in scheduler.waiting
+            if waiting_request.lora_request
+            ]))
+        max_lora_stat = "0"
+        if self.lora_config:
+            max_lora_stat = str(self.lora_config.max_loras)
+
         # NOTE: This loop assumes prefill seq_groups are before
         # decode seq_groups in scheduled_seq_groups.
         if scheduler_outputs is not None:
@@ -1738,6 +1755,9 @@ def _get_stats(self,
             num_generation_tokens_requests=num_generation_tokens_requests,
             n_requests=n_requests,
             finished_reason_requests=finished_reason_requests,
+            max_lora=str(max_lora_stat),
+            waiting_adapters=list(waiting_adapters.keys()),
+            running_adapters=list(running_adapters.keys())
         )
 
     def add_lora(self, lora_request: LoRARequest) -> bool:

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import prometheus_client
+import time
 
 from vllm.engine.metrics_types import (StatLoggerBase, Stats,
                                        SupportsMetricsInfo)
@@ -35,9 +36,13 @@ class Metrics:
     details on limitations.
     """
     labelname_finish_reason = "finished_reason"
+    labelname_waiting_adapters = "waiting_adapters"
+    labelname_running_adapters = "running_adapters"
+    labelname_max_lora = "max_lora"
     _gauge_cls = prometheus_client.Gauge
     _counter_cls = prometheus_client.Counter
     _histogram_cls = prometheus_client.Histogram
+    _info_cls = prometheus_client.Info
 
     def __init__(self, labelnames: List[str], max_model_len: int):
         # Unregister any existing vLLM collectors (for CI/CD)
@@ -55,6 +60,15 @@ def __init__(self, labelnames: List[str], max_model_len: int):
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames,
             multiprocess_mode="sum")
+        self.gauge_lora_info = self._gauge_cls(
+            name="vllm:lora_requests_info",
+            documentation="Running stats on lora requests waiting and under process.",
+            labelnames=[
+                self.labelname_running_adapters,
+                self.labelname_max_lora, 
+                self.labelname_waiting_adapters],
+            multiprocess_mode="livemostrecent"
+        )
         self.gauge_scheduler_swapped = self._gauge_cls(
             name="vllm:num_requests_swapped",
             documentation="Number of requests swapped to CPU.",
@@ -425,6 +439,9 @@ def _log_histogram(self, histogram, data: Union[List[int],
         # Convenience function for logging list to histogram.
         for datum in data:
             histogram.labels(**self.labels).observe(datum)
+
+    def _log_gauge_string(self, gauge, data:Dict[str,str]) -> None:
+        gauge.labels(**data).set(1)
 
     def _log_prometheus(self, stats: Stats) -> None:
         # System state data
@@ -442,7 +459,12 @@ def _log_prometheus(self, stats: Stats) -> None:
                         stats.cpu_prefix_cache_hit_rate)
         self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
                         stats.gpu_prefix_cache_hit_rate)
-
+        lora_info = {
+            self.metrics.labelname_running_adapters: ','.join(stats.running_adapters),
+            self.metrics.labelname_waiting_adapters: ','.join(stats.waiting_adapters),
+            self.metrics.labelname_max_lora: stats.max_lora,
+            }
+        self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
         # Iteration level data
         self._log_counter(self.metrics.counter_num_preemption,
                           stats.num_preemption_iter)

diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
@@ -51,6 +51,9 @@ class Stats:
     num_generation_tokens_requests: List[int]
     n_requests: List[int]
     finished_reason_requests: List[str]
+    waiting_adapters: List[str]
+    running_adapters: List[str]
+    max_lora: str
 
     spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None