From 1895ff2b5795f941440c2e41e15b36b4d8b4c64c Mon Sep 17 00:00:00 2001 From: Kunjan Date: Fri, 18 Oct 2024 13:50:18 -0700 Subject: [PATCH] [MISC] Add lora requests to metrics (#9477) Co-authored-by: Kunjan Patel Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- vllm/engine/llm_engine.py | 24 +++++++++++++++++++++++- vllm/engine/metrics.py | 29 ++++++++++++++++++++++++++++- vllm/engine/metrics_types.py | 3 +++ 3 files changed, 54 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index eede3486e5e8f..a90bfce8491fb 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,4 +1,5 @@ import time +from collections import Counter as collectionsCounter from collections import deque from contextlib import contextmanager from dataclasses import dataclass @@ -1617,6 +1618,25 @@ def _get_stats(self, n_requests: List[int] = [] finished_reason_requests: List[str] = [] + # Lora requests + running_lora_adapters = dict( + collectionsCounter([ + running_request.lora_request.lora_name + for scheduler in self.scheduler + for running_request in scheduler.running + if running_request.lora_request + ])) + waiting_lora_adapters = dict( + collectionsCounter([ + waiting_request.lora_request.lora_name + for scheduler in self.scheduler + for waiting_request in scheduler.waiting + if waiting_request.lora_request + ])) + max_lora_stat = "0" + if self.lora_config: + max_lora_stat = str(self.lora_config.max_loras) + # NOTE: This loop assumes prefill seq_groups are before # decode seq_groups in scheduled_seq_groups. if scheduler_outputs is not None: @@ -1738,7 +1758,9 @@ def _get_stats(self, num_generation_tokens_requests=num_generation_tokens_requests, n_requests=n_requests, finished_reason_requests=finished_reason_requests, - ) + max_lora=str(max_lora_stat), + waiting_lora_adapters=list(waiting_lora_adapters.keys()), + running_lora_adapters=list(running_lora_adapters.keys())) def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 98bf59be3469d..a46625eff1e4a 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -34,7 +34,11 @@ class Metrics: See https://prometheus.github.io/client_python/multiprocess/ for more details on limitations. """ + labelname_finish_reason = "finished_reason" + labelname_waiting_lora_adapters = "waiting_lora_adapters" + labelname_running_lora_adapters = "running_lora_adapters" + labelname_max_lora = "max_lora" _gauge_cls = prometheus_client.Gauge _counter_cls = prometheus_client.Counter _histogram_cls = prometheus_client.Histogram @@ -55,6 +59,16 @@ def __init__(self, labelnames: List[str], max_model_len: int): documentation="Number of requests waiting to be processed.", labelnames=labelnames, multiprocess_mode="sum") + self.gauge_lora_info = self._gauge_cls( + name="vllm:lora_requests_info", + documentation="Running stats on lora requests.", + labelnames=[ + self.labelname_running_lora_adapters, + self.labelname_max_lora, + self.labelname_waiting_lora_adapters, + ], + multiprocess_mode="livemostrecent", + ) self.gauge_scheduler_swapped = self._gauge_cls( name="vllm:num_requests_swapped", documentation="Number of requests swapped to CPU.", @@ -426,6 +440,9 @@ def _log_histogram(self, histogram, data: Union[List[int], for datum in data: histogram.labels(**self.labels).observe(datum) + def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None: + gauge.labels(**data).set(1) + def _log_prometheus(self, stats: Stats) -> None: # System state data self._log_gauge(self.metrics.gauge_scheduler_running, @@ -442,7 +459,17 @@ def _log_prometheus(self, stats: Stats) -> None: stats.cpu_prefix_cache_hit_rate) self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, stats.gpu_prefix_cache_hit_rate) - + # Including max-lora in metric, in future this property of lora + # config maybe extended to be dynamic. + lora_info = { + self.metrics.labelname_running_lora_adapters: + ",".join(stats.running_lora_adapters), + self.metrics.labelname_waiting_lora_adapters: + ",".join(stats.waiting_lora_adapters), + self.metrics.labelname_max_lora: + stats.max_lora, + } + self._log_gauge_string(self.metrics.gauge_lora_info, lora_info) # Iteration level data self._log_counter(self.metrics.counter_num_preemption, stats.num_preemption_iter) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index bafd5fa1a8a82..e9a5bd3b586be 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -51,6 +51,9 @@ class Stats: num_generation_tokens_requests: List[int] n_requests: List[int] finished_reason_requests: List[str] + waiting_lora_adapters: List[str] + running_lora_adapters: List[str] + max_lora: str spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None