Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MISC] Add lora requests to metrics #9477

Merged
merged 10 commits into from
Oct 18, 2024
24 changes: 22 additions & 2 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
Iterable, List, Mapping, NamedTuple, Optional)
from typing import Sequence as GenericSequence
from typing import Set, Type, Union, cast, overload

from collections import Counter as collectionsCounter
import torch
from typing_extensions import TypeVar

Expand Down Expand Up @@ -1616,7 +1616,24 @@ def _get_stats(self,
num_generation_tokens_requests: List[int] = []
n_requests: List[int] = []
finished_reason_requests: List[str] = []


# Lora requests
running_adapters = dict(collectionsCounter([
coolkp marked this conversation as resolved.
Show resolved Hide resolved
running_request.lora_request.lora_name
for scheduler in self.scheduler
for running_request in scheduler.running
if running_request.lora_request
]))
waiting_adapters = dict(collectionsCounter([
waiting_request.lora_request.lora_name
for scheduler in self.scheduler
for waiting_request in scheduler.waiting
if waiting_request.lora_request
]))
max_lora_stat = "0"
if self.lora_config:
max_lora_stat = str(self.lora_config.max_loras)
Comment on lines +1636 to +1638
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems always fixed? In this case can we don't dump this value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

across multiple deployments its hard to get this value, helps determine how many loras can be fitted on the server. You are right, its definitely static right now, initialised at runtime and thats it. I considered moving it to separate info metric like the cache config info. But I think in future there maybe value in enabling dynamic adjustment of max lora, like base_model which is static right now.


# NOTE: This loop assumes prefill seq_groups are before
# decode seq_groups in scheduled_seq_groups.
if scheduler_outputs is not None:
Expand Down Expand Up @@ -1738,6 +1755,9 @@ def _get_stats(self,
num_generation_tokens_requests=num_generation_tokens_requests,
n_requests=n_requests,
finished_reason_requests=finished_reason_requests,
max_lora=str(max_lora_stat),
waiting_adapters=list(waiting_adapters.keys()),
running_adapters=list(running_adapters.keys())
)

def add_lora(self, lora_request: LoRARequest) -> bool:
Expand Down
24 changes: 23 additions & 1 deletion vllm/engine/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import numpy as np
import prometheus_client
import time
coolkp marked this conversation as resolved.
Show resolved Hide resolved

from vllm.engine.metrics_types import (StatLoggerBase, Stats,
SupportsMetricsInfo)
Expand Down Expand Up @@ -35,9 +36,13 @@ class Metrics:
details on limitations.
"""
labelname_finish_reason = "finished_reason"
labelname_waiting_adapters = "waiting_adapters"
labelname_running_adapters = "running_adapters"
labelname_max_lora = "max_lora"
_gauge_cls = prometheus_client.Gauge
_counter_cls = prometheus_client.Counter
_histogram_cls = prometheus_client.Histogram
_info_cls = prometheus_client.Info
coolkp marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, labelnames: List[str], max_model_len: int):
# Unregister any existing vLLM collectors (for CI/CD)
Expand All @@ -55,6 +60,15 @@ def __init__(self, labelnames: List[str], max_model_len: int):
documentation="Number of requests waiting to be processed.",
labelnames=labelnames,
multiprocess_mode="sum")
self.gauge_lora_info = self._gauge_cls(
name="vllm:lora_requests_info",
documentation="Running stats on lora requests waiting and under process.",
labelnames=[
self.labelname_running_adapters,
self.labelname_max_lora,
self.labelname_waiting_adapters],
multiprocess_mode="livemostrecent"
)
self.gauge_scheduler_swapped = self._gauge_cls(
name="vllm:num_requests_swapped",
documentation="Number of requests swapped to CPU.",
Expand Down Expand Up @@ -425,6 +439,9 @@ def _log_histogram(self, histogram, data: Union[List[int],
# Convenience function for logging list to histogram.
for datum in data:
histogram.labels(**self.labels).observe(datum)

def _log_gauge_string(self, gauge, data:Dict[str,str]) -> None:
gauge.labels(**data).set(1)

def _log_prometheus(self, stats: Stats) -> None:
# System state data
Expand All @@ -442,7 +459,12 @@ def _log_prometheus(self, stats: Stats) -> None:
stats.cpu_prefix_cache_hit_rate)
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
stats.gpu_prefix_cache_hit_rate)

lora_info = {
self.metrics.labelname_running_adapters: ','.join(stats.running_adapters),
self.metrics.labelname_waiting_adapters: ','.join(stats.waiting_adapters),
self.metrics.labelname_max_lora: stats.max_lora,
}
self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
# Iteration level data
self._log_counter(self.metrics.counter_num_preemption,
stats.num_preemption_iter)
Expand Down
3 changes: 3 additions & 0 deletions vllm/engine/metrics_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ class Stats:
num_generation_tokens_requests: List[int]
n_requests: List[int]
finished_reason_requests: List[str]
waiting_adapters: List[str]
running_adapters: List[str]
max_lora: str

spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None

Expand Down
Loading