Skip to content

Commit

Permalink
[Frontend] Add max_tokens prometheus metric (vllm-project#9881)
Browse files Browse the repository at this point in the history
Signed-off-by: Tomer Asida <[email protected]>
  • Loading branch information
tomeras91 authored Nov 4, 2024
1 parent 9a5664d commit ac04a97
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 2 deletions.
11 changes: 9 additions & 2 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,14 @@ async def client(server):
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_max_tokens":
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:prompt_tokens": [("_total",
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens":
[("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens": [
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
],
"vllm:request_success": [("_total", _NUM_REQUESTS)],
}

Expand Down Expand Up @@ -149,6 +153,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
Expand Down
1 change: 1 addition & 0 deletions tests/metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
"vllm:request_prompt_tokens",
"vllm:request_generation_tokens",
"vllm:request_params_n",
"vllm:request_params_max_tokens",
]
for metric_name in request_histogram_metrics:
metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
Expand Down
4 changes: 4 additions & 0 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1685,6 +1685,7 @@ def _get_stats(self,
num_prompt_tokens_requests: List[int] = []
num_generation_tokens_requests: List[int] = []
n_requests: List[int] = []
max_tokens_requests: List[int] = []
finished_reason_requests: List[str] = []

# Lora requests
Expand Down Expand Up @@ -1792,6 +1793,8 @@ def _get_stats(self,
])
if seq_group.sampling_params is not None:
n_requests.append(seq_group.sampling_params.n)
max_tokens_requests.append(
seq_group.sampling_params.max_tokens)
finished_reason_requests.extend([
SequenceStatus.get_finished_reason(seq.status)
for seq in seq_group.get_finished_seqs()
Expand Down Expand Up @@ -1847,6 +1850,7 @@ def _get_stats(self,
num_prompt_tokens_requests=num_prompt_tokens_requests,
num_generation_tokens_requests=num_generation_tokens_requests,
n_requests=n_requests,
max_tokens_requests=max_tokens_requests,
finished_reason_requests=finished_reason_requests,
max_lora=str(max_lora_stat),
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
Expand Down
8 changes: 8 additions & 0 deletions vllm/engine/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,12 @@ def __init__(self, labelnames: List[str], max_model_len: int):
labelnames=labelnames,
buckets=[1, 2, 5, 10, 20],
)
self.histogram_max_tokens_request = self._histogram_cls(
name="vllm:request_params_max_tokens",
documentation="Histogram of the max_tokens request parameter.",
labelnames=labelnames,
buckets=build_1_2_5_buckets(max_model_len),
)
self.counter_request_success = self._counter_cls(
name="vllm:request_success_total",
documentation="Count of successfully processed requests.",
Expand Down Expand Up @@ -547,6 +553,8 @@ def _log_prometheus(self, stats: Stats) -> None:
self.metrics.histogram_num_generation_tokens_request,
stats.num_generation_tokens_requests)
self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
self._log_histogram(self.metrics.histogram_max_tokens_request,
stats.max_tokens_requests)

def _log_prometheus_interval(self, prompt_throughput: float,
generation_throughput: float) -> None:
Expand Down
1 change: 1 addition & 0 deletions vllm/engine/metrics_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class Stats:
num_prompt_tokens_requests: List[int]
num_generation_tokens_requests: List[int]
n_requests: List[int]
max_tokens_requests: List[int]
finished_reason_requests: List[str]
waiting_lora_adapters: List[str]
running_lora_adapters: List[str]
Expand Down

0 comments on commit ac04a97

Please sign in to comment.