Skip to content

Commit

Permalink
use Counter and Histogram, remove posprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
AllentDan committed Nov 28, 2024
1 parent 66cb69b commit e70f478
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 63 deletions.
2 changes: 0 additions & 2 deletions lmdeploy/serve/async_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,6 @@ async def get_inputs_genconfig(gen_config):
start_ids_offset = state.ids_offset
response = ''
async for outputs in iterator:
start_frame = self.metrics.insert_frame()
is_first_token = state.prev_tokens is None
# decode res
if is_error(outputs.status):
Expand All @@ -626,7 +625,6 @@ async def get_inputs_genconfig(gen_config):
if outputs.logprobs:
log_offset = ids_offset - start_ids_offset
logprobs = outputs.logprobs[log_offset:]
self.metrics.update_postprocess(start_frame)
if is_first_token:
self.metrics.update_FTL(arrival_frame)
# response, history token len,
Expand Down
90 changes: 29 additions & 61 deletions lmdeploy/serve/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import dataclasses
import threading
import time
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import psutil
import pynvml
from prometheus_client import REGISTRY, Gauge, Info, disable_created_metrics
from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
disable_created_metrics)

disable_created_metrics()

Expand Down Expand Up @@ -57,11 +58,11 @@ class Stats:
request_waiting: int = 0

# latency stats
duration_queue: float = 0
duration_infer: float = 0
duration_preprocess: float = 0
duration_postprocess: float = 0
first_token_latency: float = 0
duration_queue: list = field(default_factory=list)
duration_infer: list = field(default_factory=list)
duration_preprocess: list = field(default_factory=list)
duration_postprocess: list = field(default_factory=list)
first_token_latency: list = field(default_factory=list)

# system status
cpu_utilization: Optional[float] = None
Expand Down Expand Up @@ -134,43 +135,38 @@ def __init__(self,
labelnames=labelnames)

# requests
self.gauge_request_success = Gauge(
self.counter_request_success = Counter(
name='lmdeploy:request_success',
documentation='Number of successful requests.',
labelnames=labelnames)
self.gauge_request_failure = Gauge(
self.counter_request_failure = Counter(
name='lmdeploy:request_failure',
documentation='Number of failed requests.',
labelnames=labelnames)
self.gauge_request_total = Gauge(
self.counter_request_total = Counter(
name='lmdeploy:request_total',
documentation='Number of total requests.',
labelnames=labelnames)

# latency metrics
self.gauge_duration_queue = Gauge(
name='lmdeploy:duration_queue',
self.histogram_duration_queue = Histogram(
name='lmdeploy:duration_queue_seconds',
documentation= # noqa
'Avarate duration waiting in the queue of requests in s.',
labelnames=labelnames,
)
self.gauge_duration_infer = Gauge(
name='lmdeploy:duration_infer',
self.histogram_duration_infer = Histogram(
name='lmdeploy:duration_infer_seconds',
documentation='Average inference time in s.',
labelnames=labelnames,
)
self.gauge_duration_preprocess = Gauge(
name='lmdeploy:duration_preprocess',
self.histogram_duration_preprocess = Histogram(
name='lmdeploy:duration_preprocess_seconds',
documentation='Average duration of processing inputs in s.',
labelnames=labelnames,
)
self.gauge_duration_postprocess = Gauge(
name='lmdeploy:duration_postprocess',
documentation='Average duration of processing outputs in s.',
labelnames=labelnames,
)
self.gauge_first_token_latency = Gauge(
name='lmdeploy:first_token_latency',
self.histogram_first_token_latency = Histogram(
name='lmdeploy:first_token_latency_seconds',
documentation='Average first token latency in s.',
labelnames=labelnames,
)
Expand All @@ -191,63 +187,35 @@ def info(self, backend_config: object) -> None:
def failure_frame(self):
"""log the failaure frame."""
if self.applied:
self.stats.request_failure += 1
self.stats.request_total += 1
self.counter_request_failure.inc()
self.counter_request_total.inc()

def last_token_frame(self, iterator):
"""log the last token frame."""
if self.applied:
self.stats.duration_infer += iterator.get_duration()
self.stats.request_success += 1
self.stats.request_total += 1
self.log()
self.histogram_duration_infer.observe(iterator.get_duration())
self.counter_request_success.inc()
self.counter_request_total.inc()

def insert_frame(self):
"""Insert a frame."""
if self.applied:
return time.time()
return None

def update_postprocess(self, start_frame):
"""Update postprocess duration."""
if self.applied:
self.stats.duration_postprocess += time.time() - start_frame

def update_preprocess(self, start_frame):
"""Update preprocess duration."""
if self.applied:
self.stats.duration_preprocess += time.time() - start_frame
self.histogram_duration_preprocess.observe(time.time() -
start_frame)

def update_queue_waiting(self, start_frame):
"""Update queue waiting time."""
if self.applied:
self.stats.duration_queue += time.time() - start_frame
self.histogram_duration_queue.observe(time.time() - start_frame)

def update_FTL(self, start_frame):
"""Update first token latency."""
if self.applied:
self.stats.first_token_latency += time.time() - start_frame

def log(self) -> None:
"""Called by LLMEngine.
Logs to prometheus and tracked stats every iteration. Logs to Stdout
every self.local_interval seconds.
"""
stats = self.stats
# Add to request counters.
self.gauge_request_total.set(stats.request_total)
self.gauge_request_success.set(stats.request_success)
self.gauge_request_failure.set(stats.request_failure)

# duration gauges
self.gauge_duration_infer.set(stats.duration_infer /
stats.request_total)
self.gauge_duration_queue.set(stats.duration_queue /
stats.request_total)
self.gauge_duration_preprocess.set(stats.duration_preprocess /
stats.request_total)
self.gauge_duration_postprocess.set(stats.duration_postprocess /
stats.request_total)
self.gauge_first_token_latency.set(stats.first_token_latency /
stats.request_total)
self.histogram_first_token_latency.observe(time.time() -
start_frame)

0 comments on commit e70f478

Please sign in to comment.