Skip to content

Commit

Permalink
feat(continuous-test): Add histograms to measure latency of read and …
Browse files Browse the repository at this point in the history
…write requests (#8583)

This will allow us to better appreciate the performances of the underlying mimir cluster/instance

Signed-off-by: julien.girard <[email protected]>
  • Loading branch information
bubu11e authored Jul 5, 2024
1 parent 1365650 commit c1c561d
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
* [CHANGE] Use test metrics that do not pass through 0 to make identifying incorrect results easier. #8630
* [ENHANCEMENT] Include human-friendly timestamps in diffs logged when a test fails. #8630
* [BUGFIX] Initialize test result metrics to 0 at startup so that alerts can correctly identify the first failure after startup. #8630
* [ENHANCEMENT] Add histograms to measure latency of read and write requests. #8583

### Query-tee

Expand Down
33 changes: 33 additions & 0 deletions docs/sources/mimir/manage/tools/mimir-continuous-test.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,18 @@ mimir_continuous_test_writes_total{test="<name>"}
# TYPE mimir_continuous_test_writes_failed_total counter
mimir_continuous_test_writes_failed_total{test="<name>",status_code="<code>"}

# HELP mimir_continuous_test_writes_request_duration_seconds Duration of the requests
# TYPE mimir_continuous_test_writes_request_duration_seconds histogram
mimir_continuous_test_writes_request_duration_seconds_bucket{test="<name>",le="0.001"}
mimir_continuous_test_writes_request_duration_seconds_bucket{test="<name>",le="0.004"}
mimir_continuous_test_writes_request_duration_seconds_bucket{test="<name>",le="0.016"}
mimir_continuous_test_writes_request_duration_seconds_bucket{test="<name>",le="0.064"}
mimir_continuous_test_writes_request_duration_seconds_bucket{test="<name>",le="0.256"}
mimir_continuous_test_writes_request_duration_seconds_bucket{test="<name>",le="1.024"}
mimir_continuous_test_writes_request_duration_seconds_bucket{test="<name>",le="+Inf"}
mimir_continuous_test_writes_request_duration_seconds_sum{test="<name>"}
mimir_continuous_test_writes_request_duration_seconds_count{test="<name>"}

# HELP mimir_continuous_test_queries_total Total number of attempted query requests.
# TYPE mimir_continuous_test_queries_total counter
mimir_continuous_test_queries_total{test="<name>"}
Expand All @@ -93,6 +105,27 @@ mimir_continuous_test_query_result_checks_total{test="<name>"}
# HELP mimir_continuous_test_query_result_checks_failed_total Total number of query results failed when checking for correctness.
# TYPE mimir_continuous_test_query_result_checks_failed_total counter
mimir_continuous_test_query_result_checks_failed_total{test="<name>"}

# HELP mimir_continuous_test_queries_request_duration_seconds Duration of the requests
# TYPE mimir_continuous_test_queries_request_duration_seconds histogram
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="<name>",le="0.001"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="<name>",le="0.004"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="<name>",le="0.016"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="<name>",le="0.064"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="<name>",le="0.256"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="<name>",le="1.024"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="<name>",le="+Inf"}
mimir_continuous_test_queries_request_duration_seconds_sum{results_cache="false",test="<name>"}
mimir_continuous_test_queries_request_duration_seconds_count{results_cache="false",test="<name>"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="<name>",le="0.001"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="<name>",le="0.004"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="<name>",le="0.016"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="<name>",le="0.064"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="<name>",le="0.256"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="<name>",le="1.024"}
mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="<name>",le="+Inf"}
mimir_continuous_test_queries_request_duration_seconds_sum{results_cache="true",test="<name>"}
mimir_continuous_test_queries_request_duration_seconds_count{results_cache="true",test="<name>"}
```

### Alerts
Expand Down
20 changes: 20 additions & 0 deletions pkg/continuoustest/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
package continuoustest

import (
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
Expand All @@ -12,8 +14,10 @@ import (
type TestMetrics struct {
writesTotal *prometheus.CounterVec
writesFailedTotal *prometheus.CounterVec
writesLatency *prometheus.HistogramVec
queriesTotal *prometheus.CounterVec
queriesFailedTotal *prometheus.CounterVec
queriesLatency *prometheus.HistogramVec
queryResultChecksTotal *prometheus.CounterVec
queryResultChecksFailedTotal *prometheus.CounterVec
}
Expand All @@ -30,6 +34,14 @@ func NewTestMetrics(testName string, reg prometheus.Registerer) *TestMetrics {
Help: "Total number of failed write requests.",
ConstLabels: map[string]string{"test": testName},
}, []string{"status_code", "type"}),
writesLatency: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
Name: "mimir_continuous_test_writes_request_duration_seconds",
Help: "Duration of the write requests.",
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: time.Hour,
ConstLabels: map[string]string{"test": testName},
}, []string{"type"}),
queriesTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Name: "mimir_continuous_test_queries_total",
Help: "Total number of attempted query requests.",
Expand All @@ -40,6 +52,14 @@ func NewTestMetrics(testName string, reg prometheus.Registerer) *TestMetrics {
Help: "Total number of failed query requests.",
ConstLabels: map[string]string{"test": testName},
}, []string{"type"}),
queriesLatency: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
Name: "mimir_continuous_test_queries_request_duration_seconds",
Help: "Duration of the read requests.",
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: time.Hour,
ConstLabels: map[string]string{"test": testName},
}, []string{"type", "results_cache"}),
queryResultChecksTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Name: "mimir_continuous_test_query_result_checks_total",
Help: "Total number of query results checked for correctness.",
Expand Down
8 changes: 7 additions & 1 deletion pkg/continuoustest/write_read_series.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,11 @@ func (t *WriteReadSeriesTest) writeSamples(ctx context.Context, typeLabel string
defer sp.Finish()
logger := log.With(sp, "timestamp", timestamp.String(), "num_series", t.cfg.NumSeries)

start := time.Now()
statusCode, err := t.client.WriteSeries(ctx, series)

t.metrics.writesLatency.WithLabelValues(typeLabel).Observe(time.Since(start).Seconds())
t.metrics.writesTotal.WithLabelValues(typeLabel).Inc()

if statusCode/100 != 2 {
t.metrics.writesFailedTotal.WithLabelValues(strconv.Itoa(statusCode), typeLabel).Inc()
level.Warn(logger).Log("msg", "Failed to remote write series", "status_code", statusCode, "err", err)
Expand Down Expand Up @@ -295,7 +297,9 @@ func (t *WriteReadSeriesTest) runRangeQueryAndVerifyResult(ctx context.Context,
level.Debug(logger).Log("msg", "Running range query")

t.metrics.queriesTotal.WithLabelValues(typeLabel).Inc()
queryStart := time.Now()
matrix, err := t.client.QueryRange(ctx, metricSumQuery, start, end, step, WithResultsCacheEnabled(resultsCacheEnabled))
t.metrics.queriesLatency.WithLabelValues(typeLabel, strconv.FormatBool(resultsCacheEnabled)).Observe(time.Since(queryStart).Seconds())
if err != nil {
t.metrics.queriesFailedTotal.WithLabelValues(typeLabel).Inc()
level.Warn(logger).Log("msg", "Failed to execute range query", "err", err)
Expand Down Expand Up @@ -327,7 +331,9 @@ func (t *WriteReadSeriesTest) runInstantQueryAndVerifyResult(ctx context.Context
level.Debug(logger).Log("msg", "Running instant query")

t.metrics.queriesTotal.WithLabelValues(typeLabel).Inc()
queryStart := time.Now()
vector, err := t.client.Query(ctx, metricSumQuery, ts, WithResultsCacheEnabled(resultsCacheEnabled))
t.metrics.queriesLatency.WithLabelValues(typeLabel, strconv.FormatBool(resultsCacheEnabled)).Observe(time.Since(queryStart).Seconds())
if err != nil {
t.metrics.queriesFailedTotal.WithLabelValues(typeLabel).Inc()
level.Warn(logger).Log("msg", "Failed to execute instant query", "err", err)
Expand Down
2 changes: 2 additions & 0 deletions pkg/continuoustest/write_read_series_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,10 @@ func init() {
emCtx = util_test.NewExpectedMetricsContext()
emCtx.Add("mimir_continuous_test_writes_total", "Total number of attempted write requests.", "counter")
emCtx.Add("mimir_continuous_test_writes_failed_total", "Total number of failed write requests.", "counter")
emCtx.Add("mimir_continuous_test_writes_request_duration_seconds", "Duration of the write requests.", "histogram")
emCtx.Add("mimir_continuous_test_queries_total", "Total number of attempted query requests.", "counter")
emCtx.Add("mimir_continuous_test_queries_failed_total", "Total number of failed query requests.", "counter")
emCtx.Add("mimir_continuous_test_queries_request_duration_seconds", "Duration of the read requests.", "histogram")
emCtx.Add("mimir_continuous_test_query_result_checks_total", "Total number of query results checked for correctness.", "counter")
emCtx.Add("mimir_continuous_test_query_result_checks_failed_total", "Total number of query results failed when checking for correctness.", "counter")
}
Expand Down

0 comments on commit c1c561d

Please sign in to comment.