diff --git a/CHANGELOG.md b/CHANGELOG.md index 5098fcd1892..3dff9a86a37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,7 @@ * [CHANGE] Use test metrics that do not pass through 0 to make identifying incorrect results easier. #8630 * [ENHANCEMENT] Include human-friendly timestamps in diffs logged when a test fails. #8630 * [BUGFIX] Initialize test result metrics to 0 at startup so that alerts can correctly identify the first failure after startup. #8630 +* [ENHANCEMENT] Add histograms to measure latency of read and write requests. #8583 ### Query-tee diff --git a/docs/sources/mimir/manage/tools/mimir-continuous-test.md b/docs/sources/mimir/manage/tools/mimir-continuous-test.md index 1f9e9273cd0..c504115f9b0 100644 --- a/docs/sources/mimir/manage/tools/mimir-continuous-test.md +++ b/docs/sources/mimir/manage/tools/mimir-continuous-test.md @@ -78,6 +78,18 @@ mimir_continuous_test_writes_total{test=""} # TYPE mimir_continuous_test_writes_failed_total counter mimir_continuous_test_writes_failed_total{test="",status_code=""} +# HELP mimir_continuous_test_writes_request_duration_seconds Duration of the requests +# TYPE mimir_continuous_test_writes_request_duration_seconds histogram +mimir_continuous_test_writes_request_duration_seconds_bucket{test="",le="0.001"} +mimir_continuous_test_writes_request_duration_seconds_bucket{test="",le="0.004"} +mimir_continuous_test_writes_request_duration_seconds_bucket{test="",le="0.016"} +mimir_continuous_test_writes_request_duration_seconds_bucket{test="",le="0.064"} +mimir_continuous_test_writes_request_duration_seconds_bucket{test="",le="0.256"} +mimir_continuous_test_writes_request_duration_seconds_bucket{test="",le="1.024"} +mimir_continuous_test_writes_request_duration_seconds_bucket{test="",le="+Inf"} +mimir_continuous_test_writes_request_duration_seconds_sum{test=""} +mimir_continuous_test_writes_request_duration_seconds_count{test=""} + # HELP mimir_continuous_test_queries_total Total number of attempted query requests. # TYPE mimir_continuous_test_queries_total counter mimir_continuous_test_queries_total{test=""} @@ -93,6 +105,27 @@ mimir_continuous_test_query_result_checks_total{test=""} # HELP mimir_continuous_test_query_result_checks_failed_total Total number of query results failed when checking for correctness. # TYPE mimir_continuous_test_query_result_checks_failed_total counter mimir_continuous_test_query_result_checks_failed_total{test=""} + +# HELP mimir_continuous_test_queries_request_duration_seconds Duration of the requests +# TYPE mimir_continuous_test_queries_request_duration_seconds histogram +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="",le="0.001"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="",le="0.004"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="",le="0.016"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="",le="0.064"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="",le="0.256"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="",le="1.024"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="false",test="",le="+Inf"} +mimir_continuous_test_queries_request_duration_seconds_sum{results_cache="false",test=""} +mimir_continuous_test_queries_request_duration_seconds_count{results_cache="false",test=""} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="",le="0.001"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="",le="0.004"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="",le="0.016"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="",le="0.064"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="",le="0.256"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="",le="1.024"} +mimir_continuous_test_queries_request_duration_seconds_bucket{results_cache="true",test="",le="+Inf"} +mimir_continuous_test_queries_request_duration_seconds_sum{results_cache="true",test=""} +mimir_continuous_test_queries_request_duration_seconds_count{results_cache="true",test=""} ``` ### Alerts diff --git a/pkg/continuoustest/metrics.go b/pkg/continuoustest/metrics.go index 2a7dba02ffa..d349faad1bf 100644 --- a/pkg/continuoustest/metrics.go +++ b/pkg/continuoustest/metrics.go @@ -3,6 +3,8 @@ package continuoustest import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) @@ -12,8 +14,10 @@ import ( type TestMetrics struct { writesTotal *prometheus.CounterVec writesFailedTotal *prometheus.CounterVec + writesLatency *prometheus.HistogramVec queriesTotal *prometheus.CounterVec queriesFailedTotal *prometheus.CounterVec + queriesLatency *prometheus.HistogramVec queryResultChecksTotal *prometheus.CounterVec queryResultChecksFailedTotal *prometheus.CounterVec } @@ -30,6 +34,14 @@ func NewTestMetrics(testName string, reg prometheus.Registerer) *TestMetrics { Help: "Total number of failed write requests.", ConstLabels: map[string]string{"test": testName}, }, []string{"status_code", "type"}), + writesLatency: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ + Name: "mimir_continuous_test_writes_request_duration_seconds", + Help: "Duration of the write requests.", + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, + ConstLabels: map[string]string{"test": testName}, + }, []string{"type"}), queriesTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "mimir_continuous_test_queries_total", Help: "Total number of attempted query requests.", @@ -40,6 +52,14 @@ func NewTestMetrics(testName string, reg prometheus.Registerer) *TestMetrics { Help: "Total number of failed query requests.", ConstLabels: map[string]string{"test": testName}, }, []string{"type"}), + queriesLatency: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ + Name: "mimir_continuous_test_queries_request_duration_seconds", + Help: "Duration of the read requests.", + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, + ConstLabels: map[string]string{"test": testName}, + }, []string{"type", "results_cache"}), queryResultChecksTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "mimir_continuous_test_query_result_checks_total", Help: "Total number of query results checked for correctness.", diff --git a/pkg/continuoustest/write_read_series.go b/pkg/continuoustest/write_read_series.go index 46cd62d81b6..d7ba6076c02 100644 --- a/pkg/continuoustest/write_read_series.go +++ b/pkg/continuoustest/write_read_series.go @@ -183,9 +183,11 @@ func (t *WriteReadSeriesTest) writeSamples(ctx context.Context, typeLabel string defer sp.Finish() logger := log.With(sp, "timestamp", timestamp.String(), "num_series", t.cfg.NumSeries) + start := time.Now() statusCode, err := t.client.WriteSeries(ctx, series) - + t.metrics.writesLatency.WithLabelValues(typeLabel).Observe(time.Since(start).Seconds()) t.metrics.writesTotal.WithLabelValues(typeLabel).Inc() + if statusCode/100 != 2 { t.metrics.writesFailedTotal.WithLabelValues(strconv.Itoa(statusCode), typeLabel).Inc() level.Warn(logger).Log("msg", "Failed to remote write series", "status_code", statusCode, "err", err) @@ -295,7 +297,9 @@ func (t *WriteReadSeriesTest) runRangeQueryAndVerifyResult(ctx context.Context, level.Debug(logger).Log("msg", "Running range query") t.metrics.queriesTotal.WithLabelValues(typeLabel).Inc() + queryStart := time.Now() matrix, err := t.client.QueryRange(ctx, metricSumQuery, start, end, step, WithResultsCacheEnabled(resultsCacheEnabled)) + t.metrics.queriesLatency.WithLabelValues(typeLabel, strconv.FormatBool(resultsCacheEnabled)).Observe(time.Since(queryStart).Seconds()) if err != nil { t.metrics.queriesFailedTotal.WithLabelValues(typeLabel).Inc() level.Warn(logger).Log("msg", "Failed to execute range query", "err", err) @@ -327,7 +331,9 @@ func (t *WriteReadSeriesTest) runInstantQueryAndVerifyResult(ctx context.Context level.Debug(logger).Log("msg", "Running instant query") t.metrics.queriesTotal.WithLabelValues(typeLabel).Inc() + queryStart := time.Now() vector, err := t.client.Query(ctx, metricSumQuery, ts, WithResultsCacheEnabled(resultsCacheEnabled)) + t.metrics.queriesLatency.WithLabelValues(typeLabel, strconv.FormatBool(resultsCacheEnabled)).Observe(time.Since(queryStart).Seconds()) if err != nil { t.metrics.queriesFailedTotal.WithLabelValues(typeLabel).Inc() level.Warn(logger).Log("msg", "Failed to execute instant query", "err", err) diff --git a/pkg/continuoustest/write_read_series_test.go b/pkg/continuoustest/write_read_series_test.go index bf2c133bfcc..951efc0308e 100644 --- a/pkg/continuoustest/write_read_series_test.go +++ b/pkg/continuoustest/write_read_series_test.go @@ -85,8 +85,10 @@ func init() { emCtx = util_test.NewExpectedMetricsContext() emCtx.Add("mimir_continuous_test_writes_total", "Total number of attempted write requests.", "counter") emCtx.Add("mimir_continuous_test_writes_failed_total", "Total number of failed write requests.", "counter") + emCtx.Add("mimir_continuous_test_writes_request_duration_seconds", "Duration of the write requests.", "histogram") emCtx.Add("mimir_continuous_test_queries_total", "Total number of attempted query requests.", "counter") emCtx.Add("mimir_continuous_test_queries_failed_total", "Total number of failed query requests.", "counter") + emCtx.Add("mimir_continuous_test_queries_request_duration_seconds", "Duration of the read requests.", "histogram") emCtx.Add("mimir_continuous_test_query_result_checks_total", "Total number of query results checked for correctness.", "counter") emCtx.Add("mimir_continuous_test_query_result_checks_failed_total", "Total number of query results failed when checking for correctness.", "counter") }