Skip to content

Commit

Permalink
Write dashboard: allow using cortex_request_duration_seconds native h…
Browse files Browse the repository at this point in the history
…istogram (#8757)

* Write dashboard: qps and latency w/ cortex_request_duration_seconds

Signed-off-by: Yuri Nikolic <[email protected]>

* Fix the instance label

* Distributor and Ingester panels

Signed-off-by: Yuri Nikolic <[email protected]>

* Fix review findings

Signed-off-by: Yuri Nikolic <[email protected]>

---------

Signed-off-by: Yuri Nikolic <[email protected]>
  • Loading branch information
duricanikolic authored Jul 18, 2024
1 parent 8dd215d commit e10027d
Show file tree
Hide file tree
Showing 8 changed files with 351 additions and 80 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@
### Mixin

* [CHANGE] Dashboards: set default auto-refresh rate to 5m. #8758
* [ENHANCEMENT] Dashboards: allow switching between using classic or native histograms in dashboards. #7674 #8502
* Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric.
* [ENHANCEMENT] Dashboards: allow switching between using classic or native histograms in dashboards.
* Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric. #7674 #8502
* Writes dashboard: `cortex_request_duration_seconds` metric. #8757
* [ENHANCEMENT] Alerts: `MimirRunningIngesterReceiveDelayTooHigh` alert has been tuned to be more reactive to high receive delay. #8538
* [ENHANCEMENT] Dashboards: improve end-to-end latency and strong read consistency panels when experimental ingest storage is enabled. #8543
* [ENHANCEMENT] Dashboards: Add panels for monitoring ingester autoscaling when not using ingest-storage. These panels are disabled by default, but can be enabled using the `autoscaling.ingester.enabled: true` config option. #8484
Expand Down
5 changes: 3 additions & 2 deletions operations/helm/charts/mimir-distributed/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ Entries should include a reference to the Pull Request that introduced the chang

## main / unreleased

* [ENHANCEMENT] Dashboards: allow switching between using classic or native histograms in dashboards. #7674
* Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric.
* [ENHANCEMENT] Dashboards: allow switching between using classic or native histograms in dashboards.
* Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric. #7674
* Writes dashboard: `cortex_request_duration_seconds` metric. #8757
* [ENHANCEMENT] Memcached: Update to Memcached 1.6.28 and memcached-exporter 0.14.4. #8557
* [ENHANCEMENT] Add missing fields in multiple topology spread constraints. #8533
* [ENHANCEMENT] Add support for setting the image pull secrets, node selectors, tolerations and topology spread constraints for the Grafana Agent pods used for metamonitoring. #8670
Expand Down

Large diffs are not rendered by default.

123 changes: 107 additions & 16 deletions operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json

Large diffs are not rendered by default.

123 changes: 107 additions & 16 deletions operations/mimir-mixin-compiled/dashboards/mimir-writes.json

Large diffs are not rendered by default.

17 changes: 9 additions & 8 deletions operations/mimir-mixin/dashboards/dashboard-queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
local variables = {
gatewayMatcher: $.jobMatcher($._config.job_names.gateway),
distributorMatcher: $.jobMatcher($._config.job_names.distributor),
ingesterMatcher: $.jobMatcher($._config.job_names.ingester),
queryFrontendMatcher: $.jobMatcher($._config.job_names.query_frontend),
rulerMatcher: $.jobMatcher($._config.job_names.ruler),
alertmanagerMatcher: $.jobMatcher($._config.job_names.alertmanager),
namespaceMatcher: $.namespaceMatcher(),
writeHTTPRoutesRegex: $.queries.write_http_routes_regex,
writeGRPCRoutesRegex: $.queries.write_grpc_routes_regex,
writeGRPCDistributorRoutesRegex: $.queries.write_grpc_distributor_routes_regex,
writeGRPCIngesterRoute: $.queries.write_grpc_ingester_route,
readHTTPRoutesRegex: $.queries.read_http_routes_regex,
perClusterLabel: $._config.per_cluster_label,
recordingRulePrefix: $.recordingRulePrefix($.jobSelector('any')), // The job name does not matter here.
Expand All @@ -47,13 +49,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
},

write_http_routes_regex: 'api_(v1|prom)_push|otlp_v1_metrics',
write_grpc_routes_regex: '/distributor.Distributor/Push|/httpgrpc.*',
write_grpc_distributor_routes_regex: '/distributor.Distributor/Push|/httpgrpc.*',
write_grpc_ingester_route: '/cortex.Ingester/Push',
read_http_routes_regex: '(prometheus|api_prom)_api_v1_.+',
query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?',

gateway: {
// deprecated, will be removed
writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables,
readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables,

local p = self,
Expand All @@ -69,12 +70,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
},

distributor: {
// deprecated, will be removed
writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables,

local p = self,
requestsPerSecondMetric: 'cortex_request_duration_seconds',
writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"' % variables,
writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeGRPCDistributorRoutesRegex)s|%(writeHTTPRoutesRegex)s"' % variables,
samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables,
exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables,

Expand Down Expand Up @@ -237,6 +235,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
},

ingester: {
requestsPerSecondMetric: 'cortex_request_duration_seconds',
writeRequestsPerSecondSelector: '%(ingesterMatcher)s, route="%(writeGRPCIngesterRoute)s"' % variables,

ingestOrClassicDeduplicatedQuery(perIngesterQuery, groupByLabels=''):: |||
( # Classic storage
sum by (%(groupByCluster)s, %(groupByLabels)s) (%(perIngesterQuery)s)
Expand Down
27 changes: 11 additions & 16 deletions operations/mimir-mixin/dashboards/writes.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ local filename = 'mimir-writes.json';
assert std.md5(filename) == '8280707b8f16e7b87b840fc1cc92d4c5' : 'UID of the dashboard has changed, please update references to dashboard.';
($.dashboard('Writes') + { uid: std.md5(filename) })
.addClusterSelectorTemplates()
.addShowNativeLatencyVariable()
.addRowIf(
$._config.show_dashboard_descriptions.writes,
($.row('Writes dashboard description') { height: '125px', showTitle: false })
Expand Down Expand Up @@ -97,25 +98,23 @@ local filename = 'mimir-writes.json';
.addPanelIf(
$._config.gateway_enabled,
$.panel('Requests / sec') +
$.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"%s"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.gateway), $.queries.write_http_routes_regex], format='reqps')
$.statPanel(utils.ncHistogramSumBy(utils.ncHistogramCountRate($.queries.gateway.requestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector)), format='reqps')
)
)
.addRowIf(
$._config.gateway_enabled,
$.row('Gateway')
.addPanel(
$.timeseriesPanel('Requests / sec') +
$.qpsPanel($.queries.gateway.writeRequestsPerSecond)
$.qpsPanelNativeHistogram($.queries.gateway.requestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector)
)
.addPanel(
$.timeseriesPanel('Latency') +
$.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)])
$.latencyRecordingRulePanelNativeHistogram($.queries.gateway.requestsPerSecondMetric, $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)])
)
.addPanel(
$.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway), $.queries.write_http_routes_regex], ''
)
$.perInstanceLatencyPanelNativeHistogram('0.99', $.queries.gateway.requestsPerSecondMetric, $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)])
)
)
.addRow(
Expand All @@ -131,7 +130,7 @@ local filename = 'mimir-writes.json';
When distributor is not configured to use "early" request rejection, then rejected requests are also counted as "errors".
|||
) +
$.qpsPanel($.queries.distributor.writeRequestsPerSecond) +
$.qpsPanelNativeHistogram($.queries.distributor.requestsPerSecondMetric, $.queries.distributor.writeRequestsPerSecondSelector) +
if $._config.show_rejected_requests_on_writes_dashboard then
{
targets: [
Expand All @@ -151,13 +150,11 @@ local filename = 'mimir-writes.json';
)
.addPanel(
$.timeseriesPanel('Latency') +
$.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)])
$.latencyRecordingRulePanelNativeHistogram($.queries.distributor.requestsPerSecondMetric, $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)])
)
.addPanel(
$.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor), $.queries.write_http_routes_regex], ''
)
$.perInstanceLatencyPanelNativeHistogram('0.99', $.queries.distributor.requestsPerSecondMetric, $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)])
)
.addPanelIf(
$._config.show_ingest_storage_panels,
Expand Down Expand Up @@ -203,7 +200,7 @@ local filename = 'mimir-writes.json';
When ingester is not configured to use "early" request rejection, then rejected requests are also counted as "errors".
|||
) +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) +
$.qpsPanelNativeHistogram($.queries.ingester.requestsPerSecondMetric, $.queries.ingester.writeRequestsPerSecondSelector) +
if $._config.show_rejected_requests_on_writes_dashboard then
{
targets: [
Expand All @@ -223,13 +220,11 @@ local filename = 'mimir-writes.json';
)
.addPanel(
$.timeseriesPanel('Latency') +
$.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')])
$.latencyRecordingRulePanelNativeHistogram($.queries.ingester.requestsPerSecondMetric, $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')])
)
.addPanel(
$.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], ''
)
$.perInstanceLatencyPanelNativeHistogram('0.99', $.queries.ingester.requestsPerSecondMetric, $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')])
)
)
.addRowIf(
Expand Down
8 changes: 4 additions & 4 deletions operations/mimir-mixin/jsonnetfile.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
"subdir": "grafana-builder"
}
},
"version": "bf12954197422f36f0803ee217e378ad055f3837",
"sum": "EEPwMLfUIJT9iEUI/gCW9x6PxWoTBPSJOfabTF4rp1M="
"version": "ea6f2601969aa12c02dbca761ce4316aff036af2",
"sum": "udZaafkbKYMGodLqsFhEe+Oy/St2p0edrK7hiMPEey0="
},
{
"source": {
Expand All @@ -18,8 +18,8 @@
"subdir": "mixin-utils"
}
},
"version": "bf12954197422f36f0803ee217e378ad055f3837",
"sum": "Qg992ZB0jkrS+YLq0Q7RV1fSHa8+hQT0jbpTyCGE2NI="
"version": "ea6f2601969aa12c02dbca761ce4316aff036af2",
"sum": "mzLmCv9n3ldLChVGPfyRJOVKoBw+dfK40vU9792aHIM="
}
],
"legacyImports": false
Expand Down

0 comments on commit e10027d

Please sign in to comment.