From 53ed4126e05aea7a53e3c8078eb659e90db19366 Mon Sep 17 00:00:00 2001 From: nolouch Date: Wed, 22 May 2024 14:39:03 +0800 Subject: [PATCH] metrics: update grafana template to add heartbeat latency overview Signed-off-by: nolouch --- metrics/grafana/pd.json | 309 ++++++++++++++++++++++++++++------------ 1 file changed, 221 insertions(+), 88 deletions(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 54a047e612e..a2c3d31a4b0 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -11170,10 +11170,15 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The region heartbeat handle duration in .99", + "description": "The region heartbeat handle duration by levels", "editable": true, "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "fill": 0, + "fillGradient": 0, "grid": {}, "gridPos": { "h": 8, @@ -11181,7 +11186,8 @@ "x": 12, "y": 23 }, - "id": 1302, + "hiddenSeries": false, + "id": 1610, "legend": { "alignAsTable": true, "avg": false, @@ -11199,8 +11205,12 @@ "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "paceLength": 10, "percentage": false, + "pluginVersion": "7.5.17", "pointradius": 5, "points": false, "renderer": "flot", @@ -11210,20 +11220,46 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) by (address, store, le))", + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) by (le))", "format": "time_series", "hide": false, + "interval": "", "intervalFactor": 2, - "legendFormat": "{{address}}-store-{{store}}", + "legendFormat": "0.99", "refId": "A", "step": 4 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "0.9", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.8, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "0.8", + "refId": "C" + }, + { + "exemplar": true, + "expr": "rate(pd_scheduler_handle_region_heartbeat_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m]) / rate(pd_scheduler_handle_region_heartbeat_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])", + "hide": false, + "interval": "", + "legendFormat": "avg", + "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "99% Region heartbeat handle latency", + "title": "Region heartbeat handle latency overview", "tooltip": { "msResolution": false, "shared": true, @@ -11381,15 +11417,14 @@ }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The breakdown metric about heartbeat", + "description": "The region heartbeat handle duration in .99 by store", "editable": true, "error": false, "fill": 0, - "fillGradient": 0, "grid": {}, "gridPos": { "h": 8, @@ -11397,77 +11432,49 @@ "x": 12, "y": 31 }, - "hiddenSeries": false, - "id": 1335, + "id": 1302, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, "hideEmpty": true, - "hideZero": true, + "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", - "options": { - "alertThreshold": true - }, "paceLength": 10, "percentage": false, - "pluginVersion": "8.5.27", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "WaitRegionsLock", - "bars": false, - "lines": true, - "linewidth": 2, - "stack": false - }, - { - "alias": "WaitSubRegionsLock", - "bars": false, - "lines": true, - "linewidth": 2, - "stack": false - } - ], + "seriesOverrides": [], "spaceLength": 10, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(pd_core_region_heartbeat_breakdown_handle_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (name)", + "expr": "histogram_quantile(0.99, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) by (address, store, le))", "format": "time_series", "hide": false, "intervalFactor": 2, - "legendFormat": "{{name}}", - "range": true, + "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 - }, - { - "expr": "sum(rate(pd_core_acquire_regions_lock_wait_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (type)", - "hide": false, - "legendFormat": "{{type}}", - "range": true, - "refId": "B" } ], "thresholds": [], + "timeFrom": null, "timeRegions": [], - "title": "Heartbeat Performance Duration BreakDown (Accumulation)", + "timeShift": null, + "title": "99% Region heartbeat handle latency by store", "tooltip": { "msResolution": false, "shared": true, @@ -11476,25 +11483,33 @@ }, "type": "graph", "xaxis": { + "buckets": null, "mode": "time", + "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", + "label": null, "logBase": 1, + "max": null, "min": "0", "show": true }, { "format": "s", + "label": null, "logBase": 1, + "max": null, + "min": null, "show": true } ], "yaxis": { - "align": false + "align": false, + "alignLevel": null } }, { @@ -11594,6 +11609,124 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The breakdown metric about heartbeat", + "editable": true, + "error": false, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "hiddenSeries": false, + "id": 1335, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "8.5.27", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "WaitRegionsLock", + "bars": false, + "lines": true, + "linewidth": 2, + "stack": false + }, + { + "alias": "WaitSubRegionsLock", + "bars": false, + "lines": true, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(pd_core_region_heartbeat_breakdown_handle_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (name)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "range": true, + "refId": "A", + "step": 4 + }, + { + "expr": "sum(rate(pd_core_acquire_regions_lock_wait_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (type)", + "hide": false, + "legendFormat": "{{type}}", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Heartbeat Performance Duration BreakDown (Accumulation)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "s", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, { "aliasColors": {}, "bars": false, @@ -11613,11 +11746,11 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 39 + "x": 0, + "y": 47 }, "hiddenSeries": false, - "id": 1608, + "id": 1609, "legend": { "alignAsTable": true, "avg": true, @@ -11644,7 +11777,15 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "/max-wait-duration.*/", + "bars": true, + "lines": false, + "transform": "negative-Y", + "yaxis": 2 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, @@ -11659,13 +11800,21 @@ "legendFormat": "{{task_type}}_{{runner_name}}", "refId": "A", "step": 4 + }, + { + "exemplar": true, + "expr": "pd_ratelimit_runner_task_max_waiting_duration_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", + "hide": false, + "interval": "", + "legendFormat": "max-wait-duration-{{runner_name}}", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Heartbeat Runner Pending Task", + "title": "Concurrent Runner Pending Task", "tooltip": { "msResolution": false, "shared": true, @@ -11682,8 +11831,9 @@ }, "yaxes": [ { - "format": "opm", - "label": null, + "decimals": null, + "format": "none", + "label": "", "logBase": 1, "max": null, "min": "0", @@ -11722,11 +11872,11 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 47 }, "hiddenSeries": false, - "id": 1609, + "id": 1608, "legend": { "alignAsTable": true, "avg": true, @@ -11753,15 +11903,7 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/max-wait-duration.*/", - "bars": true, - "lines": false, - "transform": "negative-Y", - "yaxis": 2 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, @@ -11776,14 +11918,6 @@ "legendFormat": "failed-tasks-{{runner_name}}", "refId": "A", "step": 4 - }, - { - "exemplar": true, - "expr": "pd_ratelimit_runner_task_max_waiting_duration_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", - "hide": false, - "interval": "", - "legendFormat": "max-wait-duration-{{runner_name}}", - "refId": "B" } ], "thresholds": [], @@ -11807,9 +11941,8 @@ }, "yaxes": [ { - "decimals": null, "format": "opm", - "label": "", + "label": null, "logBase": 1, "max": null, "min": "0", @@ -11843,8 +11976,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 47 + "x": 0, + "y": 55 }, "id": 1305, "legend": { @@ -11937,7 +12070,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 55 }, "id": 1306, @@ -12027,8 +12160,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 55 + "x": 0, + "y": 63 }, "id": 1307, "legend": { @@ -12120,7 +12253,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 63 }, "id": 1308, @@ -12217,8 +12350,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 63 + "x": 0, + "y": 71 }, "id": 1309, "legend": { @@ -12314,7 +12447,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 71 }, "id": 1310, @@ -12411,8 +12544,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 71 + "x": 0, + "y": 79 }, "id": 1311, "legend": { @@ -12508,7 +12641,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 79 }, "id": 1312,