From 466290840cc57ee089da4d7f4a7d8ab57d923529 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 26 Mar 2024 21:23:15 +0000 Subject: [PATCH] Add other queues to queue graphs --- .../prometheus/flytepropeller-dashboard.json | 371 +++++++++++++++--- stats/flytepropeller.dashboard.py | 82 +++- 2 files changed, 398 insertions(+), 55 deletions(-) diff --git a/deployment/stats/prometheus/flytepropeller-dashboard.json b/deployment/stats/prometheus/flytepropeller-dashboard.json index 81ef6aaa517..c9f33331b31 100644 --- a/deployment/stats/prometheus/flytepropeller-dashboard.json +++ b/deployment/stats/prometheus/flytepropeller-dashboard.json @@ -1524,12 +1524,42 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "main", "metric": "", "query": "rate(flyte:propeller:all:main_adds[5m])", "refId": "A", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "rate(flyte:propeller:all:sub_adds[5m])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "sub", + "metric": "", + "query": "rate(flyte:propeller:all:sub_adds[5m])", + "refId": "B", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "rate(flyte:propeller:all:admin_launcher:_adds[5m])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "admin_launcher", + "metric": "", + "query": "rate(flyte:propeller:all:admin_launcher:_adds[5m])", + "refId": "C", + "step": 10, + "target": "" } ], "thresholds": [], @@ -1651,12 +1681,42 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "main", "metric": "", "query": "flyte:propeller:all:main_depth", "refId": "A", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "flyte:propeller:all:sub_depth", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "sub", + "metric": "", + "query": "flyte:propeller:all:sub_depth", + "refId": "B", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "flyte:propeller:all:admin_launcher:_depth", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "admin_launcher", + "metric": "", + "query": "flyte:propeller:all:admin_launcher:_depth", + "refId": "C", + "step": 10, + "target": "" } ], "thresholds": [], @@ -1778,12 +1838,42 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "main", "metric": "", "query": "rate(flyte:propeller:all:main_retries[5m])", "refId": "A", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "rate(flyte:propeller:all:sub_retries[5m])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "sub", + "metric": "", + "query": "rate(flyte:propeller:all:sub_retries[5m])", + "refId": "B", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "rate(flyte:propeller:all:admin_launcher:_retries[5m])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "admin_launcher", + "metric": "", + "query": "rate(flyte:propeller:all:admin_launcher:_retries[5m])", + "refId": "C", + "step": 10, + "target": "" } ], "thresholds": [], @@ -1905,12 +1995,42 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "main", "metric": "", "query": "flyte:propeller:all:main_unfinished_work_s", "refId": "A", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "flyte:propeller:all:sub_unfinished_work_s", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "sub", + "metric": "", + "query": "flyte:propeller:all:sub_unfinished_work_s", + "refId": "B", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "flyte:propeller:all:admin_launcher:_unfinished_work_s", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "admin_launcher", + "metric": "", + "query": "flyte:propeller:all:admin_launcher:_unfinished_work_s", + "refId": "C", + "step": 10, + "target": "" } ], "thresholds": [], @@ -1956,6 +2076,163 @@ "align": false, "alignLevel": 0 } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 16, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "flyte:propeller:all:main_work_duration_us_sum/(flyte:propeller:all:main_work_duration_us_count*1000000)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "main", + "metric": "", + "query": "flyte:propeller:all:main_work_duration_us_sum/(flyte:propeller:all:main_work_duration_us_count*1000000)", + "refId": "A", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "flyte:propeller:all:sub_work_duration_us_sum/(flyte:propeller:all:sub_work_duration_us_count*1000000)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "sub", + "metric": "", + "query": "flyte:propeller:all:sub_work_duration_us_sum/(flyte:propeller:all:sub_work_duration_us_count*1000000)", + "refId": "B", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "flyte:propeller:all:admin_launcher:_queue_latency_us_sum/(flyte:propeller:all:admin_launcher:_queue_latency_us_count*1000000)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "admin_launcher", + "metric": "", + "query": "flyte:propeller:all:admin_launcher:_queue_latency_us_sum/(flyte:propeller:all:admin_launcher:_queue_latency_us_count*1000000)", + "refId": "C", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Average time before item being requested from work queue", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } } ], "repeat": null, @@ -1993,7 +2270,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 16, + "id": 17, "interval": null, "isNew": true, "legend": { @@ -2120,7 +2397,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 17, + "id": 18, "interval": null, "isNew": true, "legend": { @@ -2317,7 +2594,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 18, + "id": 19, "interval": null, "isNew": true, "legend": { @@ -2444,7 +2721,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 19, + "id": 20, "interval": null, "isNew": true, "legend": { @@ -2571,7 +2848,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 20, + "id": 21, "interval": null, "isNew": true, "legend": { @@ -2698,7 +2975,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 21, + "id": 22, "interval": null, "isNew": true, "legend": { @@ -2825,7 +3102,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 22, + "id": 23, "interval": null, "isNew": true, "legend": { @@ -2977,7 +3254,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 23, + "id": 24, "interval": null, "isNew": true, "legend": { @@ -3104,7 +3381,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 24, + "id": 25, "interval": null, "isNew": true, "legend": { @@ -3231,7 +3508,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 25, + "id": 26, "interval": null, "isNew": true, "legend": { @@ -3373,7 +3650,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 26, + "id": 27, "interval": null, "isNew": true, "legend": { @@ -3540,7 +3817,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 27, + "id": 28, "interval": null, "isNew": true, "legend": { @@ -3667,7 +3944,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 28, + "id": 29, "interval": null, "isNew": true, "legend": { @@ -3809,7 +4086,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 29, + "id": 30, "interval": null, "isNew": true, "legend": { @@ -3936,7 +4213,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 30, + "id": 31, "interval": null, "isNew": true, "legend": { @@ -4078,7 +4355,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 31, + "id": 32, "interval": null, "isNew": true, "legend": { @@ -4205,7 +4482,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 32, + "id": 33, "interval": null, "isNew": true, "legend": { @@ -4347,7 +4624,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 33, + "id": 34, "interval": null, "isNew": true, "legend": { @@ -4474,7 +4751,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 34, + "id": 35, "interval": null, "isNew": true, "legend": { @@ -4601,7 +4878,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 35, + "id": 36, "interval": null, "isNew": true, "legend": { @@ -4761,7 +5038,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 36, + "id": 37, "interval": null, "links": [], "maxDataPoints": 100, @@ -4871,7 +5148,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 37, + "id": 38, "interval": null, "isNew": true, "legend": { @@ -4998,7 +5275,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 38, + "id": 39, "interval": null, "isNew": true, "legend": { @@ -5125,7 +5402,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 39, + "id": 40, "interval": null, "isNew": true, "legend": { @@ -5252,7 +5529,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 40, + "id": 41, "interval": null, "isNew": true, "legend": { @@ -5379,7 +5656,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 41, + "id": 42, "interval": null, "isNew": true, "legend": { @@ -5506,7 +5783,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 42, + "id": 43, "interval": null, "isNew": true, "legend": { @@ -5633,7 +5910,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 43, + "id": 44, "interval": null, "isNew": true, "legend": { @@ -5760,7 +6037,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 44, + "id": 45, "interval": null, "isNew": true, "legend": { @@ -5897,7 +6174,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 45, + "id": 46, "interval": null, "isNew": true, "legend": { @@ -6024,7 +6301,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 46, + "id": 47, "interval": null, "isNew": true, "legend": { @@ -6151,7 +6428,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 47, + "id": 48, "interval": null, "isNew": true, "legend": { @@ -6278,7 +6555,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 48, + "id": 49, "interval": null, "isNew": true, "legend": { @@ -6405,7 +6682,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 49, + "id": 50, "interval": null, "isNew": true, "legend": { @@ -6542,7 +6819,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 50, + "id": 51, "interval": null, "isNew": true, "legend": { @@ -6669,7 +6946,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 51, + "id": 52, "interval": null, "isNew": true, "legend": { @@ -6806,7 +7083,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 52, + "id": 53, "interval": null, "isNew": true, "legend": { @@ -6933,7 +7210,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 53, + "id": 54, "interval": null, "isNew": true, "legend": { @@ -7060,7 +7337,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 54, + "id": 55, "interval": null, "isNew": true, "legend": { @@ -7197,7 +7474,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 55, + "id": 56, "interval": null, "isNew": true, "legend": { @@ -7324,7 +7601,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 56, + "id": 57, "interval": null, "isNew": true, "legend": { @@ -7451,7 +7728,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 57, + "id": 58, "interval": null, "isNew": true, "legend": { @@ -7578,7 +7855,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 58, + "id": 59, "interval": null, "isNew": true, "legend": { diff --git a/stats/flytepropeller.dashboard.py b/stats/flytepropeller.dashboard.py index 410bdca01bc..67ef3a37df4 100644 --- a/stats/flytepropeller.dashboard.py +++ b/stats/flytepropeller.dashboard.py @@ -2,8 +2,9 @@ from grafanalib.core import (MILLISECONDS_FORMAT, NO_FORMAT, OPS_FORMAT, PERCENT_FORMAT, SECONDS_FORMAT, SHORT_FORMAT, - Dashboard, DataSourceInput, Gauge, Graph, Row, - Stat, Target, YAxes, YAxis, single_y_axis, BarGauge) + BarGauge, Dashboard, DataSourceInput, Gauge, + Graph, Row, Stat, Target, YAxes, YAxis, + single_y_axis) # ------------------------------ # For Gostats we recommend using @@ -93,7 +94,7 @@ def streak_length() -> Graph: YAxis(format=SHORT_FORMAT), ), ) - + @staticmethod def skipped_rounds() -> Graph: return Graph( @@ -110,7 +111,6 @@ def skipped_rounds() -> Graph: YAxis(format=SHORT_FORMAT), ), ) - @staticmethod def system_errors() -> Graph: @@ -680,9 +680,20 @@ def queue_metrics(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='rate(flyte:propeller:all:main_adds[5m])', + expr="rate(flyte:propeller:all:main_adds[5m])", + legendFormat="main", refId="A", ), + Target( + expr="rate(flyte:propeller:all:sub_adds[5m])", + legendFormat="sub", + refId="B", + ), + Target( + expr="rate(flyte:propeller:all:admin_launcher:_adds[5m])", + legendFormat="admin_launcher", + refId="C", + ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), @@ -691,9 +702,20 @@ def queue_metrics(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='flyte:propeller:all:main_depth', + expr="flyte:propeller:all:main_depth", + legendFormat="main", refId="A", ), + Target( + expr="flyte:propeller:all:sub_depth", + legendFormat="sub", + refId="B", + ), + Target( + expr="flyte:propeller:all:admin_launcher:_depth", + legendFormat="admin_launcher", + refId="C", + ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), @@ -702,9 +724,20 @@ def queue_metrics(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='rate(flyte:propeller:all:main_retries[5m])', + expr="rate(flyte:propeller:all:main_retries[5m])", + legendFormat="main", refId="A", ), + Target( + expr="rate(flyte:propeller:all:sub_retries[5m])", + legendFormat="sub", + refId="B", + ), + Target( + expr="rate(flyte:propeller:all:admin_launcher:_retries[5m])", + legendFormat="admin_launcher", + refId="C", + ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), @@ -713,9 +746,42 @@ def queue_metrics(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='flyte:propeller:all:main_unfinished_work_s', + expr="flyte:propeller:all:main_unfinished_work_s", + legendFormat="main", refId="A", ), + Target( + expr="flyte:propeller:all:sub_unfinished_work_s", + legendFormat="sub", + refId="B", + ), + Target( + expr="flyte:propeller:all:admin_launcher:_unfinished_work_s", + legendFormat="admin_launcher", + refId="C", + ), + ], + yAxes=single_y_axis(format=SECONDS_FORMAT), + ), + Graph( + title="Average time before item being requested from work queue", + dataSource=DATASOURCE, + targets=[ + Target( + expr="flyte:propeller:all:main_work_duration_us_sum/(flyte:propeller:all:main_work_duration_us_count*1000000)", + legendFormat="main", + refId="A", + ), + Target( + expr="flyte:propeller:all:sub_work_duration_us_sum/(flyte:propeller:all:sub_work_duration_us_count*1000000)", + legendFormat="sub", + refId="B", + ), + Target( + expr="flyte:propeller:all:admin_launcher:_queue_latency_us_sum/(flyte:propeller:all:admin_launcher:_queue_latency_us_count*1000000)", + legendFormat="admin_launcher", + refId="C", + ), ], yAxes=single_y_axis(format=SECONDS_FORMAT), ),