From 9e46d1b02d670b9df994b268eedbbff475b3d3ed Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 24 Nov 2023 13:07:02 +0000 Subject: [PATCH] Add more stats on workflow store --- .../prometheus/flytepropeller-dashboard.json | 548 +++++++++++++++++- stats/flytepropeller.dashboard.py | 62 +- 2 files changed, 591 insertions(+), 19 deletions(-) diff --git a/deployment/stats/prometheus/flytepropeller-dashboard.json b/deployment/stats/prometheus/flytepropeller-dashboard.json index d2520e41511..385eea5f705 100644 --- a/deployment/stats/prometheus/flytepropeller-dashboard.json +++ b/deployment/stats/prometheus/flytepropeller-dashboard.json @@ -4462,6 +4462,133 @@ "align": false, "alignLevel": 0 } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 34, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:wf_too_large[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "sum(rate(flyte:propeller:all:wf_too_large[5m]))", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "etcD write too large", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } } ], "repeat": null, @@ -4499,7 +4626,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 34, + "id": 35, "interval": null, "isNew": true, "legend": { @@ -4626,7 +4753,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 35, + "id": 36, "interval": null, "isNew": true, "legend": { @@ -4753,7 +4880,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 36, + "id": 37, "interval": null, "isNew": true, "legend": { @@ -4880,7 +5007,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 37, + "id": 38, "interval": null, "isNew": true, "legend": { @@ -5017,7 +5144,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 38, + "id": 39, "interval": null, "isNew": true, "legend": { @@ -5144,7 +5271,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 39, + "id": 40, "interval": null, "isNew": true, "legend": { @@ -5271,7 +5398,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 40, + "id": 41, "interval": null, "isNew": true, "legend": { @@ -5398,7 +5525,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 41, + "id": 42, "interval": null, "isNew": true, "legend": { @@ -5525,7 +5652,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 42, + "id": 43, "interval": null, "isNew": true, "legend": { @@ -5652,7 +5779,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 43, + "id": 44, "interval": null, "isNew": true, "legend": { @@ -5779,7 +5906,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 44, + "id": 45, "interval": null, "isNew": true, "legend": { @@ -5906,7 +6033,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 45, + "id": 46, "interval": null, "isNew": true, "legend": { @@ -6043,7 +6170,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 46, + "id": 47, "interval": null, "isNew": true, "legend": { @@ -6170,7 +6297,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 47, + "id": 48, "interval": null, "isNew": true, "legend": { @@ -6274,7 +6401,398 @@ ], "repeat": null, "showTitle": true, - "title": "Informer stats" + "title": "K8s Pod Informer stats" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 49, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:wf_stale_unlabeled[5m])) by (quantile)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "sum(rate(flyte:propeller:all:wf_stale_unlabeled[5m])) by (quantile)", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Stale workflows rate", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 50, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:wf_stale_unlabeled[5m])) by (quantile)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "sum(rate(flyte:propeller:all:wf_stale_unlabeled[5m])) by (quantile)", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Evict workflows rate", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 51, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:wf_redundant_unlabeled[5m])) by (quantile)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "sum(rate(flyte:propeller:all:wf_redundant_unlabeled[5m])) by (quantile)", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Workflow redundant updates rate", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + } + ], + "repeat": null, + "showTitle": true, + "title": "Workflow store" } ], "schemaVersion": 12, diff --git a/stats/flytepropeller.dashboard.py b/stats/flytepropeller.dashboard.py index 7ba46375367..6b75b6e98b7 100644 --- a/stats/flytepropeller.dashboard.py +++ b/stats/flytepropeller.dashboard.py @@ -554,6 +554,17 @@ def wf_store_latency(collapse: bool) -> Row: ], yAxes=single_y_axis(format=NO_FORMAT), ), + Graph( + title="etcD write too large", + dataSource=DATASOURCE, + targets=[ + Target( + expr=f"sum(rate(flyte:propeller:all:wf_too_large[5m]))", + refId="A", + ), + ], + yAxes=single_y_axis(format=NO_FORMAT), + ), ], ) @@ -738,11 +749,11 @@ def workflow_latencies(collapse: bool) -> Row: for panel in panels ], ) - + @staticmethod - def informers(collapse: bool) -> Row: + def k8s_pod_informers(collapse: bool) -> Row: return Row( - title="Informer stats", + title="K8s Pod Informer stats", collapse=collapse, panels=[ Graph( @@ -770,6 +781,48 @@ def informers(collapse: bool) -> Row: ], ) + @staticmethod + def workflowstore(collapse: bool) -> Row: + return Row( + title="Workflow store", + collapse=collapse, + panels=[ + Graph( + title="Stale workflows rate", + dataSource=DATASOURCE, + targets=[ + Target( + expr=f"sum(rate(flyte:propeller:all:wf_stale_unlabeled[5m])) by (quantile)", + refId="A", + ), + ], + yAxes=single_y_axis(format=MILLISECONDS_FORMAT), + ), + Graph( + title="Evict workflows rate", + dataSource=DATASOURCE, + targets=[ + Target( + expr=f"sum(rate(flyte:propeller:all:wf_stale_unlabeled[5m])) by (quantile)", + refId="A", + ), + ], + yAxes=single_y_axis(format=MILLISECONDS_FORMAT), + ), + Graph( + title="Workflow redundant updates rate", + dataSource=DATASOURCE, + targets=[ + Target( + expr=f"sum(rate(flyte:propeller:all:wf_redundant_unlabeled[5m])) by (quantile)", + refId="A", + ), + ], + yAxes=single_y_axis(format=MILLISECONDS_FORMAT), + ), + ], + ) + @staticmethod def create_all_rows(interval: int = 5) -> typing.List[Row]: return [ @@ -781,7 +834,8 @@ def create_all_rows(interval: int = 5) -> typing.List[Row]: FlytePropeller.wf_store_latency(False), FlytePropeller.queue_metrics(True), FlytePropeller.workflow_latencies(False), - FlytePropeller.informers(False), + FlytePropeller.k8s_pod_informers(False), + FlytePropeller.workflowstore(False), ]