diff --git a/deployment/stats/prometheus/flytepropeller-dashboard.json b/deployment/stats/prometheus/flytepropeller-dashboard.json index b702e6a935e..72c2a6fb0e4 100644 --- a/deployment/stats/prometheus/flytepropeller-dashboard.json +++ b/deployment/stats/prometheus/flytepropeller-dashboard.json @@ -90,394 +90,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 1, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "sum(flyte:propeller:all:free_workers_count) by (kubernetes_pod_name)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "sum(flyte:propeller:all:free_workers_count) by (kubernetes_pod_name)", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Free workers count", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - } - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 2, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 1, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:round:success_count[5m]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "sum(rate(flyte:propeller:all:round:success_count[5m]))", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Round success rate", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - } - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 3, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 1, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:round:error_count[5m]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "sum(rate(flyte:propeller:all:round:error_count[5m]))", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Round error rate", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - } - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 4, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 1, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:round:abort_error_unlabeled[5m]))", + "expr": "sum(flyte:propeller:all:free_workers_count) by (kubernetes_pod_name)", "format": "time_series", "hide": false, "instant": false, @@ -485,7 +104,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:round:abort_error_unlabeled[5m]))", + "query": "sum(flyte:propeller:all:free_workers_count) by (kubernetes_pod_name)", "refId": "A", "step": 10, "target": "" @@ -494,7 +113,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Round abort errors", + "title": "Free workers count", "tooltip": { "msResolution": true, "shared": true, @@ -561,7 +180,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 5, + "id": 2, "interval": null, "isNew": true, "legend": { @@ -598,30 +217,75 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 1, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:round:system_error_unlabeled[5m]))", + "expr": "sum(rate(flyte:propeller:all:round:success_count[5m]))", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "success", "metric": "", - "query": "sum(rate(flyte:propeller:all:round:system_error_unlabeled[5m]))", + "query": "sum(rate(flyte:propeller:all:round:success_count[5m]))", "refId": "A", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:round:error_count[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "error", + "metric": "", + "query": "sum(rate(flyte:propeller:all:round:error_count[5m]))", + "refId": "B", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:round:round_total_ms_count[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "total", + "metric": "", + "query": "sum(rate(flyte:propeller:all:round:round_total_ms_count[5m]))", + "refId": "C", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:round:round_time_unlabeled_ms_count[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "other_total", + "metric": "", + "query": "sum(rate(flyte:propeller:all:round:round_time_unlabeled_ms_count[5m]))", + "refId": "D", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Round system errors", + "title": "Round success/error rate", "tooltip": { "msResolution": true, "shared": true, @@ -688,7 +352,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 6, + "id": 3, "interval": null, "isNew": true, "legend": { @@ -725,10 +389,40 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 1, + "span": 2, "stack": false, "steppedLine": false, "targets": [ + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:round:system_error_unlabeled[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "system", + "metric": "", + "query": "sum(rate(flyte:propeller:all:round:system_error_unlabeled[5m]))", + "refId": "A", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:round:abort_error_unlabeled[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "abort", + "metric": "", + "query": "sum(rate(flyte:propeller:all:round:abort_error_unlabeled[5m]))", + "refId": "B", + "step": 10, + "target": "" + }, { "datasource": null, "expr": "sum(rate(flyte:propeller:all:round:panic_unlabeled[5m]))", @@ -737,10 +431,25 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "panic", "metric": "", "query": "sum(rate(flyte:propeller:all:round:panic_unlabeled[5m]))", - "refId": "A", + "refId": "C", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:round:not_found[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "abort", + "metric": "", + "query": "sum(rate(flyte:propeller:all:round:not_found[5m]))", + "refId": "D", "step": 10, "target": "" } @@ -748,7 +457,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Round panic", + "title": "Error rate breakdown", "tooltip": { "msResolution": true, "shared": true, @@ -815,7 +524,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 7, + "id": 4, "interval": null, "isNew": true, "legend": { @@ -852,7 +561,7 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 1, + "span": 2, "stack": false, "steppedLine": false, "targets": [ @@ -942,7 +651,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 8, + "id": 5, "interval": null, "isNew": true, "legend": { @@ -979,7 +688,7 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 1, + "span": 2, "stack": false, "steppedLine": false, "targets": [ @@ -1069,7 +778,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 9, + "id": 6, "interval": null, "isNew": true, "legend": { @@ -1106,36 +815,36 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 1, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate({__name__=~\"flyte:propeller:all:plugin:.*_failure_unlabeled\"}[5m]))", + "expr": "sum(rate({__name__=~\"flyte:propeller:all:plugin:.*_success_unlabeled\"}[5m]))", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "failure", + "legendFormat": "success", "metric": "", - "query": "sum(rate({__name__=~\"flyte:propeller:all:plugin:.*_failure_unlabeled\"}[5m]))", + "query": "sum(rate({__name__=~\"flyte:propeller:all:plugin:.*_success_unlabeled\"}[5m]))", "refId": "A", "step": 10, "target": "" }, { "datasource": null, - "expr": "sum(rate({__name__=~\"flyte:propeller:all:plugin:.*_success_unlabeled\"}[5m]))", + "expr": "sum(rate({__name__=~\"flyte:propeller:all:plugin:.*_failure_unlabeled\"}[5m]))", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "success", + "legendFormat": "failure", "metric": "", - "query": "sum(rate({__name__=~\"flyte:propeller:all:plugin:.*_success_unlabeled\"}[5m]))", + "query": "sum(rate({__name__=~\"flyte:propeller:all:plugin:.*_failure_unlabeled\"}[5m]))", "refId": "B", "step": 10, "target": "" @@ -1211,7 +920,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 10, + "id": 7, "interval": null, "isNew": true, "legend": { @@ -1248,7 +957,7 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 1, + "span": 2, "stack": false, "steppedLine": false, "targets": [ @@ -1260,7 +969,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "traverse-P{{quantile}}", "metric": "", "query": "sum(flyte:propeller:all:round:raw_unlabeled_ms) by (quantile)", "refId": "A", @@ -1275,18 +984,48 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "mean", + "legendFormat": "traverse-mean", "metric": "", "query": "avg(flyte:propeller:all:round:raw_unlabeled_ms_sum/flyte:propeller:all:round:raw_unlabeled_ms_count)", "refId": "B", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "sum(flyte:propeller:all:round:round_time_ms) by (quantile)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "total-P{{quantile}}", + "metric": "", + "query": "sum(flyte:propeller:all:round:round_time_ms) by (quantile)", + "refId": "C", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "avg(flyte:propeller:all:round:round_time_unlabeled_ms_sum/flyte:propeller:all:round:round_time_unlabeled_ms_count)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "total-mean", + "metric": "", + "query": "avg(flyte:propeller:all:round:round_time_unlabeled_ms_sum/flyte:propeller:all:round:round_time_unlabeled_ms_count)", + "refId": "D", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "round Latency by quantile", + "title": "Round Latency", "tooltip": { "msResolution": true, "shared": true, @@ -1353,7 +1092,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 11, + "id": 8, "interval": null, "isNew": true, "legend": { @@ -1390,7 +1129,7 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 1, + "span": 2, "stack": false, "steppedLine": false, "targets": [ @@ -1402,7 +1141,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "{{wf}}", "metric": "", "query": "sum(flyte:propeller:all:round:raw_ms) by (wf)", "refId": "A", @@ -1413,7 +1152,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "round Latency per workflow", + "title": "Round traverse latency per workflow", "tooltip": { "msResolution": true, "shared": true, @@ -1480,7 +1219,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 12, + "id": 9, "interval": null, "isNew": true, "legend": { @@ -1517,7 +1256,7 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 1, + "span": 2, "stack": false, "steppedLine": false, "targets": [ @@ -1607,7 +1346,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 13, + "id": 10, "interval": null, "isNew": true, "legend": { @@ -1644,7 +1383,7 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 1, + "span": 2, "stack": false, "steppedLine": false, "targets": [ @@ -1744,7 +1483,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 14, + "id": 11, "interval": null, "isNew": true, "legend": { @@ -1901,7 +1640,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 15, + "id": 12, "interval": null, "isNew": true, "legend": { @@ -2058,7 +1797,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 16, + "id": 13, "interval": null, "isNew": true, "legend": { @@ -2215,7 +1954,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 17, + "id": 14, "interval": null, "isNew": true, "legend": { @@ -2382,7 +2121,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 18, + "id": 15, "interval": null, "isNew": true, "legend": { @@ -2509,7 +2248,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 19, + "id": 16, "interval": null, "isNew": true, "legend": { @@ -2706,7 +2445,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 20, + "id": 17, "interval": null, "isNew": true, "legend": { @@ -2833,7 +2572,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 21, + "id": 18, "interval": null, "isNew": true, "legend": { @@ -2960,7 +2699,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 22, + "id": 19, "interval": null, "isNew": true, "legend": { @@ -3087,7 +2826,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 23, + "id": 20, "interval": null, "isNew": true, "legend": { @@ -3214,7 +2953,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 24, + "id": 21, "interval": null, "isNew": true, "legend": { @@ -3366,7 +3105,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 25, + "id": 22, "interval": null, "isNew": true, "legend": { @@ -3493,7 +3232,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 26, + "id": 23, "interval": null, "isNew": true, "legend": { @@ -3620,7 +3359,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 27, + "id": 24, "interval": null, "isNew": true, "legend": { @@ -3762,7 +3501,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 28, + "id": 25, "interval": null, "isNew": true, "legend": { @@ -3929,7 +3668,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 29, + "id": 26, "interval": null, "isNew": true, "legend": { @@ -4056,7 +3795,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 30, + "id": 27, "interval": null, "isNew": true, "legend": { @@ -4198,7 +3937,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 31, + "id": 28, "interval": null, "isNew": true, "legend": { @@ -4325,7 +4064,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 32, + "id": 29, "interval": null, "isNew": true, "legend": { @@ -4467,7 +4206,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 33, + "id": 30, "interval": null, "isNew": true, "legend": { @@ -4594,7 +4333,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 34, + "id": 31, "interval": null, "isNew": true, "legend": { @@ -4736,7 +4475,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 35, + "id": 32, "interval": null, "isNew": true, "legend": { @@ -4863,7 +4602,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 36, + "id": 33, "interval": null, "isNew": true, "legend": { @@ -4990,7 +4729,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 37, + "id": 34, "interval": null, "isNew": true, "legend": { @@ -5150,7 +4889,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 38, + "id": 35, "interval": null, "links": [], "maxDataPoints": 100, @@ -5260,7 +4999,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 39, + "id": 36, "interval": null, "isNew": true, "legend": { @@ -5387,7 +5126,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 40, + "id": 37, "interval": null, "isNew": true, "legend": { @@ -5514,7 +5253,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 41, + "id": 38, "interval": null, "isNew": true, "legend": { @@ -5641,7 +5380,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 42, + "id": 39, "interval": null, "isNew": true, "legend": { @@ -5768,7 +5507,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 43, + "id": 40, "interval": null, "isNew": true, "legend": { @@ -5895,7 +5634,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 44, + "id": 41, "interval": null, "isNew": true, "legend": { @@ -6022,7 +5761,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 45, + "id": 42, "interval": null, "isNew": true, "legend": { @@ -6149,7 +5888,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 46, + "id": 43, "interval": null, "isNew": true, "legend": { @@ -6286,7 +6025,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 47, + "id": 44, "interval": null, "isNew": true, "legend": { @@ -6413,7 +6152,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 48, + "id": 45, "interval": null, "isNew": true, "legend": { @@ -6540,7 +6279,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 49, + "id": 46, "interval": null, "isNew": true, "legend": { @@ -6667,7 +6406,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 50, + "id": 47, "interval": null, "isNew": true, "legend": { @@ -6794,7 +6533,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 51, + "id": 48, "interval": null, "isNew": true, "legend": { @@ -6931,7 +6670,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 52, + "id": 49, "interval": null, "isNew": true, "legend": { @@ -7058,7 +6797,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 53, + "id": 50, "interval": null, "isNew": true, "legend": { @@ -7195,7 +6934,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 54, + "id": 51, "interval": null, "isNew": true, "legend": { @@ -7322,7 +7061,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 55, + "id": 52, "interval": null, "isNew": true, "legend": { @@ -7449,7 +7188,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 56, + "id": 53, "interval": null, "isNew": true, "legend": { @@ -7586,7 +7325,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 57, + "id": 54, "interval": null, "isNew": true, "legend": { @@ -7713,7 +7452,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 58, + "id": 55, "interval": null, "isNew": true, "legend": { @@ -7840,7 +7579,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 59, + "id": 56, "interval": null, "isNew": true, "legend": { @@ -7967,7 +7706,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 60, + "id": 57, "interval": null, "isNew": true, "legend": { diff --git a/stats/flytepropeller.dashboard.py b/stats/flytepropeller.dashboard.py index 29b2459ad48..5f17949c4be 100644 --- a/stats/flytepropeller.dashboard.py +++ b/stats/flytepropeller.dashboard.py @@ -36,12 +36,13 @@ def create_free_workers() -> Graph: @staticmethod def round_latency_per_wf() -> Graph: return Graph( - title=f"round Latency per workflow", + title=f"Round traverse latency per workflow", dataSource=DATASOURCE, targets=[ Target( expr=f"sum(flyte:propeller:all:round:raw_ms) by (wf)", refId="A", + legendFormat="{{wf}}", ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), @@ -50,31 +51,58 @@ def round_latency_per_wf() -> Graph: @staticmethod def round_latency() -> Graph: return Graph( - title=f"round Latency by quantile", + title=f"Round Latency", dataSource=DATASOURCE, targets=[ Target( - expr=f"sum(flyte:propeller:all:round:raw_unlabeled_ms) by (quantile)", + expr="sum(flyte:propeller:all:round:raw_unlabeled_ms) by (quantile)", refId="A", + legendFormat="traverse-P{{quantile}}", ), Target( - expr=f"avg(flyte:propeller:all:round:raw_unlabeled_ms_sum/flyte:propeller:all:round:raw_unlabeled_ms_count)", + expr="avg(flyte:propeller:all:round:raw_unlabeled_ms_sum/flyte:propeller:all:round:raw_unlabeled_ms_count)", refId="B", - legendFormat="mean", + legendFormat="traverse-mean", + ), + Target( + expr="sum(flyte:propeller:all:round:round_time_ms) by (quantile)", + refId="C", + legendFormat="total-P{{quantile}}", + ), + Target( + expr="avg(flyte:propeller:all:round:round_time_unlabeled_ms_sum/flyte:propeller:all:round:round_time_unlabeled_ms_count)", + refId="D", + legendFormat="total-mean", ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ) @staticmethod - def round_panic() -> Graph: + def error_breakdown() -> Graph: return Graph( - title="Round panic", + title="Error rate breakdown", dataSource=DATASOURCE, targets=[ Target( - expr="sum(rate(flyte:propeller:all:round:panic_unlabeled[5m]))", + expr="sum(rate(flyte:propeller:all:round:system_error_unlabeled[5m]))", refId="A", + legendFormat="system", + ), + Target( + expr="sum(rate(flyte:propeller:all:round:abort_error_unlabeled[5m]))", + refId="B", + legendFormat="abort", + ), + Target( + expr="sum(rate(flyte:propeller:all:round:panic_unlabeled[5m]))", + refId="C", + legendFormat="panic", + ), + Target( + expr="sum(rate(flyte:propeller:all:round:not_found[5m]))", + refId="D", + legendFormat="abort", ), ], yAxes=YAxes( @@ -117,83 +145,32 @@ def skipped_rounds() -> Graph: ), ) - @staticmethod - def system_errors() -> Graph: - return Graph( - title="Round system errors", - dataSource=DATASOURCE, - targets=[ - Target( - expr="sum(rate(flyte:propeller:all:round:system_error_unlabeled[5m]))", - refId="A", - ), - ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), - ) - - @staticmethod - def abort_errors() -> Graph: - return Graph( - title="Round abort errors", - dataSource=DATASOURCE, - targets=[ - Target( - expr="sum(rate(flyte:propeller:all:round:abort_error_unlabeled[5m]))", - refId="A", - ), - ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), - ) @staticmethod - def round_success() -> Graph: + def round_rates() -> Graph: return Graph( - title="Round success rate", + title="Round success/error rate", dataSource=DATASOURCE, targets=[ Target( expr="sum(rate(flyte:propeller:all:round:success_count[5m]))", refId="A", + legendFormat="success", ), - ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), - ) - - @staticmethod - def round_total() -> Graph: - return Graph( - title="Total round rate", - dataSource=DATASOURCE, - targets=[ Target( - expr="sum(rate(flyte:propeller:all:round:round_total[5m]))", - refId="A", + expr="sum(rate(flyte:propeller:all:round:error_count[5m]))", + refId="B", + legendFormat="error", ), - ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), - ) - - @staticmethod - def round_errors() -> Graph: - return Graph( - title="Round error rate", - dataSource=DATASOURCE, - targets=[ Target( - expr="sum(rate(flyte:propeller:all:round:error_count[5m]))", - refId="A", + expr="sum(rate(flyte:propeller:all:round:round_total_ms_count[5m]))", + refId="C", + legendFormat="total", + ), + Target( + expr="sum(rate(flyte:propeller:all:round:round_time_unlabeled_ms_count[5m]))", + refId="D", + legendFormat="other_total", ), ], yAxes=YAxes( @@ -246,14 +223,14 @@ def plugin_success_vs_failures() -> Graph: dataSource=DATASOURCE, targets=[ Target( - expr='sum(rate({__name__=~"flyte:propeller:all:plugin:.*_failure_unlabeled"}[5m]))', + expr='sum(rate({__name__=~"flyte:propeller:all:plugin:.*_success_unlabeled"}[5m]))', refId="A", - legendFormat="failure", + legendFormat="success", ), Target( - expr='sum(rate({__name__=~"flyte:propeller:all:plugin:.*_success_unlabeled"}[5m]))', + expr='sum(rate({__name__=~"flyte:propeller:all:plugin:.*_failure_unlabeled"}[5m]))', refId="B", - legendFormat="success", + legendFormat="failure", ), ], yAxes=YAxes( @@ -844,11 +821,8 @@ def core_metrics(interval: int, collapse: bool) -> Row: collapse=collapse, panels=[ FlytePropeller.create_free_workers(), - FlytePropeller.round_success(), - FlytePropeller.round_errors(), - FlytePropeller.abort_errors(), - FlytePropeller.system_errors(), - FlytePropeller.round_panic(), + FlytePropeller.round_rates(), + FlytePropeller.error_breakdown(), FlytePropeller.skipped_rounds(), FlytePropeller.streak_length(), FlytePropeller.plugin_success_vs_failures(),