From 1deef7fc7e709d0b0d39b7aafdcc99d59fd5d529 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 29 Nov 2023 18:18:18 +0000 Subject: [PATCH] Fix scale on cache hit rate --- .../prometheus/flytepropeller-dashboard.json | 740 +++++++++--------- stats/flytepropeller.dashboard.py | 17 +- 2 files changed, 380 insertions(+), 377 deletions(-) diff --git a/deployment/stats/prometheus/flytepropeller-dashboard.json b/deployment/stats/prometheus/flytepropeller-dashboard.json index 1aa1d8358ed..07c2ef3d7d3 100644 --- a/deployment/stats/prometheus/flytepropeller-dashboard.json +++ b/deployment/stats/prometheus/flytepropeller-dashboard.json @@ -1445,7 +1445,7 @@ "title": "Core metrics" }, { - "collapse": true, + "collapse": false, "editable": true, "height": "250px", "panels": [ @@ -1512,13 +1512,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 6, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "(sum(rate(flyte:propeller:all:metastore:cache_hit[5m])) * 100) / (sum(rate(flyte:propeller:all:metastore:cache_miss[5m])) + sum(rate(flyte:propeller:all:metastore:cache_hit[5m])))", + "expr": "sum(rate(workqueue_adds_total{name=~\"flyte:propeller.*\"}[5m])) by (name)", "format": "time_series", "hide": false, "instant": false, @@ -1526,7 +1526,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "(sum(rate(flyte:propeller:all:metastore:cache_hit[5m])) * 100) / (sum(rate(flyte:propeller:all:metastore:cache_miss[5m])) + sum(rate(flyte:propeller:all:metastore:cache_hit[5m])))", + "query": "sum(rate(workqueue_adds_total{name=~\"flyte:propeller.*\"}[5m])) by (name)", "refId": "A", "step": 10, "target": "" @@ -1535,7 +1535,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "cache hit percentage", + "title": "Add rate to queue", "tooltip": { "msResolution": true, "shared": true, @@ -1554,7 +1554,7 @@ "yaxes": [ { "decimals": null, - "format": "percent", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -1639,90 +1639,30 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 6, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:metastore:head_failure_unlabeled[5m]))", + "expr": "sum(workqueue_depth{name=~\"flyte:propeller.*\"}) by (name)", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "head-failure", + "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:metastore:head_failure_unlabeled[5m]))", + "query": "sum(workqueue_depth{name=~\"flyte:propeller.*\"}) by (name)", "refId": "A", "step": 10, "target": "" - }, - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:metastore:bad_container_unlabeled[5m]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "bad-container", - "metric": "", - "query": "sum(rate(flyte:propeller:all:metastore:bad_container_unlabeled[5m]))", - "refId": "B", - "step": 10, - "target": "" - }, - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:metastore:bad_key_unlabeled[5m]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "bad-key", - "metric": "", - "query": "sum(rate(flyte:propeller:all:metastore:bad_key_unlabeled[5m]))", - "refId": "C", - "step": 10, - "target": "" - }, - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:metastore:read_failure_unlabeled[5m]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "read-failure", - "metric": "", - "query": "sum(rate(flyte:propeller:all:metastore:read_failure_unlabeled[5m]))", - "refId": "D", - "step": 10, - "target": "" - }, - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:metastore:write_failure_unlabeled[5m]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "write-failure", - "metric": "", - "query": "sum(rate(flyte:propeller:all:metastore:write_failure_unlabeled[5m]))", - "refId": "E", - "step": 10, - "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Failures from metastore", + "title": "Unprocessed Queue depth", "tooltip": { "msResolution": true, "shared": true, @@ -1741,7 +1681,7 @@ "yaxes": [ { "decimals": null, - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -1762,17 +1702,7 @@ "align": false, "alignLevel": 0 } - } - ], - "repeat": null, - "showTitle": true, - "title": "Metastore failures and cache" - }, - { - "collapse": true, - "editable": true, - "height": "250px", - "panels": [ + }, { "aliasColors": {}, "bars": false, @@ -1842,7 +1772,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:metastore:copy:overall_unlabeled_ms) by (quantile)", + "expr": "sum(rate(workqueue_retries_total{name=~\"flyte:propeller.*\"}[5m])) by (name)", "format": "time_series", "hide": false, "instant": false, @@ -1850,7 +1780,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:metastore:copy:overall_unlabeled_ms) by (quantile)", + "query": "sum(rate(workqueue_retries_total{name=~\"flyte:propeller.*\"}[5m])) by (name)", "refId": "A", "step": 10, "target": "" @@ -1859,7 +1789,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Metastore copy latency", + "title": "Item retries rate", "tooltip": { "msResolution": true, "shared": true, @@ -1878,7 +1808,7 @@ "yaxes": [ { "decimals": null, - "format": "ms", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -1969,7 +1899,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:metastore:write_ms) by (quantile, wf)", + "expr": "sum(workqueue_unfinished_work_seconds{name=~\"flyte:propeller.*\"}) by (name)", "format": "time_series", "hide": false, "instant": false, @@ -1977,7 +1907,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:metastore:write_ms) by (quantile, wf)", + "query": "sum(workqueue_unfinished_work_seconds{name=~\"flyte:propeller.*\"}) by (name)", "refId": "A", "step": 10, "target": "" @@ -1986,7 +1916,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Metastore write latency by workflow", + "title": "Seconds of unfinished work in progress", "tooltip": { "msResolution": true, "shared": true, @@ -2005,7 +1935,7 @@ "yaxes": [ { "decimals": null, - "format": "ms", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -2026,7 +1956,17 @@ "align": false, "alignLevel": 0 } - }, + } + ], + "repeat": null, + "showTitle": true, + "title": "FlytePropeller Queue metrics" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ { "aliasColors": {}, "bars": false, @@ -2090,13 +2030,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 6, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:metastore:read_open_ms) by (quantile, wf)", + "expr": "(sum(rate(flyte:propeller:all:metastore:cache_hit[5m])) * 100) / (sum(rate(flyte:propeller:all:metastore:cache_miss[5m])) + sum(rate(flyte:propeller:all:metastore:cache_hit[5m])))", "format": "time_series", "hide": false, "instant": false, @@ -2104,7 +2044,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:metastore:read_open_ms) by (quantile, wf)", + "query": "(sum(rate(flyte:propeller:all:metastore:cache_hit[5m])) * 100) / (sum(rate(flyte:propeller:all:metastore:cache_miss[5m])) + sum(rate(flyte:propeller:all:metastore:cache_hit[5m])))", "refId": "A", "step": 10, "target": "" @@ -2113,7 +2053,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Metastore read open latency by workflow", + "title": "cache hit percentage", "tooltip": { "msResolution": true, "shared": true, @@ -2132,7 +2072,7 @@ "yaxes": [ { "decimals": null, - "format": "ms", + "format": "percent", "label": null, "logBase": 1, "max": null, @@ -2217,30 +2157,90 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 6, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:metastore:head_ms) by (quantile, wf)", + "expr": "sum(rate(flyte:propeller:all:metastore:head_failure_unlabeled[5m]))", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "head-failure", "metric": "", - "query": "sum(flyte:propeller:all:metastore:head_ms) by (quantile, wf)", + "query": "sum(rate(flyte:propeller:all:metastore:head_failure_unlabeled[5m]))", "refId": "A", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:metastore:bad_container_unlabeled[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "bad-container", + "metric": "", + "query": "sum(rate(flyte:propeller:all:metastore:bad_container_unlabeled[5m]))", + "refId": "B", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:metastore:bad_key_unlabeled[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "bad-key", + "metric": "", + "query": "sum(rate(flyte:propeller:all:metastore:bad_key_unlabeled[5m]))", + "refId": "C", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:metastore:read_failure_unlabeled[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "read-failure", + "metric": "", + "query": "sum(rate(flyte:propeller:all:metastore:read_failure_unlabeled[5m]))", + "refId": "D", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:metastore:write_failure_unlabeled[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "write-failure", + "metric": "", + "query": "sum(rate(flyte:propeller:all:metastore:write_failure_unlabeled[5m]))", + "refId": "E", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Metastore head latency by workflow", + "title": "Failures from metastore", "tooltip": { "msResolution": true, "shared": true, @@ -2259,7 +2259,7 @@ "yaxes": [ { "decimals": null, - "format": "ms", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -2280,7 +2280,17 @@ "align": false, "alignLevel": 0 } - }, + } + ], + "repeat": null, + "showTitle": true, + "title": "Metastore failures and cache" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ { "aliasColors": {}, "bars": false, @@ -2350,39 +2360,24 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:metastore:proto_fetch_ms) by (quantile, wf)", + "expr": "sum(flyte:propeller:all:metastore:copy:overall_unlabeled_ms) by (quantile)", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "proto-fetch", + "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:metastore:proto_fetch_ms) by (quantile, wf)", + "query": "sum(flyte:propeller:all:metastore:copy:overall_unlabeled_ms) by (quantile)", "refId": "A", "step": 10, "target": "" - }, - { - "datasource": null, - "expr": "sum(flyte:propeller:all:metastore:remote_fetch_ms) by (quantile, wf)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "remote-fetch", - "metric": "", - "query": "sum(flyte:propeller:all:metastore:remote_fetch_ms) by (quantile, wf)", - "refId": "B", - "step": 10, - "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Metastore fetch latency by workflow", + "title": "Metastore copy latency", "tooltip": { "msResolution": true, "shared": true, @@ -2422,17 +2417,7 @@ "align": false, "alignLevel": 0 } - } - ], - "repeat": null, - "showTitle": true, - "title": "Metastore latencies" - }, - { - "collapse": true, - "editable": true, - "height": "250px", - "panels": [ + }, { "aliasColors": {}, "bars": false, @@ -2502,7 +2487,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:node:node_exec_latency_us) by (quantile, wf) / 1000", + "expr": "sum(flyte:propeller:all:metastore:write_ms) by (quantile, wf)", "format": "time_series", "hide": false, "instant": false, @@ -2510,7 +2495,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:node:node_exec_latency_us) by (quantile, wf) / 1000", + "query": "sum(flyte:propeller:all:metastore:write_ms) by (quantile, wf)", "refId": "A", "step": 10, "target": "" @@ -2519,7 +2504,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Node Exec latency quantile and workflow", + "title": "Metastore write latency by workflow", "tooltip": { "msResolution": true, "shared": true, @@ -2629,7 +2614,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:node:node_input_latency_ms) by (quantile, wf)", + "expr": "sum(flyte:propeller:all:metastore:read_open_ms) by (quantile, wf)", "format": "time_series", "hide": false, "instant": false, @@ -2637,7 +2622,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:node:node_input_latency_ms) by (quantile, wf)", + "query": "sum(flyte:propeller:all:metastore:read_open_ms) by (quantile, wf)", "refId": "A", "step": 10, "target": "" @@ -2646,7 +2631,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Node Input latency quantile and workflow", + "title": "Metastore read open latency by workflow", "tooltip": { "msResolution": true, "shared": true, @@ -2756,7 +2741,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:node:event_recording:success_duration_ms) by (quantile, wf)", + "expr": "sum(flyte:propeller:all:metastore:head_ms) by (quantile, wf)", "format": "time_series", "hide": false, "instant": false, @@ -2764,31 +2749,16 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:node:event_recording:success_duration_ms) by (quantile, wf)", + "query": "sum(flyte:propeller:all:metastore:head_ms) by (quantile, wf)", "refId": "A", "step": 10, "target": "" - }, - { - "datasource": null, - "expr": "sum(flyte:propeller:all:node:event_recording:failure_duration_ms) by (quantile, wf)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "sum(flyte:propeller:all:node:event_recording:failure_duration_ms) by (quantile, wf)", - "refId": "B", - "step": 10, - "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Node Event event recording latency quantile and workflow", + "title": "Metastore head latency by workflow", "tooltip": { "msResolution": true, "shared": true, @@ -2898,54 +2868,39 @@ "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:node:perma_system_error_duration_unlabeled_ms_count[5m]))", + "expr": "sum(flyte:propeller:all:metastore:proto_fetch_ms) by (quantile, wf)", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "system error", + "legendFormat": "proto-fetch", "metric": "", - "query": "sum(rate(flyte:propeller:all:node:perma_system_error_duration_unlabeled_ms_count[5m]))", + "query": "sum(flyte:propeller:all:metastore:proto_fetch_ms) by (quantile, wf)", "refId": "A", "step": 10, "target": "" }, { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:node:perma_user_error_duration_unlabeled_ms[5m]))", + "expr": "sum(flyte:propeller:all:metastore:remote_fetch_ms) by (quantile, wf)", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "user error", + "legendFormat": "remote-fetch", "metric": "", - "query": "sum(rate(flyte:propeller:all:node:perma_user_error_duration_unlabeled_ms[5m]))", + "query": "sum(flyte:propeller:all:metastore:remote_fetch_ms) by (quantile, wf)", "refId": "B", "step": 10, "target": "" - }, - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:node:perma_unknown_error_duration_unlabeled_ms[5m]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "user error", - "metric": "", - "query": "sum(rate(flyte:propeller:all:node:perma_unknown_error_duration_unlabeled_ms[5m]))", - "refId": "C", - "step": 10, - "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "node event recording count", + "title": "Metastore fetch latency by workflow", "tooltip": { "msResolution": true, "shared": true, @@ -2964,7 +2919,7 @@ "yaxes": [ { "decimals": null, - "format": "none", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -2989,7 +2944,7 @@ ], "repeat": null, "showTitle": true, - "title": "Node Metrics" + "title": "Metastore latencies" }, { "collapse": true, @@ -3059,13 +3014,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 2, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:workflow:event_recording:success_duration_ms) by (quantile, wf)", + "expr": "sum(flyte:propeller:all:node:node_exec_latency_us) by (quantile, wf) / 1000", "format": "time_series", "hide": false, "instant": false, @@ -3073,7 +3028,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:workflow:event_recording:success_duration_ms) by (quantile, wf)", + "query": "sum(flyte:propeller:all:node:node_exec_latency_us) by (quantile, wf) / 1000", "refId": "A", "step": 10, "target": "" @@ -3082,7 +3037,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "wf event recording latency success", + "title": "Node Exec latency quantile and workflow", "tooltip": { "msResolution": true, "shared": true, @@ -3186,45 +3141,30 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 2, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:workflow:event_recording:success_duration_ms_count[5m])) by (wf)", + "expr": "sum(flyte:propeller:all:node:node_input_latency_ms) by (quantile, wf)", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "success", + "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:workflow:event_recording:success_duration_ms_count[5m])) by (wf)", + "query": "sum(flyte:propeller:all:node:node_input_latency_ms) by (quantile, wf)", "refId": "A", "step": 10, "target": "" - }, - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count[5m])) by (wf)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "failure", - "metric": "", - "query": "sum(rate(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count[5m])) by (wf)", - "refId": "B", - "step": 10, - "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "wf event recording count", + "title": "Node Input latency quantile and workflow", "tooltip": { "msResolution": true, "shared": true, @@ -3243,7 +3183,7 @@ "yaxes": [ { "decimals": null, - "format": "none", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -3328,7 +3268,7 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 2, + "span": 3, "stack": false, "steppedLine": false, "targets": [ @@ -3346,12 +3286,27 @@ "refId": "A", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "sum(flyte:propeller:all:node:event_recording:failure_duration_ms) by (quantile, wf)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "sum(flyte:propeller:all:node:event_recording:failure_duration_ms) by (quantile, wf)", + "refId": "B", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "node event recording latency success", + "title": "Node Event event recording latency quantile and workflow", "tooltip": { "msResolution": true, "shared": true, @@ -3455,39 +3410,54 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 2, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count[5m])) by (wf)", + "expr": "sum(rate(flyte:propeller:all:node:perma_system_error_duration_unlabeled_ms_count[5m]))", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "success", + "legendFormat": "system error", "metric": "", - "query": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count[5m])) by (wf)", + "query": "sum(rate(flyte:propeller:all:node:perma_system_error_duration_unlabeled_ms_count[5m]))", "refId": "A", "step": 10, "target": "" }, { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count[5m])) by (wf)", + "expr": "sum(rate(flyte:propeller:all:node:perma_user_error_duration_unlabeled_ms[5m]))", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "failure", + "legendFormat": "user error", "metric": "", - "query": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count[5m])) by (wf)", + "query": "sum(rate(flyte:propeller:all:node:perma_user_error_duration_unlabeled_ms[5m]))", "refId": "B", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:node:perma_unknown_error_duration_unlabeled_ms[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "user error", + "metric": "", + "query": "sum(rate(flyte:propeller:all:node:perma_unknown_error_duration_unlabeled_ms[5m]))", + "refId": "C", + "step": 10, + "target": "" } ], "thresholds": [], @@ -3533,7 +3503,17 @@ "align": false, "alignLevel": 0 } - }, + } + ], + "repeat": null, + "showTitle": true, + "title": "Node Metrics" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ { "aliasColors": {}, "bars": false, @@ -3603,7 +3583,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:task:event_recording:success_duration_ms) by (quantile, wf)", + "expr": "sum(flyte:propeller:all:workflow:event_recording:success_duration_ms) by (quantile, wf)", "format": "time_series", "hide": false, "instant": false, @@ -3611,7 +3591,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:task:event_recording:success_duration_ms) by (quantile, wf)", + "query": "sum(flyte:propeller:all:workflow:event_recording:success_duration_ms) by (quantile, wf)", "refId": "A", "step": 10, "target": "" @@ -3620,7 +3600,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "task event recording latency", + "title": "wf event recording latency success", "tooltip": { "msResolution": true, "shared": true, @@ -3730,22 +3710,22 @@ "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count[5m])) by (wf)", + "expr": "sum(rate(flyte:propeller:all:workflow:event_recording:success_duration_ms_count[5m])) by (wf)", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "success wf", + "legendFormat": "success", "metric": "", - "query": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count[5m])) by (wf)", + "query": "sum(rate(flyte:propeller:all:workflow:event_recording:success_duration_ms_count[5m])) by (wf)", "refId": "A", "step": 10, "target": "" }, { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count[5m])) by (wf)", + "expr": "sum(rate(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count[5m])) by (wf)", "format": "time_series", "hide": false, "instant": false, @@ -3753,7 +3733,7 @@ "intervalFactor": 2, "legendFormat": "failure", "metric": "", - "query": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count[5m])) by (wf)", + "query": "sum(rate(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count[5m])) by (wf)", "refId": "B", "step": 10, "target": "" @@ -3762,7 +3742,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "task event recording count", + "title": "wf event recording count", "tooltip": { "msResolution": true, "shared": true, @@ -3872,7 +3852,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:node:build_dynamic_workflow_us) by (quantile, wf) / 1000", + "expr": "sum(flyte:propeller:all:node:event_recording:success_duration_ms) by (quantile, wf)", "format": "time_series", "hide": false, "instant": false, @@ -3880,7 +3860,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:node:build_dynamic_workflow_us) by (quantile, wf) / 1000", + "query": "sum(flyte:propeller:all:node:event_recording:success_duration_ms) by (quantile, wf)", "refId": "A", "step": 10, "target": "" @@ -3889,7 +3869,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Dynamic workflow build latency", + "title": "node event recording latency success", "tooltip": { "msResolution": true, "shared": true, @@ -3999,24 +3979,39 @@ "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:node:build_dynamic_workflow_us_count[5m])) by (wf)", + "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count[5m])) by (wf)", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "success", "metric": "", - "query": "sum(rate(flyte:propeller:all:node:build_dynamic_workflow_us_count[5m])) by (wf)", + "query": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count[5m])) by (wf)", "refId": "A", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count[5m])) by (wf)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "failure", + "metric": "", + "query": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count[5m])) by (wf)", + "refId": "B", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Dynamic workflow build count", + "title": "node event recording count", "tooltip": { "msResolution": true, "shared": true, @@ -4126,39 +4121,24 @@ "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:admin_launcher:cache_hit[5m]))", + "expr": "sum(flyte:propeller:all:task:event_recording:success_duration_ms) by (quantile, wf)", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "hit", + "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:admin_launcher:cache_hit[5m]))", + "query": "sum(flyte:propeller:all:task:event_recording:success_duration_ms) by (quantile, wf)", "refId": "A", "step": 10, "target": "" - }, - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:admin_launcher:cache_miss[5m]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "miss", - "metric": "", - "query": "sum(rate(flyte:propeller:all:admin_launcher:cache_miss[5m]))", - "refId": "B", - "step": 10, - "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Admin Launcher cache", + "title": "task event recording latency", "tooltip": { "msResolution": true, "shared": true, @@ -4198,17 +4178,7 @@ "align": false, "alignLevel": 0 } - } - ], - "repeat": null, - "showTitle": true, - "title": "Perf metrics" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ + }, { "aliasColors": {}, "bars": false, @@ -4272,30 +4242,45 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:wf_update_latency_ms) by (quantile)", + "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count[5m])) by (wf)", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "success wf", "metric": "", - "query": "sum(flyte:propeller:all:wf_update_latency_ms) by (quantile)", + "query": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count[5m])) by (wf)", "refId": "A", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count[5m])) by (wf)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "failure", + "metric": "", + "query": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count[5m])) by (wf)", + "refId": "B", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "wf update etcD latency", + "title": "task event recording count", "tooltip": { "msResolution": true, "shared": true, @@ -4314,7 +4299,7 @@ "yaxes": [ { "decimals": null, - "format": "ms", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -4399,13 +4384,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:wf_update_latency_ms_count[5m]))", + "expr": "sum(flyte:propeller:all:node:build_dynamic_workflow_us) by (quantile, wf) / 1000", "format": "time_series", "hide": false, "instant": false, @@ -4413,7 +4398,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:wf_update_latency_ms_count[5m]))", + "query": "sum(flyte:propeller:all:node:build_dynamic_workflow_us) by (quantile, wf) / 1000", "refId": "A", "step": 10, "target": "" @@ -4422,7 +4407,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "etcD writes", + "title": "Dynamic workflow build latency", "tooltip": { "msResolution": true, "shared": true, @@ -4441,7 +4426,7 @@ "yaxes": [ { "decimals": null, - "format": "none", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -4526,13 +4511,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:wf_update_conflict[5m]))", + "expr": "sum(rate(flyte:propeller:all:node:build_dynamic_workflow_us_count[5m])) by (wf)", "format": "time_series", "hide": false, "instant": false, @@ -4540,7 +4525,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:wf_update_conflict[5m]))", + "query": "sum(rate(flyte:propeller:all:node:build_dynamic_workflow_us_count[5m])) by (wf)", "refId": "A", "step": 10, "target": "" @@ -4549,7 +4534,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "etcD write conflicts", + "title": "Dynamic workflow build count", "tooltip": { "msResolution": true, "shared": true, @@ -4653,30 +4638,45 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:wf_update_failed[5m]))", + "expr": "sum(rate(flyte:propeller:all:admin_launcher:cache_hit[5m]))", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "hit", "metric": "", - "query": "sum(rate(flyte:propeller:all:wf_update_failed[5m]))", + "query": "sum(rate(flyte:propeller:all:admin_launcher:cache_hit[5m]))", "refId": "A", "step": 10, "target": "" + }, + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:admin_launcher:cache_miss[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "miss", + "metric": "", + "query": "sum(rate(flyte:propeller:all:admin_launcher:cache_miss[5m]))", + "refId": "B", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "etcD write fail", + "title": "Admin Launcher cache hit/miss rate", "tooltip": { "msResolution": true, "shared": true, @@ -4695,7 +4695,7 @@ "yaxes": [ { "decimals": null, - "format": "none", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -4716,7 +4716,17 @@ "align": false, "alignLevel": 0 } - }, + } + ], + "repeat": null, + "showTitle": true, + "title": "Perf metrics" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ { "aliasColors": {}, "bars": false, @@ -4780,13 +4790,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:wf_too_large[5m]))", + "expr": "sum(flyte:propeller:all:workflow:acceptance_latency_ms) by (wf)", "format": "time_series", "hide": false, "instant": false, @@ -4794,7 +4804,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:wf_too_large[5m]))", + "query": "sum(flyte:propeller:all:workflow:acceptance_latency_ms) by (wf)", "refId": "A", "step": 10, "target": "" @@ -4803,7 +4813,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "etcD write too large", + "title": "Workflow acceptance latency per workflow", "tooltip": { "msResolution": true, "shared": true, @@ -4822,7 +4832,7 @@ "yaxes": [ { "decimals": null, - "format": "none", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -4843,17 +4853,7 @@ "align": false, "alignLevel": 0 } - } - ], - "repeat": null, - "showTitle": true, - "title": "etcD write metrics" - }, - { - "collapse": true, - "editable": true, - "height": "250px", - "panels": [ + }, { "aliasColors": {}, "bars": false, @@ -4917,13 +4917,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(workqueue_adds_total{name=~\"flyte:propeller.*\"}[5m])) by (name)", + "expr": "sum(flyte:propeller:all:workflow:acceptance_latency_unlabeled_ms) by (quantile)", "format": "time_series", "hide": false, "instant": false, @@ -4931,7 +4931,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(rate(workqueue_adds_total{name=~\"flyte:propeller.*\"}[5m])) by (name)", + "query": "sum(flyte:propeller:all:workflow:acceptance_latency_unlabeled_ms) by (quantile)", "refId": "A", "step": 10, "target": "" @@ -4940,7 +4940,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Add rate to queue", + "title": "Workflow acceptance latency by quantile", "tooltip": { "msResolution": true, "shared": true, @@ -4959,7 +4959,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -5044,13 +5044,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(workqueue_depth{name=~\"flyte:propeller.*\"}) by (name)", + "expr": "sum(flyte:propeller:all:node:transition_latency_ms) by (wf)", "format": "time_series", "hide": false, "instant": false, @@ -5058,7 +5058,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(workqueue_depth{name=~\"flyte:propeller.*\"}) by (name)", + "query": "sum(flyte:propeller:all:node:transition_latency_ms) by (wf)", "refId": "A", "step": 10, "target": "" @@ -5067,7 +5067,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Unprocessed Queue depth", + "title": "Node transition latency per workflow", "tooltip": { "msResolution": true, "shared": true, @@ -5086,7 +5086,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -5171,13 +5171,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(rate(workqueue_retries_total{name=~\"flyte:propeller.*\"}[5m])) by (name)", + "expr": "sum(flyte:propeller:all:node:transition_latency_unlabeled_ms) by (quantile)", "format": "time_series", "hide": false, "instant": false, @@ -5185,7 +5185,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(rate(workqueue_retries_total{name=~\"flyte:propeller.*\"}[5m])) by (name)", + "query": "sum(flyte:propeller:all:node:transition_latency_unlabeled_ms) by (quantile)", "refId": "A", "step": 10, "target": "" @@ -5194,7 +5194,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Item retries rate", + "title": "Node transition latency by quantile", "tooltip": { "msResolution": true, "shared": true, @@ -5213,7 +5213,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -5298,13 +5298,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(workqueue_unfinished_work_seconds{name=~\"flyte:propeller.*\"}) by (name)", + "expr": "sum(flyte:propeller:all:node:queueing_latency_ms) by (wf)", "format": "time_series", "hide": false, "instant": false, @@ -5312,7 +5312,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(workqueue_unfinished_work_seconds{name=~\"flyte:propeller.*\"}) by (name)", + "query": "sum(flyte:propeller:all:node:queueing_latency_ms) by (wf)", "refId": "A", "step": 10, "target": "" @@ -5321,7 +5321,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Seconds of unfinished work in progress", + "title": "Node queueing latency per workflow", "tooltip": { "msResolution": true, "shared": true, @@ -5340,7 +5340,7 @@ "yaxes": [ { "decimals": null, - "format": "s", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -5361,17 +5361,7 @@ "align": false, "alignLevel": 0 } - } - ], - "repeat": null, - "showTitle": true, - "title": "FlytePropeller Queue metrics" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ + }, { "aliasColors": {}, "bars": false, @@ -5441,7 +5431,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:workflow:acceptance_latency_ms) by (wf)", + "expr": "sum(flyte:propeller:all:node:queueing_latency_unlabeled_ms) by (quantile)", "format": "time_series", "hide": false, "instant": false, @@ -5449,7 +5439,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:workflow:acceptance_latency_ms) by (wf)", + "query": "sum(flyte:propeller:all:node:queueing_latency_unlabeled_ms) by (quantile)", "refId": "A", "step": 10, "target": "" @@ -5458,7 +5448,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Workflow acceptance latency per workflow", + "title": "Node queueing latency by quantile", "tooltip": { "msResolution": true, "shared": true, @@ -5568,7 +5558,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:workflow:acceptance_latency_unlabeled_ms) by (quantile)", + "expr": "sum(flyte:propeller:all:workflow:completion_latency_ms) by (wf)", "format": "time_series", "hide": false, "instant": false, @@ -5576,7 +5566,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:workflow:acceptance_latency_unlabeled_ms) by (quantile)", + "query": "sum(flyte:propeller:all:workflow:completion_latency_ms) by (wf)", "refId": "A", "step": 10, "target": "" @@ -5585,7 +5575,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Workflow acceptance latency by quantile", + "title": "Workflow completion latency per workflow", "tooltip": { "msResolution": true, "shared": true, @@ -5695,7 +5685,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:node:transition_latency_ms) by (wf)", + "expr": "sum(flyte:propeller:all:workflow:completion_latency_unlabeled_ms) by (quantile)", "format": "time_series", "hide": false, "instant": false, @@ -5703,7 +5693,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:node:transition_latency_ms) by (wf)", + "query": "sum(flyte:propeller:all:workflow:completion_latency_unlabeled_ms) by (quantile)", "refId": "A", "step": 10, "target": "" @@ -5712,7 +5702,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Node transition latency per workflow", + "title": "Workflow completion latency by quantile", "tooltip": { "msResolution": true, "shared": true, @@ -5752,7 +5742,17 @@ "align": false, "alignLevel": 0 } - }, + } + ], + "repeat": null, + "showTitle": true, + "title": "Workflow latencies" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ { "aliasColors": {}, "bars": false, @@ -5816,13 +5816,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 2, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:node:transition_latency_unlabeled_ms) by (quantile)", + "expr": "sum(flyte:propeller:all:wf_update_latency_ms) by (quantile)", "format": "time_series", "hide": false, "instant": false, @@ -5830,7 +5830,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:node:transition_latency_unlabeled_ms) by (quantile)", + "query": "sum(flyte:propeller:all:wf_update_latency_ms) by (quantile)", "refId": "A", "step": 10, "target": "" @@ -5839,7 +5839,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Node transition latency by quantile", + "title": "wf update etcD latency", "tooltip": { "msResolution": true, "shared": true, @@ -5943,13 +5943,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 2, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:node:queueing_latency_ms) by (wf)", + "expr": "sum(rate(flyte:propeller:all:wf_update_latency_ms_count[5m]))", "format": "time_series", "hide": false, "instant": false, @@ -5957,7 +5957,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:node:queueing_latency_ms) by (wf)", + "query": "sum(rate(flyte:propeller:all:wf_update_latency_ms_count[5m]))", "refId": "A", "step": 10, "target": "" @@ -5966,7 +5966,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Node queueing latency per workflow", + "title": "etcD writes", "tooltip": { "msResolution": true, "shared": true, @@ -5985,7 +5985,7 @@ "yaxes": [ { "decimals": null, - "format": "ms", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -6070,13 +6070,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 2, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:node:queueing_latency_unlabeled_ms) by (quantile)", + "expr": "sum(rate(flyte:propeller:all:wf_update_conflict[5m]))", "format": "time_series", "hide": false, "instant": false, @@ -6084,7 +6084,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:node:queueing_latency_unlabeled_ms) by (quantile)", + "query": "sum(rate(flyte:propeller:all:wf_update_conflict[5m]))", "refId": "A", "step": 10, "target": "" @@ -6093,7 +6093,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Node queueing latency by quantile", + "title": "etcD write conflicts", "tooltip": { "msResolution": true, "shared": true, @@ -6112,7 +6112,7 @@ "yaxes": [ { "decimals": null, - "format": "ms", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -6197,13 +6197,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 2, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:workflow:completion_latency_ms) by (wf)", + "expr": "sum(rate(flyte:propeller:all:wf_update_failed[5m]))", "format": "time_series", "hide": false, "instant": false, @@ -6211,7 +6211,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:workflow:completion_latency_ms) by (wf)", + "query": "sum(rate(flyte:propeller:all:wf_update_failed[5m]))", "refId": "A", "step": 10, "target": "" @@ -6220,7 +6220,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Workflow completion latency per workflow", + "title": "etcD write fail", "tooltip": { "msResolution": true, "shared": true, @@ -6239,7 +6239,7 @@ "yaxes": [ { "decimals": null, - "format": "ms", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -6324,13 +6324,13 @@ "repeat": null, "repeatDirection": null, "seriesOverrides": [], - "span": 2, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:workflow:completion_latency_unlabeled_ms) by (quantile)", + "expr": "sum(rate(flyte:propeller:all:wf_too_large[5m]))", "format": "time_series", "hide": false, "instant": false, @@ -6338,7 +6338,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:workflow:completion_latency_unlabeled_ms) by (quantile)", + "query": "sum(rate(flyte:propeller:all:wf_too_large[5m]))", "refId": "A", "step": 10, "target": "" @@ -6347,7 +6347,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Workflow completion latency by quantile", + "title": "etcD write too large", "tooltip": { "msResolution": true, "shared": true, @@ -6366,7 +6366,7 @@ "yaxes": [ { "decimals": null, - "format": "ms", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -6391,10 +6391,10 @@ ], "repeat": null, "showTitle": true, - "title": "Workflow latencies" + "title": "etcD write metrics" }, { - "collapse": false, + "collapse": true, "editable": true, "height": "250px", "panels": [ @@ -6658,7 +6658,7 @@ "title": "K8s Pod Informer stats" }, { - "collapse": false, + "collapse": true, "editable": true, "height": "250px", "panels": [ @@ -7049,7 +7049,7 @@ "title": "Workflow store" }, { - "collapse": false, + "collapse": true, "editable": true, "height": "250px", "panels": [ diff --git a/stats/flytepropeller.dashboard.py b/stats/flytepropeller.dashboard.py index 884529f3728..a3762db3a71 100644 --- a/stats/flytepropeller.dashboard.py +++ b/stats/flytepropeller.dashboard.py @@ -379,7 +379,7 @@ def metastore_latencies(collapse: bool) -> Row: @staticmethod def admin_launcher_cache() -> Graph: return Graph( - title="Admin Launcher cache", + title="Admin Launcher cache hit/miss rate", dataSource=DATASOURCE, targets=[ Target( @@ -393,7 +393,10 @@ def admin_launcher_cache() -> Graph: refId="B", ), ], - yAxes=single_y_axis(format=MILLISECONDS_FORMAT), + yAxes=YAxes( + YAxis(format=OPS_FORMAT), + YAxis(format=SHORT_FORMAT), + ), ) @staticmethod @@ -900,16 +903,16 @@ def workflowstore(collapse: bool) -> Row: def create_all_rows(interval: int = 5) -> typing.List[Row]: return [ FlytePropeller.core_metrics(interval, False), + FlytePropeller.queue_metrics(False), FlytePropeller.metastore_metrics(interval, True), FlytePropeller.metastore_latencies(True), FlytePropeller.node_metrics(True), FlytePropeller.perf_metrics(True), - FlytePropeller.wf_store_latency(False), - FlytePropeller.queue_metrics(True), FlytePropeller.workflow_latencies(False), - FlytePropeller.k8s_pod_informers(False), - FlytePropeller.workflowstore(False), - FlytePropeller.workflow_garbage_collection(False), + FlytePropeller.wf_store_latency(False), + FlytePropeller.k8s_pod_informers(True), + FlytePropeller.workflowstore(True), + FlytePropeller.workflow_garbage_collection(True), ]