diff --git a/deployment/stats/prometheus/flyteadmin-dashboard.json b/deployment/stats/prometheus/flyteadmin-dashboard.json index b8669fc78bf..5e88094e4c5 100644 --- a/deployment/stats/prometheus/flyteadmin-dashboard.json +++ b/deployment/stats/prometheus/flyteadmin-dashboard.json @@ -23,7 +23,7 @@ "refresh": "10s", "rows": [ { - "collapse": false, + "collapse": true, "editable": true, "height": "250px", "panels": [ diff --git a/deployment/stats/prometheus/flytepropeller-dashboard.json b/deployment/stats/prometheus/flytepropeller-dashboard.json index fe429d14e4f..12ffbd26c5c 100644 --- a/deployment/stats/prometheus/flytepropeller-dashboard.json +++ b/deployment/stats/prometheus/flytepropeller-dashboard.json @@ -132,7 +132,7 @@ "yaxes": [ { "decimals": null, - "format": "ops", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -446,7 +446,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "abort", + "legendFormat": "not-found", "metric": "", "query": "sum(rate(flyte:propeller:all:round:not_found[5m]))", "refId": "D", @@ -618,7 +618,7 @@ "yaxes": [ { "decimals": null, - "format": "none", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -2256,7 +2256,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Failures from metastore", + "title": "Metastore failure rate", "tooltip": { "msResolution": true, "shared": true, @@ -2890,7 +2890,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "proto-fetch", + "legendFormat": "proto-fetch-P{{quantile}}", "metric": "", "query": "sum(flyte:propeller:all:metastore:proto_fetch_ms) by (quantile, wf)", "refId": "A", @@ -2905,7 +2905,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "remote-fetch", + "legendFormat": "remote-fetch-P{{quantile}}", "metric": "", "query": "sum(flyte:propeller:all:metastore:remote_fetch_ms) by (quantile, wf)", "refId": "B", @@ -3468,7 +3468,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "user error", + "legendFormat": "unknown error", "metric": "", "query": "sum(rate(flyte:propeller:all:node:perma_unknown_error_duration_unlabeled_ms[5m]))", "refId": "C", @@ -3479,7 +3479,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "node event recording rate", + "title": "node event recording error rate breakdown", "tooltip": { "msResolution": true, "shared": true, @@ -4001,7 +4001,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "success", + "legendFormat": "success-{{wf}}", "metric": "", "query": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count[5m])) by (wf)", "refId": "A", @@ -4016,7 +4016,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "failure", + "legendFormat": "failure-{{wf}}", "metric": "", "query": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count[5m])) by (wf)", "refId": "B", @@ -4270,7 +4270,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "success wf", + "legendFormat": "success-{{wf}}", "metric": "", "query": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count[5m])) by (wf)", "refId": "A", @@ -4285,7 +4285,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "failure", + "legendFormat": "failure-{{wf}}", "metric": "", "query": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count[5m])) by (wf)", "refId": "B", @@ -6618,7 +6618,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Update events from informer", + "title": "Update event rate from informer", "tooltip": { "msResolution": true, "shared": true, @@ -6637,7 +6637,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -6745,7 +6745,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Update events dropped becacuse they have the same resource version", + "title": "Update events drop rate becacuse they have the same resource version", "tooltip": { "msResolution": true, "shared": true, @@ -6764,7 +6764,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -6901,7 +6901,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -7028,7 +7028,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -7155,7 +7155,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, diff --git a/stats/flyteadmin.dashboard.py b/stats/flyteadmin.dashboard.py index 7220749737e..80df160daf7 100644 --- a/stats/flyteadmin.dashboard.py +++ b/stats/flyteadmin.dashboard.py @@ -210,6 +210,7 @@ def create_all_apis(interval: int = 5) -> typing.List[Row]: def grpc_latency_row() -> Graph: return Row( title="GRPC latency metrics", + collapse=True, panels=[ BarGauge( title="All GRPC calls latency", diff --git a/stats/flytepropeller.dashboard.py b/stats/flytepropeller.dashboard.py index 2ac12825789..513404cd5e8 100644 --- a/stats/flytepropeller.dashboard.py +++ b/stats/flytepropeller.dashboard.py @@ -28,7 +28,7 @@ def create_free_workers() -> Graph: ), ], yAxes=YAxes( - YAxis(format=OPS_FORMAT), + YAxis(format=NO_FORMAT), YAxis(format=SHORT_FORMAT), ), ) @@ -105,7 +105,7 @@ def error_breakdown() -> Graph: Target( expr="sum(rate(flyte:propeller:all:round:not_found[5m]))", refId="D", - legendFormat="abort", + legendFormat="not-found", ), Target( expr="sum(rate(flyte:propeller:all:round:skipped[5m]))", @@ -132,7 +132,7 @@ def streak_rate() -> Graph: ), ], yAxes=YAxes( - YAxis(format=NO_FORMAT), + YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ) @@ -284,7 +284,7 @@ def node_input_latency() -> Graph: def metastore_failures(): # Copy counts sum(rate(flyte:propeller:all:metastore:copy:overall_unlabeled_ms_count[5m])) return Graph( - title=f"Failures from metastore", + title=f"Metastore failure rate", dataSource=DATASOURCE, targets=[ Target( @@ -392,12 +392,12 @@ def metastore_latencies(collapse: bool) -> Row: targets=[ Target( expr="sum(flyte:propeller:all:metastore:proto_fetch_ms) by (quantile, wf)", - legendFormat="proto-fetch", + legendFormat="proto-fetch-P{{quantile}}", refId="A", ), Target( expr="sum(flyte:propeller:all:metastore:remote_fetch_ms) by (quantile, wf)", - legendFormat="remote-fetch", + legendFormat="remote-fetch-P{{quantile}}", refId="B", ), ], @@ -477,12 +477,12 @@ def task_event_recording() -> typing.List[Graph]: targets=[ Target( expr=f"sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count[5m])) by (wf)", - legendFormat="success wf", + legendFormat="success-{{wf}}", refId="A", ), Target( expr=f"sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count[5m])) by (wf)", - legendFormat="failure", + legendFormat="failure-{{wf}}", refId="B", ), ], @@ -510,12 +510,12 @@ def node_event_recording() -> typing.List[Graph]: targets=[ Target( expr=f"sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count[5m])) by (wf)", - legendFormat="success", + legendFormat="success-{{wf}}", refId="A", ), Target( expr=f"sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count[5m])) by (wf)", - legendFormat="failure", + legendFormat="failure-{{wf}}", refId="B", ), ], @@ -678,7 +678,7 @@ def metastore_metrics(interval: int, collapse: bool) -> Row: @staticmethod def node_errors() -> Graph: return Graph( - title="node event recording rate", + title="node event recording error rate breakdown", dataSource=DATASOURCE, targets=[ Target( @@ -693,7 +693,7 @@ def node_errors() -> Graph: ), Target( expr=f"sum(rate(flyte:propeller:all:node:perma_unknown_error_duration_unlabeled_ms[5m]))", - legendFormat="user error", + legendFormat="unknown error", refId="C", ), ], @@ -887,7 +887,7 @@ def k8s_pod_informers(collapse: bool) -> Row: collapse=collapse, panels=[ Graph( - title=f"Update events from informer", + title=f"Update event rate from informer", dataSource=DATASOURCE, targets=[ Target( @@ -895,10 +895,10 @@ def k8s_pod_informers(collapse: bool) -> Row: refId="A", ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + yAxes=single_y_axis(format=OPS_FORMAT), ), Graph( - title=f"Update events dropped becacuse they have the same resource version", + title=f"Update events drop rate becacuse they have the same resource version", dataSource=DATASOURCE, targets=[ Target( @@ -906,7 +906,7 @@ def k8s_pod_informers(collapse: bool) -> Row: refId="A", ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + yAxes=single_y_axis(format=OPS_FORMAT), ), ], ) @@ -979,7 +979,7 @@ def workflowstore(collapse: bool) -> Row: refId="A", ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + yAxes=single_y_axis(format=OPS_FORMAT), ), Graph( title="Evict workflows rate", @@ -990,7 +990,7 @@ def workflowstore(collapse: bool) -> Row: refId="A", ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + yAxes=single_y_axis(format=OPS_FORMAT), ), Graph( title="Workflow redundant updates rate", @@ -1001,7 +1001,7 @@ def workflowstore(collapse: bool) -> Row: refId="A", ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + yAxes=single_y_axis(format=OPS_FORMAT), ), ], )