diff --git a/deployment/stats/prometheus/flytepropeller-dashboard.json b/deployment/stats/prometheus/flytepropeller-dashboard.json index 4ffe3f100c0..713a282429a 100644 --- a/deployment/stats/prometheus/flytepropeller-dashboard.json +++ b/deployment/stats/prometheus/flytepropeller-dashboard.json @@ -32,7 +32,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "The number of golang goroutines available to accept new work from the main workqueue. Each worker can process one item from the workqueue at a time.", "editable": true, "error": false, "fieldConfig": { @@ -159,7 +159,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Round success, error and total rates. Also includes total rate including streaks within a single round. The streak rate graph should match the difference between the totals with and without streaks.", "editable": true, "error": false, "fieldConfig": { @@ -274,7 +274,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "total-including-streak-rounds", + "legendFormat": "total-including-streaks", "metric": "", "query": "sum(rate(flyte:propeller:all:round:round_time_unlabeled_ms_count[5m]))", "refId": "D", @@ -331,7 +331,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Error rates for each type of failure that may occur as propeller is traversing the workflow DAG.", "editable": true, "error": false, "fieldConfig": { @@ -452,119 +452,7 @@ "refId": "D", "step": 10, "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Error rate breakdown", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - } - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 4, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ { "datasource": null, "expr": "sum(rate(flyte:propeller:all:round:skipped[5m]))", @@ -573,10 +461,10 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "skipped", "metric": "", "query": "sum(rate(flyte:propeller:all:round:skipped[5m]))", - "refId": "A", + "refId": "E", "step": 10, "target": "" } @@ -584,7 +472,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Round skip rate", + "title": "Error rate breakdown", "tooltip": { "msResolution": true, "shared": true, @@ -630,7 +518,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Streaks are when propeller iterates over the same workflow multiple times in a single round. This is an optimisation technique.", "editable": true, "error": false, "fieldConfig": { @@ -651,7 +539,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 5, + "id": 4, "interval": null, "isNew": true, "legend": { @@ -757,7 +645,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Success vs failure rate for the various plugins used for each node e.g. k8s plugin or spark plugin.", "editable": true, "error": false, "fieldConfig": { @@ -778,7 +666,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 6, + "id": 5, "interval": null, "isNew": true, "legend": { @@ -899,7 +787,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Round latency breakdown. When there are streaks within one round each iteration is measured separately.", "editable": true, "error": false, "fieldConfig": { @@ -920,7 +808,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 7, + "id": 6, "interval": null, "isNew": true, "legend": { @@ -1071,7 +959,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Round latency by workflow name. When there are streaks within one round each iteration is measured separately.", "editable": true, "error": false, "fieldConfig": { @@ -1092,7 +980,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 8, + "id": 7, "interval": null, "isNew": true, "legend": { @@ -1198,7 +1086,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Count of currently running workflows running per project", "editable": true, "error": false, "fieldConfig": { @@ -1219,7 +1107,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 9, + "id": 8, "interval": null, "isNew": true, "legend": { @@ -1325,7 +1213,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Rate at which workflows are being enqueued by flytepropeller. These enqueues all pass through the sub-queue before going back into the main queue.", "editable": true, "error": false, "fieldConfig": { @@ -1346,7 +1234,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 10, + "id": 9, "interval": null, "isNew": true, "legend": { @@ -1462,7 +1350,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Rate at which items are actually added to the queue. If an item is already on the queue attempting to add it will be a no-op. Usually there is also rate limiting that will delay items from being added to the queue.", "editable": true, "error": false, "fieldConfig": { @@ -1483,7 +1371,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 11, + "id": 10, "interval": null, "isNew": true, "legend": { @@ -1526,7 +1414,7 @@ "targets": [ { "datasource": null, - "expr": "rate(flyte:propeller:all:main_adds[5m])", + "expr": "sum(rate(flyte:propeller:all:main_adds[5m]))", "format": "time_series", "hide": false, "instant": false, @@ -1534,14 +1422,14 @@ "intervalFactor": 2, "legendFormat": "main", "metric": "", - "query": "rate(flyte:propeller:all:main_adds[5m])", + "query": "sum(rate(flyte:propeller:all:main_adds[5m]))", "refId": "A", "step": 10, "target": "" }, { "datasource": null, - "expr": "rate(flyte:propeller:all:sub_adds[5m])", + "expr": "sum(rate(flyte:propeller:all:sub_adds[5m]))", "format": "time_series", "hide": false, "instant": false, @@ -1549,14 +1437,14 @@ "intervalFactor": 2, "legendFormat": "sub", "metric": "", - "query": "rate(flyte:propeller:all:sub_adds[5m])", + "query": "sum(rate(flyte:propeller:all:sub_adds[5m]))", "refId": "B", "step": 10, "target": "" }, { "datasource": null, - "expr": "rate(flyte:propeller:all:admin_launcher:_adds[5m])", + "expr": "sum(rate(flyte:propeller:all:admin_launcher:_adds[5m]))", "format": "time_series", "hide": false, "instant": false, @@ -1564,7 +1452,7 @@ "intervalFactor": 2, "legendFormat": "admin_launcher", "metric": "", - "query": "rate(flyte:propeller:all:admin_launcher:_adds[5m])", + "query": "sum(rate(flyte:propeller:all:admin_launcher:_adds[5m]))", "refId": "C", "step": 10, "target": "" @@ -1592,7 +1480,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -1619,7 +1507,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Tracks every rate limited add synchronously before rate limiting delays or deduplication", "editable": true, "error": false, "fieldConfig": { @@ -1640,7 +1528,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 12, + "id": 11, "interval": null, "isNew": true, "legend": { @@ -1683,7 +1571,7 @@ "targets": [ { "datasource": null, - "expr": "flyte:propeller:all:main_depth", + "expr": "sum(rate(flyte:propeller:all:main_retries[5m]))", "format": "time_series", "hide": false, "instant": false, @@ -1691,14 +1579,14 @@ "intervalFactor": 2, "legendFormat": "main", "metric": "", - "query": "flyte:propeller:all:main_depth", + "query": "sum(rate(flyte:propeller:all:main_retries[5m]))", "refId": "A", "step": 10, "target": "" }, { "datasource": null, - "expr": "flyte:propeller:all:sub_depth", + "expr": "sum(rate(flyte:propeller:all:sub_retries[5m]))", "format": "time_series", "hide": false, "instant": false, @@ -1706,14 +1594,14 @@ "intervalFactor": 2, "legendFormat": "sub", "metric": "", - "query": "flyte:propeller:all:sub_depth", + "query": "sum(rate(flyte:propeller:all:sub_retries[5m]))", "refId": "B", "step": 10, "target": "" }, { "datasource": null, - "expr": "flyte:propeller:all:admin_launcher:_depth", + "expr": "sum(rate(flyte:propeller:all:admin_launcher:_retries[5m]))", "format": "time_series", "hide": false, "instant": false, @@ -1721,7 +1609,7 @@ "intervalFactor": 2, "legendFormat": "admin_launcher", "metric": "", - "query": "flyte:propeller:all:admin_launcher:_depth", + "query": "sum(rate(flyte:propeller:all:admin_launcher:_retries[5m]))", "refId": "C", "step": 10, "target": "" @@ -1730,7 +1618,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Unprocessed Queue depth", + "title": "Add rate before rate limiting and deduplication", "tooltip": { "msResolution": true, "shared": true, @@ -1749,7 +1637,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -1776,7 +1664,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "The number of items that are currently in the queue but have not been processed yet.", "editable": true, "error": false, "fieldConfig": { @@ -1797,7 +1685,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 13, + "id": 12, "interval": null, "isNew": true, "legend": { @@ -1840,7 +1728,7 @@ "targets": [ { "datasource": null, - "expr": "rate(flyte:propeller:all:main_retries[5m])", + "expr": "sum(flyte:propeller:all:main_depth)", "format": "time_series", "hide": false, "instant": false, @@ -1848,14 +1736,14 @@ "intervalFactor": 2, "legendFormat": "main", "metric": "", - "query": "rate(flyte:propeller:all:main_retries[5m])", + "query": "sum(flyte:propeller:all:main_depth)", "refId": "A", "step": 10, "target": "" }, { "datasource": null, - "expr": "rate(flyte:propeller:all:sub_retries[5m])", + "expr": "sum(flyte:propeller:all:sub_depth)", "format": "time_series", "hide": false, "instant": false, @@ -1863,14 +1751,14 @@ "intervalFactor": 2, "legendFormat": "sub", "metric": "", - "query": "rate(flyte:propeller:all:sub_retries[5m])", + "query": "sum(flyte:propeller:all:sub_depth)", "refId": "B", "step": 10, "target": "" }, { "datasource": null, - "expr": "rate(flyte:propeller:all:admin_launcher:_retries[5m])", + "expr": "sum(flyte:propeller:all:admin_launcher:_depth)", "format": "time_series", "hide": false, "instant": false, @@ -1878,7 +1766,7 @@ "intervalFactor": 2, "legendFormat": "admin_launcher", "metric": "", - "query": "rate(flyte:propeller:all:admin_launcher:_retries[5m])", + "query": "sum(flyte:propeller:all:admin_launcher:_depth)", "refId": "C", "step": 10, "target": "" @@ -1887,7 +1775,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Item retries rate", + "title": "Unprocessed Queue depth", "tooltip": { "msResolution": true, "shared": true, @@ -1933,7 +1821,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Sum of the current in progress time of every in progress item in the queue.", "editable": true, "error": false, "fieldConfig": { @@ -1954,7 +1842,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 14, + "id": 13, "interval": null, "isNew": true, "legend": { @@ -1997,7 +1885,7 @@ "targets": [ { "datasource": null, - "expr": "flyte:propeller:all:main_unfinished_work_s", + "expr": "sum(flyte:propeller:all:main_unfinished_work_s)", "format": "time_series", "hide": false, "instant": false, @@ -2005,14 +1893,14 @@ "intervalFactor": 2, "legendFormat": "main", "metric": "", - "query": "flyte:propeller:all:main_unfinished_work_s", + "query": "sum(flyte:propeller:all:main_unfinished_work_s)", "refId": "A", "step": 10, "target": "" }, { "datasource": null, - "expr": "flyte:propeller:all:sub_unfinished_work_s", + "expr": "sum(flyte:propeller:all:sub_unfinished_work_s)", "format": "time_series", "hide": false, "instant": false, @@ -2020,14 +1908,14 @@ "intervalFactor": 2, "legendFormat": "sub", "metric": "", - "query": "flyte:propeller:all:sub_unfinished_work_s", + "query": "sum(flyte:propeller:all:sub_unfinished_work_s)", "refId": "B", "step": 10, "target": "" }, { "datasource": null, - "expr": "flyte:propeller:all:admin_launcher:_unfinished_work_s", + "expr": "sum(flyte:propeller:all:admin_launcher:_unfinished_work_s)", "format": "time_series", "hide": false, "instant": false, @@ -2035,7 +1923,7 @@ "intervalFactor": 2, "legendFormat": "admin_launcher", "metric": "", - "query": "flyte:propeller:all:admin_launcher:_unfinished_work_s", + "query": "sum(flyte:propeller:all:admin_launcher:_unfinished_work_s)", "refId": "C", "step": 10, "target": "" @@ -2121,7 +2009,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 15, + "id": 14, "interval": null, "isNew": true, "legend": { @@ -2248,7 +2136,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 16, + "id": 15, "interval": null, "isNew": true, "legend": { @@ -2445,7 +2333,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 17, + "id": 16, "interval": null, "isNew": true, "legend": { @@ -2572,7 +2460,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 18, + "id": 17, "interval": null, "isNew": true, "legend": { @@ -2699,7 +2587,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 19, + "id": 18, "interval": null, "isNew": true, "legend": { @@ -2826,7 +2714,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 20, + "id": 19, "interval": null, "isNew": true, "legend": { @@ -2953,7 +2841,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 21, + "id": 20, "interval": null, "isNew": true, "legend": { @@ -3105,7 +2993,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 22, + "id": 21, "interval": null, "isNew": true, "legend": { @@ -3232,7 +3120,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 23, + "id": 22, "interval": null, "isNew": true, "legend": { @@ -3359,7 +3247,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 24, + "id": 23, "interval": null, "isNew": true, "legend": { @@ -3501,7 +3389,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 25, + "id": 24, "interval": null, "isNew": true, "legend": { @@ -3668,7 +3556,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 26, + "id": 25, "interval": null, "isNew": true, "legend": { @@ -3795,7 +3683,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 27, + "id": 26, "interval": null, "isNew": true, "legend": { @@ -3937,7 +3825,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 28, + "id": 27, "interval": null, "isNew": true, "legend": { @@ -4064,7 +3952,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 29, + "id": 28, "interval": null, "isNew": true, "legend": { @@ -4206,7 +4094,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 30, + "id": 29, "interval": null, "isNew": true, "legend": { @@ -4333,7 +4221,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 31, + "id": 30, "interval": null, "isNew": true, "legend": { @@ -4475,7 +4363,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 32, + "id": 31, "interval": null, "isNew": true, "legend": { @@ -4602,7 +4490,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 33, + "id": 32, "interval": null, "isNew": true, "legend": { @@ -4708,7 +4596,7 @@ "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", - "description": null, + "description": "Cache hit rate when admin launcher is queried for the status of a workflow. Admin launcher will asynchnously update status of workflows in the cache.", "editable": true, "error": false, "fieldConfig": { @@ -4729,7 +4617,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 34, + "id": 33, "interval": null, "isNew": true, "legend": { @@ -4889,7 +4777,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 35, + "id": 34, "interval": null, "links": [], "maxDataPoints": 100, @@ -4999,7 +4887,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 36, + "id": 35, "interval": null, "isNew": true, "legend": { @@ -5126,7 +5014,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 37, + "id": 36, "interval": null, "isNew": true, "legend": { @@ -5253,7 +5141,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 38, + "id": 37, "interval": null, "isNew": true, "legend": { @@ -5380,7 +5268,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 39, + "id": 38, "interval": null, "isNew": true, "legend": { @@ -5507,7 +5395,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 40, + "id": 39, "interval": null, "isNew": true, "legend": { @@ -5634,7 +5522,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 41, + "id": 40, "interval": null, "isNew": true, "legend": { @@ -5761,7 +5649,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 42, + "id": 41, "interval": null, "isNew": true, "legend": { @@ -5888,7 +5776,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 43, + "id": 42, "interval": null, "isNew": true, "legend": { @@ -6025,7 +5913,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 44, + "id": 43, "interval": null, "isNew": true, "legend": { @@ -6152,7 +6040,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 45, + "id": 44, "interval": null, "isNew": true, "legend": { @@ -6279,7 +6167,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 46, + "id": 45, "interval": null, "isNew": true, "legend": { @@ -6406,7 +6294,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 47, + "id": 46, "interval": null, "isNew": true, "legend": { @@ -6533,7 +6421,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 48, + "id": 47, "interval": null, "isNew": true, "legend": { @@ -6670,7 +6558,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 49, + "id": 48, "interval": null, "isNew": true, "legend": { @@ -6797,7 +6685,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 50, + "id": 49, "interval": null, "isNew": true, "legend": { @@ -6934,7 +6822,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 51, + "id": 50, "interval": null, "isNew": true, "legend": { @@ -7061,7 +6949,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 52, + "id": 51, "interval": null, "isNew": true, "legend": { @@ -7188,7 +7076,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 53, + "id": 52, "interval": null, "isNew": true, "legend": { @@ -7325,7 +7213,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 54, + "id": 53, "interval": null, "isNew": true, "legend": { @@ -7452,7 +7340,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 55, + "id": 54, "interval": null, "isNew": true, "legend": { @@ -7579,7 +7467,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 56, + "id": 55, "interval": null, "isNew": true, "legend": { @@ -7706,7 +7594,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 57, + "id": 56, "interval": null, "isNew": true, "legend": { diff --git a/stats/flytepropeller.dashboard.py b/stats/flytepropeller.dashboard.py index 3fd87ba7429..7ef00926d67 100644 --- a/stats/flytepropeller.dashboard.py +++ b/stats/flytepropeller.dashboard.py @@ -20,6 +20,7 @@ class FlytePropeller(object): def create_free_workers() -> Graph: return Graph( title="Free workers count", + description="The number of golang goroutines available to accept new work from the main workqueue. Each worker can process one item from the workqueue at a time.", dataSource=DATASOURCE, targets=[ Target( @@ -37,6 +38,7 @@ def create_free_workers() -> Graph: def round_latency_per_wf() -> Graph: return Graph( title=f"Round traverse latency per workflow", + description="Round latency by workflow name. When there are streaks within one round each iteration is measured separately.", dataSource=DATASOURCE, targets=[ Target( @@ -52,6 +54,7 @@ def round_latency_per_wf() -> Graph: def round_latency() -> Graph: return Graph( title=f"Round Latency (includes streak rounds)", + description="Round latency breakdown. When there are streaks within one round each iteration is measured separately.", dataSource=DATASOURCE, targets=[ Target( @@ -82,6 +85,7 @@ def round_latency() -> Graph: def error_breakdown() -> Graph: return Graph( title="Error rate breakdown", + description="Error rates for each type of failure that may occur as propeller is traversing the workflow DAG.", dataSource=DATASOURCE, targets=[ Target( @@ -104,6 +108,11 @@ def error_breakdown() -> Graph: refId="D", legendFormat="abort", ), + Target( + expr="sum(rate(flyte:propeller:all:round:skipped[5m]))", + refId="E", + legendFormat="skipped", + ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), @@ -115,6 +124,7 @@ def error_breakdown() -> Graph: def streak_rate() -> Graph: return Graph( title="Streak rate", + description="Streaks are when propeller iterates over the same workflow multiple times in a single round. This is an optimisation technique.", dataSource=DATASOURCE, targets=[ Target( @@ -127,29 +137,12 @@ def streak_rate() -> Graph: YAxis(format=SHORT_FORMAT), ), ) - - @staticmethod - def skipped_rounds() -> Graph: - return Graph( - title="Round skip rate", - dataSource=DATASOURCE, - targets=[ - Target( - expr="sum(rate(flyte:propeller:all:round:skipped[5m]))", - refId="A", - ), - ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), - ) - @staticmethod def round_rates() -> Graph: return Graph( title="Round success/error rate", + description="Round success, error and total rates. Also includes total rate including streaks within a single round. The streak rate graph should match the difference between the totals with and without streaks.", dataSource=DATASOURCE, targets=[ Target( @@ -170,7 +163,7 @@ def round_rates() -> Graph: Target( expr="sum(rate(flyte:propeller:all:round:round_time_unlabeled_ms_count[5m]))", refId="D", - legendFormat="total-including-streak-rounds", + legendFormat="total-including-streaks", ), ], yAxes=YAxes( @@ -183,6 +176,7 @@ def round_rates() -> Graph: def workflows_per_project() -> Graph: return Graph( title=f"Running Workflows per project", + description="Count of currently running workflows running per project", dataSource=DATASOURCE, targets=[ Target( @@ -200,6 +194,7 @@ def workflows_per_project() -> Graph: def enqueued_workflows() -> Graph: return Graph( title="Workflow enqueue rate", + description="Rate at which workflows are being enqueued by flytepropeller. These enqueues all pass through the sub-queue before going back into the main queue.", dataSource=DATASOURCE, targets=[ Target( @@ -220,6 +215,7 @@ def plugin_success_vs_failures() -> Graph: """ return Graph( title=f"Plugin Success/Failure rate", + description="Success vs failure rate for the various plugins used for each node e.g. k8s plugin or spark plugin.", dataSource=DATASOURCE, targets=[ Target( @@ -415,6 +411,7 @@ def metastore_latencies(collapse: bool) -> Row: def admin_launcher_cache() -> Graph: return Graph( title="Admin Launcher cache hit/miss rate", + description="Cache hit rate when admin launcher is queried for the status of a workflow. Admin launcher will asynchnously update status of workflows in the cache.", dataSource=DATASOURCE, targets=[ Target( @@ -712,64 +709,67 @@ def queue_metrics(collapse: bool) -> Row: panels=[ Graph( title="Add rate to queue", + description="Rate at which items are actually added to the queue. If an item is already on the queue attempting to add it will be a no-op. Usually there is also rate limiting that will delay items from being added to the queue.", dataSource=DATASOURCE, targets=[ Target( - expr="rate(flyte:propeller:all:main_adds[5m])", + expr="sum(rate(flyte:propeller:all:main_adds[5m]))", legendFormat="main", refId="A", ), Target( - expr="rate(flyte:propeller:all:sub_adds[5m])", + expr="sum(rate(flyte:propeller:all:sub_adds[5m]))", legendFormat="sub", refId="B", ), Target( - expr="rate(flyte:propeller:all:admin_launcher:_adds[5m])", + expr="sum(rate(flyte:propeller:all:admin_launcher:_adds[5m]))", legendFormat="admin_launcher", refId="C", ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + yAxes=single_y_axis(format=OPS_FORMAT), ), Graph( - title="Unprocessed Queue depth", + title="Add rate before rate limiting and deduplication", dataSource=DATASOURCE, + description="Tracks every rate limited add synchronously before rate limiting delays or deduplication", targets=[ Target( - expr="flyte:propeller:all:main_depth", + expr="sum(rate(flyte:propeller:all:main_retries[5m]))", legendFormat="main", refId="A", ), Target( - expr="flyte:propeller:all:sub_depth", + expr="sum(rate(flyte:propeller:all:sub_retries[5m]))", legendFormat="sub", refId="B", ), Target( - expr="flyte:propeller:all:admin_launcher:_depth", + expr="sum(rate(flyte:propeller:all:admin_launcher:_retries[5m]))", legendFormat="admin_launcher", refId="C", ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + yAxes=single_y_axis(format=OPS_FORMAT), ), Graph( - title="Item retries rate", + title="Unprocessed Queue depth", + description="The number of items that are currently in the queue but have not been processed yet.", dataSource=DATASOURCE, targets=[ Target( - expr="rate(flyte:propeller:all:main_retries[5m])", + expr="sum(flyte:propeller:all:main_depth)", legendFormat="main", refId="A", ), Target( - expr="rate(flyte:propeller:all:sub_retries[5m])", + expr="sum(flyte:propeller:all:sub_depth)", legendFormat="sub", refId="B", ), Target( - expr="rate(flyte:propeller:all:admin_launcher:_retries[5m])", + expr="sum(flyte:propeller:all:admin_launcher:_depth)", legendFormat="admin_launcher", refId="C", ), @@ -778,20 +778,21 @@ def queue_metrics(collapse: bool) -> Row: ), Graph( title="Seconds of unfinished work in progress", + description="Sum of the current in progress time of every in progress item in the queue.", dataSource=DATASOURCE, targets=[ Target( - expr="flyte:propeller:all:main_unfinished_work_s", + expr="sum(flyte:propeller:all:main_unfinished_work_s)", legendFormat="main", refId="A", ), Target( - expr="flyte:propeller:all:sub_unfinished_work_s", + expr="sum(flyte:propeller:all:sub_unfinished_work_s)", legendFormat="sub", refId="B", ), Target( - expr="flyte:propeller:all:admin_launcher:_unfinished_work_s", + expr="sum(flyte:propeller:all:admin_launcher:_unfinished_work_s)", legendFormat="admin_launcher", refId="C", ), @@ -823,7 +824,6 @@ def core_metrics(interval: int, collapse: bool) -> Row: FlytePropeller.create_free_workers(), FlytePropeller.round_rates(), FlytePropeller.error_breakdown(), - FlytePropeller.skipped_rounds(), FlytePropeller.streak_rate(), FlytePropeller.plugin_success_vs_failures(), FlytePropeller.round_latency(),