From 0e891ccbb5fb37c470a326fa2437c592f418d98b Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Wed, 11 Sep 2024 07:26:11 +0200 Subject: [PATCH] Fix units with stuck reconciliation (#367) * add timeout command * Check-in panel with average actual reconciliation interval * introduce constant * update dashboard * rename panel using term "Application" * update docs * remove Optional --- docs/reference/cos.md | 2 + src-docs/charm.md | 11 +- src-docs/event_timer.md | 8 +- src/charm.py | 7 +- src/event_timer.py | 11 +- src/grafana_dashboards/metrics.json | 201 ++++++++++++++++++++++------ templates/dispatch-event.service.j2 | 2 +- 7 files changed, 181 insertions(+), 61 deletions(-) diff --git a/docs/reference/cos.md b/docs/reference/cos.md index 898845c3e..799bc8a95 100644 --- a/docs/reference/cos.md +++ b/docs/reference/cos.md @@ -19,6 +19,8 @@ The "GitHub Self-Hosted Runner Metrics" metrics dashboard presents the following - Runner idle duration - Charm reconciliation duration - Job queue duration - how long a job waits in the queue before a runner picks it up + - Max job queue duration by application: Similar to "Job queue duration" panel, but shows maximum durations by charm application. + - Average reconciliation interval: Shows the average time between reconciliation events, broken down by charm application. - Jobs: Displays certain metrics about the jobs executed by the runners. These metrics can be displayed per repository by specifying a regular expression on the `Repository` variable. The following metrics are displayed: - Proportion charts: Share of jobs by completion status, job conclusion, application, repo policy check failure http codes and github events over time. diff --git a/src-docs/charm.md b/src-docs/charm.md index fa38de542..5b1540fee 100644 --- a/src-docs/charm.md +++ b/src-docs/charm.md @@ -15,12 +15,13 @@ Charm for creating and managing GitHub self-hosted runner instances. - **RECONCILE_INTERVAL_CONFIG_NAME** - **TEST_MODE_CONFIG_NAME** - **TOKEN_CONFIG_NAME** +- **RECONCILIATION_INTERVAL_TIMEOUT_FACTOR** - **RECONCILE_RUNNERS_EVENT** - **REACTIVE_MQ_DB_NAME** --- - + ## function `catch_charm_errors` @@ -46,7 +47,7 @@ Catch common errors in charm. --- - + ## function `catch_action_errors` @@ -72,7 +73,7 @@ Catch common errors in actions. --- - + ## class `ReconcileRunnersEvent` Event representing a periodic check to ensure runners are ok. @@ -83,7 +84,7 @@ Event representing a periodic check to ensure runners are ok. --- - + ## class `GithubRunnerCharm` Charm for managing GitHub self-hosted runners. @@ -100,7 +101,7 @@ Charm for managing GitHub self-hosted runners. - `ram_pool_path`: The path to memdisk storage. - `kernel_module_path`: The path to kernel modules. - + ### method `__init__` diff --git a/src-docs/event_timer.md b/src-docs/event_timer.md index 57055e575..7d10d807c 100644 --- a/src-docs/event_timer.md +++ b/src-docs/event_timer.md @@ -109,7 +109,7 @@ Construct the timer manager. --- - + ### method `disable_event_timer` @@ -138,11 +138,7 @@ Disable the systemd timer for the given event. ### method `ensure_event_timer` ```python -ensure_event_timer( - event_name: str, - interval: int, - timeout: Optional[int] = None -) → None +ensure_event_timer(event_name: str, interval: int, timeout: int) → None ``` Ensure that a systemd service and timer are registered to dispatch the given event. diff --git a/src/charm.py b/src/charm.py index 0b17dbf52..9dae00059 100755 --- a/src/charm.py +++ b/src/charm.py @@ -94,6 +94,10 @@ from runner_manager import LXDRunnerManager, LXDRunnerManagerConfig from runner_manager_type import LXDFlushMode +# We assume a stuck reconcile event when it takes longer +# than 10 times a normal interval. Currently, we are only aware of +# https://bugs.launchpad.net/juju/+bug/2055184 causing a stuck reconcile event. +RECONCILIATION_INTERVAL_TIMEOUT_FACTOR = 10 RECONCILE_RUNNERS_EVENT = "reconcile-runners" # This is currently hardcoded and may be moved to a config option in the future. @@ -555,7 +559,8 @@ def _set_reconcile_timer(self) -> None: self._event_timer.ensure_event_timer( event_name="reconcile-runners", interval=int(self.config[RECONCILE_INTERVAL_CONFIG_NAME]), - timeout=int(self.config[RECONCILE_INTERVAL_CONFIG_NAME]) - 1, + timeout=RECONCILIATION_INTERVAL_TIMEOUT_FACTOR + * int(self.config[RECONCILE_INTERVAL_CONFIG_NAME]), ) def _ensure_reconcile_timer_is_active(self) -> None: diff --git a/src/event_timer.py b/src/event_timer.py index da03150ff..ab94c5736 100644 --- a/src/event_timer.py +++ b/src/event_timer.py @@ -5,7 +5,7 @@ import logging import subprocess # nosec B404 from pathlib import Path -from typing import Optional, TypedDict +from typing import TypedDict import jinja2 @@ -107,9 +107,7 @@ def is_active(self, event_name: str) -> bool: return ret_code == 0 - def ensure_event_timer( - self, event_name: str, interval: int, timeout: Optional[int] = None - ) -> None: + def ensure_event_timer(self, event_name: str, interval: int, timeout: int) -> None: """Ensure that a systemd service and timer are registered to dispatch the given event. The interval is how frequently, in minutes, the event should be dispatched. @@ -125,10 +123,7 @@ def ensure_event_timer( Raises: TimerEnableError: Timer cannot be started. Events will be not emitted. """ - if timeout is not None: - timeout_in_secs = timeout * 60 - else: - timeout_in_secs = interval * 30 + timeout_in_secs = timeout * 60 context: EventConfig = { "event": event_name, diff --git a/src/grafana_dashboards/metrics.json b/src/grafana_dashboards/metrics.json index 8582b77ef..0e8728765 100644 --- a/src/grafana_dashboards/metrics.json +++ b/src/grafana_dashboards/metrics.json @@ -24,10 +24,23 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 537, + "id": 567, "links": [], "liveNow": false, "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "panels": [], + "title": "General", + "type": "row" + }, { "datasource": { "type": "loki", @@ -91,7 +104,7 @@ "h": 8, "w": 12, "x": 0, - "y": 0 + "y": 1 }, "id": 3, "options": { @@ -212,7 +225,7 @@ "h": 8, "w": 12, "x": 12, - "y": 0 + "y": 1 }, "id": 22, "options": { @@ -253,19 +266,6 @@ "title": "Available Runners", "type": "barchart" }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 8 - }, - "id": 10, - "panels": [], - "title": "General", - "type": "row" - }, { "datasource": { "type": "loki", @@ -485,7 +485,7 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\", duration=\"queue_duration\", flavor=\"flavor\" | __error__=`` | event = `runner_start` | flavor =~ `$flavor` | unwrap duration [1h]) by (flavor)", + "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"queue_duration\",flavor=\"flavor\" | __error__=\"\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(flavor)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "{{flavor}}", @@ -493,7 +493,7 @@ "refId": "E" } ], - "title": "Max Job Queue Duration By Flavour", + "title": "Max Job Queue Duration By Application", "type": "timeseries" }, { @@ -789,10 +789,10 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, + "x": 0, "y": 25 }, - "id": 4, + "id": 24, "options": { "legend": { "calcs": [], @@ -812,7 +812,7 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "50%", "queryType": "range", @@ -824,12 +824,12 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "95%", "queryType": "range", - "refId": "C" + "refId": "D" }, { "datasource": { @@ -837,30 +837,151 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "99%", "queryType": "range", - "refId": "D" + "refId": "E" }, { "datasource": { "type": "loki", "uid": "${lokids}" }, - "editorMode": "code", - "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "editorMode": "builder", + "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "Max", "queryType": "range", - "refId": "E" + "refId": "C" } ], - "title": "Runner Idle Duration (Percentile)", + "title": "Reconciliation Duration (Percentile)", "type": "timeseries" }, + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "description": "The average actual reconciliation interval.\n\nUnlike the Reconciliation Duration panel, this panel shows the time between the actual start of reconciliations. This is useful for identifying problems where the reconciliation itself is behaving normally, but the event itself is not being scheduled as expected.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "Duration" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 6, + "options": { + "barRadius": 0, + "barWidth": 0.97, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "orientation": "horizontal", + "showValue": "never", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "editorMode": "code", + "expr": "1 / sum by (flavor) (rate({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\", flavor=\"flavor\" | event=\"reconciliation\" [$__range]))", + "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", + "legendFormat": "", + "queryType": "instant", + "refId": "A" + } + ], + "title": "Average Reconciliation Interval", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "desc": true, + "field": "Value #A" + } + ] + } + } + ], + "type": "barchart" + }, { "datasource": { "type": "loki", @@ -926,7 +1047,7 @@ "x": 12, "y": 33 }, - "id": 6, + "id": 4, "options": { "legend": { "calcs": [], @@ -946,7 +1067,7 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "50%", "queryType": "range", @@ -958,12 +1079,12 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "95%", "queryType": "range", - "refId": "D" + "refId": "C" }, { "datasource": { @@ -971,28 +1092,28 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "99%", "queryType": "range", - "refId": "E" + "refId": "D" }, { "datasource": { "type": "loki", "uid": "${lokids}" }, - "editorMode": "builder", - "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "editorMode": "code", + "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "Max", "queryType": "range", - "refId": "C" + "refId": "E" } ], - "title": "Reconciliation Duration (Percentile)", + "title": "Runner Idle Duration (Percentile)", "type": "timeseries" }, { @@ -1924,6 +2045,6 @@ "timepicker": {}, "timezone": "", "title": "GitHub Self-Hosted Runner Metrics", - "version": 14, + "version": 15, "weekStart": "" } diff --git a/templates/dispatch-event.service.j2 b/templates/dispatch-event.service.j2 index 7bef08f96..b4482795f 100644 --- a/templates/dispatch-event.service.j2 +++ b/templates/dispatch-event.service.j2 @@ -4,7 +4,7 @@ Description=Dispatch the {{event}} event on {{unit}} [Service] Type=oneshot # For juju 3 and juju 2 compatibility. The juju-run binary was renamed to juju-exec for juju 3. -ExecStart=/usr/bin/run-one /usr/bin/bash -c '/usr/bin/juju-exec "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch" || /usr/bin/juju-run "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch"' +ExecStart=/usr/bin/timeout "{{timeout}}" /usr/bin/run-one /usr/bin/bash -c '/usr/bin/juju-exec "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch" || /usr/bin/juju-run "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch"' [Install] WantedBy=multi-user.target