Skip to content

Commit

Permalink
Merge b555aa8 into b9130e6
Browse files Browse the repository at this point in the history
  • Loading branch information
cbartz authored Oct 24, 2024
2 parents b9130e6 + b555aa8 commit cdc3693
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 13 deletions.
4 changes: 4 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

### 2024-10-24

- Add "expected_runners" to reconciliation metric.

### 2024-10-23

- Fixed the wrong dateformat usage in the server uniqueness check.
Expand Down
2 changes: 1 addition & 1 deletion docs/reference/cos.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ The "GitHub Self-Hosted Runner Metrics" metrics dashboard presents the following
- General: Displays general metrics about the charm and runners, such as:
- Lifecycle counters: Tracks the frequency of Runner initialisation, start, stop, and crash events.
- Available runners: A horizontal bar graph showing the number of runners available during the last reconciliation event. Note: This data is updated after each reconciliation event and is not real-time.
- Runners after reconciliation: A time series graph showing the number of runners marked as active/idle during the last reconciliation event over time. Note: This data is updated after each reconciliation event and is not real-time.
- Runners after reconciliation: A time series graph showing the number of runners marked as active/idle and the number of expected runners during the last reconciliation event over time. Note: This data is updated after each reconciliation event and is not real-time.
- Duration observations: Each data point aggregates the last hour and shows the 50th, 90th, 95th percentile and maximum durations for:
- Runner installation
- Runner idle duration
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ cosl ==0.0.15
# juju 3.1.2.0 depends on pyyaml<=6.0 and >=5.1.2
PyYAML ==6.0.*
pyOpenSSL==24.2.1
github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@398b4ff4740476f815cd7582e28b8b3a953cf845
github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@feat/expected-runner-metrics
47 changes: 39 additions & 8 deletions src/grafana_dashboards/metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 313,
"id": 317,
"links": [],
"liveNow": false,
"panels": [
Expand Down Expand Up @@ -202,7 +202,6 @@
"mode": "off"
}
},
"decimals": 0,
"mappings": [],
"thresholds": {
"mode": "absolute",
Expand All @@ -216,10 +215,29 @@
"value": 80
}
]
},
"unit": "short"
}
},
"overrides": []
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Value #A"
},
"properties": [
{
"id": "displayName",
"value": "Available"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Value #B"
},
"properties": []
}
]
},
"gridPos": {
"h": 8,
Expand Down Expand Up @@ -635,7 +653,7 @@
"type": "loki",
"uid": "${lokids}"
},
"description": "This panel totals the number of active/idle runners reported by charm units after reconciliation. Note that this is not a real-time metric and should only be used to identify trends or to investigate problems. A given point in time shows the aggregation of the last reported values from the reconciliation. This means, for example, that if the graph shows 10 idle runners at a particular time, it doesn't mean that there are 10 idle runners at that time, just that this is the most recently reported value. Also note that runners who are reported as idle at the time of reconciliation may become active immediately afterwards. And active runners may become offline/inactive.",
"description": "This panel totals the number of active/idle/total expected runners reported by charm units after reconciliation. Note that this is not a real-time metric and should only be used to identify trends or to investigate problems. A given point in time shows the aggregation of the last reported values from the reconciliation. This means, for example, that if the graph shows 10 idle runners at a particular time, it doesn't mean that there are 10 idle runners at that time, just that this is the most recently reported value. Also note that runners who are reported as idle at the time of reconciliation may become active immediately afterwards. And active runners may become offline/inactive.\n\nThere is no expected number of runners for runners in reactive mode.",
"fieldConfig": {
"defaults": {
"color": {
Expand Down Expand Up @@ -733,6 +751,18 @@
"legendFormat": "Active",
"queryType": "range",
"refId": "A"
},
{
"datasource": {
"type": "loki",
"uid": "${lokids}"
},
"editorMode": "code",
"expr": "sum by(filename)(last_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",expected_runners=\"expected_runners\",flavor=\"flavor\" | __error__=\"\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap expected_runners[60m]))",
"hide": false,
"legendFormat": "Expected",
"queryType": "range",
"refId": "B"
}
],
"title": "Runners after Reconciliation",
Expand Down Expand Up @@ -1040,7 +1070,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -2056,6 +2087,6 @@
"timepicker": {},
"timezone": "",
"title": "GitHub Self-Hosted Runner Metrics",
"version": 16,
"version": 17,
"weekStart": ""
}
4 changes: 4 additions & 0 deletions src/runner_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,13 +362,15 @@ def _issue_reconciliation_metric(
metric_stats: IssuedMetricEventsStats,
reconciliation_start_ts: float,
reconciliation_end_ts: float,
expected_quantity: int,
) -> None:
"""Issue reconciliation metric.
Args:
metric_stats: The stats of issued metric events.
reconciliation_start_ts: The timestamp of when reconciliation started.
reconciliation_end_ts: The timestamp of when reconciliation ended.
expected_quantity: The expected quantity of runners.
"""
runners = self._get_runners()
runner_states = self._get_runner_health_states()
Expand Down Expand Up @@ -398,6 +400,7 @@ def _issue_reconciliation_metric(
- metric_stats.get(metric_events.RunnerStop, 0),
idle_runners=idle_online_count + idle_offline_count,
active_runners=active_count,
expected_runners=expected_quantity,
duration=reconciliation_end_ts - reconciliation_start_ts,
)
)
Expand Down Expand Up @@ -582,6 +585,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
metric_stats=metric_stats,
reconciliation_start_ts=start_ts,
reconciliation_end_ts=end_ts,
expected_quantity=quantity,
)
return delta

Expand Down
11 changes: 10 additions & 1 deletion tests/integration/helpers/charm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,19 @@ async def cancel_workflow_run(


async def assert_events_after_reconciliation(
app: Application, github_repository: Repository, post_job_status: PostJobStatus
app: Application,
github_repository: Repository,
post_job_status: PostJobStatus,
reactive_mode: bool = False,
):
"""Assert that the RunnerStart, RunnerStop and Reconciliation metric is logged.
Args:
app: The charm to assert the events for.
github_repository: The github repository to assert the events for.
post_job_status: The expected post job status of the reconciliation event.
reactive_mode: Whether the charm manages reactive runners,
this changes the expected events.
"""
unit = app.units[0]

Expand Down Expand Up @@ -221,6 +226,10 @@ async def assert_events_after_reconciliation(
assert metric_log.get("crashed_runners") == 0
assert metric_log.get("idle_runners") >= 0
assert metric_log.get("active_runners") >= 0
if not reactive_mode:
assert metric_log.get("expected_runners") >= 0
else:
assert "expected_runners" not in metric_log


async def wait_for_runner_to_be_marked_offline(
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_charm_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async def test_charm_upgrade(
assert: the charm is upgraded successfully.
"""
latest_stable_path = tmp_path / "github-runner.charm"
latest_stable_revision = 256 # update this value every release to stable.
latest_stable_revision = 282 # update this value every release to stable.
# download the charm and inject lxd profile for testing
retcode, stdout, stderr = await ops_test.juju(
"download",
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_reactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from charm_state import VIRTUAL_MACHINES_CONFIG_NAME
from tests.integration.helpers.charm_metrics import (
assert_events_after_reconciliation,
clear_metrics_log,
get_metrics_log,
)
from tests.integration.helpers.common import (
Expand All @@ -47,6 +48,7 @@ async def app_fixture(

await app_for_reactive.set_config({VIRTUAL_MACHINES_CONFIG_NAME: "1"})
await reconcile(app_for_reactive, app_for_reactive.model)
await clear_metrics_log(app_for_reactive.units[0])

yield app_for_reactive

Expand Down Expand Up @@ -400,4 +402,5 @@ async def _assert_metrics_are_logged(app: Application, github_repository: Reposi
app=app,
github_repository=github_repository,
post_job_status=PostJobStatus.NORMAL,
reactive_mode=True,
)
4 changes: 3 additions & 1 deletion tests/unit/test_lxd_runner_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,8 +438,9 @@ def mock_get_runners():
unhealthy=(),
)

quantity = random.randint(0, 5)
runner_manager.reconcile(
quantity=random.randint(0, 5), resources=VirtualMachineResources(2, "7GiB", "10Gib")
quantity=quantity, resources=VirtualMachineResources(2, "7GiB", "10Gib")
)

issue_event_mock.assert_any_call(
Expand All @@ -449,6 +450,7 @@ def mock_get_runners():
crashed_runners=1,
idle_runners=2,
active_runners=1,
expected_runners=quantity,
duration=0,
)
)
Expand Down

0 comments on commit cdc3693

Please sign in to comment.