From 4c865c9b9bc8f4c612d53189d9b7e5cc59b1d0c8 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Tue, 10 Sep 2024 10:16:58 +0200 Subject: [PATCH] issue reconciliation metric on error --- src-docs/manager.runner_scaler.md | 12 +- .../manager/runner_scaler.py | 113 +++++++++++++----- tests/unit/test_runner_scaler.py | 30 ++++- 3 files changed, 118 insertions(+), 37 deletions(-) diff --git a/src-docs/manager.runner_scaler.md b/src-docs/manager.runner_scaler.md index ade13dd..e5abac0 100644 --- a/src-docs/manager.runner_scaler.md +++ b/src-docs/manager.runner_scaler.md @@ -9,7 +9,7 @@ Module for scaling the runners amount. --- - + ## class `RunnerInfo` Information on the runners. @@ -50,12 +50,12 @@ __init__( --- - + ## class `RunnerScaler` Manage the reconcile of runners. - + ### method `__init__` @@ -77,7 +77,7 @@ Construct the object. --- - + ### method `flush` @@ -100,7 +100,7 @@ Flush the runners. --- - + ### method `get_runner_info` @@ -117,7 +117,7 @@ Get information on the runners. --- - + ### method `reconcile` diff --git a/src/github_runner_manager/manager/runner_scaler.py b/src/github_runner_manager/manager/runner_scaler.py index 3ddcf03..9b9234b 100644 --- a/src/github_runner_manager/manager/runner_scaler.py +++ b/src/github_runner_manager/manager/runner_scaler.py @@ -13,7 +13,12 @@ from github_runner_manager.errors import IssueMetricEventError, MissingServerConfigError from github_runner_manager.manager.cloud_runner_manager import HealthState from github_runner_manager.manager.github_runner_manager import GitHubRunnerState -from github_runner_manager.manager.runner_manager import FlushMode, RunnerManager +from github_runner_manager.manager.runner_manager import ( + FlushMode, + IssuedMetricEventsStats, + RunnerInstance, + RunnerManager, +) from github_runner_manager.metrics import events as metric_events from github_runner_manager.types_ import ReactiveConfig @@ -123,34 +128,59 @@ def reconcile(self, quantity: int) -> int: logger.info("Reactive configuration detected, going into experimental reactive mode.") return self._reconcile_reactive(quantity, self._reactive_config.mq_uri) + metric_stats = {} start_timestamp = time.time() - delete_metric_stats = None - metric_stats = self._manager.cleanup() - runners = self._manager.get_runners() - logger.info("Reconcile runners from %s to %s", len(runners), quantity) - runner_diff = quantity - len(runners) - if runner_diff > 0: - try: - self._manager.create_runners(runner_diff) - except MissingServerConfigError: - logging.exception( - "Unable to spawn runner due to missing server configuration, such as, image." - ) - elif runner_diff < 0: - delete_metric_stats = self._manager.delete_runners(-runner_diff) - else: - logger.info("No changes to the number of runners.") - end_timestamp = time.time() - - # Merge the two metric stats. - if delete_metric_stats is not None: - metric_stats = { - event_name: delete_metric_stats.get(event_name, 0) - + metric_stats.get(event_name, 0) - for event_name in set(delete_metric_stats) | set(metric_stats) - } - runner_list = self._manager.get_runners() + try: + delete_metric_stats = None + metric_stats = self._manager.cleanup() + runners = self._manager.get_runners() + logger.info("Reconcile runners from %s to %s", len(runners), quantity) + runner_diff = quantity - len(runners) + if runner_diff > 0: + try: + self._manager.create_runners(runner_diff) + except MissingServerConfigError: + logging.exception( + "Unable to spawn runner due to missing server configuration, " + "such as, image." + ) + elif runner_diff < 0: + delete_metric_stats = self._manager.delete_runners(-runner_diff) + else: + logger.info("No changes to the number of runners.") + + # Merge the two metric stats. + if delete_metric_stats is not None: + metric_stats = { + event_name: delete_metric_stats.get(event_name, 0) + + metric_stats.get(event_name, 0) + for event_name in set(delete_metric_stats) | set(metric_stats) + } + finally: + runner_list = self._manager.get_runners() + self._log_runners(runner_list) + end_timestamp = time.time() + self._issue_reconciliation_metric( + start_timestamp, end_timestamp, metric_stats, runner_list + ) + + return runner_diff + + @staticmethod + def _log_runners(runner_list: tuple[RunnerInstance]) -> None: + """Log information about the runners found. + + Args: + runner_list: The list of runners. + """ + for runner in runner_list: + logger.info( + "Runner %s: state=%s, health=%s", + runner.name, + runner.github_state, + runner.health, + ) busy_runners = [ runner for runner in runner_list if runner.github_state == GitHubRunnerState.BUSY ] @@ -163,7 +193,7 @@ def reconcile(self, quantity: int) -> int: if runner.github_state == GitHubRunnerState.OFFLINE and runner.health == HealthState.HEALTHY ] - unhealthy_states = set((HealthState.UNHEALTHY, HealthState.UNKNOWN)) + unhealthy_states = {HealthState.UNHEALTHY, HealthState.UNKNOWN} unhealthy_runners = [runner for runner in runner_list if runner.health in unhealthy_states] logger.info("Found %s busy runners: %s", len(busy_runners), busy_runners) logger.info("Found %s idle runners: %s", len(idle_runners), idle_runners) @@ -174,6 +204,31 @@ def reconcile(self, quantity: int) -> int: ) logger.info("Found %s unhealthy runners: %s", len(unhealthy_runners), unhealthy_runners) + def _issue_reconciliation_metric( + self, + start_timestamp: float, + end_timestamp: float, + metric_stats: IssuedMetricEventsStats, + runner_list: tuple[RunnerInstance], + ) -> None: + """Issue the reconciliation metric. + + Args: + start_timestamp: The start timestamp of the reconciliation. + end_timestamp: The end timestamp of the reconciliation. + metric_stats: The metric stats. + runner_list: The list of runners + """ + idle_runners = [ + runner for runner in runner_list if runner.github_state == GitHubRunnerState.IDLE + ] + offline_healthy_runners = [ + runner + for runner in runner_list + if runner.github_state == GitHubRunnerState.OFFLINE + and runner.health == HealthState.HEALTHY + ] + try: available_runners = set(runner.name for runner in idle_runners) | set( runner.name for runner in offline_healthy_runners @@ -194,8 +249,6 @@ def reconcile(self, quantity: int) -> int: except IssueMetricEventError: logger.exception("Failed to issue Reconciliation metric") - return runner_diff - def _reconcile_reactive(self, quantity: int, mq_uri: MongoDsn) -> int: """Reconcile runners reactively. diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 9806a50..1ebaffc 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -15,6 +15,7 @@ RunnerManagerConfig, ) from github_runner_manager.manager.runner_scaler import RunnerScaler +from github_runner_manager.metrics.events import Reconciliation from github_runner_manager.types_.github import GitHubPath, GitHubRepo from tests.unit.mock_runner_managers import ( MockCloudRunnerManager, @@ -56,9 +57,18 @@ def mock_runner_managers_fixture( return (mock_cloud, mock_github) +@pytest.fixture(scope="function", name="issue_events_mock") +def issue_events_mock_fixture(monkeypatch: pytest.MonkeyPatch): + issue_events_mock = MagicMock() + monkeypatch.setattr( + "github_runner_manager.manager.runner_scaler.metric_events.issue_event", issue_events_mock + ) + return issue_events_mock + + @pytest.fixture(scope="function", name="runner_manager") def runner_manager_fixture( - monkeypatch, mock_runner_managers, github_path: GitHubPath + monkeypatch, mock_runner_managers, github_path: GitHubPath, issue_events_mock ) -> RunnerManager: mock_cloud, mock_github = mock_runner_managers monkeypatch.setattr( @@ -176,6 +186,24 @@ def test_reconcile_runner_create_one(runner_scaler: RunnerScaler): assert_runner_info(runner_scaler, online=0) +def test_reconcile_error_still_issue_metrics( + runner_scaler: RunnerScaler, monkeypatch: pytest.MonkeyPatch, issue_events_mock: MagicMock +): + """ + Arrange: A RunnerScaler with no runners which raises an error on reconcile. + Act: Reconcile to one runner. + Assert: ReconciliationEvent should be issued. + """ + monkeypatch.setattr( + runner_scaler._manager, "cleanup", MagicMock(side_effect=Exception("Mock error")) + ) + with pytest.raises(Exception): + runner_scaler.reconcile(1) + issue_events_mock.assert_called_once() + issued_event = issue_events_mock.call_args[0][0] + assert isinstance(issued_event, Reconciliation) + + def test_one_runner(runner_scaler: RunnerScaler): """ Arrange: A RunnerScaler with no runners.