Skip to content

Commit

Permalink
Issue reconciliation metric on error (#5)
Browse files Browse the repository at this point in the history
* issue reconciliation metric on error

* bump patch version

* remove edge runners due to shortage

* add .
  • Loading branch information
cbartz committed Sep 10, 2024
1 parent 1f310b2 commit d9013e0
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 40 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,4 @@ jobs:
uses: canonical/operator-workflows/.github/workflows/test.yaml@main
secrets: inherit
with:
self-hosted-runner: true
self-hosted-runner-label: "edge"
self-hosted-runner: false
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

[project]
name = "github-runner-manager"
version = "0.1.0"
version = "0.1.1"
authors = [
{ name = "Canonical IS DevOps", email = "[email protected]" },
]
Expand Down
12 changes: 6 additions & 6 deletions src-docs/manager.runner_scaler.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Module for scaling the runners amount.

---

<a href="../src/github_runner_manager/manager/runner_scaler.py#L23"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../src/github_runner_manager/manager/runner_scaler.py#L28"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## <kbd>class</kbd> `RunnerInfo`
Information on the runners.
Expand Down Expand Up @@ -50,12 +50,12 @@ __init__(

---

<a href="../src/github_runner_manager/manager/runner_scaler.py#L44"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../src/github_runner_manager/manager/runner_scaler.py#L49"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## <kbd>class</kbd> `RunnerScaler`
Manage the reconcile of runners.

<a href="../src/github_runner_manager/manager/runner_scaler.py#L47"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../src/github_runner_manager/manager/runner_scaler.py#L52"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>method</kbd> `__init__`

Expand All @@ -77,7 +77,7 @@ Construct the object.

---

<a href="../src/github_runner_manager/manager/runner_scaler.py#L93"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../src/github_runner_manager/manager/runner_scaler.py#L98"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>method</kbd> `flush`

Expand All @@ -100,7 +100,7 @@ Flush the runners.

---

<a href="../src/github_runner_manager/manager/runner_scaler.py#L57"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../src/github_runner_manager/manager/runner_scaler.py#L62"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>method</kbd> `get_runner_info`

Expand All @@ -117,7 +117,7 @@ Get information on the runners.

---

<a href="../src/github_runner_manager/manager/runner_scaler.py#L111"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../src/github_runner_manager/manager/runner_scaler.py#L116"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>method</kbd> `reconcile`

Expand Down
113 changes: 83 additions & 30 deletions src/github_runner_manager/manager/runner_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,12 @@
from github_runner_manager.errors import IssueMetricEventError, MissingServerConfigError
from github_runner_manager.manager.cloud_runner_manager import HealthState
from github_runner_manager.manager.github_runner_manager import GitHubRunnerState
from github_runner_manager.manager.runner_manager import FlushMode, RunnerManager
from github_runner_manager.manager.runner_manager import (
FlushMode,
IssuedMetricEventsStats,
RunnerInstance,
RunnerManager,
)
from github_runner_manager.metrics import events as metric_events
from github_runner_manager.types_ import ReactiveConfig

Expand Down Expand Up @@ -123,34 +128,59 @@ def reconcile(self, quantity: int) -> int:
logger.info("Reactive configuration detected, going into experimental reactive mode.")
return self._reconcile_reactive(quantity, self._reactive_config.mq_uri)

metric_stats = {}
start_timestamp = time.time()
delete_metric_stats = None
metric_stats = self._manager.cleanup()
runners = self._manager.get_runners()
logger.info("Reconcile runners from %s to %s", len(runners), quantity)
runner_diff = quantity - len(runners)
if runner_diff > 0:
try:
self._manager.create_runners(runner_diff)
except MissingServerConfigError:
logging.exception(
"Unable to spawn runner due to missing server configuration, such as, image."
)
elif runner_diff < 0:
delete_metric_stats = self._manager.delete_runners(-runner_diff)
else:
logger.info("No changes to the number of runners.")
end_timestamp = time.time()

# Merge the two metric stats.
if delete_metric_stats is not None:
metric_stats = {
event_name: delete_metric_stats.get(event_name, 0)
+ metric_stats.get(event_name, 0)
for event_name in set(delete_metric_stats) | set(metric_stats)
}

runner_list = self._manager.get_runners()
try:
delete_metric_stats = None
metric_stats = self._manager.cleanup()
runners = self._manager.get_runners()
logger.info("Reconcile runners from %s to %s", len(runners), quantity)
runner_diff = quantity - len(runners)
if runner_diff > 0:
try:
self._manager.create_runners(runner_diff)
except MissingServerConfigError:
logging.exception(
"Unable to spawn runner due to missing server configuration, "
"such as, image."
)
elif runner_diff < 0:
delete_metric_stats = self._manager.delete_runners(-runner_diff)
else:
logger.info("No changes to the number of runners.")

# Merge the two metric stats.
if delete_metric_stats is not None:
metric_stats = {
event_name: delete_metric_stats.get(event_name, 0)
+ metric_stats.get(event_name, 0)
for event_name in set(delete_metric_stats) | set(metric_stats)
}
finally:
runner_list = self._manager.get_runners()
self._log_runners(runner_list)
end_timestamp = time.time()
self._issue_reconciliation_metric(
start_timestamp, end_timestamp, metric_stats, runner_list
)

return runner_diff

@staticmethod
def _log_runners(runner_list: tuple[RunnerInstance]) -> None:
"""Log information about the runners found.
Args:
runner_list: The list of runners.
"""
for runner in runner_list:
logger.info(
"Runner %s: state=%s, health=%s",
runner.name,
runner.github_state,
runner.health,
)
busy_runners = [
runner for runner in runner_list if runner.github_state == GitHubRunnerState.BUSY
]
Expand All @@ -163,7 +193,7 @@ def reconcile(self, quantity: int) -> int:
if runner.github_state == GitHubRunnerState.OFFLINE
and runner.health == HealthState.HEALTHY
]
unhealthy_states = set((HealthState.UNHEALTHY, HealthState.UNKNOWN))
unhealthy_states = {HealthState.UNHEALTHY, HealthState.UNKNOWN}
unhealthy_runners = [runner for runner in runner_list if runner.health in unhealthy_states]
logger.info("Found %s busy runners: %s", len(busy_runners), busy_runners)
logger.info("Found %s idle runners: %s", len(idle_runners), idle_runners)
Expand All @@ -174,6 +204,31 @@ def reconcile(self, quantity: int) -> int:
)
logger.info("Found %s unhealthy runners: %s", len(unhealthy_runners), unhealthy_runners)

def _issue_reconciliation_metric(
self,
start_timestamp: float,
end_timestamp: float,
metric_stats: IssuedMetricEventsStats,
runner_list: tuple[RunnerInstance],
) -> None:
"""Issue the reconciliation metric.
Args:
start_timestamp: The start timestamp of the reconciliation.
end_timestamp: The end timestamp of the reconciliation.
metric_stats: The metric stats.
runner_list: The list of runners.
"""
idle_runners = [
runner for runner in runner_list if runner.github_state == GitHubRunnerState.IDLE
]
offline_healthy_runners = [
runner
for runner in runner_list
if runner.github_state == GitHubRunnerState.OFFLINE
and runner.health == HealthState.HEALTHY
]

try:
available_runners = set(runner.name for runner in idle_runners) | set(
runner.name for runner in offline_healthy_runners
Expand All @@ -194,8 +249,6 @@ def reconcile(self, quantity: int) -> int:
except IssueMetricEventError:
logger.exception("Failed to issue Reconciliation metric")

return runner_diff

def _reconcile_reactive(self, quantity: int, mq_uri: MongoDsn) -> int:
"""Reconcile runners reactively.
Expand Down
30 changes: 29 additions & 1 deletion tests/unit/test_runner_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
RunnerManagerConfig,
)
from github_runner_manager.manager.runner_scaler import RunnerScaler
from github_runner_manager.metrics.events import Reconciliation
from github_runner_manager.types_.github import GitHubPath, GitHubRepo
from tests.unit.mock_runner_managers import (
MockCloudRunnerManager,
Expand Down Expand Up @@ -56,9 +57,18 @@ def mock_runner_managers_fixture(
return (mock_cloud, mock_github)


@pytest.fixture(scope="function", name="issue_events_mock")
def issue_events_mock_fixture(monkeypatch: pytest.MonkeyPatch):
issue_events_mock = MagicMock()
monkeypatch.setattr(
"github_runner_manager.manager.runner_scaler.metric_events.issue_event", issue_events_mock
)
return issue_events_mock


@pytest.fixture(scope="function", name="runner_manager")
def runner_manager_fixture(
monkeypatch, mock_runner_managers, github_path: GitHubPath
monkeypatch, mock_runner_managers, github_path: GitHubPath, issue_events_mock
) -> RunnerManager:
mock_cloud, mock_github = mock_runner_managers
monkeypatch.setattr(
Expand Down Expand Up @@ -176,6 +186,24 @@ def test_reconcile_runner_create_one(runner_scaler: RunnerScaler):
assert_runner_info(runner_scaler, online=0)


def test_reconcile_error_still_issue_metrics(
runner_scaler: RunnerScaler, monkeypatch: pytest.MonkeyPatch, issue_events_mock: MagicMock
):
"""
Arrange: A RunnerScaler with no runners which raises an error on reconcile.
Act: Reconcile to one runner.
Assert: ReconciliationEvent should be issued.
"""
monkeypatch.setattr(
runner_scaler._manager, "cleanup", MagicMock(side_effect=Exception("Mock error"))
)
with pytest.raises(Exception):
runner_scaler.reconcile(1)
issue_events_mock.assert_called_once()
issued_event = issue_events_mock.call_args[0][0]
assert isinstance(issued_event, Reconciliation)


def test_one_runner(runner_scaler: RunnerScaler):
"""
Arrange: A RunnerScaler with no runners.
Expand Down

0 comments on commit d9013e0

Please sign in to comment.