diff --git a/docs/explanation/charm-architecture.md b/docs/explanation/charm-architecture.md
index 286d004ac..98280cd96 100644
--- a/docs/explanation/charm-architecture.md
+++ b/docs/explanation/charm-architecture.md
@@ -69,6 +69,9 @@ The charm requires a GitHub personal access token for the [`token` configuration
The token is also passed to [repo-policy-compliance](https://github.com/canonical/repo-policy-compliance) to access GitHub API for the service.
+Note that the GitHub API uses a [rate-limiting mechanism](https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28). When this is reached, the charm may not be able to perform the necessary operations and may go into
+BlockedStatus. The charm will automatically recover from this state once the rate limit is reset, but using a different token with a higher rate limit may be a better solution depending on your deployment requirements.
+
## GitHub repository setting check
The [repo-policy-compliance](https://github.com/canonical/repo-policy-compliance) is a [Flask application](https://flask.palletsprojects.com/) hosted on [Gunicorn](https://gunicorn.org/) that provides a RESTful HTTP API to check the settings of GitHub repositories. This ensures the GitHub repository settings do not allow the execution of code not reviewed by maintainers on the self-hosted runners.
diff --git a/src-docs/charm.py.md b/src-docs/charm.py.md
index 0b6f16406..50078b5c6 100644
--- a/src-docs/charm.py.md
+++ b/src-docs/charm.py.md
@@ -12,7 +12,7 @@ Charm for creating and managing GitHub self-hosted runner instances.
---
-
+
## function `catch_charm_errors`
@@ -38,7 +38,7 @@ Catch common errors in charm.
---
-
+
## function `catch_action_errors`
@@ -67,7 +67,7 @@ Catch common errors in actions.
## class `GithubRunnerCharm`
Charm for managing GitHub self-hosted runners.
-
+
### function `__init__`
diff --git a/src-docs/errors.py.md b/src-docs/errors.py.md
index b6bdc93d7..18ed39353 100644
--- a/src-docs/errors.py.md
+++ b/src-docs/errors.py.md
@@ -52,6 +52,15 @@ Represents an error when the shared filesystem could not be retrieved.
+---
+
+## class `GithubApiError`
+Represents an error when the GitHub API returns an error.
+
+
+
+
+
---
## class `GithubClientError`
@@ -305,3 +314,12 @@ Construct the subprocess error.
+---
+
+## class `TokenError`
+Represents an error when the token is invalid or has not enough permissions.
+
+
+
+
+
diff --git a/src-docs/github_client.py.md b/src-docs/github_client.py.md
index f2b51c425..56bd7c01b 100644
--- a/src-docs/github_client.py.md
+++ b/src-docs/github_client.py.md
@@ -8,13 +8,25 @@ GitHub API client.
Migrate to PyGithub in the future. PyGithub is still lacking some API such as remove token for runner.
+---
+
+
+
+## function `catch_http_errors`
+
+```python
+catch_http_errors(func)
+```
+
+Catch HTTP errors and raise custom exceptions.
+
---
## class `GithubClient`
GitHub API client.
-
+
### function `__init__`
@@ -36,7 +48,7 @@ Instantiate the GiHub API client.
---
-
+
### function `delete_runner`
@@ -55,7 +67,7 @@ Delete the self-hosted runner from GitHub.
---
-
+
### function `get_job_info`
@@ -84,7 +96,7 @@ Get information about a job for a specific workflow run.
---
-
+
### function `get_runner_applications`
@@ -107,7 +119,7 @@ Get list of runner applications available for download.
---
-
+
### function `get_runner_github_info`
@@ -132,7 +144,7 @@ Get runner information on GitHub under a repo or org.
---
-
+
### function `get_runner_registration_token`
@@ -155,7 +167,7 @@ Get token from GitHub used for registering runners.
---
-
+
### function `get_runner_remove_token`
diff --git a/src-docs/runner.py.md b/src-docs/runner.py.md
index f9801909a..586cf0537 100644
--- a/src-docs/runner.py.md
+++ b/src-docs/runner.py.md
@@ -96,7 +96,7 @@ Create the runner instance on LXD and register it on GitHub.
### function `remove`
```python
-remove(remove_token: str) → None
+remove(remove_token: Optional[str]) → None
```
Remove this runner instance from LXD and GitHub.
diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md
index f634974ec..c22e5f40b 100644
--- a/src-docs/runner_manager.py.md
+++ b/src-docs/runner_manager.py.md
@@ -46,7 +46,7 @@ Construct RunnerManager object for creating and managing runners.
---
-
+
### function `build_runner_image`
@@ -172,7 +172,7 @@ Bring runners in line with target.
---
-
+
### function `schedule_build_runner_image`
diff --git a/src-docs/runner_manager_type.py.md b/src-docs/runner_manager_type.py.md
index 43f020476..3c6487496 100644
--- a/src-docs/runner_manager_type.py.md
+++ b/src-docs/runner_manager_type.py.md
@@ -18,8 +18,9 @@ Strategy for flushing runners.
- `FLUSH_IDLE`: Flush only idle runners.
- `FLUSH_IDLE_WAIT_REPO_CHECK`: Flush only idle runners, then wait until repo-policy-check is completed for the busy runners.
- - `FORCE_FLUSH_BUSY`: Force flush busy runners.
- - `FORCE_FLUSH_BUSY_WAIT_REPO_CHECK`: Wait until the repo-policy-check is completed before force flush of busy runners.
+ - `FLUSH_BUSY`: Flush busy runners.
+ - `FLUSH_BUSY_WAIT_REPO_CHECK`: Wait until the repo-policy-check is completed before flush of busy runners.
+ - `FORCE_FLUSH_WAIT_REPO_CHECK`: Force flush the runners (remove lxd instances even on gh api issues, like invalid token). Wait until repo-policy-check is completed before force flush of busy runners.
diff --git a/src/charm.py b/src/charm.py
index 293aaf215..1de163443 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -42,6 +42,7 @@
RunnerBinaryError,
RunnerError,
SubprocessError,
+ TokenError,
)
from event_timer import EventTimer, TimerStatusError
from firewall import Firewall, FirewallEntry
@@ -82,6 +83,9 @@ def func_with_catch_errors(self, event: EventT) -> None:
except ConfigurationError as err:
logger.exception("Issue with charm configuration")
self.unit.status = BlockedStatus(str(err))
+ except TokenError as err:
+ logger.exception("Issue with GitHub token")
+ self.unit.status = BlockedStatus(str(err))
except MissingConfigurationError as err:
logger.exception("Missing required charm configuration")
self.unit.status = BlockedStatus(
@@ -492,7 +496,7 @@ def _on_upgrade_charm(self, _event: UpgradeCharmEvent) -> None:
if not runner_manager:
return
- runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK)
+ runner_manager.flush(FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK)
self._reconcile_runners(runner_manager)
@catch_charm_errors
@@ -502,31 +506,39 @@ def _on_config_changed(self, _event: ConfigChangedEvent) -> None:
Args:
event: Event of configuration change.
"""
+ self._set_reconcile_timer()
+
+ prev_config_for_flush: dict[str, str] = {}
+
if self.config["token"] != self._stored.token:
+ prev_config_for_flush["token"] = str(self._stored.token)
self._start_services()
self._stored.token = None
- self._refresh_firewall()
- self._set_reconcile_timer()
-
if self.config["path"] != self._stored.path:
- prev_runner_manager = self._get_runner_manager(
- path=str(self._stored.path)
- ) # Casting for mypy checks.
- if prev_runner_manager:
- self.unit.status = MaintenanceStatus("Removing runners from old org/repo")
- prev_runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK)
+ prev_config_for_flush["path"] = str(self._stored.path)
self._stored.path = self.config["path"]
- runner_manager = self._get_runner_manager()
- if runner_manager:
+ if prev_config_for_flush:
+ prev_runner_manager = self._get_runner_manager(**prev_config_for_flush)
+ if prev_runner_manager:
+ self.unit.status = MaintenanceStatus("Removing runners due to config change")
+ # it may be the case that the prev token has expired, so we need to use force flush
+ prev_runner_manager.flush(FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK)
+
+ self._refresh_firewall()
+
+ try:
+ runner_manager = self._get_runner_manager()
+ except MissingConfigurationError as err:
+ self.unit.status = BlockedStatus(
+ f"Missing required charm configuration: {err.configs}"
+ )
+ else:
self._reconcile_runners(runner_manager)
self.unit.status = ActiveStatus()
- else:
- self.unit.status = BlockedStatus("Missing token or org/repo path config")
if self.config["token"] != self._stored.token:
- runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK)
self._stored.token = self.config["token"]
def _check_and_update_dependencies(self) -> bool:
@@ -669,7 +681,7 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None:
"""
runner_manager = self._get_runner_manager()
- runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK)
+ runner_manager.flush(FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK)
delta = self._reconcile_runners(runner_manager)
self._on_check_runners_action(event)
@@ -704,7 +716,7 @@ def _on_stop(self, _: StopEvent) -> None:
self._event_timer.disable_event_timer("reconcile-runners")
runner_manager = self._get_runner_manager()
- runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY)
+ runner_manager.flush(FlushMode.FLUSH_BUSY)
def _reconcile_runners(self, runner_manager: RunnerManager) -> Dict[str, Any]:
"""Reconcile the current runners state and intended runner state.
diff --git a/src/errors.py b/src/errors.py
index b34c279b1..478e1cf25 100644
--- a/src/errors.py
+++ b/src/errors.py
@@ -150,6 +150,14 @@ class GithubClientError(Exception):
"""Base class for all github client errors."""
+class GithubApiError(GithubClientError):
+ """Represents an error when the GitHub API returns an error."""
+
+
+class TokenError(GithubClientError):
+ """Represents an error when the token is invalid or has not enough permissions."""
+
+
class JobNotFoundError(GithubClientError):
"""Represents an error when the job could not be found on GitHub."""
diff --git a/src/github_client.py b/src/github_client.py
index 002231334..4ae8c5b66 100644
--- a/src/github_client.py
+++ b/src/github_client.py
@@ -6,7 +6,7 @@
Migrate to PyGithub in the future. PyGithub is still lacking some API such as
remove token for runner.
"""
-
+import functools
import logging
from datetime import datetime
from urllib.error import HTTPError
@@ -28,6 +28,25 @@
logger = logging.getLogger(__name__)
+def catch_http_errors(func):
+ """Catch HTTP errors and raise custom exceptions."""
+
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ try:
+ return func(*args, **kwargs)
+ except HTTPError as exc:
+ if exc.code in (401, 403):
+ if exc.code == 401:
+ msg = "Invalid token."
+ else:
+ msg = "Provided token has not enough permissions or has reached rate-limit."
+ raise errors.TokenError(msg) from exc
+ raise errors.GithubApiError from exc
+
+ return wrapper
+
+
class GithubClient:
"""GitHub API client."""
@@ -41,6 +60,7 @@ def __init__(self, token: str):
self._token = token
self._client = GhApi(token=self._token)
+ @catch_http_errors
def get_runner_applications(self, path: GithubPath) -> RunnerApplicationList:
"""Get list of runner applications available for download.
@@ -60,6 +80,7 @@ def get_runner_applications(self, path: GithubPath) -> RunnerApplicationList:
return runner_bins
+ @catch_http_errors
def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]:
"""Get runner information on GitHub under a repo or org.
@@ -107,6 +128,7 @@ def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]:
]
return remote_runners_list
+ @catch_http_errors
def get_runner_remove_token(self, path: GithubPath) -> str:
"""Get token from GitHub used for removing runners.
@@ -125,6 +147,7 @@ def get_runner_remove_token(self, path: GithubPath) -> str:
return token["token"]
+ @catch_http_errors
def get_runner_registration_token(self, path: GithubPath) -> str:
"""Get token from GitHub used for registering runners.
@@ -147,6 +170,7 @@ def get_runner_registration_token(self, path: GithubPath) -> str:
return token["token"]
+ @catch_http_errors
def delete_runner(self, path: GithubPath, runner_id: int) -> None:
"""Delete the self-hosted runner from GitHub.
@@ -208,6 +232,8 @@ def get_job_info(self, path: GithubRepo, workflow_run_id: str, runner_name: str)
)
except HTTPError as exc:
+ if exc.code in (401, 403):
+ raise errors.TokenError from exc
raise errors.JobNotFoundError(
f"Could not find job for runner {runner_name}. "
f"Could not list jobs for workflow run {workflow_run_id}"
diff --git a/src/runner.py b/src/runner.py
index 4dbfb82ec..523149653 100644
--- a/src/runner.py
+++ b/src/runner.py
@@ -19,7 +19,6 @@
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, NamedTuple, Optional, Sequence
-from urllib.error import HTTPError
import yaml
@@ -27,6 +26,7 @@
from charm_state import ARCH, SSHDebugConnection
from errors import (
CreateSharedFilesystemError,
+ GithubClientError,
LxdError,
RunnerAproxyError,
RunnerCreateError,
@@ -166,7 +166,7 @@ def create(self, config: CreateRunnerConfig):
except (RunnerError, LxdError) as err:
raise RunnerCreateError(f"Unable to create runner {self.config.name}") from err
- def remove(self, remove_token: str) -> None:
+ def remove(self, remove_token: Optional[str]) -> None:
"""Remove this runner instance from LXD and GitHub.
Args:
@@ -179,18 +179,20 @@ def remove(self, remove_token: str) -> None:
if self.instance:
logger.info("Executing command to removal of runner and clean up...")
- self.instance.execute(
- [
- "/usr/bin/sudo",
- "-u",
- "ubuntu",
- str(self.config_script),
- "remove",
- "--token",
- remove_token,
- ],
- hide_cmd=True,
- )
+
+ if remove_token:
+ self.instance.execute(
+ [
+ "/usr/bin/sudo",
+ "-u",
+ "ubuntu",
+ str(self.config_script),
+ "remove",
+ "--token",
+ remove_token,
+ ],
+ hide_cmd=True,
+ )
if self.instance.status == "Running":
logger.info("Removing LXD instance of runner: %s", self.config.name)
@@ -232,7 +234,7 @@ def remove(self, remove_token: str) -> None:
)
try:
self._clients.github.delete_runner(self.config.path, self.status.runner_id)
- except HTTPError:
+ except GithubClientError:
logger.exception("Unable the remove runner on GitHub: %s", self.config.name)
# This can occur when attempting to remove a busy runner.
# The caller should retry later, after GitHub mark the runner as offline.
diff --git a/src/runner_manager.py b/src/runner_manager.py
index 725a7b566..f2dc48e69 100644
--- a/src/runner_manager.py
+++ b/src/runner_manager.py
@@ -561,7 +561,14 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int:
Returns:
Number of runners removed.
"""
- remove_token = self._clients.github.get_runner_remove_token(self.config.path)
+ try:
+ remove_token = self._clients.github.get_runner_remove_token(self.config.path)
+ except errors.GithubClientError:
+ logger.exception("Failed to get remove-token to unregister runners from GitHub.")
+ if mode != FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK:
+ raise
+ logger.info("Proceeding with flush without remove-token.")
+ remove_token = None
# Removing non-busy runners
runners = [
@@ -579,7 +586,8 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int:
if mode in (
FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK,
- FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK,
+ FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK,
+ FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK,
):
for _ in range(5):
if not self._runners_in_pre_job():
@@ -593,7 +601,11 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int:
)
)
- if mode in {FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK, FlushMode.FORCE_FLUSH_BUSY}:
+ if mode in {
+ FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK,
+ FlushMode.FLUSH_BUSY,
+ FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK,
+ }:
busy_runners = [runner for runner in self._get_runners() if runner.status.exist]
logger.info("Removing existing %i busy local runners", len(runners))
diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py
index 124c564b8..9df2c4ed3 100644
--- a/src/runner_manager_type.py
+++ b/src/runner_manager_type.py
@@ -24,15 +24,19 @@ class FlushMode(Enum):
FLUSH_IDLE: Flush only idle runners.
FLUSH_IDLE_WAIT_REPO_CHECK: Flush only idle runners, then wait until repo-policy-check is
completed for the busy runners.
- FORCE_FLUSH_BUSY: Force flush busy runners.
- FORCE_FLUSH_BUSY_WAIT_REPO_CHECK: Wait until the repo-policy-check is completed before
- force flush of busy runners.
+ FLUSH_BUSY: Flush busy runners.
+ FLUSH_BUSY_WAIT_REPO_CHECK: Wait until the repo-policy-check is completed before
+ flush of busy runners.
+ FORCE_FLUSH_WAIT_REPO_CHECK: Force flush the runners (remove lxd instances even on
+ gh api issues, like invalid token).
+ Wait until repo-policy-check is completed before force flush of busy runners.
"""
FLUSH_IDLE = auto()
FLUSH_IDLE_WAIT_REPO_CHECK = auto()
- FORCE_FLUSH_BUSY = auto()
- FORCE_FLUSH_BUSY_WAIT_REPO_CHECK = auto()
+ FLUSH_BUSY = auto()
+ FLUSH_BUSY_WAIT_REPO_CHECK = auto()
+ FORCE_FLUSH_WAIT_REPO_CHECK = auto()
@dataclass
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 18dccdd0a..27e5398f4 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -185,7 +185,7 @@ async def app_no_runner(
@pytest_asyncio.fixture(scope="module")
-async def app(model: Model, app_no_runner: Application) -> AsyncIterator[Application]:
+async def app_one_runner(model: Model, app_no_runner: Application) -> AsyncIterator[Application]:
"""Application with a single runner.
Test should ensure it returns with the application in a good state and has
diff --git a/tests/integration/test_charm_one_runner.py b/tests/integration/test_charm_one_runner.py
index 6fcb8c786..20dfc2e76 100644
--- a/tests/integration/test_charm_one_runner.py
+++ b/tests/integration/test_charm_one_runner.py
@@ -1,15 +1,18 @@
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.
-"""Integration tests for github-runner charm."""
+"""Integration tests for github-runner charm containing one runner."""
+from typing import AsyncIterator
import pytest
+import pytest_asyncio
from juju.application import Application
from juju.model import Model
from charm import GithubRunnerCharm
from tests.integration.helpers import (
assert_resource_lxd_profile,
+ ensure_charm_has_runner,
get_runner_names,
reconcile,
run_in_lxd_instance,
@@ -20,6 +23,19 @@
from tests.status_name import ACTIVE, BLOCKED
+@pytest_asyncio.fixture(scope="function", name="app")
+async def app_fixture(
+ model: Model,
+ app_one_runner: Application,
+) -> AsyncIterator[Application]:
+ """Setup and teardown the charm after each test.
+
+ Ensure the charm has one runner before starting a test.
+ """
+ await ensure_charm_has_runner(app_one_runner, model)
+ yield app_one_runner
+
+
@pytest.mark.asyncio
@pytest.mark.abort_on_fail
async def test_network_access(app: Application) -> None:
@@ -211,3 +227,19 @@ async def test_change_runner_storage(model: Model, app: Application) -> None:
# 2.
await app.set_config({"runner-storage": "memory"})
await model.wait_for_idle(status=ACTIVE, timeout=1 * 60)
+
+
+async def test_token_config_changed_insufficient_perms(
+ model: Model, app: Application, token_alt: str
+) -> None:
+ """
+ arrange: A working application with one runner.
+ act: Change the token to be invalid and set the number of runners to zero.
+ assert: The active runner should be removed, regardless of the invalid new token.
+ """
+ unit = app.units[0]
+
+ await app.set_config({"token": "", "virtual-machines": "0"})
+ await model.wait_for_idle()
+
+ await wait_till_num_of_runners(unit, num=0)
diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py
index e298005bc..f53197fa2 100644
--- a/tests/unit/test_runner.py
+++ b/tests/unit/test_runner.py
@@ -457,6 +457,7 @@ def test_random_ssh_connection_choice(
runner._configure_runner()
second_call_args = runner._clients.jinja.get_template("env.j2").render.call_args.kwargs
- assert (
- first_call_args["ssh_debug_info"] != second_call_args["ssh_debug_info"]
- ), "Same ssh debug info found, this may have occurred with a very low priority."
+ assert first_call_args["ssh_debug_info"] != second_call_args["ssh_debug_info"], (
+ "Same ssh debug info found, this may have occurred with a very low priority. "
+ "Just try again."
+ )