From c9f6bfa1b8c92e3cb9026aacdd9c2cfe95818e29 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Wed, 7 Feb 2024 15:35:36 +0100 Subject: [PATCH] Use previous token to flush runners (#218) * Flush runners using prev token * Catch correct error * Force flush runners * Improve BlockedStatus message * Use force flush correctly * Add rate-limit info --- docs/explanation/charm-architecture.md | 3 ++ src-docs/charm.py.md | 6 +-- src-docs/errors.py.md | 18 +++++++++ src-docs/github_client.py.md | 26 ++++++++---- src-docs/runner.py.md | 2 +- src-docs/runner_manager.py.md | 4 +- src-docs/runner_manager_type.py.md | 5 ++- src/charm.py | 46 ++++++++++++++-------- src/errors.py | 8 ++++ src/github_client.py | 28 ++++++++++++- src/runner.py | 32 ++++++++------- src/runner_manager.py | 18 +++++++-- src/runner_manager_type.py | 14 ++++--- tests/integration/conftest.py | 2 +- tests/integration/test_charm_one_runner.py | 34 +++++++++++++++- tests/unit/test_runner.py | 7 ++-- 16 files changed, 192 insertions(+), 61 deletions(-) diff --git a/docs/explanation/charm-architecture.md b/docs/explanation/charm-architecture.md index 286d004ac..98280cd96 100644 --- a/docs/explanation/charm-architecture.md +++ b/docs/explanation/charm-architecture.md @@ -69,6 +69,9 @@ The charm requires a GitHub personal access token for the [`token` configuration The token is also passed to [repo-policy-compliance](https://github.com/canonical/repo-policy-compliance) to access GitHub API for the service. +Note that the GitHub API uses a [rate-limiting mechanism](https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28). When this is reached, the charm may not be able to perform the necessary operations and may go into +BlockedStatus. The charm will automatically recover from this state once the rate limit is reset, but using a different token with a higher rate limit may be a better solution depending on your deployment requirements. + ## GitHub repository setting check The [repo-policy-compliance](https://github.com/canonical/repo-policy-compliance) is a [Flask application](https://flask.palletsprojects.com/) hosted on [Gunicorn](https://gunicorn.org/) that provides a RESTful HTTP API to check the settings of GitHub repositories. This ensures the GitHub repository settings do not allow the execution of code not reviewed by maintainers on the self-hosted runners. diff --git a/src-docs/charm.py.md b/src-docs/charm.py.md index 0b6f16406..50078b5c6 100644 --- a/src-docs/charm.py.md +++ b/src-docs/charm.py.md @@ -12,7 +12,7 @@ Charm for creating and managing GitHub self-hosted runner instances. --- - + ## function `catch_charm_errors` @@ -38,7 +38,7 @@ Catch common errors in charm. --- - + ## function `catch_action_errors` @@ -67,7 +67,7 @@ Catch common errors in actions. ## class `GithubRunnerCharm` Charm for managing GitHub self-hosted runners. - + ### function `__init__` diff --git a/src-docs/errors.py.md b/src-docs/errors.py.md index b6bdc93d7..18ed39353 100644 --- a/src-docs/errors.py.md +++ b/src-docs/errors.py.md @@ -52,6 +52,15 @@ Represents an error when the shared filesystem could not be retrieved. +--- + +## class `GithubApiError` +Represents an error when the GitHub API returns an error. + + + + + --- ## class `GithubClientError` @@ -305,3 +314,12 @@ Construct the subprocess error. +--- + +## class `TokenError` +Represents an error when the token is invalid or has not enough permissions. + + + + + diff --git a/src-docs/github_client.py.md b/src-docs/github_client.py.md index f2b51c425..56bd7c01b 100644 --- a/src-docs/github_client.py.md +++ b/src-docs/github_client.py.md @@ -8,13 +8,25 @@ GitHub API client. Migrate to PyGithub in the future. PyGithub is still lacking some API such as remove token for runner. +--- + + + +## function `catch_http_errors` + +```python +catch_http_errors(func) +``` + +Catch HTTP errors and raise custom exceptions. + --- ## class `GithubClient` GitHub API client. - + ### function `__init__` @@ -36,7 +48,7 @@ Instantiate the GiHub API client. --- - + ### function `delete_runner` @@ -55,7 +67,7 @@ Delete the self-hosted runner from GitHub. --- - + ### function `get_job_info` @@ -84,7 +96,7 @@ Get information about a job for a specific workflow run. --- - + ### function `get_runner_applications` @@ -107,7 +119,7 @@ Get list of runner applications available for download. --- - + ### function `get_runner_github_info` @@ -132,7 +144,7 @@ Get runner information on GitHub under a repo or org. --- - + ### function `get_runner_registration_token` @@ -155,7 +167,7 @@ Get token from GitHub used for registering runners. --- - + ### function `get_runner_remove_token` diff --git a/src-docs/runner.py.md b/src-docs/runner.py.md index f9801909a..586cf0537 100644 --- a/src-docs/runner.py.md +++ b/src-docs/runner.py.md @@ -96,7 +96,7 @@ Create the runner instance on LXD and register it on GitHub. ### function `remove` ```python -remove(remove_token: str) → None +remove(remove_token: Optional[str]) → None ``` Remove this runner instance from LXD and GitHub. diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md index f634974ec..c22e5f40b 100644 --- a/src-docs/runner_manager.py.md +++ b/src-docs/runner_manager.py.md @@ -46,7 +46,7 @@ Construct RunnerManager object for creating and managing runners. --- - + ### function `build_runner_image` @@ -172,7 +172,7 @@ Bring runners in line with target. --- - + ### function `schedule_build_runner_image` diff --git a/src-docs/runner_manager_type.py.md b/src-docs/runner_manager_type.py.md index 43f020476..3c6487496 100644 --- a/src-docs/runner_manager_type.py.md +++ b/src-docs/runner_manager_type.py.md @@ -18,8 +18,9 @@ Strategy for flushing runners. - `FLUSH_IDLE`: Flush only idle runners. - `FLUSH_IDLE_WAIT_REPO_CHECK`: Flush only idle runners, then wait until repo-policy-check is completed for the busy runners. - - `FORCE_FLUSH_BUSY`: Force flush busy runners. - - `FORCE_FLUSH_BUSY_WAIT_REPO_CHECK`: Wait until the repo-policy-check is completed before force flush of busy runners. + - `FLUSH_BUSY`: Flush busy runners. + - `FLUSH_BUSY_WAIT_REPO_CHECK`: Wait until the repo-policy-check is completed before flush of busy runners. + - `FORCE_FLUSH_WAIT_REPO_CHECK`: Force flush the runners (remove lxd instances even on gh api issues, like invalid token). Wait until repo-policy-check is completed before force flush of busy runners. diff --git a/src/charm.py b/src/charm.py index 293aaf215..1de163443 100755 --- a/src/charm.py +++ b/src/charm.py @@ -42,6 +42,7 @@ RunnerBinaryError, RunnerError, SubprocessError, + TokenError, ) from event_timer import EventTimer, TimerStatusError from firewall import Firewall, FirewallEntry @@ -82,6 +83,9 @@ def func_with_catch_errors(self, event: EventT) -> None: except ConfigurationError as err: logger.exception("Issue with charm configuration") self.unit.status = BlockedStatus(str(err)) + except TokenError as err: + logger.exception("Issue with GitHub token") + self.unit.status = BlockedStatus(str(err)) except MissingConfigurationError as err: logger.exception("Missing required charm configuration") self.unit.status = BlockedStatus( @@ -492,7 +496,7 @@ def _on_upgrade_charm(self, _event: UpgradeCharmEvent) -> None: if not runner_manager: return - runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK) + runner_manager.flush(FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) self._reconcile_runners(runner_manager) @catch_charm_errors @@ -502,31 +506,39 @@ def _on_config_changed(self, _event: ConfigChangedEvent) -> None: Args: event: Event of configuration change. """ + self._set_reconcile_timer() + + prev_config_for_flush: dict[str, str] = {} + if self.config["token"] != self._stored.token: + prev_config_for_flush["token"] = str(self._stored.token) self._start_services() self._stored.token = None - self._refresh_firewall() - self._set_reconcile_timer() - if self.config["path"] != self._stored.path: - prev_runner_manager = self._get_runner_manager( - path=str(self._stored.path) - ) # Casting for mypy checks. - if prev_runner_manager: - self.unit.status = MaintenanceStatus("Removing runners from old org/repo") - prev_runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK) + prev_config_for_flush["path"] = str(self._stored.path) self._stored.path = self.config["path"] - runner_manager = self._get_runner_manager() - if runner_manager: + if prev_config_for_flush: + prev_runner_manager = self._get_runner_manager(**prev_config_for_flush) + if prev_runner_manager: + self.unit.status = MaintenanceStatus("Removing runners due to config change") + # it may be the case that the prev token has expired, so we need to use force flush + prev_runner_manager.flush(FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) + + self._refresh_firewall() + + try: + runner_manager = self._get_runner_manager() + except MissingConfigurationError as err: + self.unit.status = BlockedStatus( + f"Missing required charm configuration: {err.configs}" + ) + else: self._reconcile_runners(runner_manager) self.unit.status = ActiveStatus() - else: - self.unit.status = BlockedStatus("Missing token or org/repo path config") if self.config["token"] != self._stored.token: - runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK) self._stored.token = self.config["token"] def _check_and_update_dependencies(self) -> bool: @@ -669,7 +681,7 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None: """ runner_manager = self._get_runner_manager() - runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK) + runner_manager.flush(FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) delta = self._reconcile_runners(runner_manager) self._on_check_runners_action(event) @@ -704,7 +716,7 @@ def _on_stop(self, _: StopEvent) -> None: self._event_timer.disable_event_timer("reconcile-runners") runner_manager = self._get_runner_manager() - runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY) + runner_manager.flush(FlushMode.FLUSH_BUSY) def _reconcile_runners(self, runner_manager: RunnerManager) -> Dict[str, Any]: """Reconcile the current runners state and intended runner state. diff --git a/src/errors.py b/src/errors.py index b34c279b1..478e1cf25 100644 --- a/src/errors.py +++ b/src/errors.py @@ -150,6 +150,14 @@ class GithubClientError(Exception): """Base class for all github client errors.""" +class GithubApiError(GithubClientError): + """Represents an error when the GitHub API returns an error.""" + + +class TokenError(GithubClientError): + """Represents an error when the token is invalid or has not enough permissions.""" + + class JobNotFoundError(GithubClientError): """Represents an error when the job could not be found on GitHub.""" diff --git a/src/github_client.py b/src/github_client.py index 002231334..4ae8c5b66 100644 --- a/src/github_client.py +++ b/src/github_client.py @@ -6,7 +6,7 @@ Migrate to PyGithub in the future. PyGithub is still lacking some API such as remove token for runner. """ - +import functools import logging from datetime import datetime from urllib.error import HTTPError @@ -28,6 +28,25 @@ logger = logging.getLogger(__name__) +def catch_http_errors(func): + """Catch HTTP errors and raise custom exceptions.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except HTTPError as exc: + if exc.code in (401, 403): + if exc.code == 401: + msg = "Invalid token." + else: + msg = "Provided token has not enough permissions or has reached rate-limit." + raise errors.TokenError(msg) from exc + raise errors.GithubApiError from exc + + return wrapper + + class GithubClient: """GitHub API client.""" @@ -41,6 +60,7 @@ def __init__(self, token: str): self._token = token self._client = GhApi(token=self._token) + @catch_http_errors def get_runner_applications(self, path: GithubPath) -> RunnerApplicationList: """Get list of runner applications available for download. @@ -60,6 +80,7 @@ def get_runner_applications(self, path: GithubPath) -> RunnerApplicationList: return runner_bins + @catch_http_errors def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: """Get runner information on GitHub under a repo or org. @@ -107,6 +128,7 @@ def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: ] return remote_runners_list + @catch_http_errors def get_runner_remove_token(self, path: GithubPath) -> str: """Get token from GitHub used for removing runners. @@ -125,6 +147,7 @@ def get_runner_remove_token(self, path: GithubPath) -> str: return token["token"] + @catch_http_errors def get_runner_registration_token(self, path: GithubPath) -> str: """Get token from GitHub used for registering runners. @@ -147,6 +170,7 @@ def get_runner_registration_token(self, path: GithubPath) -> str: return token["token"] + @catch_http_errors def delete_runner(self, path: GithubPath, runner_id: int) -> None: """Delete the self-hosted runner from GitHub. @@ -208,6 +232,8 @@ def get_job_info(self, path: GithubRepo, workflow_run_id: str, runner_name: str) ) except HTTPError as exc: + if exc.code in (401, 403): + raise errors.TokenError from exc raise errors.JobNotFoundError( f"Could not find job for runner {runner_name}. " f"Could not list jobs for workflow run {workflow_run_id}" diff --git a/src/runner.py b/src/runner.py index 4dbfb82ec..523149653 100644 --- a/src/runner.py +++ b/src/runner.py @@ -19,7 +19,6 @@ from dataclasses import dataclass from pathlib import Path from typing import Iterable, NamedTuple, Optional, Sequence -from urllib.error import HTTPError import yaml @@ -27,6 +26,7 @@ from charm_state import ARCH, SSHDebugConnection from errors import ( CreateSharedFilesystemError, + GithubClientError, LxdError, RunnerAproxyError, RunnerCreateError, @@ -166,7 +166,7 @@ def create(self, config: CreateRunnerConfig): except (RunnerError, LxdError) as err: raise RunnerCreateError(f"Unable to create runner {self.config.name}") from err - def remove(self, remove_token: str) -> None: + def remove(self, remove_token: Optional[str]) -> None: """Remove this runner instance from LXD and GitHub. Args: @@ -179,18 +179,20 @@ def remove(self, remove_token: str) -> None: if self.instance: logger.info("Executing command to removal of runner and clean up...") - self.instance.execute( - [ - "/usr/bin/sudo", - "-u", - "ubuntu", - str(self.config_script), - "remove", - "--token", - remove_token, - ], - hide_cmd=True, - ) + + if remove_token: + self.instance.execute( + [ + "/usr/bin/sudo", + "-u", + "ubuntu", + str(self.config_script), + "remove", + "--token", + remove_token, + ], + hide_cmd=True, + ) if self.instance.status == "Running": logger.info("Removing LXD instance of runner: %s", self.config.name) @@ -232,7 +234,7 @@ def remove(self, remove_token: str) -> None: ) try: self._clients.github.delete_runner(self.config.path, self.status.runner_id) - except HTTPError: + except GithubClientError: logger.exception("Unable the remove runner on GitHub: %s", self.config.name) # This can occur when attempting to remove a busy runner. # The caller should retry later, after GitHub mark the runner as offline. diff --git a/src/runner_manager.py b/src/runner_manager.py index 725a7b566..f2dc48e69 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -561,7 +561,14 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: Returns: Number of runners removed. """ - remove_token = self._clients.github.get_runner_remove_token(self.config.path) + try: + remove_token = self._clients.github.get_runner_remove_token(self.config.path) + except errors.GithubClientError: + logger.exception("Failed to get remove-token to unregister runners from GitHub.") + if mode != FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK: + raise + logger.info("Proceeding with flush without remove-token.") + remove_token = None # Removing non-busy runners runners = [ @@ -579,7 +586,8 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: if mode in ( FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK, - FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK, + FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, + FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, ): for _ in range(5): if not self._runners_in_pre_job(): @@ -593,7 +601,11 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: ) ) - if mode in {FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK, FlushMode.FORCE_FLUSH_BUSY}: + if mode in { + FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, + FlushMode.FLUSH_BUSY, + FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, + }: busy_runners = [runner for runner in self._get_runners() if runner.status.exist] logger.info("Removing existing %i busy local runners", len(runners)) diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py index 124c564b8..9df2c4ed3 100644 --- a/src/runner_manager_type.py +++ b/src/runner_manager_type.py @@ -24,15 +24,19 @@ class FlushMode(Enum): FLUSH_IDLE: Flush only idle runners. FLUSH_IDLE_WAIT_REPO_CHECK: Flush only idle runners, then wait until repo-policy-check is completed for the busy runners. - FORCE_FLUSH_BUSY: Force flush busy runners. - FORCE_FLUSH_BUSY_WAIT_REPO_CHECK: Wait until the repo-policy-check is completed before - force flush of busy runners. + FLUSH_BUSY: Flush busy runners. + FLUSH_BUSY_WAIT_REPO_CHECK: Wait until the repo-policy-check is completed before + flush of busy runners. + FORCE_FLUSH_WAIT_REPO_CHECK: Force flush the runners (remove lxd instances even on + gh api issues, like invalid token). + Wait until repo-policy-check is completed before force flush of busy runners. """ FLUSH_IDLE = auto() FLUSH_IDLE_WAIT_REPO_CHECK = auto() - FORCE_FLUSH_BUSY = auto() - FORCE_FLUSH_BUSY_WAIT_REPO_CHECK = auto() + FLUSH_BUSY = auto() + FLUSH_BUSY_WAIT_REPO_CHECK = auto() + FORCE_FLUSH_WAIT_REPO_CHECK = auto() @dataclass diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 18dccdd0a..27e5398f4 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -185,7 +185,7 @@ async def app_no_runner( @pytest_asyncio.fixture(scope="module") -async def app(model: Model, app_no_runner: Application) -> AsyncIterator[Application]: +async def app_one_runner(model: Model, app_no_runner: Application) -> AsyncIterator[Application]: """Application with a single runner. Test should ensure it returns with the application in a good state and has diff --git a/tests/integration/test_charm_one_runner.py b/tests/integration/test_charm_one_runner.py index 6fcb8c786..20dfc2e76 100644 --- a/tests/integration/test_charm_one_runner.py +++ b/tests/integration/test_charm_one_runner.py @@ -1,15 +1,18 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -"""Integration tests for github-runner charm.""" +"""Integration tests for github-runner charm containing one runner.""" +from typing import AsyncIterator import pytest +import pytest_asyncio from juju.application import Application from juju.model import Model from charm import GithubRunnerCharm from tests.integration.helpers import ( assert_resource_lxd_profile, + ensure_charm_has_runner, get_runner_names, reconcile, run_in_lxd_instance, @@ -20,6 +23,19 @@ from tests.status_name import ACTIVE, BLOCKED +@pytest_asyncio.fixture(scope="function", name="app") +async def app_fixture( + model: Model, + app_one_runner: Application, +) -> AsyncIterator[Application]: + """Setup and teardown the charm after each test. + + Ensure the charm has one runner before starting a test. + """ + await ensure_charm_has_runner(app_one_runner, model) + yield app_one_runner + + @pytest.mark.asyncio @pytest.mark.abort_on_fail async def test_network_access(app: Application) -> None: @@ -211,3 +227,19 @@ async def test_change_runner_storage(model: Model, app: Application) -> None: # 2. await app.set_config({"runner-storage": "memory"}) await model.wait_for_idle(status=ACTIVE, timeout=1 * 60) + + +async def test_token_config_changed_insufficient_perms( + model: Model, app: Application, token_alt: str +) -> None: + """ + arrange: A working application with one runner. + act: Change the token to be invalid and set the number of runners to zero. + assert: The active runner should be removed, regardless of the invalid new token. + """ + unit = app.units[0] + + await app.set_config({"token": "", "virtual-machines": "0"}) + await model.wait_for_idle() + + await wait_till_num_of_runners(unit, num=0) diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py index e298005bc..f53197fa2 100644 --- a/tests/unit/test_runner.py +++ b/tests/unit/test_runner.py @@ -457,6 +457,7 @@ def test_random_ssh_connection_choice( runner._configure_runner() second_call_args = runner._clients.jinja.get_template("env.j2").render.call_args.kwargs - assert ( - first_call_args["ssh_debug_info"] != second_call_args["ssh_debug_info"] - ), "Same ssh debug info found, this may have occurred with a very low priority." + assert first_call_args["ssh_debug_info"] != second_call_args["ssh_debug_info"], ( + "Same ssh debug info found, this may have occurred with a very low priority. " + "Just try again." + )