diff --git a/script/deploy_runner.sh b/script/deploy_runner.sh deleted file mode 100644 index 2a6f4215c..000000000 --- a/script/deploy_runner.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -set -e - -rm -f github_runner.zip - -# Request a download URL for the artifact. -echo "Requesting github runner charm download link..." -DOWNLOAD_LOCATION=$(curl \ - --head \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${GITHUB_TOKEN}"\ - -H "X-GitHub-Api-Version: 2022-11-28" \ - "https://api.github.com/repos/canonical/github-runner-operator/actions/artifacts/{$GITHUB_RUNNER_ARTIFACT_ID}/zip" \ - | grep location) -# Parse out the URL from the format "Location: URL\r". -read -ra LOCATION_ARRAY <<< "$DOWNLOAD_LOCATION" -URL=$(echo "${LOCATION_ARRAY[1]}" | tr -d '\r') - -# Download the github runner charm. -echo "Downloading github runner charm..." -curl -o github_runner.zip "$URL" - -# Decompress the zip. -echo "Decompressing github runner charm..." -unzip -p github_runner.zip > github-runner.charm -rm github_runner.zip - -# Deploy the charm. -juju deploy ./github-runner.charm --series=jammy --constraints="cores=4 mem=32G" e2e-runner -juju config e2e-runner token="$GITHUB_TOKEN" path=canonical/github-runner-operator virtual-machines=1 diff --git a/script/remove_offline_runners.py b/script/remove_offline_runners.py deleted file mode 100644 index 9df78d365..000000000 --- a/script/remove_offline_runners.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -import logging -import sys - -import requests - -ORG = "" -TOKEN = "" - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def get_runners(): - try: - response = requests.get( - f"https://api.github.com/orgs/{ORG}/actions/runners?per_page=100", - # "https://api.github.com/repos/canonical/github-runner-operator/actions/runners", - headers={ - "X-GitHub-Api-Version": "2022-11-28", - "Authorization": "Bearer " + TOKEN, - "Accept": "application/vnd.github+json", - }, - timeout=60, - ) - - response.raise_for_status() - runners = response.json() - logger.info("Runners found: %s", runners) - return runners - except requests.HTTPError as http_err: - sys.exit(f"HTTP error occurred: {http_err}") - except Exception as err: - sys.exit(f"Other error occurred: {err}") - - -def filter_offline_runners(runners): - offline_runners = [] - - runner_list = runners["runners"] - for runner in runner_list: - if runner["status"] == "offline": - offline_runners.append(runner) - - return offline_runners - - -def delete_runner(runner): - logger.info("Deleting runner with id %s", runner["id"]) - - try: - response = requests.delete( - f"https://api.github.com/orgs/{ORG}/actions/runners/{runner['id']}", - # f"https://api.github.com/repos/canonical/github-runner-operator/actions/runners/{runner['id']}", - headers={ - "X-GitHub-Api-Version": "2022-11-28", - "Authorization": "Bearer " + TOKEN, - "Accept": "application/vnd.github+json", - }, - timeout=60, - ) - - response.raise_for_status() - except requests.HTTPError as http_err: - sys.exit(f"HTTP error occurred: {http_err}") - except Exception as err: - sys.exit(f"Other error occurred: {err}") - - -if __name__ == "__main__": - while offline_runners := filter_offline_runners(get_runners()) > 0: - for runner in offline_runners: - delete_runner(runner) diff --git a/script/remove_runner.sh b/script/remove_runner.sh deleted file mode 100644 index 635d6beca..000000000 --- a/script/remove_runner.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -juju remove-application e2e-runner --force --destroy-storage --no-wait diff --git a/src/lxd.py b/src/lxd.py index a9c1da2a3..2da02e3cb 100644 --- a/src/lxd.py +++ b/src/lxd.py @@ -222,7 +222,9 @@ def delete(self, wait: bool = False) -> None: logger.exception("Failed to delete LXD instance") raise LxdError(f"Unable to delete LXD instance {self.name}") from err - def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, IO]: + def execute( + self, cmd: list[str], cwd: Optional[str] = None, hide_cmd: bool = False + ) -> Tuple[int, IO, IO]: """Execute a command within the LXD instance. Exceptions are not raise if command execution failed. Caller should check the exit code and @@ -231,6 +233,7 @@ def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, I Args: cmd: Commands to be executed. cwd: Working directory to execute the commands. + hide_cmd: Hide logging of cmd. Returns: Tuple containing the exit code, stdout, stderr. @@ -241,7 +244,7 @@ def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, I lxc_cmd += ["--"] + cmd - result = secure_run_subprocess(lxc_cmd) + result = secure_run_subprocess(lxc_cmd, hide_cmd) return (result.returncode, io.BytesIO(result.stdout), io.BytesIO(result.stderr)) diff --git a/src/runner.py b/src/runner.py index 0f5932add..b686b809c 100644 --- a/src/runner.py +++ b/src/runner.py @@ -135,10 +135,10 @@ def remove(self, remove_token: str) -> None: Raises: RunnerRemoveError: Failure in removing runner. """ - logger.info("Removing LXD instance of runner: %s", self.config.name) + logger.info("Removing runner: %s", self.config.name) if self.instance: - # Run script to remove the the runner and cleanup. + logger.info("Executing command to removal of runner and clean up...") self.instance.execute( [ "/usr/bin/sudo", @@ -149,9 +149,11 @@ def remove(self, remove_token: str) -> None: "--token", remove_token, ], + hide_cmd=True, ) if self.instance.status == "Running": + logger.info("Removing LXD instance of runner: %s", self.config.name) try: self.instance.stop(wait=True, timeout=60) except LxdError: @@ -179,6 +181,8 @@ def remove(self, remove_token: str) -> None: if self.status.runner_id is None: return + logger.info("Removing runner on GitHub: %s", self.config.name) + # The runner should cleanup itself. Cleanup on GitHub in case of runner cleanup error. if isinstance(self.config.path, GitHubRepo): logger.debug( @@ -549,9 +553,11 @@ def _register_runner(self, registration_token: str, labels: Sequence[str]) -> No if isinstance(self.config.path, GitHubOrg): register_cmd += ["--runnergroup", self.config.path.group] + logger.info("Executing registration command...") self.instance.execute( register_cmd, cwd=str(self.runner_application), + hide_cmd=True, ) @retry(tries=5, delay=30, local_logger=logger) diff --git a/src/runner_manager.py b/src/runner_manager.py index 460f2369c..07b3a04a9 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -20,6 +20,7 @@ import requests.adapters import urllib3 from ghapi.all import GhApi +from ghapi.page import pages from typing_extensions import assert_never from errors import RunnerBinaryError, RunnerCreateError @@ -34,7 +35,14 @@ from lxd import LxdClient, LxdInstance from repo_policy_compliance_client import RepoPolicyComplianceClient from runner import Runner, RunnerClients, RunnerConfig, RunnerStatus -from runner_type import GitHubOrg, GitHubPath, GitHubRepo, ProxySetting, VirtualMachineResources +from runner_type import ( + GitHubOrg, + GitHubPath, + GitHubRepo, + ProxySetting, + RunnerByHealth, + VirtualMachineResources, +) from utilities import retry, set_env_var logger = logging.getLogger(__name__) @@ -252,6 +260,26 @@ def get_github_info(self) -> Iterator[RunnerInfo]: remote_runners = self._get_runner_github_info() return iter(RunnerInfo(runner.name, runner.status) for runner in remote_runners.values()) + def _get_runner_health_states(self) -> RunnerByHealth: + local_runners = [ + instance + # Pylint cannot find the `all` method. + for instance in self._clients.lxd.instances.all() # pylint: disable=no-member + if instance.name.startswith(f"{self.instance_name}-") + ] + + healthy = [] + unhealthy = [] + + for runner in local_runners: + _, stdout, _ = runner.execute(["ps", "aux"]) + if f"/bin/bash {Runner.runner_script}" in stdout.read().decode("utf-8"): + healthy.append(runner.name) + else: + unhealthy.append(runner.name) + + return RunnerByHealth(healthy, unhealthy) + def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int: """Bring runners in line with target. @@ -269,37 +297,35 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int: runner for runner in runners if runner.status.exist and runner.status.online ] - offline_runners = [runner for runner in runners if not runner.status.online] - - local_runners = { - instance.name: instance - # Pylint cannot find the `all` method. - for instance in self._clients.lxd.instances.all() # pylint: disable=no-member - if instance.name.startswith(f"{self.instance_name}-") - } + runner_states = self._get_runner_health_states() logger.info( ( - "Expected runner count: %i, Online runner count: %i, Offline runner count: %i, " - "LXD instance count: %i" + "Expected runner count: %i, Online count: %i, Offline count: %i, " + "healthy count: %i, unhealthy count: %i" ), quantity, len(online_runners), - len(offline_runners), - len(local_runners), + len(runners) - len(online_runners), + len(runner_states.healthy), + len(runner_states.unhealthy), ) # Clean up offline runners - if offline_runners: - logger.info("Cleaning up offline runners.") + if runner_states.unhealthy: + logger.info("Cleaning up unhealthy runners.") remove_token = self._get_github_remove_token() - for runner in offline_runners: + unhealthy_runners = [ + runner for runner in runners if runner.config.name in set(runner_states.unhealthy) + ] + + for runner in unhealthy_runners: runner.remove(remove_token) logger.info("Removed runner: %s", runner.config.name) - delta = quantity - len(online_runners) + delta = quantity - len(runner_states.healthy) # Spawn new runners if delta > 0: if RunnerManager.runner_bin_path is None: @@ -310,7 +336,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int: registration_token = self._get_github_registration_token() remove_token = self._get_github_remove_token() - logger.info("Adding %i additional runner(s).", delta) + logger.info("Attempting to add %i runner(s).", delta) for _ in range(delta): config = RunnerConfig( self.app_name, @@ -335,6 +361,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int: raise elif delta < 0: + logger.info("Attempting to remove %i runner(s).", -delta) # Idle runners are online runners that has not taken a job. idle_runners = [runner for runner in online_runners if not runner.status.busy] offset = min(-delta, len(idle_runners)) @@ -349,7 +376,6 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int: for runner in remove_runners: runner.remove(remove_token) logger.info("Removed runner: %s", runner.config.name) - else: logger.info("There are no idle runner to remove.") else: @@ -397,13 +423,40 @@ def _generate_runner_name(self) -> str: def _get_runner_github_info(self) -> Dict[str, SelfHostedRunner]: remote_runners_list: list[SelfHostedRunner] = [] if isinstance(self.config.path, GitHubRepo): - remote_runners_list = self._clients.github.actions.list_self_hosted_runners_for_repo( - owner=self.config.path.owner, repo=self.config.path.repo - )["runners"] + # The documentation of ghapi for pagination is incorrect and examples will give errors. + # This workaround is a temp solution. Will be moving to PyGitHub in the future. + self._clients.github.actions.list_self_hosted_runners_for_repo( + owner=self.config.path.owner, repo=self.config.path.repo, per_page=100 + ) + num_of_pages = self._clients.github.last_page() + remote_runners_list = [ + item + for page in pages( + self._clients.github.actions.list_self_hosted_runners_for_repo, + num_of_pages + 1, + owner=self.config.path.owner, + repo=self.config.path.repo, + per_page=100, + ) + for item in page["runners"] + ] if isinstance(self.config.path, GitHubOrg): - remote_runners_list = self._clients.github.actions.list_self_hosted_runners_for_org( - org=self.config.path.org - )["runners"] + # The documentation of ghapi for pagination is incorrect and examples will give errors. + # This workaround is a temp solution. Will be moving to PyGitHub in the future. + self._clients.github.actions.list_self_hosted_runners_for_org( + org=self.config.path.org, per_page=100 + ) + num_of_pages = self._clients.github.last_page() + remote_runners_list = [ + item + for page in pages( + self._clients.github.actions.list_self_hosted_runners_for_org, + num_of_pages + 1, + org=self.config.path.org, + per_page=100, + ) + for item in page["runners"] + ] logger.debug("List of runners found on GitHub:%s", remote_runners_list) diff --git a/src/runner_type.py b/src/runner_type.py index 2d11bb32c..62ce0b18b 100644 --- a/src/runner_type.py +++ b/src/runner_type.py @@ -15,6 +15,14 @@ from repo_policy_compliance_client import RepoPolicyComplianceClient +@dataclass +class RunnerByHealth: + """Set of runners LXD instance by health state.""" + + healthy: tuple[str] + unhealthy: tuple[str] + + class ProxySetting(TypedDict, total=False): """Represent HTTP-related proxy settings.""" diff --git a/src/utilities.py b/src/utilities.py index 9a1d3ef1e..56316bd78 100644 --- a/src/utilities.py +++ b/src/utilities.py @@ -95,7 +95,9 @@ def fn_with_retry(*args, **kwargs) -> ReturnT: return retry_decorator -def secure_run_subprocess(cmd: Sequence[str], **kwargs) -> subprocess.CompletedProcess[bytes]: +def secure_run_subprocess( + cmd: Sequence[str], hide_cmd: bool = False, **kwargs +) -> subprocess.CompletedProcess[bytes]: """Run command in subprocess according to security recommendations. The argument `shell` is set to `False` for security reasons. @@ -105,12 +107,17 @@ def secure_run_subprocess(cmd: Sequence[str], **kwargs) -> subprocess.CompletedP Args: cmd: Command in a list. + hide_cmd: Hide logging of cmd. kwargs: Additional keyword arguments for the `subprocess.run` call. Returns: Object representing the completed process. The outputs subprocess can accessed. """ - logger.info("Executing command %s", cmd) + if not hide_cmd: + logger.info("Executing command %s", cmd) + else: + logger.info("Executing sensitive command") + result = subprocess.run( # nosec B603 cmd, capture_output=True, diff --git a/templates/repo-policy-compliance.service.j2 b/templates/repo-policy-compliance.service.j2 index afde32b2e..20049e621 100644 --- a/templates/repo-policy-compliance.service.j2 +++ b/templates/repo-policy-compliance.service.j2 @@ -14,4 +14,4 @@ Environment="NO_PROXY={{proxies['no_proxy']}}" Environment="http_proxy={{proxies['http']}}" Environment="https_proxy={{proxies['https']}}" Environment="no_proxy={{proxies['no_proxy']}}" -ExecStart=/usr/bin/gunicorn --bind 0.0.0.0:8080 app:app +ExecStart=/usr/bin/gunicorn --bind 0.0.0.0:8080 --timeout 60 app:app diff --git a/templates/start.j2 b/templates/start.j2 index 7e6979ce5..e0d2bbef6 100644 --- a/templates/start.j2 +++ b/templates/start.j2 @@ -1,3 +1,3 @@ #!/bin/bash -(/home/ubuntu/github-runner/run.sh; sudo systemctl halt -i) &>/dev/null & +(/home/ubuntu/github-runner/run.sh; sudo systemctl poweroff -i) &>/dev/null & diff --git a/tests/unit/mock.py b/tests/unit/mock.py index 7c3bfdac0..cd94cdc6f 100644 --- a/tests/unit/mock.py +++ b/tests/unit/mock.py @@ -6,9 +6,10 @@ from __future__ import annotations import hashlib +import io import logging import secrets -from typing import Optional, Sequence, Union +from typing import IO, Optional, Sequence, Union from errors import LxdError, RunnerError from github_type import RegistrationToken, RemoveToken, RunnerApplication @@ -102,8 +103,10 @@ def stop(self, wait: bool = True, timeout: int = 60): def delete(self, wait: bool = True): self.deleted = True - def execute(self, cmd: Sequence[str], cwd: Optional[str] = None) -> tuple[int, str, str]: - return 0, "", "" + def execute( + self, cmd: Sequence[str], cwd: Optional[str] = None, hide_cmd: bool = False + ) -> tuple[int, IO, IO]: + return 0, io.BytesIO(b""), io.BytesIO(b"") class MockLxdInstanceFileManager: @@ -186,6 +189,9 @@ def __init__(self, token: str): self.token = token self.actions = MockGhapiActions() + def last_page(self) -> int: + return 0 + class MockGhapiActions: """Mock for actions in Ghapi client.""" @@ -238,10 +244,12 @@ def create_remove_token_for_org(self, org: str): {"token": self.remove_token_org, "expires_at": "2020-01-22T12:13:35.123-08:00"} ) - def list_self_hosted_runners_for_repo(self, owner: str, repo: str): + def list_self_hosted_runners_for_repo( + self, owner: str, repo: str, per_page: int, page: int = 0 + ): return {"runners": []} - def list_self_hosted_runners_for_org(self, org: str): + def list_self_hosted_runners_for_org(self, org: str, per_page: int, page: int = 0): return {"runners": []} def delete_self_hosted_runner_from_repo(self, owner: str, repo: str, runner_id: str): diff --git a/tests/unit/test_runner_manager.py b/tests/unit/test_runner_manager.py index 02358867e..a7bc42ef9 100644 --- a/tests/unit/test_runner_manager.py +++ b/tests/unit/test_runner_manager.py @@ -12,7 +12,7 @@ from errors import RunnerBinaryError from runner import Runner, RunnerStatus from runner_manager import RunnerManager, RunnerManagerConfig -from runner_type import GitHubOrg, GitHubRepo, VirtualMachineResources +from runner_type import GitHubOrg, GitHubRepo, RunnerByHealth, VirtualMachineResources from tests.unit.mock import TEST_BINARY @@ -134,6 +134,14 @@ def mock_get_runners(): # Create online runners. runner_manager._get_runners = mock_get_runners + runner_manager._get_runner_health_states = lambda: RunnerByHealth( + ( + f"{runner_manager.instance_name}-0", + f"{runner_manager.instance_name}-1", + f"{runner_manager.instance_name}-2", + ), + (), + ) delta = runner_manager.reconcile(2, VirtualMachineResources(2, "7GiB", "10Gib"))