diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 115b62a33..f930b9dba 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -16,7 +16,7 @@ jobs: provider: lxd test-tox-env: integration-juju2.9 modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]' - integration-tests-juju3: + integration-tests: name: Integration test with juju 3.1 uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main secrets: inherit diff --git a/.github/workflows/workflow_dispatch_wait_test.yaml b/.github/workflows/workflow_dispatch_wait_test.yaml new file mode 100644 index 000000000..c50a2c18e --- /dev/null +++ b/.github/workflows/workflow_dispatch_wait_test.yaml @@ -0,0 +1,28 @@ +name: Workflow Dispatch Wait Tests + +on: + # Manually dispatched workflow action + workflow_dispatch: + inputs: + runner: + description: 'Self hosted gh runner' + required: true + minutes: + description: 'Number of minutes to wait' + # Number type not supported in workflow dispatch: https://github.com/orgs/community/discussions/67182 + # Seems to be by design: https://github.blog/changelog/2021-11-10-github-actions-input-types-for-manual-workflows/ + default: '2' + +jobs: + workflow-dispatch-tests: + runs-on: [self-hosted, linux, x64, "${{ inputs.runner }}"] + steps: + - name: Echo input variable and message + run: | + echo "Hello, runner: ${{ inputs.runner }}" + - name: Wait + run: | + sleep ${{ inputs.minutes }}m + - name: Always echo a message + if: always() + run: echo "Should not echo if pre-job script failed" diff --git a/src-docs/charm.py.md b/src-docs/charm.py.md index ec19b575c..732d35c7d 100644 --- a/src-docs/charm.py.md +++ b/src-docs/charm.py.md @@ -11,7 +11,7 @@ Charm for creating and managing GitHub self-hosted runner instances. --- - + ## function `catch_charm_errors` @@ -37,7 +37,7 @@ Catch common errors in charm. --- - + ## function `catch_action_errors` @@ -66,7 +66,7 @@ Catch common errors in actions. ## class `GithubRunnerCharm` Charm for managing GitHub self-hosted runners. - + ### function `__init__` diff --git a/src-docs/runner.py.md b/src-docs/runner.py.md index 9916319b7..f9801909a 100644 --- a/src-docs/runner.py.md +++ b/src-docs/runner.py.md @@ -39,7 +39,7 @@ The configuration values for creating a single runner instance. ## class `Runner` Single instance of GitHub self-hosted runner. - + ### function `__init__` @@ -67,7 +67,7 @@ Construct the runner instance. --- - + ### function `create` @@ -91,7 +91,7 @@ Create the runner instance on LXD and register it on GitHub. --- - + ### function `remove` diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md index 90661bb2c..f634974ec 100644 --- a/src-docs/runner_manager.py.md +++ b/src-docs/runner_manager.py.md @@ -17,7 +17,7 @@ Runner Manager manages the runners on LXD and GitHub. ## class `RunnerManager` Manage a group of runners according to configuration. - + ### function `__init__` @@ -46,7 +46,7 @@ Construct RunnerManager object for creating and managing runners. --- - + ### function `build_runner_image` @@ -66,7 +66,7 @@ Build container image in test mode, else virtual machine image. --- - + ### function `check_runner_bin` @@ -83,12 +83,12 @@ Check if runner binary exists. --- - + ### function `flush` ```python -flush(flush_busy: bool = True) → int +flush(mode: FlushMode = ) → int ``` Remove existing runners. @@ -97,7 +97,7 @@ Remove existing runners. **Args:** - - `flush_busy`: Whether to flush busy runners as well. + - `mode`: Strategy for flushing runners. @@ -106,7 +106,7 @@ Remove existing runners. --- - + ### function `get_github_info` @@ -123,7 +123,7 @@ Get information on the runners from GitHub. --- - + ### function `get_latest_runner_bin_url` @@ -148,7 +148,7 @@ The runner binary URL changes when a new version is available. --- - + ### function `reconcile` @@ -172,7 +172,7 @@ Bring runners in line with target. --- - + ### function `schedule_build_runner_image` @@ -184,7 +184,7 @@ Install cron job for building runner image. --- - + ### function `update_runner_bin` diff --git a/src-docs/runner_manager_type.py.md b/src-docs/runner_manager_type.py.md index 9fe689d18..43f020476 100644 --- a/src-docs/runner_manager_type.py.md +++ b/src-docs/runner_manager_type.py.md @@ -7,6 +7,24 @@ Types used by RunnerManager class. +--- + +## class `FlushMode` +Strategy for flushing runners. + + + +**Attributes:** + + - `FLUSH_IDLE`: Flush only idle runners. + - `FLUSH_IDLE_WAIT_REPO_CHECK`: Flush only idle runners, then wait until repo-policy-check is completed for the busy runners. + - `FORCE_FLUSH_BUSY`: Force flush busy runners. + - `FORCE_FLUSH_BUSY_WAIT_REPO_CHECK`: Wait until the repo-policy-check is completed before force flush of busy runners. + + + + + --- ## class `RunnerInfo` diff --git a/src/charm.py b/src/charm.py index 0ca153f9c..10a3505ac 100755 --- a/src/charm.py +++ b/src/charm.py @@ -47,6 +47,7 @@ from github_type import GitHubRunnerStatus from runner import LXD_PROFILE_YAML from runner_manager import RunnerManager, RunnerManagerConfig +from runner_manager_type import FlushMode from runner_type import GithubOrg, GithubRepo, ProxySetting, VirtualMachineResources from utilities import bytes_with_unit_to_kib, execute_command, retry @@ -411,7 +412,7 @@ def _on_start(self, _event: StartEvent) -> None: self.unit.status = MaintenanceStatus("Starting runners") try: - runner_manager.flush(flush_busy=False) + runner_manager.flush(FlushMode.FLUSH_IDLE) self._reconcile_runners(runner_manager) except RunnerError as err: logger.exception("Failed to start runners") @@ -467,7 +468,7 @@ def _on_upgrade_charm(self, _event: UpgradeCharmEvent) -> None: if not runner_manager: return - runner_manager.flush() + runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK) self._reconcile_runners(runner_manager) @catch_charm_errors @@ -501,7 +502,7 @@ def _on_config_changed(self, _event: ConfigChangedEvent) -> None: ) # Casting for mypy checks. if prev_runner_manager: self.unit.status = MaintenanceStatus("Removing runners from old org/repo") - prev_runner_manager.flush(flush_busy=False) + prev_runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK) self._stored.path = self.config["path"] runner_manager = self._get_runner_manager() @@ -512,7 +513,7 @@ def _on_config_changed(self, _event: ConfigChangedEvent) -> None: self.unit.status = BlockedStatus("Missing token or org/repo path config") if self.config["token"] != self._stored.token: - runner_manager.flush(flush_busy=False) + runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK) self._stored.token = self.config["token"] def _check_and_update_dependencies(self) -> bool: @@ -565,8 +566,8 @@ def _check_and_update_dependencies(self) -> bool: self.unit.status = MaintenanceStatus("Flushing runners due to updated deps") + runner_manager.flush(FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK) self._start_services() - runner_manager.flush(flush_busy=False) self.unit.status = ActiveStatus() return service_updated or runner_bin_updated @@ -655,7 +656,7 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None: """ runner_manager = self._get_runner_manager() - runner_manager.flush() + runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK) delta = self._reconcile_runners(runner_manager) self._on_check_runners_action(event) @@ -686,7 +687,7 @@ def _on_stop(self, _: StopEvent) -> None: self.unit.status = BlockedStatus(f"Failed to stop charm event timer: {ex}") runner_manager = self._get_runner_manager() - runner_manager.flush() + runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY) def _reconcile_runners(self, runner_manager: RunnerManager) -> Dict[str, Any]: """Reconcile the current runners state and intended runner state. @@ -917,7 +918,7 @@ def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None: """Handle debug ssh relation changed event.""" self._refresh_firewall() runner_manager = self._get_runner_manager() - runner_manager.flush(flush_busy=False) + runner_manager.flush(FlushMode.FLUSH_IDLE) self._reconcile_runners(runner_manager) diff --git a/src/runner.py b/src/runner.py index f5343d363..4dbfb82ec 100644 --- a/src/runner.py +++ b/src/runner.py @@ -19,6 +19,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Iterable, NamedTuple, Optional, Sequence +from urllib.error import HTTPError import yaml @@ -229,7 +230,12 @@ def remove(self, remove_token: str) -> None: self.status.runner_id, self.config.path.path(), ) - self._clients.github.delete_runner(self.config.path, self.status.runner_id) + try: + self._clients.github.delete_runner(self.config.path, self.status.runner_id) + except HTTPError: + logger.exception("Unable the remove runner on GitHub: %s", self.config.name) + # This can occur when attempting to remove a busy runner. + # The caller should retry later, after GitHub mark the runner as offline. def _add_shared_filesystem(self, path: Path) -> None: """Add the shared filesystem to the runner instance. diff --git a/src/runner_manager.py b/src/runner_manager.py index 6ea32b0ed..725a7b566 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -10,6 +10,7 @@ import tarfile import time import urllib.request +from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Dict, Iterator, Optional, Type @@ -31,7 +32,7 @@ from lxd import LxdClient, LxdInstance from repo_policy_compliance_client import RepoPolicyComplianceClient from runner import LXD_PROFILE_YAML, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus -from runner_manager_type import RunnerInfo, RunnerManagerClients, RunnerManagerConfig +from runner_manager_type import FlushMode, RunnerInfo, RunnerManagerClients, RunnerManagerConfig from runner_metrics import RUNNER_INSTALLED_TS_FILE_NAME from runner_type import ProxySetting, RunnerByHealth, VirtualMachineResources from utilities import execute_command, retry, set_env_var @@ -523,33 +524,86 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int: ) return delta - def flush(self, flush_busy: bool = True) -> int: + def _runners_in_pre_job(self) -> bool: + """Check there exist runners in the pre-job script stage. + + If a runner has taken a job for 1 minute or more, it is assumed to exit the pre-job script. + + Returns: + Whether there are runners that has taken a job and run for less than 1 minute. + """ + now = datetime.now(timezone.utc) + busy_runners = [ + runner for runner in self._get_runners() if runner.status.exist and runner.status.busy + ] + for runner in busy_runners: + # Check if `_work` directory exists, if it exists the runner has started a job. + exit_code, stdout, _ = runner.instance.execute( + ["/usr/bin/stat", "-c", "'%w'", "/home/ubuntu/github-runner/_work"] + ) + if exit_code != 0: + return False + # The date is between two single quotes('). + _, output, _ = stdout.read().decode("utf-8").strip().split("'") + date_str, time_str, timezone_str = output.split(" ") + timezone_str = f"{timezone_str[:3]}:{timezone_str[3:]}" + job_start_time = datetime.fromisoformat(f"{date_str}T{time_str[:12]}{timezone_str}") + if job_start_time + timedelta(minutes=1) > now: + return False + return True + + def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: """Remove existing runners. Args: - flush_busy: Whether to flush busy runners as well. + mode: Strategy for flushing runners. Returns: Number of runners removed. """ - if flush_busy: - runners = [runner for runner in self._get_runners() if runner.status.exist] - else: - runners = [ - runner - for runner in self._get_runners() - if runner.status.exist and not runner.status.busy - ] + remove_token = self._clients.github.get_runner_remove_token(self.config.path) - logger.info("Removing existing %i local runners", len(runners)) + # Removing non-busy runners + runners = [ + runner + for runner in self._get_runners() + if runner.status.exist and not runner.status.busy + ] - remove_token = self._clients.github.get_runner_remove_token(self.config.path) + logger.info("Removing existing %i non-busy local runners", len(runners)) + remove_count = len(runners) for runner in runners: runner.remove(remove_token) logger.info(REMOVED_RUNNER_LOG_STR, runner.config.name) - return len(runners) + if mode in ( + FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK, + FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK, + ): + for _ in range(5): + if not self._runners_in_pre_job(): + break + time.sleep(30) + else: + logger.warning( + ( + "Proceed with flush runner after timeout waiting on runner in setup " + "stage, pre-job script might fail in currently running jobs" + ) + ) + + if mode in {FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK, FlushMode.FORCE_FLUSH_BUSY}: + busy_runners = [runner for runner in self._get_runners() if runner.status.exist] + + logger.info("Removing existing %i busy local runners", len(runners)) + + remove_count += len(busy_runners) + for runner in busy_runners: + runner.remove(remove_token) + logger.info(REMOVED_RUNNER_LOG_STR, runner.config.name) + + return remove_count def _generate_runner_name(self) -> str: """Generate a runner name based on charm name. diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py index 3bfe6126a..124c564b8 100644 --- a/src/runner_manager_type.py +++ b/src/runner_manager_type.py @@ -4,6 +4,7 @@ """Types used by RunnerManager class.""" from dataclasses import dataclass +from enum import Enum, auto from pathlib import Path import jinja2 @@ -16,6 +17,24 @@ from runner_type import GithubPath +class FlushMode(Enum): + """Strategy for flushing runners. + + Attributes: + FLUSH_IDLE: Flush only idle runners. + FLUSH_IDLE_WAIT_REPO_CHECK: Flush only idle runners, then wait until repo-policy-check is + completed for the busy runners. + FORCE_FLUSH_BUSY: Force flush busy runners. + FORCE_FLUSH_BUSY_WAIT_REPO_CHECK: Wait until the repo-policy-check is completed before + force flush of busy runners. + """ + + FLUSH_IDLE = auto() + FLUSH_IDLE_WAIT_REPO_CHECK = auto() + FORCE_FLUSH_BUSY = auto() + FORCE_FLUSH_BUSY_WAIT_REPO_CHECK = auto() + + @dataclass class RunnerManagerClients: """Clients for accessing various services. diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index a49a6da4a..18dccdd0a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -22,6 +22,7 @@ from juju.model import Model from pytest_operator.plugin import OpsTest +from github_client import GithubClient from tests.integration.helpers import ( deploy_github_runner_charm, ensure_charm_has_runner, @@ -150,6 +151,11 @@ def model(ops_test: OpsTest) -> Model: return ops_test.model +@pytest.fixture(scope="module") +def runner_manager_github_client(token: str) -> GithubClient: + return GithubClient(token=token) + + @pytest_asyncio.fixture(scope="module") async def app_no_runner( model: Model, diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index e7ef4d139..38e0a5254 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -12,6 +12,7 @@ from datetime import datetime, timezone from typing import Any, Awaitable, Callable, Union +import github import juju.version import requests import yaml @@ -32,6 +33,7 @@ DISPATCH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_test.yaml" DISPATCH_CRASH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_crash_test.yaml" DISPATCH_FAILURE_TEST_WORKFLOW_FILENAME = "workflow_dispatch_failure_test.yaml" +DISPATCH_WAIT_TEST_WORKFLOW_FILENAME = "workflow_dispatch_wait_test.yaml" JOB_LOG_START_MSG_TEMPLATE = "Job is about to start running on the runner: {runner_name}" @@ -396,6 +398,9 @@ def get_workflow_runs( runner_name: The runner name the workflow job is assigned to. branch: The branch the workflow is run on. """ + if branch is None: + branch = github.GithubObject.NotSet + for run in workflow.get_runs(created=f">={start_time.isoformat()}", branch=branch): latest_job: WorkflowJob = run.jobs()[0] logs = get_job_logs(job=latest_job) diff --git a/tests/integration/test_self_hosted_runner.py b/tests/integration/test_self_hosted_runner.py index ecbf7c28a..906ecac6d 100644 --- a/tests/integration/test_self_hosted_runner.py +++ b/tests/integration/test_self_hosted_runner.py @@ -8,15 +8,21 @@ import github import pytest -import requests from github.Repository import Repository from juju.application import Application from juju.model import Model +from github_client import GithubClient +from runner_type import GithubRepo from tests.integration.helpers import ( DISPATCH_TEST_WORKFLOW_FILENAME, + DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, + get_job_logs, get_runner_names, + get_workflow_runs, + reconcile, run_in_lxd_instance, + wait_till_num_of_runners, ) from tests.status_name import ACTIVE @@ -76,13 +82,10 @@ async def test_dispatch_workflow_with_dockerhub_mirror( # Unable to find the run id of the workflow that was dispatched. # Therefore, all runs after this test start should pass the conditions. - for run in workflow.get_runs(created=f">={start_time.isoformat()}"): - if start_time > run.created_at: - continue - + for run in get_workflow_runs(start_time, workflow, runner_to_be_used): + jobs = run.jobs() try: - logs_url = run.jobs()[0].logs_url() - logs = requests.get(logs_url).content.decode("utf-8") + logs = get_job_logs(jobs[0]) except github.GithubException.GithubException: continue @@ -92,3 +95,87 @@ async def test_dispatch_workflow_with_dockerhub_mirror( "A private docker registry is setup as a dockerhub mirror for this self-hosted" " runner." ) in logs + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_flush_busy_runner( + model: Model, + app_runner: Application, + forked_github_repository: Repository, + runner_manager_github_client: GithubClient, +) -> None: + """ + arrange: A working application with one runner. + act: + 1. Dispatch a workflow that waits for 30 mins. + 2. Run flush-runners action. + assert: + 1. The runner is in busy status. + 2. a. The flush-runners action should take less than the timeout. + b. The runner should be flushed. + """ + unit = app_runner.units[0] + + config = await app_runner.get_config() + + await app_runner.set_config( + {"path": forked_github_repository.full_name, "virtual-machines": "1"} + ) + await reconcile(app=app_runner, model=model) + await wait_till_num_of_runners(unit, 1) + + names = await get_runner_names(unit) + assert len(names) == 1 + + runner_to_be_used = names[0] + + # 1. + main_branch = forked_github_repository.get_branch(forked_github_repository.default_branch) + workflow = forked_github_repository.get_workflow( + id_or_file_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME + ) + + assert workflow.create_dispatch(main_branch, {"runner": app_runner.name, "minutes": "30"}) + + # Wait until runner online and then busy. + for _ in range(30): + all_runners = runner_manager_github_client.get_runner_github_info( + GithubRepo( + owner=forked_github_repository.owner.login, repo=forked_github_repository.name + ) + ) + runners = [runner for runner in all_runners if runner.name == runner_to_be_used] + + if not runners: + # if runner is not online yet. + sleep(30) + continue + + assert len(runners) == 1, "Should not occur as GitHub enforce unique naming of runner" + runner = runners[0] + if runner["busy"]: + start_time = datetime.now(timezone.utc) + break + + sleep(30) + else: + assert False, "Timeout while waiting for runner to take up the workflow" + + # 2. + action = await unit.run_action("flush-runners") + await action.wait() + + end_time = datetime.now(timezone.utc) + + # The flushing of runner should take less than the 30 minutes timeout of the workflow. + diff = end_time - start_time + assert diff.total_seconds() < 30 * 60 + + names = await get_runner_names(unit) + assert runner_to_be_used not in names, "Found a runner that should be flushed" + + # Ensure the app_runner is back to 0 runners. + await app_runner.set_config({"virtual-machines": "0", "path": config["path"]}) + await reconcile(app=app_runner, model=model) + await wait_till_num_of_runners(unit, 0)