diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml
index 115b62a33..f930b9dba 100644
--- a/.github/workflows/integration_test.yaml
+++ b/.github/workflows/integration_test.yaml
@@ -16,7 +16,7 @@ jobs:
provider: lxd
test-tox-env: integration-juju2.9
modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]'
- integration-tests-juju3:
+ integration-tests:
name: Integration test with juju 3.1
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
secrets: inherit
diff --git a/.github/workflows/workflow_dispatch_wait_test.yaml b/.github/workflows/workflow_dispatch_wait_test.yaml
new file mode 100644
index 000000000..c50a2c18e
--- /dev/null
+++ b/.github/workflows/workflow_dispatch_wait_test.yaml
@@ -0,0 +1,28 @@
+name: Workflow Dispatch Wait Tests
+
+on:
+ # Manually dispatched workflow action
+ workflow_dispatch:
+ inputs:
+ runner:
+ description: 'Self hosted gh runner'
+ required: true
+ minutes:
+ description: 'Number of minutes to wait'
+ # Number type not supported in workflow dispatch: https://github.com/orgs/community/discussions/67182
+ # Seems to be by design: https://github.blog/changelog/2021-11-10-github-actions-input-types-for-manual-workflows/
+ default: '2'
+
+jobs:
+ workflow-dispatch-tests:
+ runs-on: [self-hosted, linux, x64, "${{ inputs.runner }}"]
+ steps:
+ - name: Echo input variable and message
+ run: |
+ echo "Hello, runner: ${{ inputs.runner }}"
+ - name: Wait
+ run: |
+ sleep ${{ inputs.minutes }}m
+ - name: Always echo a message
+ if: always()
+ run: echo "Should not echo if pre-job script failed"
diff --git a/src-docs/charm.py.md b/src-docs/charm.py.md
index ec19b575c..732d35c7d 100644
--- a/src-docs/charm.py.md
+++ b/src-docs/charm.py.md
@@ -11,7 +11,7 @@ Charm for creating and managing GitHub self-hosted runner instances.
---
-
+
## function `catch_charm_errors`
@@ -37,7 +37,7 @@ Catch common errors in charm.
---
-
+
## function `catch_action_errors`
@@ -66,7 +66,7 @@ Catch common errors in actions.
## class `GithubRunnerCharm`
Charm for managing GitHub self-hosted runners.
-
+
### function `__init__`
diff --git a/src-docs/runner.py.md b/src-docs/runner.py.md
index 9916319b7..f9801909a 100644
--- a/src-docs/runner.py.md
+++ b/src-docs/runner.py.md
@@ -39,7 +39,7 @@ The configuration values for creating a single runner instance.
## class `Runner`
Single instance of GitHub self-hosted runner.
-
+
### function `__init__`
@@ -67,7 +67,7 @@ Construct the runner instance.
---
-
+
### function `create`
@@ -91,7 +91,7 @@ Create the runner instance on LXD and register it on GitHub.
---
-
+
### function `remove`
diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md
index 90661bb2c..f634974ec 100644
--- a/src-docs/runner_manager.py.md
+++ b/src-docs/runner_manager.py.md
@@ -17,7 +17,7 @@ Runner Manager manages the runners on LXD and GitHub.
## class `RunnerManager`
Manage a group of runners according to configuration.
-
+
### function `__init__`
@@ -46,7 +46,7 @@ Construct RunnerManager object for creating and managing runners.
---
-
+
### function `build_runner_image`
@@ -66,7 +66,7 @@ Build container image in test mode, else virtual machine image.
---
-
+
### function `check_runner_bin`
@@ -83,12 +83,12 @@ Check if runner binary exists.
---
-
+
### function `flush`
```python
-flush(flush_busy: bool = True) → int
+flush(mode: FlushMode = ) → int
```
Remove existing runners.
@@ -97,7 +97,7 @@ Remove existing runners.
**Args:**
- - `flush_busy`: Whether to flush busy runners as well.
+ - `mode`: Strategy for flushing runners.
@@ -106,7 +106,7 @@ Remove existing runners.
---
-
+
### function `get_github_info`
@@ -123,7 +123,7 @@ Get information on the runners from GitHub.
---
-
+
### function `get_latest_runner_bin_url`
@@ -148,7 +148,7 @@ The runner binary URL changes when a new version is available.
---
-
+
### function `reconcile`
@@ -172,7 +172,7 @@ Bring runners in line with target.
---
-
+
### function `schedule_build_runner_image`
@@ -184,7 +184,7 @@ Install cron job for building runner image.
---
-
+
### function `update_runner_bin`
diff --git a/src-docs/runner_manager_type.py.md b/src-docs/runner_manager_type.py.md
index 9fe689d18..43f020476 100644
--- a/src-docs/runner_manager_type.py.md
+++ b/src-docs/runner_manager_type.py.md
@@ -7,6 +7,24 @@ Types used by RunnerManager class.
+---
+
+## class `FlushMode`
+Strategy for flushing runners.
+
+
+
+**Attributes:**
+
+ - `FLUSH_IDLE`: Flush only idle runners.
+ - `FLUSH_IDLE_WAIT_REPO_CHECK`: Flush only idle runners, then wait until repo-policy-check is completed for the busy runners.
+ - `FORCE_FLUSH_BUSY`: Force flush busy runners.
+ - `FORCE_FLUSH_BUSY_WAIT_REPO_CHECK`: Wait until the repo-policy-check is completed before force flush of busy runners.
+
+
+
+
+
---
## class `RunnerInfo`
diff --git a/src/charm.py b/src/charm.py
index 0ca153f9c..10a3505ac 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -47,6 +47,7 @@
from github_type import GitHubRunnerStatus
from runner import LXD_PROFILE_YAML
from runner_manager import RunnerManager, RunnerManagerConfig
+from runner_manager_type import FlushMode
from runner_type import GithubOrg, GithubRepo, ProxySetting, VirtualMachineResources
from utilities import bytes_with_unit_to_kib, execute_command, retry
@@ -411,7 +412,7 @@ def _on_start(self, _event: StartEvent) -> None:
self.unit.status = MaintenanceStatus("Starting runners")
try:
- runner_manager.flush(flush_busy=False)
+ runner_manager.flush(FlushMode.FLUSH_IDLE)
self._reconcile_runners(runner_manager)
except RunnerError as err:
logger.exception("Failed to start runners")
@@ -467,7 +468,7 @@ def _on_upgrade_charm(self, _event: UpgradeCharmEvent) -> None:
if not runner_manager:
return
- runner_manager.flush()
+ runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK)
self._reconcile_runners(runner_manager)
@catch_charm_errors
@@ -501,7 +502,7 @@ def _on_config_changed(self, _event: ConfigChangedEvent) -> None:
) # Casting for mypy checks.
if prev_runner_manager:
self.unit.status = MaintenanceStatus("Removing runners from old org/repo")
- prev_runner_manager.flush(flush_busy=False)
+ prev_runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK)
self._stored.path = self.config["path"]
runner_manager = self._get_runner_manager()
@@ -512,7 +513,7 @@ def _on_config_changed(self, _event: ConfigChangedEvent) -> None:
self.unit.status = BlockedStatus("Missing token or org/repo path config")
if self.config["token"] != self._stored.token:
- runner_manager.flush(flush_busy=False)
+ runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK)
self._stored.token = self.config["token"]
def _check_and_update_dependencies(self) -> bool:
@@ -565,8 +566,8 @@ def _check_and_update_dependencies(self) -> bool:
self.unit.status = MaintenanceStatus("Flushing runners due to updated deps")
+ runner_manager.flush(FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK)
self._start_services()
- runner_manager.flush(flush_busy=False)
self.unit.status = ActiveStatus()
return service_updated or runner_bin_updated
@@ -655,7 +656,7 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None:
"""
runner_manager = self._get_runner_manager()
- runner_manager.flush()
+ runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK)
delta = self._reconcile_runners(runner_manager)
self._on_check_runners_action(event)
@@ -686,7 +687,7 @@ def _on_stop(self, _: StopEvent) -> None:
self.unit.status = BlockedStatus(f"Failed to stop charm event timer: {ex}")
runner_manager = self._get_runner_manager()
- runner_manager.flush()
+ runner_manager.flush(FlushMode.FORCE_FLUSH_BUSY)
def _reconcile_runners(self, runner_manager: RunnerManager) -> Dict[str, Any]:
"""Reconcile the current runners state and intended runner state.
@@ -917,7 +918,7 @@ def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None:
"""Handle debug ssh relation changed event."""
self._refresh_firewall()
runner_manager = self._get_runner_manager()
- runner_manager.flush(flush_busy=False)
+ runner_manager.flush(FlushMode.FLUSH_IDLE)
self._reconcile_runners(runner_manager)
diff --git a/src/runner.py b/src/runner.py
index f5343d363..4dbfb82ec 100644
--- a/src/runner.py
+++ b/src/runner.py
@@ -19,6 +19,7 @@
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, NamedTuple, Optional, Sequence
+from urllib.error import HTTPError
import yaml
@@ -229,7 +230,12 @@ def remove(self, remove_token: str) -> None:
self.status.runner_id,
self.config.path.path(),
)
- self._clients.github.delete_runner(self.config.path, self.status.runner_id)
+ try:
+ self._clients.github.delete_runner(self.config.path, self.status.runner_id)
+ except HTTPError:
+ logger.exception("Unable the remove runner on GitHub: %s", self.config.name)
+ # This can occur when attempting to remove a busy runner.
+ # The caller should retry later, after GitHub mark the runner as offline.
def _add_shared_filesystem(self, path: Path) -> None:
"""Add the shared filesystem to the runner instance.
diff --git a/src/runner_manager.py b/src/runner_manager.py
index 6ea32b0ed..725a7b566 100644
--- a/src/runner_manager.py
+++ b/src/runner_manager.py
@@ -10,6 +10,7 @@
import tarfile
import time
import urllib.request
+from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Dict, Iterator, Optional, Type
@@ -31,7 +32,7 @@
from lxd import LxdClient, LxdInstance
from repo_policy_compliance_client import RepoPolicyComplianceClient
from runner import LXD_PROFILE_YAML, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus
-from runner_manager_type import RunnerInfo, RunnerManagerClients, RunnerManagerConfig
+from runner_manager_type import FlushMode, RunnerInfo, RunnerManagerClients, RunnerManagerConfig
from runner_metrics import RUNNER_INSTALLED_TS_FILE_NAME
from runner_type import ProxySetting, RunnerByHealth, VirtualMachineResources
from utilities import execute_command, retry, set_env_var
@@ -523,33 +524,86 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
)
return delta
- def flush(self, flush_busy: bool = True) -> int:
+ def _runners_in_pre_job(self) -> bool:
+ """Check there exist runners in the pre-job script stage.
+
+ If a runner has taken a job for 1 minute or more, it is assumed to exit the pre-job script.
+
+ Returns:
+ Whether there are runners that has taken a job and run for less than 1 minute.
+ """
+ now = datetime.now(timezone.utc)
+ busy_runners = [
+ runner for runner in self._get_runners() if runner.status.exist and runner.status.busy
+ ]
+ for runner in busy_runners:
+ # Check if `_work` directory exists, if it exists the runner has started a job.
+ exit_code, stdout, _ = runner.instance.execute(
+ ["/usr/bin/stat", "-c", "'%w'", "/home/ubuntu/github-runner/_work"]
+ )
+ if exit_code != 0:
+ return False
+ # The date is between two single quotes(').
+ _, output, _ = stdout.read().decode("utf-8").strip().split("'")
+ date_str, time_str, timezone_str = output.split(" ")
+ timezone_str = f"{timezone_str[:3]}:{timezone_str[3:]}"
+ job_start_time = datetime.fromisoformat(f"{date_str}T{time_str[:12]}{timezone_str}")
+ if job_start_time + timedelta(minutes=1) > now:
+ return False
+ return True
+
+ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int:
"""Remove existing runners.
Args:
- flush_busy: Whether to flush busy runners as well.
+ mode: Strategy for flushing runners.
Returns:
Number of runners removed.
"""
- if flush_busy:
- runners = [runner for runner in self._get_runners() if runner.status.exist]
- else:
- runners = [
- runner
- for runner in self._get_runners()
- if runner.status.exist and not runner.status.busy
- ]
+ remove_token = self._clients.github.get_runner_remove_token(self.config.path)
- logger.info("Removing existing %i local runners", len(runners))
+ # Removing non-busy runners
+ runners = [
+ runner
+ for runner in self._get_runners()
+ if runner.status.exist and not runner.status.busy
+ ]
- remove_token = self._clients.github.get_runner_remove_token(self.config.path)
+ logger.info("Removing existing %i non-busy local runners", len(runners))
+ remove_count = len(runners)
for runner in runners:
runner.remove(remove_token)
logger.info(REMOVED_RUNNER_LOG_STR, runner.config.name)
- return len(runners)
+ if mode in (
+ FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK,
+ FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK,
+ ):
+ for _ in range(5):
+ if not self._runners_in_pre_job():
+ break
+ time.sleep(30)
+ else:
+ logger.warning(
+ (
+ "Proceed with flush runner after timeout waiting on runner in setup "
+ "stage, pre-job script might fail in currently running jobs"
+ )
+ )
+
+ if mode in {FlushMode.FORCE_FLUSH_BUSY_WAIT_REPO_CHECK, FlushMode.FORCE_FLUSH_BUSY}:
+ busy_runners = [runner for runner in self._get_runners() if runner.status.exist]
+
+ logger.info("Removing existing %i busy local runners", len(runners))
+
+ remove_count += len(busy_runners)
+ for runner in busy_runners:
+ runner.remove(remove_token)
+ logger.info(REMOVED_RUNNER_LOG_STR, runner.config.name)
+
+ return remove_count
def _generate_runner_name(self) -> str:
"""Generate a runner name based on charm name.
diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py
index 3bfe6126a..124c564b8 100644
--- a/src/runner_manager_type.py
+++ b/src/runner_manager_type.py
@@ -4,6 +4,7 @@
"""Types used by RunnerManager class."""
from dataclasses import dataclass
+from enum import Enum, auto
from pathlib import Path
import jinja2
@@ -16,6 +17,24 @@
from runner_type import GithubPath
+class FlushMode(Enum):
+ """Strategy for flushing runners.
+
+ Attributes:
+ FLUSH_IDLE: Flush only idle runners.
+ FLUSH_IDLE_WAIT_REPO_CHECK: Flush only idle runners, then wait until repo-policy-check is
+ completed for the busy runners.
+ FORCE_FLUSH_BUSY: Force flush busy runners.
+ FORCE_FLUSH_BUSY_WAIT_REPO_CHECK: Wait until the repo-policy-check is completed before
+ force flush of busy runners.
+ """
+
+ FLUSH_IDLE = auto()
+ FLUSH_IDLE_WAIT_REPO_CHECK = auto()
+ FORCE_FLUSH_BUSY = auto()
+ FORCE_FLUSH_BUSY_WAIT_REPO_CHECK = auto()
+
+
@dataclass
class RunnerManagerClients:
"""Clients for accessing various services.
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index a49a6da4a..18dccdd0a 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -22,6 +22,7 @@
from juju.model import Model
from pytest_operator.plugin import OpsTest
+from github_client import GithubClient
from tests.integration.helpers import (
deploy_github_runner_charm,
ensure_charm_has_runner,
@@ -150,6 +151,11 @@ def model(ops_test: OpsTest) -> Model:
return ops_test.model
+@pytest.fixture(scope="module")
+def runner_manager_github_client(token: str) -> GithubClient:
+ return GithubClient(token=token)
+
+
@pytest_asyncio.fixture(scope="module")
async def app_no_runner(
model: Model,
diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py
index e7ef4d139..38e0a5254 100644
--- a/tests/integration/helpers.py
+++ b/tests/integration/helpers.py
@@ -12,6 +12,7 @@
from datetime import datetime, timezone
from typing import Any, Awaitable, Callable, Union
+import github
import juju.version
import requests
import yaml
@@ -32,6 +33,7 @@
DISPATCH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_test.yaml"
DISPATCH_CRASH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_crash_test.yaml"
DISPATCH_FAILURE_TEST_WORKFLOW_FILENAME = "workflow_dispatch_failure_test.yaml"
+DISPATCH_WAIT_TEST_WORKFLOW_FILENAME = "workflow_dispatch_wait_test.yaml"
JOB_LOG_START_MSG_TEMPLATE = "Job is about to start running on the runner: {runner_name}"
@@ -396,6 +398,9 @@ def get_workflow_runs(
runner_name: The runner name the workflow job is assigned to.
branch: The branch the workflow is run on.
"""
+ if branch is None:
+ branch = github.GithubObject.NotSet
+
for run in workflow.get_runs(created=f">={start_time.isoformat()}", branch=branch):
latest_job: WorkflowJob = run.jobs()[0]
logs = get_job_logs(job=latest_job)
diff --git a/tests/integration/test_self_hosted_runner.py b/tests/integration/test_self_hosted_runner.py
index ecbf7c28a..906ecac6d 100644
--- a/tests/integration/test_self_hosted_runner.py
+++ b/tests/integration/test_self_hosted_runner.py
@@ -8,15 +8,21 @@
import github
import pytest
-import requests
from github.Repository import Repository
from juju.application import Application
from juju.model import Model
+from github_client import GithubClient
+from runner_type import GithubRepo
from tests.integration.helpers import (
DISPATCH_TEST_WORKFLOW_FILENAME,
+ DISPATCH_WAIT_TEST_WORKFLOW_FILENAME,
+ get_job_logs,
get_runner_names,
+ get_workflow_runs,
+ reconcile,
run_in_lxd_instance,
+ wait_till_num_of_runners,
)
from tests.status_name import ACTIVE
@@ -76,13 +82,10 @@ async def test_dispatch_workflow_with_dockerhub_mirror(
# Unable to find the run id of the workflow that was dispatched.
# Therefore, all runs after this test start should pass the conditions.
- for run in workflow.get_runs(created=f">={start_time.isoformat()}"):
- if start_time > run.created_at:
- continue
-
+ for run in get_workflow_runs(start_time, workflow, runner_to_be_used):
+ jobs = run.jobs()
try:
- logs_url = run.jobs()[0].logs_url()
- logs = requests.get(logs_url).content.decode("utf-8")
+ logs = get_job_logs(jobs[0])
except github.GithubException.GithubException:
continue
@@ -92,3 +95,87 @@ async def test_dispatch_workflow_with_dockerhub_mirror(
"A private docker registry is setup as a dockerhub mirror for this self-hosted"
" runner."
) in logs
+
+
+@pytest.mark.asyncio
+@pytest.mark.abort_on_fail
+async def test_flush_busy_runner(
+ model: Model,
+ app_runner: Application,
+ forked_github_repository: Repository,
+ runner_manager_github_client: GithubClient,
+) -> None:
+ """
+ arrange: A working application with one runner.
+ act:
+ 1. Dispatch a workflow that waits for 30 mins.
+ 2. Run flush-runners action.
+ assert:
+ 1. The runner is in busy status.
+ 2. a. The flush-runners action should take less than the timeout.
+ b. The runner should be flushed.
+ """
+ unit = app_runner.units[0]
+
+ config = await app_runner.get_config()
+
+ await app_runner.set_config(
+ {"path": forked_github_repository.full_name, "virtual-machines": "1"}
+ )
+ await reconcile(app=app_runner, model=model)
+ await wait_till_num_of_runners(unit, 1)
+
+ names = await get_runner_names(unit)
+ assert len(names) == 1
+
+ runner_to_be_used = names[0]
+
+ # 1.
+ main_branch = forked_github_repository.get_branch(forked_github_repository.default_branch)
+ workflow = forked_github_repository.get_workflow(
+ id_or_file_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME
+ )
+
+ assert workflow.create_dispatch(main_branch, {"runner": app_runner.name, "minutes": "30"})
+
+ # Wait until runner online and then busy.
+ for _ in range(30):
+ all_runners = runner_manager_github_client.get_runner_github_info(
+ GithubRepo(
+ owner=forked_github_repository.owner.login, repo=forked_github_repository.name
+ )
+ )
+ runners = [runner for runner in all_runners if runner.name == runner_to_be_used]
+
+ if not runners:
+ # if runner is not online yet.
+ sleep(30)
+ continue
+
+ assert len(runners) == 1, "Should not occur as GitHub enforce unique naming of runner"
+ runner = runners[0]
+ if runner["busy"]:
+ start_time = datetime.now(timezone.utc)
+ break
+
+ sleep(30)
+ else:
+ assert False, "Timeout while waiting for runner to take up the workflow"
+
+ # 2.
+ action = await unit.run_action("flush-runners")
+ await action.wait()
+
+ end_time = datetime.now(timezone.utc)
+
+ # The flushing of runner should take less than the 30 minutes timeout of the workflow.
+ diff = end_time - start_time
+ assert diff.total_seconds() < 30 * 60
+
+ names = await get_runner_names(unit)
+ assert runner_to_be_used not in names, "Found a runner that should be flushed"
+
+ # Ensure the app_runner is back to 0 runners.
+ await app_runner.set_config({"virtual-machines": "0", "path": config["path"]})
+ await reconcile(app=app_runner, model=model)
+ await wait_till_num_of_runners(unit, 0)