Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve detection of runner health #88

Merged
merged 16 commits into from
Jul 18, 2023
Merged
34 changes: 0 additions & 34 deletions script/deploy_runner.sh

This file was deleted.

76 changes: 0 additions & 76 deletions script/remove_offline_runners.py

This file was deleted.

6 changes: 0 additions & 6 deletions script/remove_runner.sh

This file was deleted.

7 changes: 5 additions & 2 deletions src/lxd.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,9 @@ def delete(self, wait: bool = False) -> None:
logger.exception("Failed to delete LXD instance")
raise LxdError(f"Unable to delete LXD instance {self.name}") from err

def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, IO]:
def execute(
self, cmd: list[str], cwd: Optional[str] = None, hide_cmd: bool = False
) -> Tuple[int, IO, IO]:
"""Execute a command within the LXD instance.

Exceptions are not raise if command execution failed. Caller should check the exit code and
Expand All @@ -231,6 +233,7 @@ def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, I
Args:
cmd: Commands to be executed.
cwd: Working directory to execute the commands.
hide_cmd: Hide logging of cmd.

Returns:
Tuple containing the exit code, stdout, stderr.
Expand All @@ -241,7 +244,7 @@ def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, I

lxc_cmd += ["--"] + cmd

result = secure_run_subprocess(lxc_cmd)
result = secure_run_subprocess(lxc_cmd, hide_cmd)
return (result.returncode, io.BytesIO(result.stdout), io.BytesIO(result.stderr))


Expand Down
10 changes: 8 additions & 2 deletions src/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ def remove(self, remove_token: str) -> None:
Raises:
RunnerRemoveError: Failure in removing runner.
"""
logger.info("Removing LXD instance of runner: %s", self.config.name)
logger.info("Removing runner: %s", self.config.name)

if self.instance:
# Run script to remove the the runner and cleanup.
logger.info("Executing command to removal of runner and clean up...")
self.instance.execute(
[
"/usr/bin/sudo",
Expand All @@ -149,9 +149,11 @@ def remove(self, remove_token: str) -> None:
"--token",
remove_token,
],
hide_cmd=True,
)

if self.instance.status == "Running":
logger.info("Removing LXD instance of runner: %s", self.config.name)
try:
self.instance.stop(wait=True, timeout=60)
except LxdError:
Expand Down Expand Up @@ -179,6 +181,8 @@ def remove(self, remove_token: str) -> None:
if self.status.runner_id is None:
return

logger.info("Removing runner on GitHub: %s", self.config.name)

# The runner should cleanup itself. Cleanup on GitHub in case of runner cleanup error.
if isinstance(self.config.path, GitHubRepo):
logger.debug(
Expand Down Expand Up @@ -549,9 +553,11 @@ def _register_runner(self, registration_token: str, labels: Sequence[str]) -> No
if isinstance(self.config.path, GitHubOrg):
register_cmd += ["--runnergroup", self.config.path.group]

logger.info("Executing registration command...")
self.instance.execute(
register_cmd,
cwd=str(self.runner_application),
hide_cmd=True,
)

@retry(tries=5, delay=30, local_logger=logger)
Expand Down
103 changes: 78 additions & 25 deletions src/runner_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import requests.adapters
import urllib3
from ghapi.all import GhApi
from ghapi.page import pages
from typing_extensions import assert_never

from errors import RunnerBinaryError, RunnerCreateError
Expand All @@ -34,7 +35,14 @@
from lxd import LxdClient, LxdInstance
from repo_policy_compliance_client import RepoPolicyComplianceClient
from runner import Runner, RunnerClients, RunnerConfig, RunnerStatus
from runner_type import GitHubOrg, GitHubPath, GitHubRepo, ProxySetting, VirtualMachineResources
from runner_type import (
GitHubOrg,
GitHubPath,
GitHubRepo,
ProxySetting,
RunnerByHealth,
VirtualMachineResources,
)
from utilities import retry, set_env_var

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -252,6 +260,26 @@ def get_github_info(self) -> Iterator[RunnerInfo]:
remote_runners = self._get_runner_github_info()
return iter(RunnerInfo(runner.name, runner.status) for runner in remote_runners.values())

def _get_runner_health_states(self) -> RunnerByHealth:
local_runners = [
instance
# Pylint cannot find the `all` method.
for instance in self._clients.lxd.instances.all() # pylint: disable=no-member
if instance.name.startswith(f"{self.instance_name}-")
]

healthy = []
unhealthy = []

for runner in local_runners:
_, stdout, _ = runner.execute(["ps", "aux"])
if f"/bin/bash {Runner.runner_script}" in stdout.read().decode("utf-8"):
healthy.append(runner.name)
else:
unhealthy.append(runner.name)

return RunnerByHealth(healthy, unhealthy)

def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
"""Bring runners in line with target.

Expand All @@ -269,37 +297,35 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
runner for runner in runners if runner.status.exist and runner.status.online
]

offline_runners = [runner for runner in runners if not runner.status.online]

local_runners = {
instance.name: instance
# Pylint cannot find the `all` method.
for instance in self._clients.lxd.instances.all() # pylint: disable=no-member
if instance.name.startswith(f"{self.instance_name}-")
}
runner_states = self._get_runner_health_states()

logger.info(
(
"Expected runner count: %i, Online runner count: %i, Offline runner count: %i, "
"LXD instance count: %i"
"Expected runner count: %i, Online count: %i, Offline count: %i, "
"healthy count: %i, unhealthy count: %i"
),
quantity,
len(online_runners),
len(offline_runners),
len(local_runners),
len(runners) - len(online_runners),
len(runner_states.healthy),
len(runner_states.unhealthy),
)

# Clean up offline runners
if offline_runners:
logger.info("Cleaning up offline runners.")
if runner_states.unhealthy:
logger.info("Cleaning up unhealthy runners.")

remove_token = self._get_github_remove_token()

for runner in offline_runners:
unhealthy_runners = [
runner for runner in runners if runner.config.name in set(runner_states.unhealthy)
]

for runner in unhealthy_runners:
runner.remove(remove_token)
logger.info("Removed runner: %s", runner.config.name)

delta = quantity - len(online_runners)
delta = quantity - len(runner_states.healthy)
# Spawn new runners
if delta > 0:
if RunnerManager.runner_bin_path is None:
Expand All @@ -310,7 +336,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
registration_token = self._get_github_registration_token()
remove_token = self._get_github_remove_token()

logger.info("Adding %i additional runner(s).", delta)
logger.info("Attempting to add %i runner(s).", delta)
for _ in range(delta):
config = RunnerConfig(
self.app_name,
Expand All @@ -335,6 +361,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
raise

elif delta < 0:
logger.info("Attempting to remove %i runner(s).", -delta)
# Idle runners are online runners that has not taken a job.
idle_runners = [runner for runner in online_runners if not runner.status.busy]
offset = min(-delta, len(idle_runners))
Expand All @@ -349,7 +376,6 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
for runner in remove_runners:
runner.remove(remove_token)
logger.info("Removed runner: %s", runner.config.name)

else:
logger.info("There are no idle runner to remove.")
else:
Expand Down Expand Up @@ -397,13 +423,40 @@ def _generate_runner_name(self) -> str:
def _get_runner_github_info(self) -> Dict[str, SelfHostedRunner]:
remote_runners_list: list[SelfHostedRunner] = []
if isinstance(self.config.path, GitHubRepo):
remote_runners_list = self._clients.github.actions.list_self_hosted_runners_for_repo(
owner=self.config.path.owner, repo=self.config.path.repo
)["runners"]
# The documentation of ghapi for pagination is incorrect and examples will give errors.
# This workaround is a temp solution. Will be moving to PyGitHub in the future.
self._clients.github.actions.list_self_hosted_runners_for_repo(
owner=self.config.path.owner, repo=self.config.path.repo, per_page=100
)
num_of_pages = self._clients.github.last_page()
remote_runners_list = [
item
for page in pages(
self._clients.github.actions.list_self_hosted_runners_for_repo,
num_of_pages + 1,
owner=self.config.path.owner,
repo=self.config.path.repo,
per_page=100,
)
for item in page["runners"]
]
if isinstance(self.config.path, GitHubOrg):
remote_runners_list = self._clients.github.actions.list_self_hosted_runners_for_org(
org=self.config.path.org
)["runners"]
# The documentation of ghapi for pagination is incorrect and examples will give errors.
# This workaround is a temp solution. Will be moving to PyGitHub in the future.
self._clients.github.actions.list_self_hosted_runners_for_org(
org=self.config.path.org, per_page=100
)
num_of_pages = self._clients.github.last_page()
remote_runners_list = [
item
for page in pages(
self._clients.github.actions.list_self_hosted_runners_for_org,
num_of_pages + 1,
org=self.config.path.org,
per_page=100,
)
for item in page["runners"]
]

logger.debug("List of runners found on GitHub:%s", remote_runners_list)

Expand Down
8 changes: 8 additions & 0 deletions src/runner_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@
from repo_policy_compliance_client import RepoPolicyComplianceClient


@dataclass
class RunnerByHealth:
"""Set of runners LXD instance by health state."""

healthy: tuple[str]
unhealthy: tuple[str]


class ProxySetting(TypedDict, total=False):
"""Represent HTTP-related proxy settings."""

Expand Down
Loading