Skip to content

Commit

Permalink
Improve detection of runner health (#88)
Browse files Browse the repository at this point in the history
* Remove unused scripts

* Use runner application process as health check

* Rename type and add docstring

* Hide logging of sensitive commands

* Fix pagination of GitHub REST API calls

* Use poweroff over halt to stop the LXD instances due to LXD issues

* Increase gunicorn request timeout to 60 seconds
  • Loading branch information
yhaliaw authored Jul 18, 2023
1 parent a9143c3 commit 85e8ef9
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 155 deletions.
34 changes: 0 additions & 34 deletions script/deploy_runner.sh

This file was deleted.

76 changes: 0 additions & 76 deletions script/remove_offline_runners.py

This file was deleted.

6 changes: 0 additions & 6 deletions script/remove_runner.sh

This file was deleted.

7 changes: 5 additions & 2 deletions src/lxd.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,9 @@ def delete(self, wait: bool = False) -> None:
logger.exception("Failed to delete LXD instance")
raise LxdError(f"Unable to delete LXD instance {self.name}") from err

def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, IO]:
def execute(
self, cmd: list[str], cwd: Optional[str] = None, hide_cmd: bool = False
) -> Tuple[int, IO, IO]:
"""Execute a command within the LXD instance.
Exceptions are not raise if command execution failed. Caller should check the exit code and
Expand All @@ -231,6 +233,7 @@ def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, I
Args:
cmd: Commands to be executed.
cwd: Working directory to execute the commands.
hide_cmd: Hide logging of cmd.
Returns:
Tuple containing the exit code, stdout, stderr.
Expand All @@ -241,7 +244,7 @@ def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, I

lxc_cmd += ["--"] + cmd

result = secure_run_subprocess(lxc_cmd)
result = secure_run_subprocess(lxc_cmd, hide_cmd)
return (result.returncode, io.BytesIO(result.stdout), io.BytesIO(result.stderr))


Expand Down
10 changes: 8 additions & 2 deletions src/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ def remove(self, remove_token: str) -> None:
Raises:
RunnerRemoveError: Failure in removing runner.
"""
logger.info("Removing LXD instance of runner: %s", self.config.name)
logger.info("Removing runner: %s", self.config.name)

if self.instance:
# Run script to remove the the runner and cleanup.
logger.info("Executing command to removal of runner and clean up...")
self.instance.execute(
[
"/usr/bin/sudo",
Expand All @@ -149,9 +149,11 @@ def remove(self, remove_token: str) -> None:
"--token",
remove_token,
],
hide_cmd=True,
)

if self.instance.status == "Running":
logger.info("Removing LXD instance of runner: %s", self.config.name)
try:
self.instance.stop(wait=True, timeout=60)
except LxdError:
Expand Down Expand Up @@ -179,6 +181,8 @@ def remove(self, remove_token: str) -> None:
if self.status.runner_id is None:
return

logger.info("Removing runner on GitHub: %s", self.config.name)

# The runner should cleanup itself. Cleanup on GitHub in case of runner cleanup error.
if isinstance(self.config.path, GitHubRepo):
logger.debug(
Expand Down Expand Up @@ -549,9 +553,11 @@ def _register_runner(self, registration_token: str, labels: Sequence[str]) -> No
if isinstance(self.config.path, GitHubOrg):
register_cmd += ["--runnergroup", self.config.path.group]

logger.info("Executing registration command...")
self.instance.execute(
register_cmd,
cwd=str(self.runner_application),
hide_cmd=True,
)

@retry(tries=5, delay=30, local_logger=logger)
Expand Down
103 changes: 78 additions & 25 deletions src/runner_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import requests.adapters
import urllib3
from ghapi.all import GhApi
from ghapi.page import pages
from typing_extensions import assert_never

from errors import RunnerBinaryError, RunnerCreateError
Expand All @@ -34,7 +35,14 @@
from lxd import LxdClient, LxdInstance
from repo_policy_compliance_client import RepoPolicyComplianceClient
from runner import Runner, RunnerClients, RunnerConfig, RunnerStatus
from runner_type import GitHubOrg, GitHubPath, GitHubRepo, ProxySetting, VirtualMachineResources
from runner_type import (
GitHubOrg,
GitHubPath,
GitHubRepo,
ProxySetting,
RunnerByHealth,
VirtualMachineResources,
)
from utilities import retry, set_env_var

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -252,6 +260,26 @@ def get_github_info(self) -> Iterator[RunnerInfo]:
remote_runners = self._get_runner_github_info()
return iter(RunnerInfo(runner.name, runner.status) for runner in remote_runners.values())

def _get_runner_health_states(self) -> RunnerByHealth:
local_runners = [
instance
# Pylint cannot find the `all` method.
for instance in self._clients.lxd.instances.all() # pylint: disable=no-member
if instance.name.startswith(f"{self.instance_name}-")
]

healthy = []
unhealthy = []

for runner in local_runners:
_, stdout, _ = runner.execute(["ps", "aux"])
if f"/bin/bash {Runner.runner_script}" in stdout.read().decode("utf-8"):
healthy.append(runner.name)
else:
unhealthy.append(runner.name)

return RunnerByHealth(healthy, unhealthy)

def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
"""Bring runners in line with target.
Expand All @@ -269,37 +297,35 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
runner for runner in runners if runner.status.exist and runner.status.online
]

offline_runners = [runner for runner in runners if not runner.status.online]

local_runners = {
instance.name: instance
# Pylint cannot find the `all` method.
for instance in self._clients.lxd.instances.all() # pylint: disable=no-member
if instance.name.startswith(f"{self.instance_name}-")
}
runner_states = self._get_runner_health_states()

logger.info(
(
"Expected runner count: %i, Online runner count: %i, Offline runner count: %i, "
"LXD instance count: %i"
"Expected runner count: %i, Online count: %i, Offline count: %i, "
"healthy count: %i, unhealthy count: %i"
),
quantity,
len(online_runners),
len(offline_runners),
len(local_runners),
len(runners) - len(online_runners),
len(runner_states.healthy),
len(runner_states.unhealthy),
)

# Clean up offline runners
if offline_runners:
logger.info("Cleaning up offline runners.")
if runner_states.unhealthy:
logger.info("Cleaning up unhealthy runners.")

remove_token = self._get_github_remove_token()

for runner in offline_runners:
unhealthy_runners = [
runner for runner in runners if runner.config.name in set(runner_states.unhealthy)
]

for runner in unhealthy_runners:
runner.remove(remove_token)
logger.info("Removed runner: %s", runner.config.name)

delta = quantity - len(online_runners)
delta = quantity - len(runner_states.healthy)
# Spawn new runners
if delta > 0:
if RunnerManager.runner_bin_path is None:
Expand All @@ -310,7 +336,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
registration_token = self._get_github_registration_token()
remove_token = self._get_github_remove_token()

logger.info("Adding %i additional runner(s).", delta)
logger.info("Attempting to add %i runner(s).", delta)
for _ in range(delta):
config = RunnerConfig(
self.app_name,
Expand All @@ -335,6 +361,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
raise

elif delta < 0:
logger.info("Attempting to remove %i runner(s).", -delta)
# Idle runners are online runners that has not taken a job.
idle_runners = [runner for runner in online_runners if not runner.status.busy]
offset = min(-delta, len(idle_runners))
Expand All @@ -349,7 +376,6 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
for runner in remove_runners:
runner.remove(remove_token)
logger.info("Removed runner: %s", runner.config.name)

else:
logger.info("There are no idle runner to remove.")
else:
Expand Down Expand Up @@ -397,13 +423,40 @@ def _generate_runner_name(self) -> str:
def _get_runner_github_info(self) -> Dict[str, SelfHostedRunner]:
remote_runners_list: list[SelfHostedRunner] = []
if isinstance(self.config.path, GitHubRepo):
remote_runners_list = self._clients.github.actions.list_self_hosted_runners_for_repo(
owner=self.config.path.owner, repo=self.config.path.repo
)["runners"]
# The documentation of ghapi for pagination is incorrect and examples will give errors.
# This workaround is a temp solution. Will be moving to PyGitHub in the future.
self._clients.github.actions.list_self_hosted_runners_for_repo(
owner=self.config.path.owner, repo=self.config.path.repo, per_page=100
)
num_of_pages = self._clients.github.last_page()
remote_runners_list = [
item
for page in pages(
self._clients.github.actions.list_self_hosted_runners_for_repo,
num_of_pages + 1,
owner=self.config.path.owner,
repo=self.config.path.repo,
per_page=100,
)
for item in page["runners"]
]
if isinstance(self.config.path, GitHubOrg):
remote_runners_list = self._clients.github.actions.list_self_hosted_runners_for_org(
org=self.config.path.org
)["runners"]
# The documentation of ghapi for pagination is incorrect and examples will give errors.
# This workaround is a temp solution. Will be moving to PyGitHub in the future.
self._clients.github.actions.list_self_hosted_runners_for_org(
org=self.config.path.org, per_page=100
)
num_of_pages = self._clients.github.last_page()
remote_runners_list = [
item
for page in pages(
self._clients.github.actions.list_self_hosted_runners_for_org,
num_of_pages + 1,
org=self.config.path.org,
per_page=100,
)
for item in page["runners"]
]

logger.debug("List of runners found on GitHub:%s", remote_runners_list)

Expand Down
8 changes: 8 additions & 0 deletions src/runner_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@
from repo_policy_compliance_client import RepoPolicyComplianceClient


@dataclass
class RunnerByHealth:
"""Set of runners LXD instance by health state."""

healthy: tuple[str]
unhealthy: tuple[str]


class ProxySetting(TypedDict, total=False):
"""Represent HTTP-related proxy settings."""

Expand Down
Loading

0 comments on commit 85e8ef9

Please sign in to comment.