Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve detection of runner health #88

Merged
merged 16 commits into from
Jul 18, 2023
Merged
34 changes: 0 additions & 34 deletions script/deploy_runner.sh

This file was deleted.

76 changes: 0 additions & 76 deletions script/remove_offline_runners.py

This file was deleted.

6 changes: 0 additions & 6 deletions script/remove_runner.sh

This file was deleted.

7 changes: 5 additions & 2 deletions src/lxd.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,9 @@ def delete(self, wait: bool = False) -> None:
logger.exception("Failed to delete LXD instance")
raise LxdError(f"Unable to delete LXD instance {self.name}") from err

def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, IO]:
def execute(
self, cmd: list[str], cwd: Optional[str] = None, hide_cmd: bool = False
) -> Tuple[int, IO, IO]:
"""Execute a command within the LXD instance.

Exceptions are not raise if command execution failed. Caller should check the exit code and
Expand All @@ -231,6 +233,7 @@ def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, I
Args:
cmd: Commands to be executed.
cwd: Working directory to execute the commands.
hide_cmd: Hide logging of cmd.

Returns:
Tuple containing the exit code, stdout, stderr.
Expand All @@ -241,7 +244,7 @@ def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, I

lxc_cmd += ["--"] + cmd

result = secure_run_subprocess(lxc_cmd)
result = secure_run_subprocess(lxc_cmd, hide_cmd)
return (result.returncode, io.BytesIO(result.stdout), io.BytesIO(result.stderr))


Expand Down
10 changes: 8 additions & 2 deletions src/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ def remove(self, remove_token: str) -> None:
Raises:
RunnerRemoveError: Failure in removing runner.
"""
logger.info("Removing LXD instance of runner: %s", self.config.name)
logger.info("Removing runner: %s", self.config.name)

if self.instance:
# Run script to remove the the runner and cleanup.
logger.info("Executing command to removal of runner and clean up...")
self.instance.execute(
[
"/usr/bin/sudo",
Expand All @@ -149,9 +149,11 @@ def remove(self, remove_token: str) -> None:
"--token",
remove_token,
],
hide_cmd=True,
)

if self.instance.status == "Running":
logger.info("Removing LXD instance of runner: %s", self.config.name)
try:
self.instance.stop(wait=True, timeout=60)
except LxdError:
Expand Down Expand Up @@ -179,6 +181,8 @@ def remove(self, remove_token: str) -> None:
if self.status.runner_id is None:
return

logger.info("Removing runner on GitHub: %s", self.config.name)

# The runner should cleanup itself. Cleanup on GitHub in case of runner cleanup error.
if isinstance(self.config.path, GitHubRepo):
logger.debug(
Expand Down Expand Up @@ -549,9 +553,11 @@ def _register_runner(self, registration_token: str, labels: Sequence[str]) -> No
if isinstance(self.config.path, GitHubOrg):
register_cmd += ["--runnergroup", self.config.path.group]

logger.info("Executing registration command...")
self.instance.execute(
register_cmd,
cwd=str(self.runner_application),
hide_cmd=True,
)

@retry(tries=5, delay=30, local_logger=logger)
Expand Down
106 changes: 56 additions & 50 deletions src/runner_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,14 @@
from lxd import LxdClient, LxdInstance
from repo_policy_compliance_client import RepoPolicyComplianceClient
from runner import Runner, RunnerClients, RunnerConfig, RunnerStatus
from runner_type import GitHubOrg, GitHubPath, GitHubRepo, ProxySetting, VirtualMachineResources
from runner_type import (
GitHubOrg,
GitHubPath,
GitHubRepo,
ProxySetting,
RunnerLxdInstance,
VirtualMachineResources,
)
from utilities import retry, set_env_var

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -252,6 +259,26 @@ def get_github_info(self) -> Iterator[RunnerInfo]:
remote_runners = self._get_runner_github_info()
return iter(RunnerInfo(runner.name, runner.status) for runner in remote_runners.values())

def _get_lxd_instance_by_runner_health(self) -> RunnerLxdInstance:
local_runners = [
instance
# Pylint cannot find the `all` method.
for instance in self._clients.lxd.instances.all() # pylint: disable=no-member
if instance.name.startswith(f"{self.instance_name}-")
]

healthy = []
unhealthy = []

for runner in local_runners:
_, stdout, _ = runner.execute(["ps", "aux"])
if f"/bin/bash {Runner.runner_script}" in stdout.read().decode("utf-8"):
healthy.append(runner)
else:
unhealthy.append(runner)

return RunnerLxdInstance(healthy, unhealthy)

def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
"""Bring runners in line with target.

Expand All @@ -262,44 +289,28 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
Returns:
Difference between intended runners and actual runners.
"""
runners = self._get_runners()

# Add/Remove runners to match the target quantity
online_runners = [
runner for runner in runners if runner.status.exist and runner.status.online
]

offline_runners = [runner for runner in runners if not runner.status.online]

local_runners = {
instance.name: instance
# Pylint cannot find the `all` method.
for instance in self._clients.lxd.instances.all() # pylint: disable=no-member
if instance.name.startswith(f"{self.instance_name}-")
}
runner_lxd_instances = self._get_lxd_instance_by_runner_health()

logger.info(
(
"Expected runner count: %i, Online runner count: %i, Offline runner count: %i, "
"LXD instance count: %i"
),
("Expected runner count: %i, healthy count: %i, unhealthy count: %i"),
quantity,
len(online_runners),
len(offline_runners),
len(local_runners),
len(runner_lxd_instances.healthy),
len(runner_lxd_instances.unhealthy),
)

# Clean up offline runners
if offline_runners:
logger.info("Cleaning up offline runners.")
# Clean up unhealthy runners
if runner_lxd_instances.unhealthy:
logger.info("Cleaning up unhealthy runners.")

remove_token = self._get_github_remove_token()
runners = {runner.config.name: runner for runner in self._get_runners()}

for runner in offline_runners:
for instance in runner_lxd_instances.unhealthy:
runner = runners[instance.name]
runner.remove(remove_token)
logger.info("Removed runner: %s", runner.config.name)

delta = quantity - len(online_runners)
delta = quantity - len(runner_lxd_instances.healthy)
# Spawn new runners
if delta > 0:
if RunnerManager.runner_bin_path is None:
Expand All @@ -310,7 +321,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
registration_token = self._get_github_registration_token()
remove_token = self._get_github_remove_token()

logger.info("Adding %i additional runner(s).", delta)
logger.info("Attempting to add %i runner(s).", delta)
for _ in range(delta):
config = RunnerConfig(
self.app_name,
Expand All @@ -333,25 +344,17 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
runner.remove(remove_token)
logger.info("Cleaned up runner: %s", runner.config.name)
raise

elif delta < 0:
# Idle runners are online runners that has not taken a job.
idle_runners = [runner for runner in online_runners if not runner.status.busy]
offset = min(-delta, len(idle_runners))
if offset != 0:
logger.info("Removing %i runner(s).", offset)
remove_runners = idle_runners[:offset]

logger.info("Cleaning up idle runners.")

remove_token = self._get_github_remove_token()

for runner in remove_runners:
runner.remove(remove_token)
logger.info("Removed runner: %s", runner.config.name)

else:
logger.info("There are no idle runner to remove.")
if delta < 0:
logger.info("Attempting to remove %i runner(s).", -delta)
runners_to_remove = [
runner
for runner in self._get_runners()
if runner.status.exist and not runner.status.busy
][:-delta]
logger.info("Found %i non-busy and online runner to remove", len(runners_to_remove))
for runner in runners_to_remove:
runner.remove(remove_token)
logger.info("Removed runner: %s", runner.config.name)
else:
logger.info("No changes to number of runner needed.")

Expand Down Expand Up @@ -398,11 +401,14 @@ def _get_runner_github_info(self) -> Dict[str, SelfHostedRunner]:
remote_runners_list: list[SelfHostedRunner] = []
if isinstance(self.config.path, GitHubRepo):
remote_runners_list = self._clients.github.actions.list_self_hosted_runners_for_repo(
owner=self.config.path.owner, repo=self.config.path.repo
owner=self.config.path.owner,
repo=self.config.path.repo,
per_page=100,
yhaliaw marked this conversation as resolved.
Show resolved Hide resolved
)["runners"]
if isinstance(self.config.path, GitHubOrg):
remote_runners_list = self._clients.github.actions.list_self_hosted_runners_for_org(
org=self.config.path.org
org=self.config.path.org,
per_page=100,
)["runners"]

logger.debug("List of runners found on GitHub:%s", remote_runners_list)
Expand Down
10 changes: 9 additions & 1 deletion src/runner_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,18 @@
import jinja2
from ghapi.all import GhApi

from lxd import LxdClient
from lxd import LxdClient, LxdInstance
from repo_policy_compliance_client import RepoPolicyComplianceClient


@dataclass
class RunnerLxdInstance:
"""Set of runners LXD instance by health state."""

healthy: tuple[LxdInstance]
unhealthy: tuple[LxdInstance]


class ProxySetting(TypedDict, total=False):
"""Represent HTTP-related proxy settings."""

Expand Down
Loading