Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve detection of runner health #88

Merged
merged 16 commits into from
Jul 18, 2023
34 changes: 0 additions & 34 deletions script/deploy_runner.sh

This file was deleted.

76 changes: 0 additions & 76 deletions script/remove_offline_runners.py

This file was deleted.

6 changes: 0 additions & 6 deletions script/remove_runner.sh

This file was deleted.

7 changes: 5 additions & 2 deletions src/lxd.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,9 @@ def delete(self, wait: bool = False) -> None:
logger.exception("Failed to delete LXD instance")
raise LxdError(f"Unable to delete LXD instance {self.name}") from err

def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, IO]:
def execute(
self, cmd: list[str], cwd: Optional[str] = None, hide_cmd: bool = False
) -> Tuple[int, IO, IO]:
"""Execute a command within the LXD instance.

Exceptions are not raise if command execution failed. Caller should check the exit code and
Expand All @@ -231,6 +233,7 @@ def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, I
Args:
cmd: Commands to be executed.
cwd: Working directory to execute the commands.
hide_cmd: Hide logging of cmd.

Returns:
Tuple containing the exit code, stdout, stderr.
Expand All @@ -241,7 +244,7 @@ def execute(self, cmd: list[str], cwd: Optional[str] = None) -> Tuple[int, IO, I

lxc_cmd += ["--"] + cmd

result = secure_run_subprocess(lxc_cmd)
result = secure_run_subprocess(lxc_cmd, hide_cmd)
return (result.returncode, io.BytesIO(result.stdout), io.BytesIO(result.stderr))


Expand Down
5 changes: 4 additions & 1 deletion src/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def remove(self, remove_token: str) -> None:
logger.info("Removing LXD instance of runner: %s", self.config.name)

if self.instance:
# Run script to remove the the runner and cleanup.
logger.info("Executing command to removal of runner and clean up...")
self.instance.execute(
[
"/usr/bin/sudo",
Expand All @@ -149,6 +149,7 @@ def remove(self, remove_token: str) -> None:
"--token",
remove_token,
],
hide_cmd=True,
)

if self.instance.status == "Running":
Expand Down Expand Up @@ -549,9 +550,11 @@ def _register_runner(self, registration_token: str, labels: Sequence[str]) -> No
if isinstance(self.config.path, GitHubOrg):
register_cmd += ["--runnergroup", self.config.path.group]

logger.info("Executing registration command...")
self.instance.execute(
register_cmd,
cwd=str(self.runner_application),
hide_cmd=True,
)

@retry(tries=5, delay=30, local_logger=logger)
Expand Down
112 changes: 62 additions & 50 deletions src/runner_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,14 @@
from lxd import LxdClient, LxdInstance
from repo_policy_compliance_client import RepoPolicyComplianceClient
from runner import Runner, RunnerClients, RunnerConfig, RunnerStatus
from runner_type import GitHubOrg, GitHubPath, GitHubRepo, ProxySetting, VirtualMachineResources
from runner_type import (
GitHubOrg,
GitHubPath,
GitHubRepo,
ProxySetting,
RunnerLxdInstance,
VirtualMachineResources,
)
from utilities import retry, set_env_var

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -252,6 +259,26 @@ def get_github_info(self) -> Iterator[RunnerInfo]:
remote_runners = self._get_runner_github_info()
return iter(RunnerInfo(runner.name, runner.status) for runner in remote_runners.values())

def _get_lxd_instance_by_runner_health(self) -> RunnerLxdInstance:
local_runners = [
instance
# Pylint cannot find the `all` method.
for instance in self._clients.lxd.instances.all() # pylint: disable=no-member
if instance.name.startswith(f"{self.instance_name}-")
]

healthy = []
unhealthy = []

for runner in local_runners:
_, stdout, _ = runner.execute(["ps", "aux"])
if f"/bin/bash {Runner.runner_script}" in stdout.read().decode("utf-8"):
healthy.append(runner)
else:
unhealthy.append(runner)

return RunnerLxdInstance(healthy, unhealthy)

def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
"""Bring runners in line with target.

Expand All @@ -262,44 +289,34 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
Returns:
Difference between intended runners and actual runners.
"""
runners = self._get_runners()

# Add/Remove runners to match the target quantity
online_runners = [
runner for runner in runners if runner.status.exist and runner.status.online
]

offline_runners = [runner for runner in runners if not runner.status.online]

local_runners = {
instance.name: instance
# Pylint cannot find the `all` method.
for instance in self._clients.lxd.instances.all() # pylint: disable=no-member
if instance.name.startswith(f"{self.instance_name}-")
}
runner_lxd_instances = self._get_lxd_instance_by_runner_health()

logger.info(
(
"Expected runner count: %i, Online runner count: %i, Offline runner count: %i, "
"LXD instance count: %i"
),
("Expected runner count: %i, healthy count: %i, unhealthy count: %i"),
quantity,
len(online_runners),
len(offline_runners),
len(local_runners),
len(runner_lxd_instances.healthy),
len(runner_lxd_instances.unhealthy),
)

# Clean up offline runners
if offline_runners:
logger.info("Cleaning up offline runners.")
# Clean up unhealthy runners
if runner_lxd_instances.unhealthy:
logger.info("Cleaning up unhealthy runners.")

remove_token = self._get_github_remove_token()

for runner in offline_runners:
for instance in runner_lxd_instances.unhealthy:
config = RunnerConfig(
self.app_name,
self.config.path,
self.proxies,
self.config.lxd_storage_path,
instance.name,
)
runner = Runner(self._clients, config, RunnerStatus())
runner.remove(remove_token)
logger.info("Removed runner: %s", runner.config.name)

delta = quantity - len(online_runners)
delta = quantity - len(runner_lxd_instances.healthy)
# Spawn new runners
if delta > 0:
if RunnerManager.runner_bin_path is None:
Expand All @@ -310,7 +327,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
registration_token = self._get_github_registration_token()
remove_token = self._get_github_remove_token()

logger.info("Adding %i additional runner(s).", delta)
logger.info("Attempting to add %i runner(s).", delta)
for _ in range(delta):
config = RunnerConfig(
self.app_name,
Expand All @@ -333,25 +350,17 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
runner.remove(remove_token)
logger.info("Cleaned up runner: %s", runner.config.name)
raise

elif delta < 0:
# Idle runners are online runners that has not taken a job.
idle_runners = [runner for runner in online_runners if not runner.status.busy]
offset = min(-delta, len(idle_runners))
if offset != 0:
logger.info("Removing %i runner(s).", offset)
remove_runners = idle_runners[:offset]

logger.info("Cleaning up idle runners.")

remove_token = self._get_github_remove_token()

for runner in remove_runners:
runner.remove(remove_token)
logger.info("Removed runner: %s", runner.config.name)

else:
logger.info("There are no idle runner to remove.")
if delta < 0:
logger.info("Attempting to remove %i runner(s).", -delta)
runners_to_remove = [
runner
for runner in self._get_runners()
if runner.status.exist and not runner.status.busy
][:-delta]
logger.info("Found %i non-busy and online runner to remove", len(runners_to_remove))
for runner in runners_to_remove:
runner.remove(remove_token)
logger.info("Removed runner: %s", runner.config.name)
else:
logger.info("No changes to number of runner needed.")

Expand Down Expand Up @@ -398,11 +407,14 @@ def _get_runner_github_info(self) -> Dict[str, SelfHostedRunner]:
remote_runners_list: list[SelfHostedRunner] = []
if isinstance(self.config.path, GitHubRepo):
remote_runners_list = self._clients.github.actions.list_self_hosted_runners_for_repo(
owner=self.config.path.owner, repo=self.config.path.repo
owner=self.config.path.owner,
repo=self.config.path.repo,
per_page=100,
yhaliaw marked this conversation as resolved.
Show resolved Hide resolved
)["runners"]
if isinstance(self.config.path, GitHubOrg):
remote_runners_list = self._clients.github.actions.list_self_hosted_runners_for_org(
org=self.config.path.org
org=self.config.path.org,
per_page=100,
)["runners"]

logger.debug("List of runners found on GitHub:%s", remote_runners_list)
Expand Down
10 changes: 9 additions & 1 deletion src/runner_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,18 @@
import jinja2
from ghapi.all import GhApi

from lxd import LxdClient
from lxd import LxdClient, LxdInstance
from repo_policy_compliance_client import RepoPolicyComplianceClient


@dataclass
class RunnerLxdInstance:
"""Set of runners LXD instance by health state."""

healthy: tuple[LxdInstance]
unhealthy: tuple[LxdInstance]


class ProxySetting(TypedDict, total=False):
"""Represent HTTP-related proxy settings."""

Expand Down
11 changes: 9 additions & 2 deletions src/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,9 @@ def fn_with_retry(*args, **kwargs) -> ReturnT:
return retry_decorator


def secure_run_subprocess(cmd: Sequence[str], **kwargs) -> subprocess.CompletedProcess[bytes]:
def secure_run_subprocess(
cmd: Sequence[str], hide_cmd: bool = False, **kwargs
) -> subprocess.CompletedProcess[bytes]:
"""Run command in subprocess according to security recommendations.

The argument `shell` is set to `False` for security reasons.
Expand All @@ -105,12 +107,17 @@ def secure_run_subprocess(cmd: Sequence[str], **kwargs) -> subprocess.CompletedP

Args:
cmd: Command in a list.
hide_cmd: Hide logging of cmd.
kwargs: Additional keyword arguments for the `subprocess.run` call.

Returns:
Object representing the completed process. The outputs subprocess can accessed.
"""
logger.info("Executing command %s", cmd)
if not hide_cmd:
yhaliaw marked this conversation as resolved.
Show resolved Hide resolved
logger.info("Executing command %s", cmd)
else:
logger.info("Executing sensitive command")

result = subprocess.run( # nosec B603
cmd,
capture_output=True,
Expand Down
2 changes: 1 addition & 1 deletion templates/repo-policy-compliance.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ Environment="NO_PROXY={{proxies['no_proxy']}}"
Environment="http_proxy={{proxies['http']}}"
Environment="https_proxy={{proxies['https']}}"
Environment="no_proxy={{proxies['no_proxy']}}"
ExecStart=/usr/bin/gunicorn --bind 0.0.0.0:8080 app:app
ExecStart=/usr/bin/gunicorn --bind 0.0.0.0:8080 --timeout 60 app:app
Loading