Skip to content

Commit

Permalink
fix: instance initialization cloud-init status check (#11)
Browse files Browse the repository at this point in the history
* fix: instance initialization cloud-init status check

* chore: increment version

* chore: add helper comments
  • Loading branch information
yanksyoon committed Sep 13, 2024
1 parent 7715306 commit 6ee40e2
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 5 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

[project]
name = "github-runner-manager"
version = "0.1.3"
version = "0.1.4"
authors = [
{ name = "Canonical IS DevOps", email = "[email protected]" },
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -590,12 +590,15 @@ def _health_check(self, instance: OpenstackInstance) -> bool:
return OpenStackRunnerManager._run_health_check(ssh_conn, instance.server_name)

@staticmethod
def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool:
def _run_health_check(
ssh_conn: SSHConnection, name: str, accept_finished_job: bool = False
) -> bool:
"""Run a health check for runner process.
Args:
ssh_conn: The SSH connection to the runner.
name: The name of the runner.
accept_finished_job: Whether a job that has finished should be marked healthy.
Returns:
Whether the health succeed.
Expand All @@ -605,7 +608,7 @@ def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool:
logger.warning("cloud-init status command failed on %s: %s.", name, result.stderr)
return False
if CloudInitStatus.DONE in result.stdout:
return False
return accept_finished_job
result = ssh_conn.run("ps aux", warn=True)
if not result.ok:
logger.warning("SSH run of `ps aux` failed on %s: %s", name, result.stderr)
Expand Down Expand Up @@ -636,10 +639,22 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None:
"not completed"
) from err

result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True)
result: invoke.runners.Result = ssh_conn.run("cloud-init status", warn=True)
if not result.ok:
logger.warning(
"cloud-init status command failed on %s: %s.", instance.server_name, result.stderr
)
raise RunnerStartError(f"Runner startup process not found on {instance.server_name}")
# A short running job may have already completed and exited the runner, hence check the
# condition via cloud-init status check.
if CloudInitStatus.DONE in result.stdout:
return
result = ssh_conn.run("ps aux", warn=True)
if not result.ok:
logger.warning("SSH run of `ps aux` failed on %s", instance.server_name)
raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.server_name}")
# Runner startup process is the parent process of runner.Listener and runner.Worker which
# starts up much faster.
if RUNNER_STARTUP_PROCESS not in result.stdout:
logger.warning("Runner startup process not found on %s", instance.server_name)
raise RunnerStartError(f"Runner startup process not found on {instance.server_name}")
Expand All @@ -662,7 +677,9 @@ def _wait_runner_running(self, instance: OpenstackInstance) -> None:
f"Failed to SSH connect to {instance.server_name} openstack runner"
) from err

if not self._run_health_check(ssh_conn=ssh_conn, name=instance.server_name):
if not self._run_health_check(
ssh_conn=ssh_conn, name=instance.server_name, accept_finished_job=True
):
logger.info("Runner process not found on %s", instance.server_name)
raise RunnerStartError(
f"Runner process on {instance.server_name} failed to initialize on after starting"
Expand Down

0 comments on commit 6ee40e2

Please sign in to comment.