diff --git a/pyproject.toml b/pyproject.toml index bd3f383..da93d53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ [project] name = "github-runner-manager" -version = "0.1.2" +version = "0.1.3" authors = [ { name = "Canonical IS DevOps", email = "is-devops-team@canonical.com" }, ] diff --git a/src-docs/manager.cloud_runner_manager.md b/src-docs/manager.cloud_runner_manager.md index b86ff18..06a9783 100644 --- a/src-docs/manager.cloud_runner_manager.md +++ b/src-docs/manager.cloud_runner_manager.md @@ -53,6 +53,32 @@ Represent state of the instance hosting the runner. +## class `CloudInitStatus` +Represents the state of cloud-init script. + +The cloud init script is used to launch ephemeral GitHub runners. If the script is being initialized, GitHub runner is listening for jobs or GitHub runner is running the job, the cloud-init script should report "running" status. + +Refer to the official documentation on cloud-init status: https://cloudinit.readthedocs.io/en/latest/howto/status.html. + + + +**Attributes:** + + - `NOT_STARTED`: The cloud-init script has not yet been started. + - `RUNNING`: The cloud-init script is running. + - `DONE`: The cloud-init script has completed successfully. + - `ERROR`: There was an error while running the cloud-init script. + - `DEGRADED`: There was a non-critical issue while running the cloud-inits script. + - `DISABLED`: Cloud init was disabled by other system configurations. + + + + + +--- + + + ## class `GitHubRunnerConfig` Configuration for GitHub runner spawned. @@ -81,7 +107,7 @@ __init__(github_path: GitHubOrg | GitHubRepo, labels: list[str]) → None --- - + ## class `SupportServiceConfig` Configuration for supporting services for runners. @@ -118,7 +144,7 @@ __init__( --- - + ## class `CloudRunnerInstance` Information on the runner on the cloud. @@ -155,7 +181,7 @@ __init__( --- - + ## class `CloudRunnerManager` Manage runner instance on cloud. @@ -177,7 +203,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `cleanup` @@ -197,7 +223,7 @@ Perform health check on runner and delete the runner if it fails. --- - + ### method `create_runner` @@ -215,7 +241,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -234,7 +260,7 @@ Delete self-hosted runner. --- - + ### method `flush_runners` @@ -253,7 +279,7 @@ Stop all runners. --- - + ### method `get_runner` @@ -271,7 +297,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index f1b3af9..ffee0cf 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -18,13 +18,14 @@ Represents an OpenStack instance. **Attributes:** + - `addresses`: IP addresses assigned to the server. + - `created_at`: The timestamp in which the instance was created at. + - `instance_id`: ID used by OpenstackCloud class to manage the instances. See docs on the OpenstackCloud. - `server_id`: ID of server assigned by OpenStack. - `server_name`: Name of the server on OpenStack. - - `instance_id`: ID used by OpenstackCloud class to manage the instances. See docs on the OpenstackCloud. - - `addresses`: IP addresses assigned to the server. - `status`: Status of the server. - + ### method `__init__` @@ -53,14 +54,14 @@ Construct the object. --- - + ## class `OpenstackCloud` Client to interact with OpenStack cloud. The OpenStack server name is managed by this cloud. Caller refers to the instances via instance_id. If the caller needs the server name, e.g., for logging, it can be queried with get_server_name. - + ### method `__init__` @@ -83,7 +84,7 @@ Create the object. --- - + ### method `cleanup` @@ -95,7 +96,7 @@ Cleanup unused key files and openstack keypairs. --- - + ### method `delete_instance` @@ -113,7 +114,7 @@ Delete a openstack instance. --- - + ### method `get_instance` @@ -136,7 +137,7 @@ Get OpenStack instance by instance ID. --- - + ### method `get_instances` @@ -153,7 +154,7 @@ Get all OpenStack instances. --- - + ### method `get_server_name` @@ -176,7 +177,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` @@ -206,7 +207,7 @@ Get SSH connection to an OpenStack instance. --- - + ### method `launch_instance` diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 407b8fa..be23251 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -17,7 +17,7 @@ Manager for self-hosted runner on OpenStack. --- - + ## class `OpenStackCloudConfig` Configuration for OpenStack cloud authorisation information. @@ -47,7 +47,7 @@ __init__(clouds_config: dict[str, dict], cloud: str) → None --- - + ## class `OpenStackServerConfig` Configuration for OpenStack server. @@ -78,7 +78,7 @@ __init__(image: str, flavor: str, network: str) → None --- - + ## class `OpenStackRunnerManager` Manage self-hosted runner on OpenStack cloud. @@ -89,7 +89,7 @@ Manage self-hosted runner on OpenStack cloud. - `name_prefix`: The name prefix of the runners created. - + ### method `__init__` @@ -133,7 +133,7 @@ The prefix of runner names. --- - + ### method `cleanup` @@ -156,7 +156,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -186,7 +186,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -210,7 +210,7 @@ Delete self-hosted runners. --- - + ### method `flush_runners` @@ -233,7 +233,7 @@ Remove idle and/or busy runners. --- - + ### method `get_runner` @@ -256,7 +256,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/github_runner_manager/manager/cloud_runner_manager.py b/src/github_runner_manager/manager/cloud_runner_manager.py index 0d62735..e9d25cd 100644 --- a/src/github_runner_manager/manager/cloud_runner_manager.py +++ b/src/github_runner_manager/manager/cloud_runner_manager.py @@ -94,6 +94,33 @@ def from_openstack_server_status( # pragma: no cover return state +class CloudInitStatus(str, Enum): + """Represents the state of cloud-init script. + + The cloud init script is used to launch ephemeral GitHub runners. If the script is being + initialized, GitHub runner is listening for jobs or GitHub runner is running the job, the + cloud-init script should report "running" status. + + Refer to the official documentation on cloud-init status: + https://cloudinit.readthedocs.io/en/latest/howto/status.html. + + Attributes: + NOT_STARTED: The cloud-init script has not yet been started. + RUNNING: The cloud-init script is running. + DONE: The cloud-init script has completed successfully. + ERROR: There was an error while running the cloud-init script. + DEGRADED: There was a non-critical issue while running the cloud-inits script. + DISABLED: Cloud init was disabled by other system configurations. + """ + + NOT_STARTED = "not started" + RUNNING = "running" + DONE = "done" + ERROR = "error" + DEGRADED = "degraded" + DISABLED = "disabled" + + @dataclass class GitHubRunnerConfig: """Configuration for GitHub runner spawned. diff --git a/src/github_runner_manager/openstack_cloud/openstack_cloud.py b/src/github_runner_manager/openstack_cloud/openstack_cloud.py index 9841e07..b9daa0e 100644 --- a/src/github_runner_manager/openstack_cloud/openstack_cloud.py +++ b/src/github_runner_manager/openstack_cloud/openstack_cloud.py @@ -43,18 +43,20 @@ class OpenstackInstance: """Represents an OpenStack instance. Attributes: - server_id: ID of server assigned by OpenStack. - server_name: Name of the server on OpenStack. + addresses: IP addresses assigned to the server. + created_at: The timestamp in which the instance was created at. instance_id: ID used by OpenstackCloud class to manage the instances. See docs on the OpenstackCloud. - addresses: IP addresses assigned to the server. + server_id: ID of server assigned by OpenStack. + server_name: Name of the server on OpenStack. status: Status of the server. """ + addresses: list[str] + created_at: datetime + instance_id: str server_id: str server_name: str - instance_id: str - addresses: list[str] status: str def __init__(self, server: OpenstackServer, prefix: str): @@ -67,21 +69,21 @@ def __init__(self, server: OpenstackServer, prefix: str): Raises: ValueError: Provided server should not be managed under this prefix. """ - self.server_id = server.id - self.server_name = server.name - self.status = server.status + if not server.name.startswith(f"{prefix}-"): + # Should never happen. + raise ValueError( + f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" + ) self.addresses = [ address["addr"] for network_addresses in server.addresses.values() for address in network_addresses ] - - if not self.server_name.startswith(f"{prefix}-"): - # Should never happen. - raise ValueError( - f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" - ) - self.instance_id = self.server_name[len(prefix) + 1 :] + self.created_at = datetime.strptime(server.created_at, "%Y-%m-%dT%H:%M:%SZ") + self.instance_id = server.name[len(prefix) + 1 :] + self.server_id = server.id + self.server_name = server.name + self.status = server.status @contextmanager diff --git a/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py index 42b3179..23fdeb1 100644 --- a/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py +++ b/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py @@ -7,6 +7,7 @@ import secrets import time from dataclasses import dataclass +from datetime import datetime, timedelta from pathlib import Path from typing import Iterator, Sequence @@ -28,6 +29,7 @@ SSHError, ) from github_runner_manager.manager.cloud_runner_manager import ( + CloudInitStatus, CloudRunnerInstance, CloudRunnerManager, CloudRunnerState, @@ -151,11 +153,11 @@ def __init__( # pylint: disable=R0913 # Setting the env var to this process and any child process spawned. proxies = service_config.proxy_config - if no_proxy := proxies.no_proxy: + if proxies and (no_proxy := proxies.no_proxy): set_env_var("NO_PROXY", no_proxy) - if http_proxy := proxies.http: + if proxies and (http_proxy := proxies.http): set_env_var("HTTP_PROXY", http_proxy) - if https_proxy := proxies.https: + if proxies and (https_proxy := proxies.https): set_env_var("HTTPS_PROXY", https_proxy) @property @@ -399,16 +401,32 @@ def _runner_health_check(self, instance: OpenstackInstance) -> bool: True if runner is healthy. """ cloud_state = CloudRunnerState.from_openstack_server_status(instance.status) - is_expected_cloud_state = cloud_state not in set( - ( - CloudRunnerState.DELETED, - CloudRunnerState.ERROR, - CloudRunnerState.STOPPED, + if cloud_state in ( + CloudRunnerState.DELETED, + CloudRunnerState.STOPPED, + ): + return False + + if cloud_state in (CloudRunnerState.ERROR, CloudRunnerState.UNKNOWN): + logger.error( + "Instance in unexpected status, failing health check. %s: %s (%s)", + cloud_state, + instance.server_name, + instance.server_id, ) - ) + return False + if cloud_state in (CloudRunnerState.CREATED,): + if datetime.now() - instance.created_at >= timedelta(hours=1): + logger.error( + "Instance in created status for too long, failing health check. %s: %s (%s)", + cloud_state, + instance.server_name, + instance.server_id, + ) + return False + return True try: - is_healthy_runner = self._health_check(instance) - return is_expected_cloud_state and is_healthy_runner + return self._health_check(instance) except SSHError: logger.exception("SSH Failed on %s, marking as unhealthy.") return False @@ -582,7 +600,13 @@ def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool: Returns: Whether the health succeed. """ - result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) + result: invoke.runners.Result = ssh_conn.run("cloud-init status", warn=True) + if not result.ok: + logger.warning("cloud-init status command failed on %s: %s.", name, result.stderr) + return False + if CloudInitStatus.DONE in result.stdout: + return False + result = ssh_conn.run("ps aux", warn=True) if not result.ok: logger.warning("SSH run of `ps aux` failed on %s: %s", name, result.stderr) return False diff --git a/tests/unit/factories/openstack_factory.py b/tests/unit/factories/openstack_factory.py new file mode 100644 index 0000000..ab63910 --- /dev/null +++ b/tests/unit/factories/openstack_factory.py @@ -0,0 +1,41 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Factories for OpenStack objects.""" + +import factory +import openstack.compute.v2.server + + +class ServerFactory(factory.Factory): + """Factory class for OpenStack server. + + Attributes: + addresses: The OpenStack server addresses mapping. + created_at: The datetime string in which the server was created. + id: The OpenStack server UUID. + name: The OpenStack server name. + status: The server status. + """ + + class Meta: + """Meta class for OpenStack server. + + Attributes: + model: The metadata reference model. + """ + + model = openstack.compute.v2.server.Server + + name = "test-server" + addresses = { + "test-network-name": [ + { + "version": 4, + "addr": "10.145.236.69", + } + ] + } + created_at = "2024-09-12T02:48:03Z" + id = "e6117e39-fbb4-47bc-9461-3933f5ab6f56" + status = "ACTIVE" diff --git a/tests/unit/requirements.txt b/tests/unit/requirements.txt new file mode 100644 index 0000000..20ee3d3 --- /dev/null +++ b/tests/unit/requirements.txt @@ -0,0 +1 @@ +factory-boy==3.3.1 diff --git a/tests/unit/test_openstack_runner_manager.py b/tests/unit/test_openstack_runner_manager.py new file mode 100644 index 0000000..9550dc1 --- /dev/null +++ b/tests/unit/test_openstack_runner_manager.py @@ -0,0 +1,144 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Module for unit-testing OpenStack runner manager.""" + +import secrets +from datetime import datetime, timedelta +from unittest.mock import MagicMock + +import pytest + +from github_runner_manager.openstack_cloud import openstack_runner_manager +from tests.unit.factories import openstack_factory + + +@pytest.fixture(scope="function", name="mock_openstack_runner_manager") +def mock_openstack_runner_manager_fixture(): + """The mocked OpenStackRunnerManager instance.""" + return openstack_runner_manager.OpenStackRunnerManager( + manager_name="mock-manager", + prefix="mock-manager", + cloud_config=openstack_runner_manager.OpenStackCloudConfig( + clouds_config={ + "clouds": { + "testcloud": { + "auth": { + "auth_url": "http://test-keystone-url.com", + "password": secrets.token_hex(16), + "project_domain_name": "test-project-domain-name", + "project_name": "test-project-name", + "user_domain_name": "test-user-domain-name", + "username": "test-user-name", + } + } + } + }, + cloud="testcloud", + ), + server_config=openstack_runner_manager.OpenStackServerConfig( + image="test-image", flavor="test-flavor", network="test-network" + ), + runner_config=openstack_runner_manager.GitHubRunnerConfig( + github_path="test-org/test-repo", labels=["test-label1", "test-label2"] + ), + service_config=openstack_runner_manager.SupportServiceConfig( + proxy_config=None, + dockerhub_mirror=None, + ssh_debug_connections=None, + repo_policy_compliance=None, + ), + ) + + +@pytest.mark.parametrize( + "instance, expected_result", + [ + pytest.param( + openstack_runner_manager.OpenstackInstance( + server=openstack_factory.ServerFactory( + status="DELETED", + ), + prefix="test", + ), + False, + id="instance deleted", + ), + pytest.param( + openstack_runner_manager.OpenstackInstance( + server=openstack_factory.ServerFactory( + status="STOPPED", + ), + prefix="test", + ), + False, + id="instance stopped", + ), + pytest.param( + openstack_runner_manager.OpenstackInstance( + server=openstack_factory.ServerFactory( + status="ERROR", + ), + prefix="test", + ), + False, + id="instance in ERROR", + ), + pytest.param( + openstack_runner_manager.OpenstackInstance( + server=openstack_factory.ServerFactory( + status="UNKNOWN", + ), + prefix="test", + ), + False, + id="instance in UNKNOWN", + ), + pytest.param( + openstack_runner_manager.OpenstackInstance( + server=openstack_factory.ServerFactory( + created_at=(datetime.now() - timedelta(hours=2)).strftime( + "%Y-%m-%dT%H:%M:%SZ" + ), + status="BUILD", + ), + prefix="test", + ), + False, + id="stuck in BUILD status", + ), + pytest.param( + openstack_runner_manager.OpenstackInstance( + server=openstack_factory.ServerFactory( + created_at=datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"), + status="BUILD", + ), + prefix="test", + ), + True, + id="just spawned", + ), + pytest.param( + openstack_runner_manager.OpenstackInstance( + server=openstack_factory.ServerFactory( + status="ACTIVE", + ), + prefix="test", + ), + True, + id="active", + ), + ], +) +def test__runner_health_check( + mock_openstack_runner_manager: openstack_runner_manager.OpenStackRunnerManager, + instance: openstack_runner_manager.OpenstackInstance, + expected_result: bool, +): + """ + arrange: given OpenStack instance in different states. + act: when _runner_health_check is called. + assert: expected health check result is returned. + """ + mock_openstack_runner_manager._health_check = MagicMock(return_value=True) + assert mock_openstack_runner_manager._runner_health_check(instance=instance) == expected_result diff --git a/tox.ini b/tox.ini index bcb8001..f34fbd6 100644 --- a/tox.ini +++ b/tox.ini @@ -76,6 +76,7 @@ deps = coverage[toml] pytest -r{toxinidir}/requirements.txt + -r{toxinidir}/tests/unit/requirements.txt commands = coverage run --source={[vars]src_path} \ -m pytest --ignore={[vars]tst_path}integration -v --tb native -s {posargs}