From ba970fd84624dc4394da2bce56139431b1706396 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sat, 27 Jul 2024 11:09:42 +0800 Subject: [PATCH 001/278] Add openstack interface --- requirements.txt | 2 + src-docs/managed_requests.md | 32 ++ src-docs/openstack_cloud.md | 3 + src-docs/openstack_cloud.openstack_cloud.md | 125 ++++++ src-docs/openstack_cloud.openstack_manager.md | 16 +- src/openstack_cloud/openstack_cloud.py | 388 ++++++++++++++++++ src/openstack_cloud/openstack_manager.py | 35 +- tests/conftest.py | 13 + tests/integration/conftest.py | 13 + tests/integration/test_e2e.py | 2 +- tests/integration/test_openstack_cloud.py | 102 +++++ 11 files changed, 688 insertions(+), 43 deletions(-) create mode 100644 src-docs/managed_requests.md create mode 100644 src-docs/openstack_cloud.openstack_cloud.md create mode 100644 src/openstack_cloud/openstack_cloud.py create mode 100644 tests/integration/test_openstack_cloud.py diff --git a/requirements.txt b/requirements.txt index 1046b854a..927fa70c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +# TODO 2024-07-12: PyGithub-based inteface will be replacing the ghapi in the future +PyGithub ghapi jinja2 fabric >=3,<4 diff --git a/src-docs/managed_requests.md b/src-docs/managed_requests.md new file mode 100644 index 000000000..23939bcc2 --- /dev/null +++ b/src-docs/managed_requests.md @@ -0,0 +1,32 @@ + + + + +# module `managed_requests` +Get configured requests session instance + + +--- + + + +## function `get_requests_session` + +```python +get_requests_session(proxy: ProxyConfig) → Session +``` + +Get managed requests session instance. + + + +**Args:** + + - `proxy`: HTTP proxy configurations. + + + +**Returns:** + Requests session with proxy and retry setup. + + diff --git a/src-docs/openstack_cloud.md b/src-docs/openstack_cloud.md index 4d82f5359..2bd698583 100644 --- a/src-docs/openstack_cloud.md +++ b/src-docs/openstack_cloud.md @@ -7,6 +7,9 @@ Module for managing Openstack cloud. **Global Variables** --------------- +- **openstack_cloud**: # Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + - **openstack_manager**: # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md new file mode 100644 index 000000000..6cb15ee29 --- /dev/null +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -0,0 +1,125 @@ + + + + +# module `openstack_cloud.openstack_cloud` + + + + + + +--- + + + +## class `OpenstackInstance` +OpenstackInstance(server: openstack.compute.v2.server.Server) + + + +### method `__init__` + +```python +__init__(server: Server) +``` + + + + + + + + + +--- + + + +## class `OpenstackCloud` + + + + + + +### method `__init__` + +```python +__init__(cloud_config: dict[str, dict], prefix: str) +``` + +Create a OpenstackCloud instance. + + + +**Args:** + + - `cloud_config`: The openstack clouds.yaml in dict format. The first cloud in the yaml is used. prefix: + + + + +--- + + + +### method `delete_instance` + +```python +delete_instance(name: str) +``` + + + + + +--- + + + +### method `get_instances` + +```python +get_instances(name: str) → list[OpenstackInstance] +``` + + + + + +--- + + + +### method `get_ssh_connection` + +```python +get_ssh_connection(instance: OpenstackInstance) → Connection +``` + + + + + +--- + + + +### method `launch_instance` + +```python +launch_instance( + name: str, + image: str, + flavor: str, + network: str, + userdata: str +) → OpenstackInstance +``` + + + + + + diff --git a/src-docs/openstack_cloud.openstack_manager.md b/src-docs/openstack_cloud.openstack_manager.md index 697d3d96a..e810e05d0 100644 --- a/src-docs/openstack_cloud.openstack_manager.md +++ b/src-docs/openstack_cloud.openstack_manager.md @@ -18,7 +18,7 @@ Module for handling interactions with OpenStack. --- - + ## function `create_instance_config` @@ -54,7 +54,7 @@ Create an instance config from charm data. --- - + ## class `InstanceConfig` The configuration values for creating a single runner instance. @@ -93,7 +93,7 @@ __init__( --- - + ## class `GithubRunnerRemoveError` Represents an error removing registered runner from Github. @@ -104,7 +104,7 @@ Represents an error removing registered runner from Github. --- - + ## class `OpenstackRunnerManager` Runner manager for OpenStack-based instances. @@ -117,7 +117,7 @@ Runner manager for OpenStack-based instances. - `unit_num`: The juju unit number. - `instance_name`: Prefix of the name for the set of runners. - + ### method `__init__` @@ -146,7 +146,7 @@ Construct OpenstackRunnerManager object. --- - + ### method `flush` @@ -163,7 +163,7 @@ Flush Openstack servers. --- - + ### method `get_github_runner_info` @@ -180,7 +180,7 @@ Get information on GitHub for the runners. --- - + ### method `reconcile` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py new file mode 100644 index 000000000..f8c601d6f --- /dev/null +++ b/src/openstack_cloud/openstack_cloud.py @@ -0,0 +1,388 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +from contextlib import contextmanager +from dataclasses import dataclass +import datetime +from functools import reduce +import logging +from pathlib import Path +import shutil +from typing import Iterable, Iterator, cast +import openstack +from openstack.compute.v2.server import Server as OpenstackServer +from openstack.compute.v2.keypair import Keypair as OpenstackKeypair +from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup +from openstack.connection import Connection as OpenstackConnection +import openstack.exceptions +from fabric import Connection as SshConnection +import paramiko +from paramiko.ssh_exception import NoValidConnectionsError + +from errors import OpenStackError + +logger = logging.getLogger(__name__) + +# Update the version when the security group rules are not backward compatible. +_SECURITY_GROUP_NAME = "github-runner-v1" + +_CREATE_SERVER_TIMEOUT = 5 * 60 +_SSH_TIMEOUT= 30 +_SSH_KEY_PATH = "/home/ubuntu/.ssh" +_TEST_STRING = "test_string" + +class _SshError(Exception): + """Represents an error while interacting with SSH.""" + +@dataclass +class OpenstackInstance: + id: str + name: str + addresses: list[str] + + def __init__(self, server: OpenstackServer): + self.id = server.id + self.name = server.name + self.addresses = [ + address["addr"] + for network_addresses in server.addresses.values() + for address in network_addresses + ] + + +@contextmanager +def _create_connection(cloud_config: dict[str, dict]) -> Iterator[OpenstackConnection]: + """Create a connection context managed object, to be used within with statements. + + This method should be called with a valid cloud_config. See _validate_cloud_config. + Also, this method assumes that the clouds.yaml exists on ~/.config/openstack/clouds.yaml. + See charm_state.py _write_openstack_config_to_disk. + + Args: + cloud_config: The configuration in clouds.yaml format to apply. + + Raises: + OpenStackError: if the credentials provided is not authorized. + + Yields: + An openstack.connection.Connection object. + """ + clouds = list(cloud_config["clouds"].keys()) + if len(clouds) > 1: + logger.warning("Multiple clouds defined in clouds.yaml. Using the first one to connect.") + cloud_name = clouds[0] + + # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but + # I could not reproduce it. Therefore, no catch here for such exception. + try: + with openstack.connect(cloud=cloud_name) as conn: + conn.authorize() + yield conn + # pylint thinks this isn't an exception, but does inherit from Exception class. + except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause + logger.exception("OpenStack API call failure") + raise OpenStackError("Failed OpenStack API call") from exc + +class OpenstackCloud: + + def __init__(self, cloud_config: dict[str, dict], prefix: str): + """Create a OpenstackCloud instance. + + Args: + cloud_config: The openstack clouds.yaml in dict format. The first cloud in the yaml is + used. + prefix: + """ + self.cloud_config = cloud_config + self.prefix = prefix + + def launch_instance(self, name: str, image: str, flavor: str, network: str, userdata: str) -> OpenstackInstance: + full_name = self._get_instance_name(name) + logger.info("Creating openstack server with %s", full_name) + + with _create_connection(cloud_config=self.cloud_config) as conn: + security_group = OpenstackCloud._ensure_security_group(conn) + keypair = OpenstackCloud._setup_key_pair(conn, full_name) + + server = conn.create_server( + name = full_name, + image = image, + key_name=keypair.name, + flavor= flavor, + network= network, + security_groups=[security_group.id], + userdata=userdata, + auto_ip=False, + timeout=_CREATE_SERVER_TIMEOUT, + wait=True, + ) + return OpenstackInstance(server) + + def delete_instance(self, name: str): + full_name = self._get_instance_name(full_name) + logger.info("Deleting openstack server with %s", full_name) + + with _create_connection(cloud_config=self.cloud_config) as conn: + server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) + server.delete() + OpenstackCloud._delete_key_pair(conn, full_name) + + def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: + key_path = OpenstackCloud._get_key_path(instance.name) + + if not key_path.exists(): + raise _SshError(f"Missing keyfile for server: {instance.name}, key path: {key_path}") + if not instance.addresses: + raise _SshError(f"No addresses found for OpenStack server {instance.name}") + + for ip in instance.addresses: + try: + connection = SshConnection( + host=ip, + user="ubuntu", + connect_kwargs={"key_filename": str(key_path)}, + connect_timeout=_SSH_TIMEOUT, + ) + result = connection.run("echo {_TEST_STRING}", warn=True, timeout=_SSH_TIMEOUT) + if not result.ok: + logger.warning( + "SSH test connection failed, server: %s, address: %s", instance.name, ip + ) + continue + if _TEST_STRING in result.stdout: + return connection + except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException): + logger.warning( + "Unable to SSH into %s with address %s", + instance.name, + connection.host, + exc_info=True, + ) + continue + raise _SshError( + f"No connectable SSH addresses found, server: {instance.name}, " + f"addresses: {instance.addresses}" + ) + + def get_instances(self, name: str) -> list[OpenstackInstance]: + logger.info("Getting all openstack servers managed by the charm") + + with _create_connection(cloud_config=self.cloud_config) as conn: + servers = self._get_openstack_instances(conn) + server_names = set(server.name for server in servers) + return [ + OpenstackInstance(OpenstackCloud._get_and_ensure_unique_server(conn, name)) + for name in server_names + ] + + def _cleanup_key_files( + self, conn: OpenstackConnection, exclude_instances: Iterable[str] + ) -> None: + """Delete all SSH key files except the specified instances. + + Args: + conn: The Openstack connection instance. + exclude_instances: The keys of these instance will not be deleted. + """ + logger.info("Cleaning up SSH key files") + exclude_filename = set( + OpenstackCloud._get_key_path(instance) for instance in exclude_instances + ) + + total = 0 + deleted = 0 + for path in _SSH_KEY_PATH.iterdir(): + # Find key file from this application. + if ( + path.is_file() + and path.name.startswith(self.instance_name) + and path.name.endswith(".key") + ): + total += 1 + if path.name in exclude_filename: + continue + + keypair_name = path.name.split(".")[0] + try: + conn.delete_keypair(keypair_name) + except openstack.exceptions.SDKException: + logger.warning( + "Unable to delete OpenStack keypair associated with deleted key file %s ", + path.name, + ) + + path.unlink() + deleted += 1 + logger.info("Found %s key files, clean up %s key files", total, deleted) + + def _clean_up_openstack_keypairs( + self, conn: OpenstackConnection, exclude_instances: Iterable[str] + ) -> None: + """Delete all OpenStack keypairs except the specified instances. + + Args: + conn: The Openstack connection instance. + exclude_instances: The keys of these instance will not be deleted. + """ + logger.info("Cleaning up openstack keypairs") + keypairs = conn.list_keypairs() + for key in keypairs: + # The `name` attribute is of resource.Body type. + if key.name and str(key.name).startswith(self.instance_name): + if str(key.name) in exclude_instances: + continue + + try: + conn.delete_keypair(key.name) + except openstack.exceptions.SDKException: + logger.warning( + "Unable to delete OpenStack keypair associated with deleted key file %s ", + key.name, + ) + + def _get_instance_name(self, name: str) -> str: + return f"{self.prefix}-{name}" + + def _get_openstack_instances(self, conn: OpenstackConnection) -> list[OpenstackServer]: + """Get the OpenStack servers managed by this unit. + + Args: + conn: The connection object to access OpenStack cloud. + + Returns: + List of OpenStack instances. + """ + return [ + server + for server in cast(list[OpenstackServer], conn.list_servers()) + if server.name.startswith(f"{self.prefix}-") + ] + + @staticmethod + def _get_and_ensure_unique_server(conn: OpenstackConnection, name: str) -> OpenstackServer | None: + """Get the latest server of the name and ensure it is unique. + + If multiple servers with the same name is found, the latest server in creation time is + returned. Other servers is deleted. + """ + servers: list[OpenstackServer] = conn.search_servers(name) + + latest_server = reduce(lambda a, b: a if datetime.strptime(a.created_at) < datetime.strptime(b.create_at) else b, servers) + outdated_servers = filter(lambda x: x != latest_server, servers) + for server in outdated_servers: + server.delete() + + return latest_server + + @staticmethod + def _get_key_path(name: str) -> Path: + """Get the filepath for storing private SSH of a runner. + + Args: + name: The name of the runner. + + Returns: + Path to reserved for the key file of the runner. + """ + return _SSH_KEY_PATH / f"{name}.key" + + @staticmethod + def _setup_key_pair(conn: OpenstackConnection, name: str) -> OpenstackKeypair: + key_path = OpenstackCloud._get_key_path(name) + + if key_path.exists: + logger.warning("Existing private key file for %s found, removing it.", name) + key_path.unlink(missing_ok=True) + + keypair = conn.create_keypair(name=name) + key_path.write_text(keypair.private_key) + shutil.chown(key_path, user="ubuntu", group="ubuntu") + key_path.chmod(0o400) + return keypair + + @staticmethod + def _delete_key_pair(conn: OpenstackConnection, name: str) -> None: + try: + # Keypair have unique names, access by ID is not needed. + if not conn.delete_keypair(name): + logger.warning("Unable to delete keypair for %s", name) + except openstack.exceptions.SDKException: + logger.warning("Unable to delete keypair for %s", name, stack_info=True) + + key_path = OpenstackCloud._get_key_path(name) + key_path.unlink(missing_ok=True) + + @staticmethod + def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: + """Ensure runner security group exists. + + Args: + conn: The connection object to access OpenStack cloud. + + Returns: + The security group with the rules for runners. + """ + rule_exists_icmp = False + rule_exists_ssh = False + rule_exists_tmate_ssh = False + + security_group_list = conn.list_security_groups(filters={"name": _SECURITY_GROUP_NAME}) + # Pick the first security_group returned. + security_group = next(iter(security_group_list), None) + if security_group is None: + logger.info("Security group %s not found, creating it", _SECURITY_GROUP_NAME) + security_group = conn.create_security_group( + name=_SECURITY_GROUP_NAME, + description="For servers managed by the github-runner charm.", + ) + else: + existing_rules = security_group.security_group_rules + for rule in existing_rules: + if rule.protocol == "icmp": + logger.debug( + "Found ICMP rule in existing security group %s of ID %s", _SECURITY_GROUP_NAME, security_group.id + ) + rule_exists_icmp = True + if ( + rule.protocol == "tcp" + and rule["port_range_min"] == rule["port_range_max"] == 22 + ): + logger.debug( + "Found SSH rule in existing security group %s of ID %s", _SECURITY_GROUP_NAME, security_group.id + ) + rule_exists_ssh = True + if ( + rule.protocol == "tcp" + and rule["port_range_min"] == rule["port_range_max"] == 10022 + ): + logger.debug( + "Found tmate SSH rule in existing security group %s of ID %s", _SECURITY_GROUP_NAME, security_group.id + ) + rule_exists_tmate_ssh = True + + if not rule_exists_icmp: + conn.create_security_group_rule( + secgroup_name_or_id=_SECURITY_GROUP_NAME, + protocol="icmp", + direction="ingress", + ethertype="IPv4", + ) + if not rule_exists_ssh: + conn.create_security_group_rule( + secgroup_name_or_id=_SECURITY_GROUP_NAME, + port_range_min="22", + port_range_max="22", + protocol="tcp", + direction="ingress", + ethertype="IPv4", + ) + if not rule_exists_tmate_ssh: + conn.create_security_group_rule( + secgroup_name_or_id=_SECURITY_GROUP_NAME, + port_range_min="10022", + port_range_max="10022", + protocol="tcp", + direction="egress", + ethertype="IPv4", + ) + return security_group diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index 7fcaa2f6f..77e42c3ef 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -59,6 +59,7 @@ from metrics import runner as runner_metrics from metrics import storage as metrics_storage from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME +from openstack_cloud.openstack_cloud import _create_connection from repo_policy_compliance_client import RepoPolicyComplianceClient from runner_manager import IssuedMetricEventsStats from runner_manager_type import OpenstackRunnerManagerConfig @@ -149,40 +150,6 @@ class _CloudInitUserData: proxies: Optional[ProxyConfig] = None -@contextmanager -def _create_connection(cloud_config: dict[str, dict]) -> Iterator[openstack.connection.Connection]: - """Create a connection context managed object, to be used within with statements. - - This method should be called with a valid cloud_config. See _validate_cloud_config. - Also, this method assumes that the clouds.yaml exists on ~/.config/openstack/clouds.yaml. - See charm_state.py _write_openstack_config_to_disk. - - Args: - cloud_config: The configuration in clouds.yaml format to apply. - - Raises: - OpenStackError: if the credentials provided is not authorized. - - Yields: - An openstack.connection.Connection object. - """ - clouds = list(cloud_config["clouds"].keys()) - if len(clouds) > 1: - logger.warning("Multiple clouds defined in clouds.yaml. Using the first one to connect.") - cloud_name = clouds[0] - - # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but - # I could not reproduce it. Therefore, no catch here for such exception. - try: - with openstack.connect(cloud=cloud_name) as conn: - conn.authorize() - yield conn - # pylint thinks this isn't an exception, but does inherit from Exception class. - except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause - logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc - - # Disable too many arguments, as they are needed to create the dataclass. def create_instance_config( # pylint: disable=too-many-arguments app_name: str, diff --git a/tests/conftest.py b/tests/conftest.py index 7bb35c4f3..7ae97d4a6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -139,3 +139,16 @@ def pytest_addoption(parser: Parser): help="The Openstack region to authenticate to.", default=None, ) + # OpenStack integration tests + parser.addoption( + "--openstack-test-image", + action="store", + help="The image for testing openstack interfaces. Any ubuntu image should work.", + default=None, + ) + parser.addoption( + "--openstack-test-flavor", + action="store", + help="The flavor for testing openstack interfaces. The resource should be enough to boot the test image.", + default=None, + ) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 4d54c8f89..46f7e61c6 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -277,6 +277,19 @@ def flavor_name_fixture(pytestconfig: pytest.Config) -> str: assert flavor_name, "Please specify the --openstack-flavor-name command line option" return flavor_name +@pytest.fixture(scope="module", name="openstack_test_image") +def openstack_test_image_fixture(pytestconfig: pytest.Config) -> str: + """Image for testing openstack interfaces.""" + test_image = pytestconfig.getoption("--openstack-test-image") + assert test_image, "Please specify the --openstack-test-image command line option" + return test_image + +@pytest.fixture(scope="module", name="openstack_test_flavor") +def openstack_test_image_fixture(pytestconfig: pytest.Config) -> str: + """Flavor for testing openstack interfaces.""" + test_flavor = pytestconfig.getoption("--openstack-test-flavor") + assert test_flavor, "Please specify the --openstack-test-flavor command line option" + return test_flavor @pytest.fixture(scope="module", name="openstack_connection") def openstack_connection_fixture( diff --git a/tests/integration/test_e2e.py b/tests/integration/test_e2e.py index b3fb311ed..bed193216 100644 --- a/tests/integration/test_e2e.py +++ b/tests/integration/test_e2e.py @@ -44,7 +44,7 @@ async def test_e2e_workflow( """ arrange: An app connected to an OpenStack cloud with no runners. act: Run e2e test workflow. - assert: + assert: No exception thrown. """ virt_type: str if instance_type == InstanceType.OPENSTACK: diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py new file mode 100644 index 000000000..10ae98d09 --- /dev/null +++ b/tests/integration/test_openstack_cloud.py @@ -0,0 +1,102 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Test for OpenstackCloud class integration with OpenStack.""" + +from secrets import token_hex +from typing import AsyncIterator + +import pytest +import pytest_asyncio +from openstack.connection import Connection as OpenstackConnection + +from openstack_cloud.openstack_cloud import OpenstackCloud + + +@pytest_asyncio.fixture(scope="function", name="base_openstack_cloud") +async def base_openstack_cloud_fixture(private_endpoint_clouds_config: dict[str, dict]) -> OpenstackCloud: + """Setup a OpenstackCloud object with connection to openstack.""" + return OpenstackCloud(private_endpoint_clouds_config, f"test-{token_hex(4)}") + +@pytest_asyncio.fixture(scope="function", name="openstack_cloud") +async def openstack_cloud_fixture(base_openstack_cloud: OpenstackCloud) -> OpenstackCloud: + """Ensures the OpenstackCloud object has no openstack servers.""" + instances = base_openstack_cloud.get_instances() + for instance in instances: + base_openstack_cloud.delete_instance(name=instance.name) + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_get_no_instances(base_openstack_cloud: OpenstackCloud) -> None: + """ + arrange: No instance on OpenStack. + act: Get instances on OpenStack. + assert: An empty list returned. + + Uses base_openstack_cloud as openstack_cloud_fixture relies on this test. + """ + instances = base_openstack_cloud.get_instances() + assert not instances + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_launch_instance_and_delete(base_openstack_cloud: OpenstackCloud, openstack_connection: OpenstackConnection, openstack_test_image: str, openstack_test_flavor: str) -> None: + """ + arrange: No instance on OpenStack. + act: + 1. Create an openstack instance. + 2. Delete openstack instance. + assert: + 1. Instance returned. + 2. No instance exists. + + Uses base_openstack_cloud as openstack_cloud_fixture relies on this test. + """ + instances = base_openstack_cloud.get_instances() + assert not instances, "Test arrange failure: found existing openstack instance." + + instance_name = f"{token_hex(2)}" + + # 1. + instance = base_openstack_cloud.launch_instance(name=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, userdata="") + + assert instance is not None + assert instance.name is not None + assert instance.id is not None + + servers = openstack_connection.list_servers() + for server in servers: + if instance_name in server.name: + break + else: + assert False, f"OpenStack server with {instance_name} in the name not found" + + # 2. + base_openstack_cloud.delete_instance(name=instance_name) + instances = base_openstack_cloud.get_instances() + assert not instances, "Test failure: openstack instance should be deleted." + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_instance_ssh_connection(openstack_cloud: OpenstackCloud, openstack_connection: OpenstackConnection, openstack_test_image: str, openstack_test_flavor: str) -> None: + """ + arrange: One instance on OpenStack. + act: Get SSH connection of instance and execute command. + assert: Test SSH command executed successfully. + + This tests whether the network rules (security group) are in place. + """ + rand_chars = f"{token_hex(10)}" + instance_name = f"{token_hex(2)}" + instance = openstack_cloud.launch_instance(name=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, userdata="") + + ssh_conn = openstack_cloud.get_ssh_connection(instance) + result = ssh_conn.run(f"echo {rand_chars}") + + assert result.ok + assert rand_chars in result.stdout From 3e6b570990c34236b898bed0fd38cc42ed693694 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sat, 27 Jul 2024 11:12:46 +0800 Subject: [PATCH 002/278] Add openstack cloud test --- .github/workflows/integration_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 6d1499b27..349bc302c 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -33,7 +33,7 @@ jobs: pre-run-script: scripts/setup-lxd.sh provider: lxd test-tox-env: integration-juju3.2 - modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_runner", "test_reactive"]' + modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_runner", "test_reactive", "test_openstack_cloud"]' extra-arguments: "-m openstack" self-hosted-runner: true self-hosted-runner-label: stg-private-endpoint From dc136f13596993b523a43fc508fda518d1424814 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sat, 27 Jul 2024 12:30:56 +0800 Subject: [PATCH 003/278] Fix wrong name --- tests/integration/test_openstack_cloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py index 10ae98d09..db8a314b7 100644 --- a/tests/integration/test_openstack_cloud.py +++ b/tests/integration/test_openstack_cloud.py @@ -14,9 +14,9 @@ @pytest_asyncio.fixture(scope="function", name="base_openstack_cloud") -async def base_openstack_cloud_fixture(private_endpoint_clouds_config: dict[str, dict]) -> OpenstackCloud: +async def base_openstack_cloud_fixture(private_endpoint_config: dict[str, dict]) -> OpenstackCloud: """Setup a OpenstackCloud object with connection to openstack.""" - return OpenstackCloud(private_endpoint_clouds_config, f"test-{token_hex(4)}") + return OpenstackCloud(private_endpoint_config, f"test-{token_hex(4)}") @pytest_asyncio.fixture(scope="function", name="openstack_cloud") async def openstack_cloud_fixture(base_openstack_cloud: OpenstackCloud) -> OpenstackCloud: From 0a9bb4a06a50a08c321ddbbf1adc04da4f3624fd Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sat, 27 Jul 2024 16:13:26 +0800 Subject: [PATCH 004/278] Fix typo and formatting --- src-docs/openstack_cloud.openstack_cloud.md | 18 ++-- src/openstack_cloud/openstack_cloud.py | 95 ++++++++++++--------- tests/integration/conftest.py | 7 +- tests/integration/test_openstack_cloud.py | 40 ++++++--- 4 files changed, 99 insertions(+), 61 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 6cb15ee29..d930869bc 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -11,12 +11,12 @@ --- - + ## class `OpenstackInstance` OpenstackInstance(server: openstack.compute.v2.server.Server) - + ### method `__init__` @@ -34,14 +34,14 @@ __init__(server: Server) --- - + ## class `OpenstackCloud` - + ### method `__init__` @@ -55,14 +55,14 @@ Create a OpenstackCloud instance. **Args:** - - `cloud_config`: The openstack clouds.yaml in dict format. The first cloud in the yaml is used. prefix: + - `cloud_config`: The openstack clouds.yaml in dict format. The first cloud in the yaml is used. prefix: --- - + ### method `delete_instance` @@ -76,7 +76,7 @@ delete_instance(name: str) --- - + ### method `get_instances` @@ -90,7 +90,7 @@ get_instances(name: str) → list[OpenstackInstance] --- - + ### method `get_ssh_connection` @@ -104,7 +104,7 @@ get_ssh_connection(instance: OpenstackInstance) → Connection --- - + ### method `launch_instance` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index f8c601d6f..226af3f9d 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -1,25 +1,26 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +import datetime +import logging +import shutil from contextlib import contextmanager from dataclasses import dataclass -import datetime from functools import reduce -import logging from pathlib import Path -import shutil from typing import Iterable, Iterator, cast + import openstack -from openstack.compute.v2.server import Server as OpenstackServer -from openstack.compute.v2.keypair import Keypair as OpenstackKeypair -from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup -from openstack.connection import Connection as OpenstackConnection import openstack.exceptions -from fabric import Connection as SshConnection import paramiko +from fabric import Connection as SshConnection +from openstack.compute.v2.keypair import Keypair as OpenstackKeypair +from openstack.compute.v2.server import Server as OpenstackServer +from openstack.connection import Connection as OpenstackConnection +from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup from paramiko.ssh_exception import NoValidConnectionsError -from errors import OpenStackError +from errors import OpenStackError logger = logging.getLogger(__name__) @@ -27,19 +28,21 @@ _SECURITY_GROUP_NAME = "github-runner-v1" _CREATE_SERVER_TIMEOUT = 5 * 60 -_SSH_TIMEOUT= 30 +_SSH_TIMEOUT = 30 _SSH_KEY_PATH = "/home/ubuntu/.ssh" _TEST_STRING = "test_string" + class _SshError(Exception): """Represents an error while interacting with SSH.""" + @dataclass class OpenstackInstance: id: str name: str addresses: list[str] - + def __init__(self, server: OpenstackServer): self.id = server.id self.name = server.name @@ -83,33 +86,36 @@ def _create_connection(cloud_config: dict[str, dict]) -> Iterator[OpenstackConne logger.exception("OpenStack API call failure") raise OpenStackError("Failed OpenStack API call") from exc + class OpenstackCloud: - + def __init__(self, cloud_config: dict[str, dict], prefix: str): """Create a OpenstackCloud instance. - + Args: - cloud_config: The openstack clouds.yaml in dict format. The first cloud in the yaml is + cloud_config: The openstack clouds.yaml in dict format. The first cloud in the yaml is used. prefix: """ self.cloud_config = cloud_config self.prefix = prefix - - def launch_instance(self, name: str, image: str, flavor: str, network: str, userdata: str) -> OpenstackInstance: + + def launch_instance( + self, name: str, image: str, flavor: str, network: str, userdata: str + ) -> OpenstackInstance: full_name = self._get_instance_name(name) logger.info("Creating openstack server with %s", full_name) with _create_connection(cloud_config=self.cloud_config) as conn: security_group = OpenstackCloud._ensure_security_group(conn) keypair = OpenstackCloud._setup_key_pair(conn, full_name) - + server = conn.create_server( - name = full_name, - image = image, + name=full_name, + image=image, key_name=keypair.name, - flavor= flavor, - network= network, + flavor=flavor, + network=network, security_groups=[security_group.id], userdata=userdata, auto_ip=False, @@ -126,7 +132,7 @@ def delete_instance(self, name: str): server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) server.delete() OpenstackCloud._delete_key_pair(conn, full_name) - + def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: key_path = OpenstackCloud._get_key_path(instance.name) @@ -166,7 +172,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: def get_instances(self, name: str) -> list[OpenstackInstance]: logger.info("Getting all openstack servers managed by the charm") - + with _create_connection(cloud_config=self.cloud_config) as conn: servers = self._get_openstack_instances(conn) server_names = set(server.name for server in servers) @@ -257,21 +263,28 @@ def _get_openstack_instances(self, conn: OpenstackConnection) -> list[OpenstackS for server in cast(list[OpenstackServer], conn.list_servers()) if server.name.startswith(f"{self.prefix}-") ] - + @staticmethod - def _get_and_ensure_unique_server(conn: OpenstackConnection, name: str) -> OpenstackServer | None: + def _get_and_ensure_unique_server( + conn: OpenstackConnection, name: str + ) -> OpenstackServer | None: """Get the latest server of the name and ensure it is unique. - If multiple servers with the same name is found, the latest server in creation time is + If multiple servers with the same name is found, the latest server in creation time is returned. Other servers is deleted. """ servers: list[OpenstackServer] = conn.search_servers(name) - latest_server = reduce(lambda a, b: a if datetime.strptime(a.created_at) < datetime.strptime(b.create_at) else b, servers) + latest_server = reduce( + lambda a, b: ( + a if datetime.strptime(a.created_at) < datetime.strptime(b.create_at) else b + ), + servers, + ) outdated_servers = filter(lambda x: x != latest_server, servers) for server in outdated_servers: server.delete() - + return latest_server @staticmethod @@ -289,17 +302,17 @@ def _get_key_path(name: str) -> Path: @staticmethod def _setup_key_pair(conn: OpenstackConnection, name: str) -> OpenstackKeypair: key_path = OpenstackCloud._get_key_path(name) - + if key_path.exists: logger.warning("Existing private key file for %s found, removing it.", name) key_path.unlink(missing_ok=True) - + keypair = conn.create_keypair(name=name) key_path.write_text(keypair.private_key) shutil.chown(key_path, user="ubuntu", group="ubuntu") key_path.chmod(0o400) return keypair - + @staticmethod def _delete_key_pair(conn: OpenstackConnection, name: str) -> None: try: @@ -307,8 +320,8 @@ def _delete_key_pair(conn: OpenstackConnection, name: str) -> None: if not conn.delete_keypair(name): logger.warning("Unable to delete keypair for %s", name) except openstack.exceptions.SDKException: - logger.warning("Unable to delete keypair for %s", name, stack_info=True) - + logger.warning("Unable to delete keypair for %s", name, stack_info=True) + key_path = OpenstackCloud._get_key_path(name) key_path.unlink(missing_ok=True) @@ -318,14 +331,14 @@ def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: Args: conn: The connection object to access OpenStack cloud. - + Returns: The security group with the rules for runners. """ rule_exists_icmp = False rule_exists_ssh = False rule_exists_tmate_ssh = False - + security_group_list = conn.list_security_groups(filters={"name": _SECURITY_GROUP_NAME}) # Pick the first security_group returned. security_group = next(iter(security_group_list), None) @@ -340,7 +353,9 @@ def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: for rule in existing_rules: if rule.protocol == "icmp": logger.debug( - "Found ICMP rule in existing security group %s of ID %s", _SECURITY_GROUP_NAME, security_group.id + "Found ICMP rule in existing security group %s of ID %s", + _SECURITY_GROUP_NAME, + security_group.id, ) rule_exists_icmp = True if ( @@ -348,7 +363,9 @@ def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: and rule["port_range_min"] == rule["port_range_max"] == 22 ): logger.debug( - "Found SSH rule in existing security group %s of ID %s", _SECURITY_GROUP_NAME, security_group.id + "Found SSH rule in existing security group %s of ID %s", + _SECURITY_GROUP_NAME, + security_group.id, ) rule_exists_ssh = True if ( @@ -356,7 +373,9 @@ def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: and rule["port_range_min"] == rule["port_range_max"] == 10022 ): logger.debug( - "Found tmate SSH rule in existing security group %s of ID %s", _SECURITY_GROUP_NAME, security_group.id + "Found tmate SSH rule in existing security group %s of ID %s", + _SECURITY_GROUP_NAME, + security_group.id, ) rule_exists_tmate_ssh = True diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 46f7e61c6..25f4f1ee3 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -277,20 +277,23 @@ def flavor_name_fixture(pytestconfig: pytest.Config) -> str: assert flavor_name, "Please specify the --openstack-flavor-name command line option" return flavor_name + @pytest.fixture(scope="module", name="openstack_test_image") def openstack_test_image_fixture(pytestconfig: pytest.Config) -> str: """Image for testing openstack interfaces.""" test_image = pytestconfig.getoption("--openstack-test-image") assert test_image, "Please specify the --openstack-test-image command line option" return test_image - + + @pytest.fixture(scope="module", name="openstack_test_flavor") -def openstack_test_image_fixture(pytestconfig: pytest.Config) -> str: +def openstack_test_flavor_fixture(pytestconfig: pytest.Config) -> str: """Flavor for testing openstack interfaces.""" test_flavor = pytestconfig.getoption("--openstack-test-flavor") assert test_flavor, "Please specify the --openstack-test-flavor command line option" return test_flavor + @pytest.fixture(scope="module", name="openstack_connection") def openstack_connection_fixture( clouds_yaml_contents: str, app_name: str diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py index db8a314b7..72d760020 100644 --- a/tests/integration/test_openstack_cloud.py +++ b/tests/integration/test_openstack_cloud.py @@ -18,6 +18,7 @@ async def base_openstack_cloud_fixture(private_endpoint_config: dict[str, dict]) """Setup a OpenstackCloud object with connection to openstack.""" return OpenstackCloud(private_endpoint_config, f"test-{token_hex(4)}") + @pytest_asyncio.fixture(scope="function", name="openstack_cloud") async def openstack_cloud_fixture(base_openstack_cloud: OpenstackCloud) -> OpenstackCloud: """Ensures the OpenstackCloud object has no openstack servers.""" @@ -34,22 +35,28 @@ async def test_get_no_instances(base_openstack_cloud: OpenstackCloud) -> None: arrange: No instance on OpenStack. act: Get instances on OpenStack. assert: An empty list returned. - + Uses base_openstack_cloud as openstack_cloud_fixture relies on this test. """ instances = base_openstack_cloud.get_instances() assert not instances + @pytest.mark.openstack @pytest.mark.asyncio @pytest.mark.abort_on_fail -async def test_launch_instance_and_delete(base_openstack_cloud: OpenstackCloud, openstack_connection: OpenstackConnection, openstack_test_image: str, openstack_test_flavor: str) -> None: +async def test_launch_instance_and_delete( + base_openstack_cloud: OpenstackCloud, + openstack_connection: OpenstackConnection, + openstack_test_image: str, + openstack_test_flavor: str, +) -> None: """ arrange: No instance on OpenStack. - act: + act: 1. Create an openstack instance. 2. Delete openstack instance. - assert: + assert: 1. Instance returned. 2. No instance exists. @@ -57,16 +64,18 @@ async def test_launch_instance_and_delete(base_openstack_cloud: OpenstackCloud, """ instances = base_openstack_cloud.get_instances() assert not instances, "Test arrange failure: found existing openstack instance." - + instance_name = f"{token_hex(2)}" # 1. - instance = base_openstack_cloud.launch_instance(name=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, userdata="") + instance = base_openstack_cloud.launch_instance( + name=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, userdata="" + ) assert instance is not None assert instance.name is not None assert instance.id is not None - + servers = openstack_connection.list_servers() for server in servers: if instance_name in server.name: @@ -78,25 +87,32 @@ async def test_launch_instance_and_delete(base_openstack_cloud: OpenstackCloud, base_openstack_cloud.delete_instance(name=instance_name) instances = base_openstack_cloud.get_instances() assert not instances, "Test failure: openstack instance should be deleted." - + @pytest.mark.openstack @pytest.mark.asyncio @pytest.mark.abort_on_fail -async def test_instance_ssh_connection(openstack_cloud: OpenstackCloud, openstack_connection: OpenstackConnection, openstack_test_image: str, openstack_test_flavor: str) -> None: +async def test_instance_ssh_connection( + openstack_cloud: OpenstackCloud, + openstack_connection: OpenstackConnection, + openstack_test_image: str, + openstack_test_flavor: str, +) -> None: """ arrange: One instance on OpenStack. act: Get SSH connection of instance and execute command. assert: Test SSH command executed successfully. - + This tests whether the network rules (security group) are in place. """ rand_chars = f"{token_hex(10)}" instance_name = f"{token_hex(2)}" - instance = openstack_cloud.launch_instance(name=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, userdata="") + instance = openstack_cloud.launch_instance( + name=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, userdata="" + ) ssh_conn = openstack_cloud.get_ssh_connection(instance) result = ssh_conn.run(f"echo {rand_chars}") - assert result.ok + assert result.ok assert rand_chars in result.stdout From 01c6f525c6167e2e9297c74c6a2b6de45386d897 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sun, 28 Jul 2024 13:54:52 +0800 Subject: [PATCH 005/278] Fix wrong func sig --- .github/workflows/integration_test.yaml | 4 +++- src-docs/openstack_cloud.openstack_cloud.md | 2 +- src/openstack_cloud/openstack_cloud.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 349bc302c..a85614f3e 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -33,7 +33,9 @@ jobs: pre-run-script: scripts/setup-lxd.sh provider: lxd test-tox-env: integration-juju3.2 - modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_runner", "test_reactive", "test_openstack_cloud"]' + # TODO: debug only remove + # modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_runner", "test_reactive", "test_openstack_cloud"]' + modules: '["test_openstack_cloud"]' extra-arguments: "-m openstack" self-hosted-runner: true self-hosted-runner-label: stg-private-endpoint diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index d930869bc..7e3e41cee 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -81,7 +81,7 @@ delete_instance(name: str) ### method `get_instances` ```python -get_instances(name: str) → list[OpenstackInstance] +get_instances() → list[OpenstackInstance] ``` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 226af3f9d..e20ccf7fb 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -170,7 +170,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: f"addresses: {instance.addresses}" ) - def get_instances(self, name: str) -> list[OpenstackInstance]: + def get_instances(self) -> list[OpenstackInstance]: logger.info("Getting all openstack servers managed by the charm") with _create_connection(cloud_config=self.cloud_config) as conn: From 3969600b90ecc5f8b935e8aec39f5435b1f7b8bb Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sun, 28 Jul 2024 15:20:00 +0800 Subject: [PATCH 006/278] Fix yaml format --- tests/integration/test_openstack_cloud.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py index 72d760020..63f2df4c8 100644 --- a/tests/integration/test_openstack_cloud.py +++ b/tests/integration/test_openstack_cloud.py @@ -9,14 +9,16 @@ import pytest import pytest_asyncio from openstack.connection import Connection as OpenstackConnection +import yaml from openstack_cloud.openstack_cloud import OpenstackCloud @pytest_asyncio.fixture(scope="function", name="base_openstack_cloud") -async def base_openstack_cloud_fixture(private_endpoint_config: dict[str, dict]) -> OpenstackCloud: +async def base_openstack_cloud_fixture(private_endpoint_clouds_yaml: str) -> OpenstackCloud: """Setup a OpenstackCloud object with connection to openstack.""" - return OpenstackCloud(private_endpoint_config, f"test-{token_hex(4)}") + clouds_yaml = yaml.load(private_endpoint_clouds_yaml) + return OpenstackCloud(clouds_yaml, f"test-{token_hex(4)}") @pytest_asyncio.fixture(scope="function", name="openstack_cloud") From 9bf7443a24391e22ebdcb63be6f1c164b410c4bf Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sun, 28 Jul 2024 17:48:11 +0800 Subject: [PATCH 007/278] Fix yaml loading --- tests/integration/test_openstack_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py index 63f2df4c8..45c19e65c 100644 --- a/tests/integration/test_openstack_cloud.py +++ b/tests/integration/test_openstack_cloud.py @@ -17,7 +17,7 @@ @pytest_asyncio.fixture(scope="function", name="base_openstack_cloud") async def base_openstack_cloud_fixture(private_endpoint_clouds_yaml: str) -> OpenstackCloud: """Setup a OpenstackCloud object with connection to openstack.""" - clouds_yaml = yaml.load(private_endpoint_clouds_yaml) + clouds_yaml = yaml.safe_load(private_endpoint_clouds_yaml) return OpenstackCloud(clouds_yaml, f"test-{token_hex(4)}") From 1bc3b2b2e4147839830f5eb71673a85fc10a855a Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 29 Jul 2024 10:30:06 +0800 Subject: [PATCH 008/278] Fix openstack connection function --- src-docs/openstack_cloud.md | 3 -- src-docs/openstack_cloud.openstack_cloud.md | 22 ++++++----- src-docs/openstack_cloud.openstack_manager.md | 16 ++++---- src/openstack_cloud/openstack_cloud.py | 39 ++++++++++--------- src/openstack_cloud/openstack_manager.py | 35 ++++++++++++++++- tests/integration/test_openstack_cloud.py | 4 +- 6 files changed, 77 insertions(+), 42 deletions(-) diff --git a/src-docs/openstack_cloud.md b/src-docs/openstack_cloud.md index 2bd698583..4d82f5359 100644 --- a/src-docs/openstack_cloud.md +++ b/src-docs/openstack_cloud.md @@ -7,9 +7,6 @@ Module for managing Openstack cloud. **Global Variables** --------------- -- **openstack_cloud**: # Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - - **openstack_manager**: # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 7e3e41cee..6feb5ad2f 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -11,12 +11,12 @@ --- - + ## class `OpenstackInstance` OpenstackInstance(server: openstack.compute.v2.server.Server) - + ### method `__init__` @@ -34,19 +34,19 @@ __init__(server: Server) --- - + ## class `OpenstackCloud` - + ### method `__init__` ```python -__init__(cloud_config: dict[str, dict], prefix: str) +__init__(clouds_config: dict[str, dict], cloud: str, prefix: str) ``` Create a OpenstackCloud instance. @@ -55,14 +55,16 @@ Create a OpenstackCloud instance. **Args:** - - `cloud_config`: The openstack clouds.yaml in dict format. The first cloud in the yaml is used. prefix: + - `clouds_config`: The openstack clouds.yaml in dict format. + - `cloud`: The name of cloud to use in the clouds.yaml. + - `prefix`: Prefix attached to names of resource managed by this instance. Used for identifying which resource belongs to this instance. --- - + ### method `delete_instance` @@ -76,7 +78,7 @@ delete_instance(name: str) --- - + ### method `get_instances` @@ -90,7 +92,7 @@ get_instances() → list[OpenstackInstance] --- - + ### method `get_ssh_connection` @@ -104,7 +106,7 @@ get_ssh_connection(instance: OpenstackInstance) → Connection --- - + ### method `launch_instance` diff --git a/src-docs/openstack_cloud.openstack_manager.md b/src-docs/openstack_cloud.openstack_manager.md index e810e05d0..93cff6908 100644 --- a/src-docs/openstack_cloud.openstack_manager.md +++ b/src-docs/openstack_cloud.openstack_manager.md @@ -18,7 +18,7 @@ Module for handling interactions with OpenStack. --- - + ## function `create_instance_config` @@ -54,7 +54,7 @@ Create an instance config from charm data. --- - + ## class `InstanceConfig` The configuration values for creating a single runner instance. @@ -93,7 +93,7 @@ __init__( --- - + ## class `GithubRunnerRemoveError` Represents an error removing registered runner from Github. @@ -104,7 +104,7 @@ Represents an error removing registered runner from Github. --- - + ## class `OpenstackRunnerManager` Runner manager for OpenStack-based instances. @@ -117,7 +117,7 @@ Runner manager for OpenStack-based instances. - `unit_num`: The juju unit number. - `instance_name`: Prefix of the name for the set of runners. - + ### method `__init__` @@ -146,7 +146,7 @@ Construct OpenstackRunnerManager object. --- - + ### method `flush` @@ -163,7 +163,7 @@ Flush Openstack servers. --- - + ### method `get_github_runner_info` @@ -180,7 +180,7 @@ Get information on GitHub for the runners. --- - + ### method `reconcile` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index e20ccf7fb..90e1d3073 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -19,11 +19,14 @@ from openstack.connection import Connection as OpenstackConnection from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup from paramiko.ssh_exception import NoValidConnectionsError +import yaml from errors import OpenStackError logger = logging.getLogger(__name__) +_CLOUDS_YAML_PATH = Path(Path.home() / ".config/openstack/clouds.yaml") + # Update the version when the security group rules are not backward compatible. _SECURITY_GROUP_NAME = "github-runner-v1" @@ -54,15 +57,14 @@ def __init__(self, server: OpenstackServer): @contextmanager -def _create_connection(cloud_config: dict[str, dict]) -> Iterator[OpenstackConnection]: +def _get_openstack_connection(clouds_config: dict[str, dict], cloud: str) -> Iterator[OpenstackConnection]: """Create a connection context managed object, to be used within with statements. - - This method should be called with a valid cloud_config. See _validate_cloud_config. - Also, this method assumes that the clouds.yaml exists on ~/.config/openstack/clouds.yaml. - See charm_state.py _write_openstack_config_to_disk. + + The file of _CLOUDS_YAML_PATH should only be modified by this function. Args: cloud_config: The configuration in clouds.yaml format to apply. + cloud: The name of cloud to use in the clouds.yaml. Raises: OpenStackError: if the credentials provided is not authorized. @@ -70,15 +72,14 @@ def _create_connection(cloud_config: dict[str, dict]) -> Iterator[OpenstackConne Yields: An openstack.connection.Connection object. """ - clouds = list(cloud_config["clouds"].keys()) - if len(clouds) > 1: - logger.warning("Multiple clouds defined in clouds.yaml. Using the first one to connect.") - cloud_name = clouds[0] + if not _CLOUDS_YAML_PATH.exists(): + _CLOUDS_YAML_PATH.parent.mkdir(parents=True, exist_ok=True) + _CLOUDS_YAML_PATH.write_text(data=yaml.dump(clouds_config), encoding="utf-8") # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but # I could not reproduce it. Therefore, no catch here for such exception. try: - with openstack.connect(cloud=cloud_name) as conn: + with openstack.connect(cloud=cloud) as conn: conn.authorize() yield conn # pylint thinks this isn't an exception, but does inherit from Exception class. @@ -89,15 +90,17 @@ def _create_connection(cloud_config: dict[str, dict]) -> Iterator[OpenstackConne class OpenstackCloud: - def __init__(self, cloud_config: dict[str, dict], prefix: str): + def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): """Create a OpenstackCloud instance. Args: - cloud_config: The openstack clouds.yaml in dict format. The first cloud in the yaml is - used. - prefix: + clouds_config: The openstack clouds.yaml in dict format. + cloud: The name of cloud to use in the clouds.yaml. + prefix: Prefix attached to names of resource managed by this instance. Used for + identifying which resource belongs to this instance. """ - self.cloud_config = cloud_config + self.clouds_config = clouds_config + self.cloud = cloud self.prefix = prefix def launch_instance( @@ -106,7 +109,7 @@ def launch_instance( full_name = self._get_instance_name(name) logger.info("Creating openstack server with %s", full_name) - with _create_connection(cloud_config=self.cloud_config) as conn: + with _get_openstack_connection(cloud_config=self.clouds_config, cloud=self.cloud) as conn: security_group = OpenstackCloud._ensure_security_group(conn) keypair = OpenstackCloud._setup_key_pair(conn, full_name) @@ -128,7 +131,7 @@ def delete_instance(self, name: str): full_name = self._get_instance_name(full_name) logger.info("Deleting openstack server with %s", full_name) - with _create_connection(cloud_config=self.cloud_config) as conn: + with _get_openstack_connection(cloud_config=self.clouds_config) as conn: server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) server.delete() OpenstackCloud._delete_key_pair(conn, full_name) @@ -173,7 +176,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: def get_instances(self) -> list[OpenstackInstance]: logger.info("Getting all openstack servers managed by the charm") - with _create_connection(cloud_config=self.cloud_config) as conn: + with _get_openstack_connection(cloud_config=self.clouds_config) as conn: servers = self._get_openstack_instances(conn) server_names = set(server.name for server in servers) return [ diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index 77e42c3ef..de17d412f 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -59,7 +59,6 @@ from metrics import runner as runner_metrics from metrics import storage as metrics_storage from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from openstack_cloud.openstack_cloud import _create_connection from repo_policy_compliance_client import RepoPolicyComplianceClient from runner_manager import IssuedMetricEventsStats from runner_manager_type import OpenstackRunnerManagerConfig @@ -1493,3 +1492,37 @@ def flush(self) -> int: remove_token=remove_token, ) return len(runners_to_delete) + + +@contextmanager +def _create_connection(cloud_config: dict[str, dict]) -> Iterator[OpenstackConnection]: + """Create a connection context managed object, to be used within with statements. + + This method should be called with a valid cloud_config. See _validate_cloud_config. + Also, this method assumes that the clouds.yaml exists on ~/.config/openstack/clouds.yaml. + See charm_state.py _write_openstack_config_to_disk. + + Args: + cloud_config: The configuration in clouds.yaml format to apply. + + Raises: + OpenStackError: if the credentials provided is not authorized. + + Yields: + An openstack.connection.Connection object. + """ + clouds = list(cloud_config["clouds"].keys()) + if len(clouds) > 1: + logger.warning("Multiple clouds defined in clouds.yaml. Using the first one to connect.") + cloud_name = clouds[0] + + # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but + # I could not reproduce it. Therefore, no catch here for such exception. + try: + with openstack.connect(cloud=cloud_name) as conn: + conn.authorize() + yield conn + # pylint thinks this isn't an exception, but does inherit from Exception class. + except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause + logger.exception("OpenStack API call failure") + raise OpenStackError("Failed OpenStack API call") from exc \ No newline at end of file diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py index 45c19e65c..8f908a331 100644 --- a/tests/integration/test_openstack_cloud.py +++ b/tests/integration/test_openstack_cloud.py @@ -8,8 +8,8 @@ import pytest import pytest_asyncio -from openstack.connection import Connection as OpenstackConnection import yaml +from openstack.connection import Connection as OpenstackConnection from openstack_cloud.openstack_cloud import OpenstackCloud @@ -18,7 +18,7 @@ async def base_openstack_cloud_fixture(private_endpoint_clouds_yaml: str) -> OpenstackCloud: """Setup a OpenstackCloud object with connection to openstack.""" clouds_yaml = yaml.safe_load(private_endpoint_clouds_yaml) - return OpenstackCloud(clouds_yaml, f"test-{token_hex(4)}") + return OpenstackCloud(clouds_yaml, "testcloud", f"test-{token_hex(4)}") @pytest_asyncio.fixture(scope="function", name="openstack_cloud") From c89725a5401df3c88607541b141749a075c18f80 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 30 Jul 2024 09:41:04 +0800 Subject: [PATCH 009/278] Fix _get_openstack_connection call --- src/openstack_cloud/openstack_cloud.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 90e1d3073..9bfb9d6c6 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -99,8 +99,8 @@ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): prefix: Prefix attached to names of resource managed by this instance. Used for identifying which resource belongs to this instance. """ - self.clouds_config = clouds_config - self.cloud = cloud + self._clouds_config = clouds_config + self._cloud = cloud self.prefix = prefix def launch_instance( @@ -109,7 +109,7 @@ def launch_instance( full_name = self._get_instance_name(name) logger.info("Creating openstack server with %s", full_name) - with _get_openstack_connection(cloud_config=self.clouds_config, cloud=self.cloud) as conn: + with _get_openstack_connection(clouds_config=self._clouds_config, cloud=self._cloud) as conn: security_group = OpenstackCloud._ensure_security_group(conn) keypair = OpenstackCloud._setup_key_pair(conn, full_name) @@ -131,7 +131,7 @@ def delete_instance(self, name: str): full_name = self._get_instance_name(full_name) logger.info("Deleting openstack server with %s", full_name) - with _get_openstack_connection(cloud_config=self.clouds_config) as conn: + with _get_openstack_connection(clouds_config=self._clouds_config, cloud=self._cloud) as conn: server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) server.delete() OpenstackCloud._delete_key_pair(conn, full_name) @@ -176,7 +176,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: def get_instances(self) -> list[OpenstackInstance]: logger.info("Getting all openstack servers managed by the charm") - with _get_openstack_connection(cloud_config=self.clouds_config) as conn: + with _get_openstack_connection(clouds_config=self._clouds_config, cloud=self._cloud) as conn: servers = self._get_openstack_instances(conn) server_names = set(server.name for server in servers) return [ From 14f621c3e79a103531e2f70d256d347b54b01008 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:36:04 +0800 Subject: [PATCH 010/278] Fix integration tests --- tests/integration/test_openstack_cloud.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py index 8f908a331..91a45b52a 100644 --- a/tests/integration/test_openstack_cloud.py +++ b/tests/integration/test_openstack_cloud.py @@ -27,6 +27,7 @@ async def openstack_cloud_fixture(base_openstack_cloud: OpenstackCloud) -> Opens instances = base_openstack_cloud.get_instances() for instance in instances: base_openstack_cloud.delete_instance(name=instance.name) + return base_openstack_cloud @pytest.mark.openstack @@ -52,6 +53,7 @@ async def test_launch_instance_and_delete( openstack_connection: OpenstackConnection, openstack_test_image: str, openstack_test_flavor: str, + network_name: str, ) -> None: """ arrange: No instance on OpenStack. @@ -71,7 +73,7 @@ async def test_launch_instance_and_delete( # 1. instance = base_openstack_cloud.launch_instance( - name=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, userdata="" + name=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, network=network_name,userdata="" ) assert instance is not None @@ -96,7 +98,6 @@ async def test_launch_instance_and_delete( @pytest.mark.abort_on_fail async def test_instance_ssh_connection( openstack_cloud: OpenstackCloud, - openstack_connection: OpenstackConnection, openstack_test_image: str, openstack_test_flavor: str, ) -> None: From 74e415fc19ed4c705ac9e5f163bbbccf56f70d54 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 30 Jul 2024 13:19:56 +0800 Subject: [PATCH 011/278] Fix dict access --- src-docs/openstack_cloud.openstack_cloud.md | 14 +++++------ src/openstack_cloud/openstack_cloud.py | 28 +++++++++++++-------- src/openstack_cloud/openstack_manager.py | 2 +- tests/integration/test_openstack_cloud.py | 13 ++++++++-- 4 files changed, 37 insertions(+), 20 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 6feb5ad2f..0831662e7 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -34,14 +34,14 @@ __init__(server: Server) --- - + ## class `OpenstackCloud` - + ### method `__init__` @@ -57,14 +57,14 @@ Create a OpenstackCloud instance. - `clouds_config`: The openstack clouds.yaml in dict format. - `cloud`: The name of cloud to use in the clouds.yaml. - - `prefix`: Prefix attached to names of resource managed by this instance. Used for identifying which resource belongs to this instance. + - `prefix`: Prefix attached to names of resource managed by this instance. Used for identifying which resource belongs to this instance. --- - + ### method `delete_instance` @@ -78,7 +78,7 @@ delete_instance(name: str) --- - + ### method `get_instances` @@ -92,7 +92,7 @@ get_instances() → list[OpenstackInstance] --- - + ### method `get_ssh_connection` @@ -106,7 +106,7 @@ get_ssh_connection(instance: OpenstackInstance) → Connection --- - + ### method `launch_instance` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 9bfb9d6c6..14d205b7b 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -13,13 +13,13 @@ import openstack import openstack.exceptions import paramiko +import yaml from fabric import Connection as SshConnection from openstack.compute.v2.keypair import Keypair as OpenstackKeypair from openstack.compute.v2.server import Server as OpenstackServer from openstack.connection import Connection as OpenstackConnection from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup from paramiko.ssh_exception import NoValidConnectionsError -import yaml from errors import OpenStackError @@ -57,9 +57,11 @@ def __init__(self, server: OpenstackServer): @contextmanager -def _get_openstack_connection(clouds_config: dict[str, dict], cloud: str) -> Iterator[OpenstackConnection]: +def _get_openstack_connection( + clouds_config: dict[str, dict], cloud: str +) -> Iterator[OpenstackConnection]: """Create a connection context managed object, to be used within with statements. - + The file of _CLOUDS_YAML_PATH should only be modified by this function. Args: @@ -96,7 +98,7 @@ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): Args: clouds_config: The openstack clouds.yaml in dict format. cloud: The name of cloud to use in the clouds.yaml. - prefix: Prefix attached to names of resource managed by this instance. Used for + prefix: Prefix attached to names of resource managed by this instance. Used for identifying which resource belongs to this instance. """ self._clouds_config = clouds_config @@ -109,7 +111,9 @@ def launch_instance( full_name = self._get_instance_name(name) logger.info("Creating openstack server with %s", full_name) - with _get_openstack_connection(clouds_config=self._clouds_config, cloud=self._cloud) as conn: + with _get_openstack_connection( + clouds_config=self._clouds_config, cloud=self._cloud + ) as conn: security_group = OpenstackCloud._ensure_security_group(conn) keypair = OpenstackCloud._setup_key_pair(conn, full_name) @@ -131,7 +135,9 @@ def delete_instance(self, name: str): full_name = self._get_instance_name(full_name) logger.info("Deleting openstack server with %s", full_name) - with _get_openstack_connection(clouds_config=self._clouds_config, cloud=self._cloud) as conn: + with _get_openstack_connection( + clouds_config=self._clouds_config, cloud=self._cloud + ) as conn: server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) server.delete() OpenstackCloud._delete_key_pair(conn, full_name) @@ -176,7 +182,9 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: def get_instances(self) -> list[OpenstackInstance]: logger.info("Getting all openstack servers managed by the charm") - with _get_openstack_connection(clouds_config=self._clouds_config, cloud=self._cloud) as conn: + with _get_openstack_connection( + clouds_config=self._clouds_config, cloud=self._cloud + ) as conn: servers = self._get_openstack_instances(conn) server_names = set(server.name for server in servers) return [ @@ -354,7 +362,7 @@ def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: else: existing_rules = security_group.security_group_rules for rule in existing_rules: - if rule.protocol == "icmp": + if rule["protocol"] == "icmp": logger.debug( "Found ICMP rule in existing security group %s of ID %s", _SECURITY_GROUP_NAME, @@ -362,7 +370,7 @@ def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: ) rule_exists_icmp = True if ( - rule.protocol == "tcp" + rule["protocol"] == "tcp" and rule["port_range_min"] == rule["port_range_max"] == 22 ): logger.debug( @@ -372,7 +380,7 @@ def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: ) rule_exists_ssh = True if ( - rule.protocol == "tcp" + rule["protocol"] == "tcp" and rule["port_range_min"] == rule["port_range_max"] == 10022 ): logger.debug( diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index de17d412f..f5fb1f0f1 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -1525,4 +1525,4 @@ def _create_connection(cloud_config: dict[str, dict]) -> Iterator[OpenstackConne # pylint thinks this isn't an exception, but does inherit from Exception class. except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc \ No newline at end of file + raise OpenStackError("Failed OpenStack API call") from exc diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py index 91a45b52a..321fbe1fa 100644 --- a/tests/integration/test_openstack_cloud.py +++ b/tests/integration/test_openstack_cloud.py @@ -73,7 +73,11 @@ async def test_launch_instance_and_delete( # 1. instance = base_openstack_cloud.launch_instance( - name=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, network=network_name,userdata="" + name=instance_name, + image=openstack_test_image, + flavor=openstack_test_flavor, + network=network_name, + userdata="", ) assert instance is not None @@ -100,6 +104,7 @@ async def test_instance_ssh_connection( openstack_cloud: OpenstackCloud, openstack_test_image: str, openstack_test_flavor: str, + network_name: str, ) -> None: """ arrange: One instance on OpenStack. @@ -111,7 +116,11 @@ async def test_instance_ssh_connection( rand_chars = f"{token_hex(10)}" instance_name = f"{token_hex(2)}" instance = openstack_cloud.launch_instance( - name=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, userdata="" + name=instance_name, + image=openstack_test_image, + flavor=openstack_test_flavor, + network=network_name, + userdata="", ) ssh_conn = openstack_cloud.get_ssh_connection(instance) From a7252ab52bd233bdec87ac63caf54f2b1568a5f4 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 30 Jul 2024 13:40:10 +0800 Subject: [PATCH 012/278] Fix SSH path --- src/openstack_cloud/openstack_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 14d205b7b..bb7c3940a 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -32,7 +32,7 @@ _CREATE_SERVER_TIMEOUT = 5 * 60 _SSH_TIMEOUT = 30 -_SSH_KEY_PATH = "/home/ubuntu/.ssh" +_SSH_KEY_PATH = Path("/home/ubuntu/.ssh") _TEST_STRING = "test_string" From 0ef1d0a263084a210f07fd7ebb5906ffc8951ac8 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:16:20 +0800 Subject: [PATCH 013/278] Fix wrong variable name --- src-docs/openstack_cloud.openstack_cloud.md | 14 ++++++++++++++ src/openstack_cloud/openstack_cloud.py | 6 +++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 0831662e7..9aa364477 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -76,6 +76,20 @@ delete_instance(name: str) +--- + + + +### method `get_instance_name` + +```python +get_instance_name(name: str) → str +``` + + + + + --- diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index bb7c3940a..d41fb722a 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -108,7 +108,7 @@ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): def launch_instance( self, name: str, image: str, flavor: str, network: str, userdata: str ) -> OpenstackInstance: - full_name = self._get_instance_name(name) + full_name = self.get_instance_name(name) logger.info("Creating openstack server with %s", full_name) with _get_openstack_connection( @@ -132,7 +132,7 @@ def launch_instance( return OpenstackInstance(server) def delete_instance(self, name: str): - full_name = self._get_instance_name(full_name) + full_name = self.get_instance_name(name) logger.info("Deleting openstack server with %s", full_name) with _get_openstack_connection( @@ -257,7 +257,7 @@ def _clean_up_openstack_keypairs( key.name, ) - def _get_instance_name(self, name: str) -> str: + def get_instance_name(self, name: str) -> str: return f"{self.prefix}-{name}" def _get_openstack_instances(self, conn: OpenstackConnection) -> list[OpenstackServer]: From a31f31085e31079188215b178b80a38671fdc854 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 31 Jul 2024 14:27:37 +0800 Subject: [PATCH 014/278] Fix server deletion --- src/openstack_cloud/openstack_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index d41fb722a..90b7191ad 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -139,7 +139,7 @@ def delete_instance(self, name: str): clouds_config=self._clouds_config, cloud=self._cloud ) as conn: server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) - server.delete() + conn.delete_server(name_or_id=server.id) OpenstackCloud._delete_key_pair(conn, full_name) def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: From acfa8c9a65ecc546b3dbfe4ad9dd37aa70dfaef9 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 31 Jul 2024 16:04:41 +0800 Subject: [PATCH 015/278] Add error handling of create openstack instance --- src-docs/openstack_cloud.openstack_cloud.md | 8 ++-- src/openstack_cloud/openstack_cloud.py | 41 ++++++++++++++------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 9aa364477..8956f7c8c 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -64,7 +64,7 @@ Create a OpenstackCloud instance. --- - + ### method `delete_instance` @@ -78,7 +78,7 @@ delete_instance(name: str) --- - + ### method `get_instance_name` @@ -92,7 +92,7 @@ get_instance_name(name: str) → str --- - + ### method `get_instances` @@ -106,7 +106,7 @@ get_instances() → list[OpenstackInstance] --- - + ### method `get_ssh_connection` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 90b7191ad..9070d6dac 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -117,18 +117,33 @@ def launch_instance( security_group = OpenstackCloud._ensure_security_group(conn) keypair = OpenstackCloud._setup_key_pair(conn, full_name) - server = conn.create_server( - name=full_name, - image=image, - key_name=keypair.name, - flavor=flavor, - network=network, - security_groups=[security_group.id], - userdata=userdata, - auto_ip=False, - timeout=_CREATE_SERVER_TIMEOUT, - wait=True, - ) + try: + server = conn.create_server( + name=full_name, + image=image, + key_name=keypair.name, + flavor=flavor, + network=network, + security_groups=[security_group.id], + userdata=userdata, + auto_ip=False, + timeout=_CREATE_SERVER_TIMEOUT, + wait=True, + ) + except openstack.exceptions.ResourceTimeout as err: + logger.exception("Timeout creating openstack server %s", full_name) + logger.info("Attempting clean up of openstack server %s that timeout during creation", full_name) + try: + conn.delete_server(name_or_id=full_name, wait=True) + except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout) as err: + logger.exception("Failed to cleanup openstack server %s that timeout during creation", full_name) + self._delete_key_pair(conn, name) + raise OpenStackError(f"Timeout creating openstack server {full_name}") from err + except openstack.exceptions.SDKException as err: + logger.exception("Failed to create openstack server %s", full_name) + self._delete_key_pair(conn, name) + raise OpenStackError(f"Failed to create openstack server {full_name}") from err + return OpenstackInstance(server) def delete_instance(self, name: str): @@ -330,7 +345,7 @@ def _delete_key_pair(conn: OpenstackConnection, name: str) -> None: # Keypair have unique names, access by ID is not needed. if not conn.delete_keypair(name): logger.warning("Unable to delete keypair for %s", name) - except openstack.exceptions.SDKException: + except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout) as err: logger.warning("Unable to delete keypair for %s", name, stack_info=True) key_path = OpenstackCloud._get_key_path(name) From 3f39a3e195064080c453b0cc1a76b3512df0cbd0 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 1 Aug 2024 15:22:32 +0800 Subject: [PATCH 016/278] Initial openstack runner manager refactor --- src-docs/openstack_cloud.openstack_cloud.md | 36 +- ...penstack_cloud.openstack_runner_manager.md | 138 +++++++ src/manager/cloud_runner_manager.py | 75 ++++ src/manager/runner_manager.py | 20 ++ src/openstack_cloud/openstack_cloud.py | 36 +- .../openstack_runner_manager.py | 339 ++++++++++++++++++ 6 files changed, 625 insertions(+), 19 deletions(-) create mode 100644 src-docs/openstack_cloud.openstack_runner_manager.md create mode 100644 src/manager/cloud_runner_manager.py create mode 100644 src/manager/runner_manager.py create mode 100644 src/openstack_cloud/openstack_runner_manager.py diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 8956f7c8c..126a4413e 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -14,14 +14,14 @@ ## class `OpenstackInstance` -OpenstackInstance(server: openstack.compute.v2.server.Server) +OpenstackInstance(server: openstack.compute.v2.server.Server, prefix: str) - + ### method `__init__` ```python -__init__(server: Server) +__init__(server: Server, prefix: str) ``` @@ -34,14 +34,14 @@ __init__(server: Server) --- - + ## class `OpenstackCloud` - + ### method `__init__` @@ -64,12 +64,12 @@ Create a OpenstackCloud instance. --- - + ### method `delete_instance` ```python -delete_instance(name: str) +delete_instance(name: str) → None ``` @@ -78,7 +78,21 @@ delete_instance(name: str) --- - + + +### method `get_instance` + +```python +get_instance(name: str) → OpenstackInstance +``` + + + + + +--- + + ### method `get_instance_name` @@ -92,7 +106,7 @@ get_instance_name(name: str) → str --- - + ### method `get_instances` @@ -106,7 +120,7 @@ get_instances() → list[OpenstackInstance] --- - + ### method `get_ssh_connection` @@ -120,7 +134,7 @@ get_ssh_connection(instance: OpenstackInstance) → Connection --- - + ### method `launch_instance` diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md new file mode 100644 index 000000000..02a1e2e4f --- /dev/null +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -0,0 +1,138 @@ + + + + +# module `openstack_cloud.openstack_runner_manager` + + + + +**Global Variables** +--------------- +- **BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME** +- **MAX_METRICS_FILE_SIZE** +- **RUNNER_STARTUP_PROCESS** +- **RUNNER_LISTENER_PROCESS** +- **RUNNER_WORKER_PROCESS** +- **CREATE_SERVER_TIMEOUT** + + +--- + + + +## class `OpenstackRunnerManagerConfig` +OpenstackRunnerManagerConfig(image: str, flavor: str, network: str, github_path: charm_state.GithubOrg | charm_state.GithubRepo, labels: list[str], proxy_config: charm_state.ProxyConfig | None, dockerhub_mirror: str | None, ssh_debug_connections: list[charm_state.SSHDebugConnection], repo_policy_url: str, repo_policy_token: str, clouds_config: dict[str, dict], cloud: str) + + + +### method `__init__` + +```python +__init__( + image: str, + flavor: str, + network: str, + github_path: GithubOrg | GithubRepo, + labels: list[str], + proxy_config: ProxyConfig | None, + dockerhub_mirror: str | None, + ssh_debug_connections: list[SSHDebugConnection], + repo_policy_url: str, + repo_policy_token: str, + clouds_config: dict[str, dict], + cloud: str +) → None +``` + + + + + + + + + +--- + + + +## class `OpenstackRunnerManager` + + + + + + +### method `__init__` + +```python +__init__(runner_flavor: str, config: OpenstackRunnerManagerConfig) → None +``` + + + + + + + + +--- + + + +### method `create_runner` + +```python +create_runner(registration_token: str) → str +``` + + + + + +--- + + + +### method `delete_runners` + +```python +delete_runners(id: str, remove_token: str) → None +``` + + + + + +--- + + + +### method `get_runner` + +```python +get_runner(id: str) → RunnerInstance | None +``` + + + + + +--- + + + +### method `get_runners` + +```python +get_runners( + cloud_runner_status: list[CloudRunnerStatus] +) → Tuple[RunnerInstance] +``` + + + + + + diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py new file mode 100644 index 000000000..39b2b6c94 --- /dev/null +++ b/src/manager/cloud_runner_manager.py @@ -0,0 +1,75 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +from abc import ABC +from dataclasses import dataclass +from enum import Enum +from typing import Tuple + +RunnerId = str + +_OPENSTACK_STATUS_SHUTOFF = "SHUTOFF" +_OPENSTACK_STATUS_ERROR = "ERROR" +_OPENSTACK_STATUS_ACTIVE = "ACTIVE" +_OPENSTACK_STATUS_BUILDING = "BUILDING" + +class CloudRunnerStatus(str, Enum): + created = "created" + active = "active" + deleted = "deleted" + error = "error" + stopped = "stopped" + unknown = "unknown" + unexpected = "unexpected" + + + def from_openstack_status(status: str) -> "CloudRunnerStatus": + """Create from openstack server status. + + The openstack server status are documented here: + https://docs.openstack.org/api-guide/compute/server_concepts.html + + Args: + status: Openstack server status. + + Returns: + The CloudRunnerStatus. + """ + match status: + case "BUILD": + return CloudRunnerStatus.created + case "REBUILD": + return CloudRunnerStatus.created + case "ACTIVE": + return CloudRunnerStatus.active + case "ERROR": + return CloudRunnerStatus.error + case "STOPPED": + return CloudRunnerStatus.stopped + case "DELETED": + return CloudRunnerStatus.deleted + case "UNKNOWN": + return CloudRunnerStatus.unknown + case _: + return CloudRunnerStatus.unexpected + +@dataclass +class RunnerInstance: + name: str + id: str + status: CloudRunnerStatus + +@dataclass +class RunnerMetrics: + pass + +class CloudRunnerManager(ABC): + def create_runner(self, registration_token: str) -> RunnerId: ... + + def get_runner(self, id: RunnerId) -> RunnerInstance: ... + + def get_runners( + self, cloud_runner_status: list[CloudRunnerStatus] + ) -> Tuple[RunnerInstance]: ... + + def delete_runners(self, id: RunnerId, remove_token: str) -> None: ... diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py new file mode 100644 index 000000000..cb5ce7b05 --- /dev/null +++ b/src/manager/runner_manager.py @@ -0,0 +1,20 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +from dataclasses import dataclass +from enum import Enum + +from manager.cloud_runner_manager import CloudRunnerStatus, RunnerId + + +class GithubRunnerStatus(str, Enum): + busy = "busy" + idle = "idle" + offline = "offline" + +@dataclass +class RunnerInstance: + github_name: str + id: RunnerId + github_status: GithubRunnerStatus + cloud_status: CloudRunnerStatus \ No newline at end of file diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 9070d6dac..6ec16f11f 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -42,20 +42,28 @@ class _SshError(Exception): @dataclass class OpenstackInstance: - id: str + server_id: str + server_name: str name: str addresses: list[str] + status: str - def __init__(self, server: OpenstackServer): - self.id = server.id - self.name = server.name + def __init__(self, server: OpenstackServer, prefix: str): + self.server_id = server.id + self.server_name = server.name + self.status = server.status self.addresses = [ address["addr"] for network_addresses in server.addresses.values() for address in network_addresses ] + if not self.name.startswith(prefix): + # Should never happen. + raise ValueError(f"Found openstack server {server.name} managed under prefix {prefix}, contact devs") + self.name = self.server_name[len(prefix):] + @contextmanager def _get_openstack_connection( clouds_config: dict[str, dict], cloud: str @@ -145,17 +153,29 @@ def launch_instance( raise OpenStackError(f"Failed to create openstack server {full_name}") from err return OpenstackInstance(server) + + def get_instance(self, name: str) -> OpenstackInstance: + full_name = self.get_instance_name(name) + logger.info("Getting openstack server with %s", full_name) + + with _get_openstack_connection( + clouds_config=self._clouds_config, cloud=self._cloud + ) as conn: + return OpenstackInstance(OpenstackCloud._get_and_ensure_unique_server(conn, full_name)) - def delete_instance(self, name: str): + def delete_instance(self, name: str) -> None: full_name = self.get_instance_name(name) logger.info("Deleting openstack server with %s", full_name) with _get_openstack_connection( clouds_config=self._clouds_config, cloud=self._cloud ) as conn: - server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) - conn.delete_server(name_or_id=server.id) - OpenstackCloud._delete_key_pair(conn, full_name) + try: + server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) + conn.delete_server(name_or_id=server.id) + OpenstackCloud._delete_key_pair(conn, full_name) + except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout) as err: + raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: key_path = OpenstackCloud._get_key_path(instance.name) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py new file mode 100644 index 000000000..57249314a --- /dev/null +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -0,0 +1,339 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +from pathlib import Path +import secrets +from dataclasses import dataclass +import time +from typing import Tuple + +import jinja2 +from fabric import Connection as SshConnection +import paramiko +import paramiko.ssh_exception + +from charm_state import GithubOrg, GithubPath, ProxyConfig, SSHDebugConnection +from errors import CreateMetricsStorageError, GetMetricsStorageError, IssueMetricEventError, OpenStackError, RunnerCreateError, RunnerRemoveError +from manager.cloud_runner_manager import ( + CloudRunnerManager, + CloudRunnerStatus, + RunnerId, + RunnerInstance, + RunnerMetrics, +) +from openstack_cloud.openstack_cloud import OpenstackCloud +from openstack_cloud.openstack_manager import GithubRunnerRemoveError +from repo_policy_compliance_client import RepoPolicyComplianceClient +from metrics import events as metric_events +from metrics import github as github_metrics +from metrics import runner as runner_metrics +from metrics import storage as metrics_storage + +logger = logging.getLogger(__name__) + +BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME = "scripts/build-openstack-image.sh" +_SSH_KEY_PATH = Path("/home/ubuntu/.ssh") +_CONFIG_SCRIPT_PATH = Path("/home/ubuntu/actions-runner/config.sh") + +RUNNER_APPLICATION = Path("/home/ubuntu/actions-runner") +METRICS_EXCHANGE_PATH = Path("/home/ubuntu/metrics-exchange") +PRE_JOB_SCRIPT = RUNNER_APPLICATION / "pre-job.sh" +MAX_METRICS_FILE_SIZE = 1024 + +RUNNER_STARTUP_PROCESS = "/home/ubuntu/actions-runner/run.sh" +RUNNER_LISTENER_PROCESS = "Runner.Listener" +RUNNER_WORKER_PROCESS = "Runner.Worker" +CREATE_SERVER_TIMEOUT = 5 * 60 + +class _SshError(Exception): + """Represents an error while interacting with SSH.""" + +class _PullFileError(Exception): + """Represents an error while pulling a file from the runner instance.""" + +@dataclass +class OpenstackRunnerManagerConfig: + image: str + flavor: str + network: str + github_path: GithubPath + labels: list[str] + proxy_config: ProxyConfig | None + dockerhub_mirror: str | None + ssh_debug_connections: list[SSHDebugConnection] + repo_policy_url: str + repo_policy_token: str + clouds_config: dict[str, dict] + cloud: str + + +class OpenstackRunnerManager(CloudRunnerManager): + + def __init__( + self, runner_flavor: str, config: OpenstackRunnerManagerConfig + ) -> None: + self.runner_flavor = runner_flavor + self.config = config + self._openstack_cloud = OpenstackCloud(clouds_config=self.config.clouds_config, cloud=self.config.cloud, prefix=self.runner_flavor) + + def create_runner(self, registration_token: str) -> RunnerId: + start_timestamp = time.time() + id = OpenstackRunnerManager._generate_runner_id() + instance_name = self._openstack_cloud.get_instance_name(name=id) + userdata = self._generate_userdata(instance_name=instance_name,registration_token=registration_token) + try: + self._openstack_cloud.launch_instance( + name=id, + image=self.config.image, + flavor=self.config.flavor, + network=self.config.network, + userdata=userdata, + ) + except OpenStackError as err: + raise RunnerCreateError("Failed to create {instance_name} openstack runner") from err + end_timestamp = time.time() + OpenstackRunnerManager._issue_runner_installed_metric( + name=instance_name, + flavor=self.runner_flavor, + install_start_timestamp=start_timestamp, + install_end_timestamp=end_timestamp, + ) + return id + + def get_runner(self, id: RunnerId) -> RunnerInstance | None: + name = self._openstack_cloud.get_instance_name(id) + instances_list = self._openstack_cloud.get_instances() + for instance in instances_list: + if instance.name == name: + return RunnerInstance(name=name, id=id, status=CloudRunnerStatus.from_openstack_status(instance.status)) + return None + + def get_runners(self, cloud_runner_status: list[CloudRunnerStatus]) -> Tuple[RunnerInstance]: + instances_list = self._openstack_cloud.get_instances() + instances_list = [RunnerInstance(name=instance.name, id=self._openstack_cloud.convert_name(instance.name), status=CloudRunnerStatus.from_openstack_status(instance.status)) + for instance in instances_list] + return [instance for instance in instances_list if instance.status in cloud_runner_status] + + def delete_runners(self, id: RunnerId, remove_token: str) -> None: + instance = self._openstack_cloud.get_instance(id) + ssh_conn = self._openstack_cloud.get_ssh_connection(instance) + self._pull_runner_metrics(instance.name, ssh_conn) + try: + OpenstackRunnerManager._run_github_runner_removal_script(instance.name,ssh_conn, remove_token) + except GithubRunnerRemoveError: + logger.warning("Unable to run github runner removal script for %s", instance.name, stack_info=True) + + try: + self._openstack_cloud.delete_instance(id) + except OpenStackError: + logger.exception("Unable to delete openstack instance for runner %s", instance.name) + + def _generate_userdata(self, instance_name: str, registration_token: str) -> str: + jinja = jinja2.Environment( + loader=jinja2.FileSystemLoader("templates"), autoescape=True + ) + + env_contents = jinja.get_template("env.j2").render( + pre_job_script=str(PRE_JOB_SCRIPT), + dockerhub_mirror=self.config.dockerhub_mirror or "", + ssh_debug_info=(secrets.choice(self.config.ssh_debug_connections) if self.config.ssh_debug_connections else None), + # Proxies are handled by aproxy. + proxies={}, + ) + + pre_job_contents_dict = { + "issue_metrics": True, + "metrics_exchange_path": str(METRICS_EXCHANGE_PATH), + "do_repo_policy_check": False, + } + repo_policy = self._get_repo_policy_compliance_client() + if repo_policy is not None: + pre_job_contents_dict.update( + { + "repo_policy_base_url": repo_policy.base_url, + "repo_policy_one_time_token": repo_policy.get_one_time_token(), + "do_repo_policy_check": True, + } + ) + + pre_job_contents = jinja.get_template("pre-job.j2").render(pre_job_contents_dict) + + runner_group = None + if isinstance(self.config.github_path, GithubOrg): + runner_group = self.config.github_path.group + aproxy_address = self.config.proxy_config.aproxy_address if self.config.proxy_config is not None else None + return jinja.get_template("openstack_userdata.sh.j2").render( + github_url=f"https://github.com/{self.config.github_path.path()}", + runner_group=runner_group, + token=registration_token, + instance_labels=",".join(self.config.labels), + instance_name=instance_name, + env_contents=env_contents, + pre_job_contents=pre_job_contents, + metrics_exchange_path=str(METRICS_EXCHANGE_PATH), + aproxy_address=aproxy_address, + dockerhub_mirror=self.config.dockerhub_mirror, + ) + + def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | None: + if self.config.repo_policy_url and self.config.repo_policy_token: + return RepoPolicyComplianceClient(self.config.repo_policy_url, self.config.repo_policy_token) + return None + + @staticmethod + def _generate_runner_id() -> RunnerId: + return secrets.token_hex(12) + + @staticmethod + def _issue_runner_installed_metric( + name: str, + flavor: str, + install_start_timestamp: float, + install_end_timestamp: float, + ) -> None: + try: + metric_events.issue_event( + event=metric_events.RunnerInstalled( + timestamp=install_start_timestamp, + flavor=flavor, + duration=install_start_timestamp - install_end_timestamp, + ) + ) + except IssueMetricEventError: + logger.exception("Failed to issue RunnerInstalled metric") + + try: + storage = metrics_storage.create(name) + except CreateMetricsStorageError: + logger.exception( + "Failed to create metrics storage for runner %s, " + "will not be able to issue all metrics.", + name, + ) + else: + try: + (storage.path / runner_metrics.RUNNER_INSTALLED_TS_FILE_NAME).write_text( + str(install_end_timestamp), encoding="utf-8" + ) + except FileNotFoundError: + logger.exception( + "Failed to write runner-installed.timestamp into metrics storage " + "for runner %s, will not be able to issue all metrics.", + name, + ) + + @staticmethod + def _pull_runner_metrics(name: str, ssh_conn: SshConnection) -> None: + try: + storage = metrics_storage.get(name) + except GetMetricsStorageError: + logger.exception( + "Failed to get shared metrics storage for runner %s, " + "will not be able to issue all metrics.", + name, + ) + return + + try: + OpenstackRunnerManager._ssh_pull_file( + ssh_conn=ssh_conn, + remote_path=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), + local_path=str(storage.path / "pre-job-metrics.json"), + max_size=MAX_METRICS_FILE_SIZE, + ) + OpenstackRunnerManager._ssh_pull_file( + ssh_conn=ssh_conn, + remote_path=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), + local_path=str(storage.path / "post-job-metrics.json"), + max_size=MAX_METRICS_FILE_SIZE, + ) + except _PullFileError as exc: + logger.warning( + "Failed to pull metrics for %s: %s . Will not be able to issue all metrics", + name, + exc, + ) + + + @staticmethod + def _ssh_pull_file(ssh_conn: SshConnection, remote_path:str, local_path: str, max_size: int) -> None: + """Pull file from the runner instance. + + Args: + ssh_conn: The SSH connection instance. + remote_path: The file path on the runner instance. + local_path: The local path to store the file. + max_size: If the file is larger than this, it will not be pulled. + + Raises: + _PullFileError: Unable to pull the file from the runner instance. + _SSHError: Issue with SSH connection. + """ + try: + result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) + except (TimeoutError, paramiko.ssh_exception.NoValidConnectionsError, paramiko.ssh_exception.SSHException) as exc: + raise _SshError(f"Unable to SSH into {ssh_conn.host}") from exc + if not result.ok: + logger.warning( + ( + "Unable to get file size of %s on instance %s, " + "exit code: %s, stdout: %s, stderr: %s" + ), + remote_path, + ssh_conn.host, + result.return_code, + result.stdout, + result.stderr, + ) + raise _PullFileError(f"Unable to get file size of {remote_path}") + + stdout = result.stdout + try: + stdout.strip() + size = int(stdout) + if size > max_size: + raise _PullFileError( f"File size of {remote_path} too large {size} > {max_size}") + except ValueError as exc: + raise _PullFileError(f"Invalid file size for {remote_path}: stdout") from exc + + try: + ssh_conn.get(remote=remote_path, local=local_path) + except (TimeoutError, paramiko.ssh_exception.NoValidConnectionsError, paramiko.ssh_exception.SSHException) as exc: + raise _SshError(f"Unable to SSH into {ssh_conn.host}") from exc + except OSError as exc: + raise _PullFileError(F"Unable to retrieve file {remote_path}") from exc + + @staticmethod + def _run_github_runner_removal_script(instance_name: str, ssh_conn: SshConnection, remove_token: str) -> None: + """Run Github runner removal script. + + Args: + ssh_conn: The SSH connection to the runner instance. + remove_token: The GitHub instance removal token. + + Raises: + GithubRunnerRemoveError: Unable to remove runner from GitHub. + """ + try: + result = ssh_conn.run( + f"{_CONFIG_SCRIPT_PATH} remove --token {remove_token}", + warn=True, + ) + if result.ok: + return + + logger.warning( + ( + "Unable to run removal script on instance %s, " + "exit code: %s, stdout: %s, stderr: %s" + ), + instance_name, + result.return_code, + result.stdout, + result.stderr, + ) + raise GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") + except (TimeoutError, paramiko.ssh_exception.NoValidConnectionsError, paramiko.ssh_exception.SSHException) as exc: + raise GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") from exc + \ No newline at end of file From 941ae941d71232221d97b2cb5936b8078270025d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 2 Aug 2024 22:38:23 +0800 Subject: [PATCH 017/278] Initial runner manager implementation --- src-docs/errors.md | 11 + src-docs/openstack_cloud.openstack_cloud.md | 36 ++- src/errors.py | 4 + src/manager/cloud_runner_manager.py | 66 ++--- src/manager/github_runner_manager.py | 58 +++++ src/manager/runner_manager.py | 109 ++++++++- src/openstack_cloud/openstack_cloud.py | 55 +++-- .../openstack_runner_manager.py | 227 ++++++++++++++---- 8 files changed, 443 insertions(+), 123 deletions(-) create mode 100644 src/manager/github_runner_manager.py diff --git a/src-docs/errors.md b/src-docs/errors.md index c0f190a73..d091b72f9 100644 --- a/src-docs/errors.md +++ b/src-docs/errors.md @@ -403,3 +403,14 @@ Represents an unauthorized connection to OpenStack. +--- + + + +## class `SshError` +Represents an error while interacting with SSH. + + + + + diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 126a4413e..0b227b3ba 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -11,12 +11,12 @@ --- - + ## class `OpenstackInstance` OpenstackInstance(server: openstack.compute.v2.server.Server, prefix: str) - + ### method `__init__` @@ -34,14 +34,14 @@ __init__(server: Server, prefix: str) --- - + ## class `OpenstackCloud` - + ### method `__init__` @@ -64,7 +64,21 @@ Create a OpenstackCloud instance. --- - + + +### method `cleanup` + +```python +cleanup() → None +``` + + + + + +--- + + ### method `delete_instance` @@ -78,7 +92,7 @@ delete_instance(name: str) → None --- - + ### method `get_instance` @@ -92,7 +106,7 @@ get_instance(name: str) → OpenstackInstance --- - + ### method `get_instance_name` @@ -106,12 +120,12 @@ get_instance_name(name: str) → str --- - + ### method `get_instances` ```python -get_instances() → list[OpenstackInstance] +get_instances() → tuple[OpenstackInstance] ``` @@ -120,7 +134,7 @@ get_instances() → list[OpenstackInstance] --- - + ### method `get_ssh_connection` @@ -134,7 +148,7 @@ get_ssh_connection(instance: OpenstackInstance) → Connection --- - + ### method `launch_instance` diff --git a/src/errors.py b/src/errors.py index 55d84e8e1..0dab2a54a 100644 --- a/src/errors.py +++ b/src/errors.py @@ -166,3 +166,7 @@ class OpenStackInvalidConfigError(OpenStackError): class OpenStackUnauthorizedError(OpenStackError): """Represents an unauthorized connection to OpenStack.""" + + +class SshError(Exception): + """Represents an error while interacting with SSH.""" diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 39b2b6c94..621a26179 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -4,7 +4,7 @@ from abc import ABC from dataclasses import dataclass from enum import Enum -from typing import Tuple +from typing import Sequence, Tuple RunnerId = str @@ -13,63 +13,65 @@ _OPENSTACK_STATUS_ACTIVE = "ACTIVE" _OPENSTACK_STATUS_BUILDING = "BUILDING" -class CloudRunnerStatus(str, Enum): - created = "created" - active = "active" - deleted = "deleted" - error = "error" - stopped = "stopped" - unknown = "unknown" - unexpected = "unexpected" - - - def from_openstack_status(status: str) -> "CloudRunnerStatus": + +class CloudRunnerState(str, Enum): + CREATED = "created" + ACTIVE = "active" + DELETED = "deleted" + ERROR = "error" + STOPPED = "stopped" + UNKNOWN = "unknown" + UNEXPECTED = "unexpected" + + def __init__(openstack_server_status: str) -> None: """Create from openstack server status. - + The openstack server status are documented here: https://docs.openstack.org/api-guide/compute/server_concepts.html - + Args: status: Openstack server status. - - Returns: - The CloudRunnerStatus. """ - match status: + match openstack_server_status: case "BUILD": - return CloudRunnerStatus.created + return CloudRunnerState.CREATED case "REBUILD": - return CloudRunnerStatus.created + return CloudRunnerState.CREATED case "ACTIVE": - return CloudRunnerStatus.active + return CloudRunnerState.ACTIVE case "ERROR": - return CloudRunnerStatus.error + return CloudRunnerState.ERROR case "STOPPED": - return CloudRunnerStatus.stopped + return CloudRunnerState.STOPPED case "DELETED": - return CloudRunnerStatus.deleted + return CloudRunnerState.DELETED case "UNKNOWN": - return CloudRunnerStatus.unknown + return CloudRunnerState.UNKNOWN case _: - return CloudRunnerStatus.unexpected + return CloudRunnerState.UNEXPECTED + @dataclass -class RunnerInstance: +class CloudRunnerInstance: name: str id: str - status: CloudRunnerStatus + status: CloudRunnerState + @dataclass class RunnerMetrics: pass + class CloudRunnerManager(ABC): def create_runner(self, registration_token: str) -> RunnerId: ... - def get_runner(self, id: RunnerId) -> RunnerInstance: ... + def get_runner(self, id: RunnerId) -> CloudRunnerInstance: ... def get_runners( - self, cloud_runner_status: list[CloudRunnerStatus] - ) -> Tuple[RunnerInstance]: ... + self, cloud_runner_status: Sequence[CloudRunnerState] + ) -> Tuple[CloudRunnerInstance]: ... + + def delete_runner(self, id: RunnerId, remove_token: str) -> None: ... - def delete_runners(self, id: RunnerId, remove_token: str) -> None: ... + def cleanup_runner(self, remove_token: str) -> None: ... diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py new file mode 100644 index 000000000..41bfbc20a --- /dev/null +++ b/src/manager/github_runner_manager.py @@ -0,0 +1,58 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +from enum import Enum, auto +from typing import Sequence + +from charm_state import GithubPath +from github_client import GithubClient +from github_type import GitHubRunnerStatus, SelfHostedRunner + + +class GithubRunnerState(str, Enum): + BUSY = "busy" + IDLE = "idle" + OFFLINE = "offline" + UNKNOWN = "unknown" + + def __init__(self, runner: SelfHostedRunner) -> "GithubRunnerState": + state = GithubRunnerState.OFFLINE + if runner.status == GitHubRunnerStatus.ONLINE: + if runner.busy: + state = GithubRunnerState.BUSY + if not runner.busy: + state = GithubRunnerState.IDLE + return state + + +class GithubRunnerManager: + + def __init__(self, prefix: str, token: str, path: GithubPath): + self._prefix = prefix + self._path = path + self._github = GithubClient(token) + + def get_runners(self, states: Sequence[GithubRunnerState]) -> tuple[SelfHostedRunner]: + runner_list = self._github.get_runner_github_info() + return tuple( + runner + for runner in runner_list + if GithubRunnerManager._filter_runner_state(runner, states) + ) + + def delete_runners(self, states: Sequence[GithubRunnerState]) -> None: + runner_list = self.get_runners(states) + for runner in runner_list: + self._github.delete_runner(self._path, runner.id) + + def get_registration_token(self) -> str: + return self._github.get_runner_registration_token(self._path) + + def get_removal_token(self) -> str: + return self._github.get_runner_remove_token(self._path) + + @staticmethod + def _filter_runner_state( + runner: SelfHostedRunner, states: Sequence[GithubRunnerState] + ) -> bool: + return GithubRunnerState(runner) in states diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index cb5ce7b05..c12fdf526 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -2,19 +2,110 @@ # See LICENSE file for licensing details. from dataclasses import dataclass -from enum import Enum +from enum import Enum, auto +from typing import Sequence -from manager.cloud_runner_manager import CloudRunnerStatus, RunnerId +from charm_state import GithubPath +from github_type import SelfHostedRunner +from manager.cloud_runner_manager import ( + CloudRunnerInstance, + CloudRunnerManager, + CloudRunnerState, + RunnerId, +) +from manager.github_runner_manager import GithubRunnerManager, GithubRunnerState -class GithubRunnerStatus(str, Enum): - busy = "busy" - idle = "idle" - offline = "offline" +class FlushMode(Enum): + """Strategy for flushing runners. + + Attributes: + FLUSH_IDLE: Flush idle runners. + FLUSH_BUSY: Flush busy runners. + """ + + FLUSH_IDLE = auto() + FLUSH_BUSY = auto() + @dataclass class RunnerInstance: - github_name: str + name: str id: RunnerId - github_status: GithubRunnerStatus - cloud_status: CloudRunnerStatus \ No newline at end of file + github_state: GithubRunnerState + cloud_state: CloudRunnerState + + def __init__( + self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner + ) -> "RunnerInstance": + self.name = github_info.name + self.id = cloud_instance.id + self.github_state = GithubRunnerState(SelfHostedRunner) + self.cloud_state = cloud_instance.status + + +@dataclass +class RunnerManagerConfig: + prefix: str + token: str + path: GithubPath + + +class RunnerManager: + + def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManagerConfig): + self._config = config + self._cloud = cloud_runner_manager + self._github = GithubRunnerManager( + self._config.prefix, self._config.path, self._config.path + ) + + def create_runners(self, num: int) -> list[RunnerId]: + registration_token = self._github.get_registration_token() + + runner_ids = [] + for _ in range(num): + runner_ids.append(self._cloud.create_runner(registration_token=registration_token)) + + return runner_ids + + def get_runners( + self, + github_runner_state: Sequence[GithubRunnerState] = None, + cloud_runner_state: Sequence[CloudRunnerState] = None, + ) -> tuple[RunnerInstance]: + """Get information on runner filter by state. + + Args: + github_runner_state: Filter for the runners with these github states. If None all + states will be included. + cloud_runner_state: Filter for the runners with these cloud states. If None all states + will be included. + + Returns: + Information on the runners. + """ + cloud_infos = self._cloud.get_runners(cloud_runner_status=cloud_runner_state) + github_infos = self._github.get_runners(github_runner_state) + cloud_infos_map = {info.name: info for info in cloud_infos} + github_infos_map = {info.name: info for info in github_infos} + return tuple( + RunnerInstance(cloud_infos_map[name], github_infos_map[name]) + for name in cloud_infos_map.keys() & github_infos_map.keys() + ) + + def delete_runners(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: + states = [GithubRunnerState.IDLE] + if flush_mode == FlushMode.FLUSH_BUSY: + states.append(GithubRunnerState.BUSY) + + runners_list = self.get_runners(github_runner_state=states) + remove_token = self._github.get_removal_token() + + for runner in runners_list: + self._cloud.delete_runners(id=runner.id, remove_token=remove_token) + + def cleanup(self) -> None: + self._github.delete_runners([GithubRunnerState.OFFLINE, GithubRunnerState.UNKNOWN]) + remove_token = self._github.get_removal_token() + self._cloud.cleanup_runner(remove_token) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 6ec16f11f..662014044 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -21,7 +21,7 @@ from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup from paramiko.ssh_exception import NoValidConnectionsError -from errors import OpenStackError +from errors import OpenStackError, SshError logger = logging.getLogger(__name__) @@ -36,10 +36,6 @@ _TEST_STRING = "test_string" -class _SshError(Exception): - """Represents an error while interacting with SSH.""" - - @dataclass class OpenstackInstance: server_id: str @@ -60,10 +56,12 @@ def __init__(self, server: OpenstackServer, prefix: str): if not self.name.startswith(prefix): # Should never happen. - raise ValueError(f"Found openstack server {server.name} managed under prefix {prefix}, contact devs") - self.name = self.server_name[len(prefix):] + raise ValueError( + f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" + ) + self.name = self.server_name[len(prefix) :] + - @contextmanager def _get_openstack_connection( clouds_config: dict[str, dict], cloud: str @@ -140,11 +138,20 @@ def launch_instance( ) except openstack.exceptions.ResourceTimeout as err: logger.exception("Timeout creating openstack server %s", full_name) - logger.info("Attempting clean up of openstack server %s that timeout during creation", full_name) + logger.info( + "Attempting clean up of openstack server %s that timeout during creation", + full_name, + ) try: conn.delete_server(name_or_id=full_name, wait=True) - except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout) as err: - logger.exception("Failed to cleanup openstack server %s that timeout during creation", full_name) + except ( + openstack.exceptions.SDKException, + openstack.exceptions.ResourceTimeout, + ) as err: + logger.exception( + "Failed to cleanup openstack server %s that timeout during creation", + full_name, + ) self._delete_key_pair(conn, name) raise OpenStackError(f"Timeout creating openstack server {full_name}") from err except openstack.exceptions.SDKException as err: @@ -153,7 +160,7 @@ def launch_instance( raise OpenStackError(f"Failed to create openstack server {full_name}") from err return OpenstackInstance(server) - + def get_instance(self, name: str) -> OpenstackInstance: full_name = self.get_instance_name(name) logger.info("Getting openstack server with %s", full_name) @@ -174,16 +181,19 @@ def delete_instance(self, name: str) -> None: server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) conn.delete_server(name_or_id=server.id) OpenstackCloud._delete_key_pair(conn, full_name) - except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout) as err: + except ( + openstack.exceptions.SDKException, + openstack.exceptions.ResourceTimeout, + ) as err: raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: key_path = OpenstackCloud._get_key_path(instance.name) if not key_path.exists(): - raise _SshError(f"Missing keyfile for server: {instance.name}, key path: {key_path}") + raise SshError(f"Missing keyfile for server: {instance.name}, key path: {key_path}") if not instance.addresses: - raise _SshError(f"No addresses found for OpenStack server {instance.name}") + raise SshError(f"No addresses found for OpenStack server {instance.name}") for ip in instance.addresses: try: @@ -209,12 +219,12 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: exc_info=True, ) continue - raise _SshError( + raise SshError( f"No connectable SSH addresses found, server: {instance.name}, " f"addresses: {instance.addresses}" ) - def get_instances(self) -> list[OpenstackInstance]: + def get_instances(self) -> tuple[OpenstackInstance]: logger.info("Getting all openstack servers managed by the charm") with _get_openstack_connection( @@ -227,6 +237,15 @@ def get_instances(self) -> list[OpenstackInstance]: for name in server_names ] + def cleanup(self) -> None: + with _get_openstack_connection( + clouds_config=self._clouds_config, cloud=self._cloud + ) as conn: + server_list = self._get_openstack_instances(conn) + exclude_list = [server.name for server in server_list] + self._cleanup_key_files(conn, exclude_list) + self._clean_up_openstack_keypairs(conn, exclude_list) + def _cleanup_key_files( self, conn: OpenstackConnection, exclude_instances: Iterable[str] ) -> None: @@ -295,7 +314,7 @@ def _clean_up_openstack_keypairs( def get_instance_name(self, name: str) -> str: return f"{self.prefix}-{name}" - def _get_openstack_instances(self, conn: OpenstackConnection) -> list[OpenstackServer]: + def _get_openstack_instances(self, conn: OpenstackConnection) -> tuple[OpenstackServer]: """Get the OpenStack servers managed by this unit. Args: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 57249314a..95981063c 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -2,38 +2,49 @@ # See LICENSE file for licensing details. import logging -from pathlib import Path import secrets -from dataclasses import dataclass import time -from typing import Tuple +from dataclasses import dataclass +from pathlib import Path +from typing import Sequence, Tuple +import invoke import jinja2 -from fabric import Connection as SshConnection import paramiko import paramiko.ssh_exception +from fabric import Connection as SshConnection from charm_state import GithubOrg, GithubPath, ProxyConfig, SSHDebugConnection -from errors import CreateMetricsStorageError, GetMetricsStorageError, IssueMetricEventError, OpenStackError, RunnerCreateError, RunnerRemoveError +from errors import ( + CreateMetricsStorageError, + GetMetricsStorageError, + IssueMetricEventError, + OpenStackError, + RunnerCreateError, + RunnerError, + RunnerRemoveError, + RunnerStartError, + SshError, +) from manager.cloud_runner_manager import ( + CloudRunnerInstance, CloudRunnerManager, - CloudRunnerStatus, + CloudRunnerState, RunnerId, - RunnerInstance, RunnerMetrics, ) -from openstack_cloud.openstack_cloud import OpenstackCloud -from openstack_cloud.openstack_manager import GithubRunnerRemoveError -from repo_policy_compliance_client import RepoPolicyComplianceClient from metrics import events as metric_events from metrics import github as github_metrics from metrics import runner as runner_metrics from metrics import storage as metrics_storage +from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance +from openstack_cloud.openstack_manager import GithubRunnerRemoveError +from repo_policy_compliance_client import RepoPolicyComplianceClient +from utilities import retry logger = logging.getLogger(__name__) BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME = "scripts/build-openstack-image.sh" -_SSH_KEY_PATH = Path("/home/ubuntu/.ssh") _CONFIG_SCRIPT_PATH = Path("/home/ubuntu/actions-runner/config.sh") RUNNER_APPLICATION = Path("/home/ubuntu/actions-runner") @@ -46,12 +57,11 @@ RUNNER_WORKER_PROCESS = "Runner.Worker" CREATE_SERVER_TIMEOUT = 5 * 60 -class _SshError(Exception): - """Represents an error while interacting with SSH.""" class _PullFileError(Exception): """Represents an error while pulling a file from the runner instance.""" + @dataclass class OpenstackRunnerManagerConfig: image: str @@ -70,20 +80,24 @@ class OpenstackRunnerManagerConfig: class OpenstackRunnerManager(CloudRunnerManager): - def __init__( - self, runner_flavor: str, config: OpenstackRunnerManagerConfig - ) -> None: + def __init__(self, runner_flavor: str, config: OpenstackRunnerManagerConfig) -> None: self.runner_flavor = runner_flavor self.config = config - self._openstack_cloud = OpenstackCloud(clouds_config=self.config.clouds_config, cloud=self.config.cloud, prefix=self.runner_flavor) + self._openstack_cloud = OpenstackCloud( + clouds_config=self.config.clouds_config, + cloud=self.config.cloud, + prefix=self.runner_flavor, + ) def create_runner(self, registration_token: str) -> RunnerId: start_timestamp = time.time() id = OpenstackRunnerManager._generate_runner_id() instance_name = self._openstack_cloud.get_instance_name(name=id) - userdata = self._generate_userdata(instance_name=instance_name,registration_token=registration_token) + userdata = self._generate_userdata( + instance_name=instance_name, registration_token=registration_token + ) try: - self._openstack_cloud.launch_instance( + instance = self._openstack_cloud.launch_instance( name=id, image=self.config.image, flavor=self.config.flavor, @@ -92,6 +106,16 @@ def create_runner(self, registration_token: str) -> RunnerId: ) except OpenStackError as err: raise RunnerCreateError("Failed to create {instance_name} openstack runner") from err + + try: + ssh_conn = self._openstack_cloud.get_ssh_connection(instance) + except SshError as err: + raise RunnerCreateError( + "Failed to SSH connect to {instance_name} openstack runner" + ) from err + + OpenstackRunnerManager._wait_runner_startup(ssh_conn, instance_name) + end_timestamp = time.time() OpenstackRunnerManager._issue_runner_installed_metric( name=instance_name, @@ -101,43 +125,78 @@ def create_runner(self, registration_token: str) -> RunnerId: ) return id - def get_runner(self, id: RunnerId) -> RunnerInstance | None: + def get_runner(self, id: RunnerId) -> CloudRunnerInstance | None: name = self._openstack_cloud.get_instance_name(id) instances_list = self._openstack_cloud.get_instances() for instance in instances_list: if instance.name == name: - return RunnerInstance(name=name, id=id, status=CloudRunnerStatus.from_openstack_status(instance.status)) + return CloudRunnerInstance( + name=name, + id=id, + status=CloudRunnerState(instance.status), + ) return None - def get_runners(self, cloud_runner_status: list[CloudRunnerStatus]) -> Tuple[RunnerInstance]: + def get_runners( + self, cloud_runner_status: Sequence[CloudRunnerState] + ) -> Tuple[CloudRunnerInstance]: instances_list = self._openstack_cloud.get_instances() - instances_list = [RunnerInstance(name=instance.name, id=self._openstack_cloud.convert_name(instance.name), status=CloudRunnerStatus.from_openstack_status(instance.status)) - for instance in instances_list] + instances_list = [ + CloudRunnerInstance( + name=instance.name, + id=self._openstack_cloud.convert_name(instance.name), + status=CloudRunnerState(instance.status), + ) + for instance in instances_list + ] return [instance for instance in instances_list if instance.status in cloud_runner_status] - def delete_runners(self, id: RunnerId, remove_token: str) -> None: + def delete_runner(self, id: RunnerId, remove_token: str) -> None: instance = self._openstack_cloud.get_instance(id) + self._delete_runner(instance, remove_token) + + def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) self._pull_runner_metrics(instance.name, ssh_conn) try: - OpenstackRunnerManager._run_github_runner_removal_script(instance.name,ssh_conn, remove_token) + OpenstackRunnerManager._run_github_runner_removal_script( + instance.name, ssh_conn, remove_token + ) except GithubRunnerRemoveError: - logger.warning("Unable to run github runner removal script for %s", instance.name, stack_info=True) + logger.warning( + "Unable to run github runner removal script for %s", instance.name, stack_info=True + ) try: self._openstack_cloud.delete_instance(id) except OpenStackError: logger.exception("Unable to delete openstack instance for runner %s", instance.name) + def cleanup(self, remove_token: str) -> None: + runner_list = self._openstack_cloud.get_instances() + + for runner in runner_list: + state = CloudRunnerState(runner.status) + if state in ( + CloudRunnerState.DELETED, + CloudRunnerState.ERROR, + CloudRunnerState.STOPPED, + ) or self._health_check(runner): + self._delete_runner(runner, remove_token) + + self._openstack_cloud.cleanup() + def _generate_userdata(self, instance_name: str, registration_token: str) -> str: - jinja = jinja2.Environment( - loader=jinja2.FileSystemLoader("templates"), autoescape=True - ) - + jinja = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"), autoescape=True) + env_contents = jinja.get_template("env.j2").render( pre_job_script=str(PRE_JOB_SCRIPT), dockerhub_mirror=self.config.dockerhub_mirror or "", - ssh_debug_info=(secrets.choice(self.config.ssh_debug_connections) if self.config.ssh_debug_connections else None), + ssh_debug_info=( + secrets.choice(self.config.ssh_debug_connections) + if self.config.ssh_debug_connections + else None + ), # Proxies are handled by aproxy. proxies={}, ) @@ -158,11 +217,15 @@ def _generate_userdata(self, instance_name: str, registration_token: str) -> str ) pre_job_contents = jinja.get_template("pre-job.j2").render(pre_job_contents_dict) - + runner_group = None if isinstance(self.config.github_path, GithubOrg): runner_group = self.config.github_path.group - aproxy_address = self.config.proxy_config.aproxy_address if self.config.proxy_config is not None else None + aproxy_address = ( + self.config.proxy_config.aproxy_address + if self.config.proxy_config is not None + else None + ) return jinja.get_template("openstack_userdata.sh.j2").render( github_url=f"https://github.com/{self.config.github_path.path()}", runner_group=runner_group, @@ -175,12 +238,54 @@ def _generate_userdata(self, instance_name: str, registration_token: str) -> str aproxy_address=aproxy_address, dockerhub_mirror=self.config.dockerhub_mirror, ) - + def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | None: if self.config.repo_policy_url and self.config.repo_policy_token: - return RepoPolicyComplianceClient(self.config.repo_policy_url, self.config.repo_policy_token) + return RepoPolicyComplianceClient( + self.config.repo_policy_url, self.config.repo_policy_token + ) return None + def _health_check(self, instance: OpenstackInstance) -> bool: + try: + ssh_conn = self._openstack_cloud.get_ssh_connection(instance) + except SshError: + logger.exception("SSH connection failure with %s", instance.name) + return False + try: + OpenstackRunnerManager._run_health_check(ssh_conn, instance.name) + except RunnerError: + logger.exception("Health check failure for %s", instance.name) + return False + logger.info("Health check success for %s", instance.name) + return True + + @retry(tries=3, delay=60, local_logger=logger) + @staticmethod + def _run_health_check(ssh_conn: SshConnection, name: str): + result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) + if not result.ok: + logger.warning("SSH run of `ps aux` failed on %s", name) + raise RunnerError(f"Unable to SSH run `ps aux` on {name}") + if ( + RUNNER_WORKER_PROCESS not in result.stdout + and RUNNER_LISTENER_PROCESS not in result.stdout + ): + logger.warning("Runner process not found on %s", name) + raise RunnerError(f"Runner process not found on {name}") + + @retry(tries=10, delay=60, local_logger=logger) + @staticmethod + def _wait_runner_startup(ssh_conn: SshConnection, name: str) -> None: + result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) + if not result.ok: + logger.warning("SSH run of `ps aux` failed on %s", name) + raise RunnerStartError(f"Unable to SSH run `ps aux` on {name}") + if RUNNER_STARTUP_PROCESS not in result.stdout: + logger.warning("Runner startup process not found on %s", name) + return RunnerStartError(f"Runner startup process not found on {name}") + logger.info("Runner startup process found to be healthy on %s", name) + @staticmethod def _generate_runner_id() -> RunnerId: return secrets.token_hex(12) @@ -202,7 +307,7 @@ def _issue_runner_installed_metric( ) except IssueMetricEventError: logger.exception("Failed to issue RunnerInstalled metric") - + try: storage = metrics_storage.create(name) except CreateMetricsStorageError: @@ -223,7 +328,7 @@ def _issue_runner_installed_metric( name, ) - @staticmethod + @staticmethod def _pull_runner_metrics(name: str, ssh_conn: SshConnection) -> None: try: storage = metrics_storage.get(name) @@ -255,9 +360,10 @@ def _pull_runner_metrics(name: str, ssh_conn: SshConnection) -> None: exc, ) - @staticmethod - def _ssh_pull_file(ssh_conn: SshConnection, remote_path:str, local_path: str, max_size: int) -> None: + def _ssh_pull_file( + ssh_conn: SshConnection, remote_path: str, local_path: str, max_size: int + ) -> None: """Pull file from the runner instance. Args: @@ -272,8 +378,12 @@ def _ssh_pull_file(ssh_conn: SshConnection, remote_path:str, local_path: str, ma """ try: result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) - except (TimeoutError, paramiko.ssh_exception.NoValidConnectionsError, paramiko.ssh_exception.SSHException) as exc: - raise _SshError(f"Unable to SSH into {ssh_conn.host}") from exc + except ( + TimeoutError, + paramiko.ssh_exception.NoValidConnectionsError, + paramiko.ssh_exception.SSHException, + ) as exc: + raise SshError(f"Unable to SSH into {ssh_conn.host}") from exc if not result.ok: logger.warning( ( @@ -293,19 +403,25 @@ def _ssh_pull_file(ssh_conn: SshConnection, remote_path:str, local_path: str, ma stdout.strip() size = int(stdout) if size > max_size: - raise _PullFileError( f"File size of {remote_path} too large {size} > {max_size}") + raise _PullFileError(f"File size of {remote_path} too large {size} > {max_size}") except ValueError as exc: raise _PullFileError(f"Invalid file size for {remote_path}: stdout") from exc - + try: ssh_conn.get(remote=remote_path, local=local_path) - except (TimeoutError, paramiko.ssh_exception.NoValidConnectionsError, paramiko.ssh_exception.SSHException) as exc: - raise _SshError(f"Unable to SSH into {ssh_conn.host}") from exc + except ( + TimeoutError, + paramiko.ssh_exception.NoValidConnectionsError, + paramiko.ssh_exception.SSHException, + ) as exc: + raise SshError(f"Unable to SSH into {ssh_conn.host}") from exc except OSError as exc: - raise _PullFileError(F"Unable to retrieve file {remote_path}") from exc - + raise _PullFileError(f"Unable to retrieve file {remote_path}") from exc + @staticmethod - def _run_github_runner_removal_script(instance_name: str, ssh_conn: SshConnection, remove_token: str) -> None: + def _run_github_runner_removal_script( + instance_name: str, ssh_conn: SshConnection, remove_token: str + ) -> None: """Run Github runner removal script. Args: @@ -319,7 +435,7 @@ def _run_github_runner_removal_script(instance_name: str, ssh_conn: SshConnectio result = ssh_conn.run( f"{_CONFIG_SCRIPT_PATH} remove --token {remove_token}", warn=True, - ) + ) if result.ok: return @@ -334,6 +450,11 @@ def _run_github_runner_removal_script(instance_name: str, ssh_conn: SshConnectio result.stderr, ) raise GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") - except (TimeoutError, paramiko.ssh_exception.NoValidConnectionsError, paramiko.ssh_exception.SSHException) as exc: - raise GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") from exc - \ No newline at end of file + except ( + TimeoutError, + paramiko.ssh_exception.NoValidConnectionsError, + paramiko.ssh_exception.SSHException, + ) as exc: + raise GithubRunnerRemoveError( + f"Failed to remove runner {instance_name} from Github." + ) from exc From 00ec04e346eb40fc77ad00c291bee7bb42fec4d7 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sat, 3 Aug 2024 16:11:01 +0800 Subject: [PATCH 018/278] Add inital integration test --- ...penstack_cloud.openstack_runner_manager.md | 66 ++++++++---- src/manager/cloud_runner_manager.py | 7 +- src/manager/github_runner_manager.py | 10 +- src/manager/runner_manager.py | 5 +- .../openstack_runner_manager.py | 29 ++--- tests/integration/conftest.py | 2 +- .../test_runner_manager_openstack.py | 100 ++++++++++++++++++ 7 files changed, 177 insertions(+), 42 deletions(-) create mode 100644 tests/integration/test_runner_manager_openstack.py diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 02a1e2e4f..537b4b776 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -19,10 +19,10 @@ --- - + ## class `OpenstackRunnerManagerConfig` -OpenstackRunnerManagerConfig(image: str, flavor: str, network: str, github_path: charm_state.GithubOrg | charm_state.GithubRepo, labels: list[str], proxy_config: charm_state.ProxyConfig | None, dockerhub_mirror: str | None, ssh_debug_connections: list[charm_state.SSHDebugConnection], repo_policy_url: str, repo_policy_token: str, clouds_config: dict[str, dict], cloud: str) +OpenstackRunnerManagerConfig(clouds_config: dict[str, dict], cloud: str, image: str, flavor: str, network: str, github_path: charm_state.GithubOrg | charm_state.GithubRepo, labels: list[str], proxy_config: charm_state.ProxyConfig | None, dockerhub_mirror: str | None, ssh_debug_connections: list[charm_state.SSHDebugConnection] | None, repo_policy_url: str | None, repo_policy_token: str | None) @@ -30,6 +30,8 @@ OpenstackRunnerManagerConfig(image: str, flavor: str, network: str, github_path: ```python __init__( + clouds_config: dict[str, dict], + cloud: str, image: str, flavor: str, network: str, @@ -37,11 +39,9 @@ __init__( labels: list[str], proxy_config: ProxyConfig | None, dockerhub_mirror: str | None, - ssh_debug_connections: list[SSHDebugConnection], - repo_policy_url: str, - repo_policy_token: str, - clouds_config: dict[str, dict], - cloud: str + ssh_debug_connections: list[SSHDebugConnection] | None, + repo_policy_url: str | None, + repo_policy_token: str | None ) → None ``` @@ -55,19 +55,19 @@ __init__( --- - + ## class `OpenstackRunnerManager` - + ### method `__init__` ```python -__init__(runner_flavor: str, config: OpenstackRunnerManagerConfig) → None +__init__(prefix: str, config: OpenstackRunnerManagerConfig) → None ``` @@ -79,7 +79,21 @@ __init__(runner_flavor: str, config: OpenstackRunnerManagerConfig) → None --- - + + +### method `cleanup` + +```python +cleanup(remove_token: str) → None +``` + + + + + +--- + + ### method `create_runner` @@ -93,12 +107,26 @@ create_runner(registration_token: str) → str --- - + + +### method `delete_runner` + +```python +delete_runner(id: str, remove_token: str) → None +``` + + + + + +--- + + -### method `delete_runners` +### method `get_name_prefix` ```python -delete_runners(id: str, remove_token: str) → None +get_name_prefix() → str ``` @@ -107,12 +135,12 @@ delete_runners(id: str, remove_token: str) → None --- - + ### method `get_runner` ```python -get_runner(id: str) → RunnerInstance | None +get_runner(id: str) → CloudRunnerInstance | None ``` @@ -121,14 +149,14 @@ get_runner(id: str) → RunnerInstance | None --- - + ### method `get_runners` ```python get_runners( - cloud_runner_status: list[CloudRunnerStatus] -) → Tuple[RunnerInstance] + cloud_runner_status: Sequence[CloudRunnerState] +) → Tuple[CloudRunnerInstance] ``` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 621a26179..b342c7254 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -23,7 +23,8 @@ class CloudRunnerState(str, Enum): UNKNOWN = "unknown" UNEXPECTED = "unexpected" - def __init__(openstack_server_status: str) -> None: + @staticmethod + def from_openstack_server_status(openstack_server_status: str) -> None: """Create from openstack server status. The openstack server status are documented here: @@ -55,7 +56,7 @@ def __init__(openstack_server_status: str) -> None: class CloudRunnerInstance: name: str id: str - status: CloudRunnerState + state: CloudRunnerState @dataclass @@ -64,6 +65,8 @@ class RunnerMetrics: class CloudRunnerManager(ABC): + def get_name_prefix(self) -> str: ... + def create_runner(self, registration_token: str) -> RunnerId: ... def get_runner(self, id: RunnerId) -> CloudRunnerInstance: ... diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 41bfbc20a..3e8972e10 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -15,7 +15,8 @@ class GithubRunnerState(str, Enum): OFFLINE = "offline" UNKNOWN = "unknown" - def __init__(self, runner: SelfHostedRunner) -> "GithubRunnerState": + @staticmethod + def from_runner(runner: SelfHostedRunner) -> "GithubRunnerState": state = GithubRunnerState.OFFLINE if runner.status == GitHubRunnerStatus.ONLINE: if runner.busy: @@ -33,11 +34,12 @@ def __init__(self, prefix: str, token: str, path: GithubPath): self._github = GithubClient(token) def get_runners(self, states: Sequence[GithubRunnerState]) -> tuple[SelfHostedRunner]: - runner_list = self._github.get_runner_github_info() + runner_list = self._github.get_runner_github_info(self._path) return tuple( runner for runner in runner_list - if GithubRunnerManager._filter_runner_state(runner, states) + if runner.name.startswith(self._prefix) + and GithubRunnerManager._filter_runner_state(runner, states) ) def delete_runners(self, states: Sequence[GithubRunnerState]) -> None: @@ -55,4 +57,4 @@ def get_removal_token(self) -> str: def _filter_runner_state( runner: SelfHostedRunner, states: Sequence[GithubRunnerState] ) -> bool: - return GithubRunnerState(runner) in states + return GithubRunnerState.from_runner(runner) in states diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index c12fdf526..3970587e4 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -41,12 +41,11 @@ def __init__( self.name = github_info.name self.id = cloud_instance.id self.github_state = GithubRunnerState(SelfHostedRunner) - self.cloud_state = cloud_instance.status + self.cloud_state = cloud_instance.state @dataclass class RunnerManagerConfig: - prefix: str token: str path: GithubPath @@ -57,7 +56,7 @@ def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManag self._config = config self._cloud = cloud_runner_manager self._github = GithubRunnerManager( - self._config.prefix, self._config.path, self._config.path + prefix=self._cloud.get_name_prefix(), token=self._config.token, path=self._config.path ) def create_runners(self, num: int) -> list[RunnerId]: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 95981063c..307e72f5d 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -64,6 +64,8 @@ class _PullFileError(Exception): @dataclass class OpenstackRunnerManagerConfig: + clouds_config: dict[str, dict] + cloud: str image: str flavor: str network: str @@ -71,24 +73,25 @@ class OpenstackRunnerManagerConfig: labels: list[str] proxy_config: ProxyConfig | None dockerhub_mirror: str | None - ssh_debug_connections: list[SSHDebugConnection] - repo_policy_url: str - repo_policy_token: str - clouds_config: dict[str, dict] - cloud: str + ssh_debug_connections: list[SSHDebugConnection] | None + repo_policy_url: str | None + repo_policy_token: str | None class OpenstackRunnerManager(CloudRunnerManager): - def __init__(self, runner_flavor: str, config: OpenstackRunnerManagerConfig) -> None: - self.runner_flavor = runner_flavor + def __init__(self, prefix: str, config: OpenstackRunnerManagerConfig) -> None: + self.prefix = prefix self.config = config self._openstack_cloud = OpenstackCloud( clouds_config=self.config.clouds_config, cloud=self.config.cloud, - prefix=self.runner_flavor, + prefix=self.prefix, ) + def get_name_prefix(self) -> str: + return self.prefix + def create_runner(self, registration_token: str) -> RunnerId: start_timestamp = time.time() id = OpenstackRunnerManager._generate_runner_id() @@ -119,7 +122,7 @@ def create_runner(self, registration_token: str) -> RunnerId: end_timestamp = time.time() OpenstackRunnerManager._issue_runner_installed_metric( name=instance_name, - flavor=self.runner_flavor, + flavor=self.prefix, install_start_timestamp=start_timestamp, install_end_timestamp=end_timestamp, ) @@ -133,7 +136,7 @@ def get_runner(self, id: RunnerId) -> CloudRunnerInstance | None: return CloudRunnerInstance( name=name, id=id, - status=CloudRunnerState(instance.status), + state=CloudRunnerState.from_openstack_server_status(instance.status), ) return None @@ -145,11 +148,11 @@ def get_runners( CloudRunnerInstance( name=instance.name, id=self._openstack_cloud.convert_name(instance.name), - status=CloudRunnerState(instance.status), + state=CloudRunnerState.from_openstack_server_status(instance.status), ) for instance in instances_list ] - return [instance for instance in instances_list if instance.status in cloud_runner_status] + return [instance for instance in instances_list if instance.state in cloud_runner_status] def delete_runner(self, id: RunnerId, remove_token: str) -> None: instance = self._openstack_cloud.get_instance(id) @@ -176,7 +179,7 @@ def cleanup(self, remove_token: str) -> None: runner_list = self._openstack_cloud.get_instances() for runner in runner_list: - state = CloudRunnerState(runner.status) + state = (CloudRunnerState(runner.status),) if state in ( CloudRunnerState.DELETED, CloudRunnerState.ERROR, diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 25f4f1ee3..c44d374fe 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -85,7 +85,7 @@ def existing_app(pytestconfig: pytest.Config) -> Optional[str]: def app_name(existing_app: Optional[str]) -> str: """Randomized application name.""" # Randomized app name to avoid collision when runner is connecting to GitHub. - return existing_app or f"integration-id{secrets.token_hex(2)}" + return existing_app or f"test-{secrets.token_hex(4)}" @pytest.fixture(scope="module", name="openstack_clouds_yaml") diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py new file mode 100644 index 000000000..dfd7ee6cd --- /dev/null +++ b/tests/integration/test_runner_manager_openstack.py @@ -0,0 +1,100 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Testing the RunnerManager class with OpenStackRunnerManager as CloudManager.""" + + +import pytest +import pytest_asyncio +import yaml +from openstack.connection import Connection as OpenstackConnection + +from charm_state import GithubPath, ProxyConfig, parse_github_path +from manager.runner_manager import RunnerManager, RunnerManagerConfig +from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH +from openstack_cloud.openstack_runner_manager import ( + OpenstackRunnerManager, + OpenstackRunnerManagerConfig, +) +from tests.integration.helpers.openstack import PrivateEndpointConfigs + + +@pytest.fixture(scope="module", name="github_path") +def github_path_fixture(path: str) -> GithubPath: + return parse_github_path(path, "Default") + + +@pytest.fixture(scope="module", name="proxy_config") +def openstack_proxy_config_fixture( + openstack_http_proxy: str, openstack_https_proxy: str, openstack_no_proxy: str +) -> ProxyConfig: + use_aproxy = False + if openstack_http_proxy or openstack_https_proxy: + use_aproxy = True + openstack_http_proxy = openstack_http_proxy if openstack_http_proxy else None + openstack_https_proxy = openstack_https_proxy if openstack_https_proxy else None + return ProxyConfig( + http=openstack_http_proxy, + https=openstack_https_proxy, + no_proxy=openstack_no_proxy, + use_aproxy=use_aproxy, + ) + + +@pytest_asyncio.fixture(scope="module", name="openstack_runner_manager") +async def openstack_runner_manager_fixture( + app_name: str, + private_endpoint_clouds_yaml: str, + openstack_test_image: str, + flavor_name: str, + network_name: str, + github_path: GithubPath, + proxy_config: ProxyConfig, + openstack_connection: OpenstackConnection, +) -> OpenstackRunnerManager: + """Create OpenstackRunnerManager instance. + + The prefix args of OpenstackRunnerManager set to app_name to let openstack_connection_fixture preform the cleanup of openstack resources. + """ + # TODO: Think about how to deal with this when testing locally. + # This will modify a file under home directory. + _CLOUDS_YAML_PATH.unlink() + clouds_config = yaml.safe_load(private_endpoint_clouds_yaml) + + config = OpenstackRunnerManagerConfig( + clouds_config=clouds_config, + cloud="testcloud", + image=openstack_test_image, + flavor=flavor_name, + network=network_name, + github_path=github_path, + labels=["openstack_test"], + proxy_config=proxy_config, + dockerhub_mirror=None, + ssh_debug_connections=None, + repo_policy_url=None, + repo_policy_token=None, + ) + return OpenstackRunnerManager(app_name, config) + + +@pytest_asyncio.fixture(scope="module", name="runner_manager") +async def runner_manager_fixture( + openstack_runner_manager: OpenstackRunnerManager, token: str, github_path: GithubPath +) -> RunnerManager: + config = RunnerManagerConfig(token, github_path) + return RunnerManager(openstack_runner_manager, config) + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_get_no_runner(runner_manager: RunnerManager) -> None: + """ + Arrange: No runners on the + Act: + Assert: + """ + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert not runner_list From 9ea0abe449df6c2bee4335105650f1380a69ae7b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sun, 4 Aug 2024 14:15:57 +0800 Subject: [PATCH 019/278] Add create runner test --- src/openstack_cloud/openstack_cloud.py | 8 ++--- .../openstack_runner_manager.py | 2 +- .../test_runner_manager_openstack.py | 36 +++++++++++++++---- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 662014044..29a084664 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -54,7 +54,7 @@ def __init__(self, server: OpenstackServer, prefix: str): for address in network_addresses ] - if not self.name.startswith(prefix): + if not self.server_name.startswith(prefix): # Should never happen. raise ValueError( f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" @@ -159,7 +159,7 @@ def launch_instance( self._delete_key_pair(conn, name) raise OpenStackError(f"Failed to create openstack server {full_name}") from err - return OpenstackInstance(server) + return OpenstackInstance(server, self.prefix) def get_instance(self, name: str) -> OpenstackInstance: full_name = self.get_instance_name(name) @@ -368,13 +368,13 @@ def _get_key_path(name: str) -> Path: def _setup_key_pair(conn: OpenstackConnection, name: str) -> OpenstackKeypair: key_path = OpenstackCloud._get_key_path(name) - if key_path.exists: + if key_path.exists(): logger.warning("Existing private key file for %s found, removing it.", name) key_path.unlink(missing_ok=True) keypair = conn.create_keypair(name=name) + key_path.parent.mkdir(parents=True, exist_ok=True) key_path.write_text(keypair.private_key) - shutil.chown(key_path, user="ubuntu", group="ubuntu") key_path.chmod(0o400) return keypair diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 307e72f5d..7e7757d55 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -229,7 +229,7 @@ def _generate_userdata(self, instance_name: str, registration_token: str) -> str if self.config.proxy_config is not None else None ) - return jinja.get_template("openstack_userdata.sh.j2").render( + return jinja.get_template("openstack-userdata.sh.j2").render( github_url=f"https://github.com/{self.config.github_path.path()}", runner_group=runner_group, token=registration_token, diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index dfd7ee6cd..b1f820a1a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -10,6 +10,8 @@ from openstack.connection import Connection as OpenstackConnection from charm_state import GithubPath, ProxyConfig, parse_github_path +from manager.cloud_runner_manager import CloudRunnerState +from manager.github_runner_manager import GithubRunnerState from manager.runner_manager import RunnerManager, RunnerManagerConfig from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( @@ -56,9 +58,7 @@ async def openstack_runner_manager_fixture( The prefix args of OpenstackRunnerManager set to app_name to let openstack_connection_fixture preform the cleanup of openstack resources. """ - # TODO: Think about how to deal with this when testing locally. - # This will modify a file under home directory. - _CLOUDS_YAML_PATH.unlink() + _CLOUDS_YAML_PATH.unlink(missing_ok=True) clouds_config = yaml.safe_load(private_endpoint_clouds_yaml) config = OpenstackRunnerManagerConfig( @@ -91,10 +91,34 @@ async def runner_manager_fixture( @pytest.mark.abort_on_fail async def test_get_no_runner(runner_manager: RunnerManager) -> None: """ - Arrange: No runners on the - Act: - Assert: + Arrange: RunnerManager instance with no runners. + Act: Get runners. + Assert: Empty tuple returned. """ runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) assert not runner_list + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_create_runner(runner_manager: RunnerManager) -> None: + """ + Arrange: RunnerManager instance with no runners. + Act: Create one runner. + Assert: An active idle runner. + """ + runner_id_list = runner_manager.create_runners(1) + assert isinstance(runner_id_list, tuple) + assert len(runner_id_list) == 1 + runner_id = runner_id[0] + + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 1 + runner = runner_list[0] + assert runner.id == runner_id + assert runner.cloud_state == CloudRunnerState.ACTIVE + assert runner.github_state == GithubRunnerState.IDLE + From d47be6f44b47637cdc3311c7469bbc54f880007c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sun, 4 Aug 2024 14:18:03 +0800 Subject: [PATCH 020/278] Update integration test to debug new tests --- .github/workflows/integration_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index a85614f3e..d71b576fe 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -35,7 +35,7 @@ jobs: test-tox-env: integration-juju3.2 # TODO: debug only remove # modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_runner", "test_reactive", "test_openstack_cloud"]' - modules: '["test_openstack_cloud"]' + modules: '["test_runner_manager_openstack"]' extra-arguments: "-m openstack" self-hosted-runner: true self-hosted-runner-label: stg-private-endpoint From f8c6c6229fc2d12990e299fad3e3f38ba98262a2 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 09:24:22 +0800 Subject: [PATCH 021/278] Enable debugging --- .github/workflows/integration_test.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index d71b576fe..72afcddfc 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -39,3 +39,5 @@ jobs: extra-arguments: "-m openstack" self-hosted-runner: true self-hosted-runner-label: stg-private-endpoint + tmate-debug: true + tmate-timeout: 300 From fc9d9976bc553864874c9122039eeee79f7f2758 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 10:03:37 +0800 Subject: [PATCH 022/278] Test env --- .github/workflows/e2e_test.yaml | 4 +++- .github/workflows/integration_test.yaml | 10 ++++------ .github/workflows/manual_test_env.yaml | 25 +++++++++++++++++++++++++ .github/workflows/test.yaml | 4 +++- src/manager/cloud_runner_manager.py | 6 ------ 5 files changed, 35 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/manual_test_env.yaml diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 5933451ee..bb1dada46 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,7 +1,9 @@ name: End-to-End tests on: - pull_request: + # TODO: Uncomment + #pull_request: + workflow_dispatch: jobs: # test option values defined at test/conftest.py are passed on via repository secret diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 72afcddfc..1edd98aca 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,7 +1,9 @@ name: integration-tests on: - pull_request: + # TODO: Uncomment + #pull_request: + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -33,11 +35,7 @@ jobs: pre-run-script: scripts/setup-lxd.sh provider: lxd test-tox-env: integration-juju3.2 - # TODO: debug only remove - # modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_runner", "test_reactive", "test_openstack_cloud"]' - modules: '["test_runner_manager_openstack"]' + modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_runner", "test_reactive", "test_openstack_cloud"]' extra-arguments: "-m openstack" self-hosted-runner: true self-hosted-runner-label: stg-private-endpoint - tmate-debug: true - tmate-timeout: 300 diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml new file mode 100644 index 000000000..5bd1a6254 --- /dev/null +++ b/.github/workflows/manual_test_env.yaml @@ -0,0 +1,25 @@ +name: Manual test env + +on: + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + openstack-integration-tests-private-endpoint: + name: Integration test using private-endpoint + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + secrets: inherit + with: + juju-channel: 3.2/stable + pre-run-script: scripts/setup-lxd.sh + provider: lxd + test-tox-env: integration-juju3.2 + modules: '["test_runner_manager_openstack"]' + extra-arguments: "-m openstack" + self-hosted-runner: true + self-hosted-runner-label: stg-private-endpoint + tmate-debug: true + tmate-timeout: 300 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 99e540d31..34803b2fb 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,7 +1,9 @@ name: Tests on: - pull_request: + # TODO: Uncomment + #pull_request: + workflow_dispatch: jobs: unit-tests: diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index b342c7254..1f4c8b507 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -8,12 +8,6 @@ RunnerId = str -_OPENSTACK_STATUS_SHUTOFF = "SHUTOFF" -_OPENSTACK_STATUS_ERROR = "ERROR" -_OPENSTACK_STATUS_ACTIVE = "ACTIVE" -_OPENSTACK_STATUS_BUILDING = "BUILDING" - - class CloudRunnerState(str, Enum): CREATED = "created" ACTIVE = "active" From fa2947292083e83b709762b75d2b8bd557ea27ea Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 10:07:19 +0800 Subject: [PATCH 023/278] Add debug --- src-docs/openstack_cloud.openstack_runner_manager.md | 8 ++++---- src/openstack_cloud/openstack_runner_manager.py | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 537b4b776..dbbc067a6 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -79,7 +79,7 @@ __init__(prefix: str, config: OpenstackRunnerManagerConfig) → None --- - + ### method `cleanup` @@ -107,7 +107,7 @@ create_runner(registration_token: str) → str --- - + ### method `delete_runner` @@ -135,7 +135,7 @@ get_name_prefix() → str --- - + ### method `get_runner` @@ -149,7 +149,7 @@ get_runner(id: str) → CloudRunnerInstance | None --- - + ### method `get_runners` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 7e7757d55..7bbca45f0 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -109,6 +109,10 @@ def create_runner(self, registration_token: str) -> RunnerId: ) except OpenStackError as err: raise RunnerCreateError("Failed to create {instance_name} openstack runner") from err + + # TODO: Test only + import pytest + pytest.set_trace() try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) From 9c74ad97dc9bcadf187b22db223d589eb142f699 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 11:01:07 +0800 Subject: [PATCH 024/278] Fix confusing naming issue. --- src-docs/openstack_cloud.openstack_cloud.md | 6 ++-- ...penstack_cloud.openstack_runner_manager.md | 8 ++--- src/openstack_cloud/openstack_cloud.py | 32 +++++++++---------- .../openstack_runner_manager.py | 28 +++++++--------- tests/integration/test_openstack_cloud.py | 11 ++++--- 5 files changed, 41 insertions(+), 44 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 0b227b3ba..6f8fb579b 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -83,7 +83,7 @@ cleanup() → None ### method `delete_instance` ```python -delete_instance(name: str) → None +delete_instance(instance_id: str) → None ``` @@ -97,7 +97,7 @@ delete_instance(name: str) → None ### method `get_instance` ```python -get_instance(name: str) → OpenstackInstance +get_instance(instance_id: str) → OpenstackInstance ``` @@ -154,7 +154,7 @@ get_ssh_connection(instance: OpenstackInstance) → Connection ```python launch_instance( - name: str, + instance_id: str, image: str, flavor: str, network: str, diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index dbbc067a6..537b4b776 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -79,7 +79,7 @@ __init__(prefix: str, config: OpenstackRunnerManagerConfig) → None --- - + ### method `cleanup` @@ -107,7 +107,7 @@ create_runner(registration_token: str) → str --- - + ### method `delete_runner` @@ -135,7 +135,7 @@ get_name_prefix() → str --- - + ### method `get_runner` @@ -149,7 +149,7 @@ get_runner(id: str) → CloudRunnerInstance | None --- - + ### method `get_runners` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 29a084664..ab1efdd07 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -40,7 +40,7 @@ class OpenstackInstance: server_id: str server_name: str - name: str + instance_id: str addresses: list[str] status: str @@ -59,7 +59,7 @@ def __init__(self, server: OpenstackServer, prefix: str): raise ValueError( f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" ) - self.name = self.server_name[len(prefix) :] + self.instance_id = self.server_name[len(prefix) :] @contextmanager @@ -112,9 +112,9 @@ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): self.prefix = prefix def launch_instance( - self, name: str, image: str, flavor: str, network: str, userdata: str + self, instance_id: str, image: str, flavor: str, network: str, userdata: str ) -> OpenstackInstance: - full_name = self.get_instance_name(name) + full_name = self.get_instance_name(instance_id) logger.info("Creating openstack server with %s", full_name) with _get_openstack_connection( @@ -152,17 +152,17 @@ def launch_instance( "Failed to cleanup openstack server %s that timeout during creation", full_name, ) - self._delete_key_pair(conn, name) + self._delete_key_pair(conn, instance_id) raise OpenStackError(f"Timeout creating openstack server {full_name}") from err except openstack.exceptions.SDKException as err: logger.exception("Failed to create openstack server %s", full_name) - self._delete_key_pair(conn, name) + self._delete_key_pair(conn, instance_id) raise OpenStackError(f"Failed to create openstack server {full_name}") from err return OpenstackInstance(server, self.prefix) - def get_instance(self, name: str) -> OpenstackInstance: - full_name = self.get_instance_name(name) + def get_instance(self, instance_id: str) -> OpenstackInstance: + full_name = self.get_instance_name(instance_id) logger.info("Getting openstack server with %s", full_name) with _get_openstack_connection( @@ -170,8 +170,8 @@ def get_instance(self, name: str) -> OpenstackInstance: ) as conn: return OpenstackInstance(OpenstackCloud._get_and_ensure_unique_server(conn, full_name)) - def delete_instance(self, name: str) -> None: - full_name = self.get_instance_name(name) + def delete_instance(self, instance_id: str) -> None: + full_name = self.get_instance_name(instance_id) logger.info("Deleting openstack server with %s", full_name) with _get_openstack_connection( @@ -188,12 +188,12 @@ def delete_instance(self, name: str) -> None: raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: - key_path = OpenstackCloud._get_key_path(instance.name) + key_path = OpenstackCloud._get_key_path(instance.server_name) if not key_path.exists(): - raise SshError(f"Missing keyfile for server: {instance.name}, key path: {key_path}") + raise SshError(f"Missing keyfile for server: {instance.server_name}, key path: {key_path}") if not instance.addresses: - raise SshError(f"No addresses found for OpenStack server {instance.name}") + raise SshError(f"No addresses found for OpenStack server {instance.server_name}") for ip in instance.addresses: try: @@ -206,7 +206,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: result = connection.run("echo {_TEST_STRING}", warn=True, timeout=_SSH_TIMEOUT) if not result.ok: logger.warning( - "SSH test connection failed, server: %s, address: %s", instance.name, ip + "SSH test connection failed, server: %s, address: %s", instance.server_name, ip ) continue if _TEST_STRING in result.stdout: @@ -214,13 +214,13 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException): logger.warning( "Unable to SSH into %s with address %s", - instance.name, + instance.server_name, connection.host, exc_info=True, ) continue raise SshError( - f"No connectable SSH addresses found, server: {instance.name}, " + f"No connectable SSH addresses found, server: {instance.server_name}, " f"addresses: {instance.addresses}" ) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 7bbca45f0..b89214ffc 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -101,7 +101,7 @@ def create_runner(self, registration_token: str) -> RunnerId: ) try: instance = self._openstack_cloud.launch_instance( - name=id, + instance_id=id, image=self.config.image, flavor=self.config.flavor, network=self.config.network, @@ -109,10 +109,6 @@ def create_runner(self, registration_token: str) -> RunnerId: ) except OpenStackError as err: raise RunnerCreateError("Failed to create {instance_name} openstack runner") from err - - # TODO: Test only - import pytest - pytest.set_trace() try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) @@ -136,7 +132,7 @@ def get_runner(self, id: RunnerId) -> CloudRunnerInstance | None: name = self._openstack_cloud.get_instance_name(id) instances_list = self._openstack_cloud.get_instances() for instance in instances_list: - if instance.name == name: + if instance.server_name == name: return CloudRunnerInstance( name=name, id=id, @@ -150,8 +146,8 @@ def get_runners( instances_list = self._openstack_cloud.get_instances() instances_list = [ CloudRunnerInstance( - name=instance.name, - id=self._openstack_cloud.convert_name(instance.name), + name=instance.server_name, + id=instance.instance_id, state=CloudRunnerState.from_openstack_server_status(instance.status), ) for instance in instances_list @@ -164,20 +160,20 @@ def delete_runner(self, id: RunnerId, remove_token: str) -> None: def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - self._pull_runner_metrics(instance.name, ssh_conn) + self._pull_runner_metrics(instance.server_name, ssh_conn) try: OpenstackRunnerManager._run_github_runner_removal_script( - instance.name, ssh_conn, remove_token + instance.server_name, ssh_conn, remove_token ) except GithubRunnerRemoveError: logger.warning( - "Unable to run github runner removal script for %s", instance.name, stack_info=True + "Unable to run github runner removal script for %s", instance.server_name, stack_info=True ) try: self._openstack_cloud.delete_instance(id) except OpenStackError: - logger.exception("Unable to delete openstack instance for runner %s", instance.name) + logger.exception("Unable to delete openstack instance for runner %s", instance.server_name) def cleanup(self, remove_token: str) -> None: runner_list = self._openstack_cloud.get_instances() @@ -257,14 +253,14 @@ def _health_check(self, instance: OpenstackInstance) -> bool: try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SshError: - logger.exception("SSH connection failure with %s", instance.name) + logger.exception("SSH connection failure with %s", instance.server_name) return False try: - OpenstackRunnerManager._run_health_check(ssh_conn, instance.name) + OpenstackRunnerManager._run_health_check(ssh_conn, instance.server_name) except RunnerError: - logger.exception("Health check failure for %s", instance.name) + logger.exception("Health check failure for %s", instance.server_name) return False - logger.info("Health check success for %s", instance.name) + logger.info("Health check success for %s", instance.server_name) return True @retry(tries=3, delay=60, local_logger=logger) diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py index 321fbe1fa..926e545bb 100644 --- a/tests/integration/test_openstack_cloud.py +++ b/tests/integration/test_openstack_cloud.py @@ -26,7 +26,7 @@ async def openstack_cloud_fixture(base_openstack_cloud: OpenstackCloud) -> Opens """Ensures the OpenstackCloud object has no openstack servers.""" instances = base_openstack_cloud.get_instances() for instance in instances: - base_openstack_cloud.delete_instance(name=instance.name) + base_openstack_cloud.delete_instance(instance_id=instance.instance_id) return base_openstack_cloud @@ -73,7 +73,7 @@ async def test_launch_instance_and_delete( # 1. instance = base_openstack_cloud.launch_instance( - name=instance_name, + instance_id=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, network=network_name, @@ -81,7 +81,8 @@ async def test_launch_instance_and_delete( ) assert instance is not None - assert instance.name is not None + assert instance.instance_id is not None + assert instance.server_name is not None assert instance.id is not None servers = openstack_connection.list_servers() @@ -92,7 +93,7 @@ async def test_launch_instance_and_delete( assert False, f"OpenStack server with {instance_name} in the name not found" # 2. - base_openstack_cloud.delete_instance(name=instance_name) + base_openstack_cloud.delete_instance(instance_id=instance_name) instances = base_openstack_cloud.get_instances() assert not instances, "Test failure: openstack instance should be deleted." @@ -116,7 +117,7 @@ async def test_instance_ssh_connection( rand_chars = f"{token_hex(10)}" instance_name = f"{token_hex(2)}" instance = openstack_cloud.launch_instance( - name=instance_name, + instance_id=instance_name, image=openstack_test_image, flavor=openstack_test_flavor, network=network_name, From b443be4e27bf5ba306ad4f4c6081a83604e45bcd Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 11:16:10 +0800 Subject: [PATCH 025/278] Pre-merge --- src-docs/openstack_cloud.openstack_manager.md | 26 ++-- src/openstack_cloud/openstack_manager.py | 146 ++++++++++++------ 2 files changed, 120 insertions(+), 52 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_manager.md b/src-docs/openstack_cloud.openstack_manager.md index 93cff6908..0a39a9a37 100644 --- a/src-docs/openstack_cloud.openstack_manager.md +++ b/src-docs/openstack_cloud.openstack_manager.md @@ -18,7 +18,7 @@ Module for handling interactions with OpenStack. --- - + ## function `create_instance_config` @@ -93,7 +93,7 @@ __init__( --- - + ## class `GithubRunnerRemoveError` Represents an error removing registered runner from Github. @@ -104,7 +104,7 @@ Represents an error removing registered runner from Github. --- - + ## class `OpenstackRunnerManager` Runner manager for OpenStack-based instances. @@ -117,7 +117,7 @@ Runner manager for OpenStack-based instances. - `unit_num`: The juju unit number. - `instance_name`: Prefix of the name for the set of runners. - + ### method `__init__` @@ -146,24 +146,32 @@ Construct OpenstackRunnerManager object. --- - + ### method `flush` ```python -flush() → int +flush(mode: FlushMode = ) → int ``` Flush Openstack servers. +1. Kill the processes depending on flush mode. 2. Get unhealthy runners after process purging. 3. Delete unhealthy runners. + + + +**Args:** + + - `mode`: The mode to determine which runner to flush. + **Returns:** - The number of runners flushed. + The number of runners flushed. --- - + ### method `get_github_runner_info` @@ -180,7 +188,7 @@ Get information on GitHub for the runners. --- - + ### method `reconcile` diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index f5fb1f0f1..f61d28b8d 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -20,6 +20,7 @@ import time from contextlib import contextmanager from dataclasses import dataclass +from datetime import datetime from multiprocessing import Pool from pathlib import Path from typing import Iterable, Iterator, Literal, Optional, cast @@ -32,7 +33,6 @@ import openstack.image.v2.image import paramiko from fabric import Connection as SshConnection -from invoke.runners import Result from openstack.compute.v2.server import Server from openstack.connection import Connection as OpenstackConnection from openstack.exceptions import SDKException @@ -61,7 +61,7 @@ from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME from repo_policy_compliance_client import RepoPolicyComplianceClient from runner_manager import IssuedMetricEventsStats -from runner_manager_type import OpenstackRunnerManagerConfig +from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig from runner_type import GithubPath, RunnerByHealth, RunnerGithubInfo from utilities import retry, set_env_var @@ -149,6 +149,40 @@ class _CloudInitUserData: proxies: Optional[ProxyConfig] = None +@contextmanager +def _create_connection(cloud_config: dict[str, dict]) -> Iterator[openstack.connection.Connection]: + """Create a connection context managed object, to be used within with statements. + + This method should be called with a valid cloud_config. See _validate_cloud_config. + Also, this method assumes that the clouds.yaml exists on ~/.config/openstack/clouds.yaml. + See charm_state.py _write_openstack_config_to_disk. + + Args: + cloud_config: The configuration in clouds.yaml format to apply. + + Raises: + OpenStackError: if the credentials provided is not authorized. + + Yields: + An openstack.connection.Connection object. + """ + clouds = list(cloud_config["clouds"].keys()) + if len(clouds) > 1: + logger.warning("Multiple clouds defined in clouds.yaml. Using the first one to connect.") + cloud_name = clouds[0] + + # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but + # I could not reproduce it. Therefore, no catch here for such exception. + try: + with openstack.connect(cloud=cloud_name) as conn: + conn.authorize() + yield conn + # pylint thinks this isn't an exception, but does inherit from Exception class. + except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause + logger.exception("OpenStack API call failure") + raise OpenStackError("Failed OpenStack API call") from exc + + # Disable too many arguments, as they are needed to create the dataclass. def create_instance_config( # pylint: disable=too-many-arguments app_name: str, @@ -424,14 +458,19 @@ def _get_openstack_instances(self, conn: OpenstackConnection) -> list[Server]: ] @staticmethod - def _health_check(conn: OpenstackConnection, server_name: str, startup: bool = False) -> bool: + def _health_check( + conn: OpenstackConnection, + server_name: str, + startup: bool = False, + ) -> bool: """Health check a server instance. A healthy server is defined as: 1. Openstack instance status is ACTIVE or BUILDING. - 2. Runner.Worker exists (running a job). - 3. Runner.Listener exists (waiting for job). - 3. GitHub runner status is Idle or Active. + 2. Openstack instance status is in BUILDING less than CREATE_SERVER_TIMEOUT seconds. + 3. Runner.Worker exists (running a job). + 4. Runner.Listener exists (waiting for job). + 5. GitHub runner status is Idle or Active. An undetermined server is marked as healthy when: 1. SSH fails - could be a transient network error. @@ -453,6 +492,11 @@ def _health_check(conn: OpenstackConnection, server_name: str, startup: bool = F return False if server.status not in (_INSTANCE_STATUS_ACTIVE, _INSTANCE_STATUS_BUILDING): return False + created_at = datetime.strptime(server.created_at, "%Y-%m-%dT%H:%M:%SZ") + current_time = datetime.now(created_at.tzinfo) + elapsed_min = (created_at - current_time).total_seconds() + if server.status == _INSTANCE_STATUS_BUILDING: + return elapsed_min < CREATE_SERVER_TIMEOUT return OpenstackRunnerManager._ssh_health_check( conn=conn, server_name=server_name, startup=startup ) @@ -494,8 +538,7 @@ def _ssh_health_check(conn: OpenstackConnection, server_name: str, startup: bool if RUNNER_WORKER_PROCESS in result.stdout or RUNNER_LISTENER_PROCESS in result.stdout: return True - logger.error("[ALERT] Health check failed for server: %s", server_name) - return True + return False @staticmethod @retry(tries=3, delay=5, max_delay=60, backoff=2, local_logger=logger) @@ -1155,7 +1198,7 @@ def _run_github_removal_script( ) from exc try: - result: Result = ssh_conn.run( + result: invoke.runners.Result = ssh_conn.run( f"{_CONFIG_SCRIPT_PATH} remove --token {remove_token}", warn=True, ) @@ -1475,54 +1518,71 @@ def _issue_reconciliation_metric( except IssueMetricEventError: logger.exception("Failed to issue Reconciliation metric") - def flush(self) -> int: + def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: """Flush Openstack servers. + 1. Kill the processes depending on flush mode. + 2. Get unhealthy runners after process purging. + 3. Delete unhealthy runners. + + Args: + mode: The mode to determine which runner to flush. + Returns: The number of runners flushed. """ logger.info("Flushing OpenStack all runners") with _create_connection(self._cloud_config) as conn: + self._kill_runner_processes(conn=conn, mode=mode) runner_by_health = self._get_openstack_runner_status(conn) remove_token = self._github.get_runner_remove_token(path=self._config.path) - runners_to_delete = (*runner_by_health.healthy, *runner_by_health.unhealthy) self._remove_runners( conn=conn, - instance_names=runners_to_delete, + instance_names=runner_by_health.unhealthy, remove_token=remove_token, ) - return len(runners_to_delete) + return len(runner_by_health.unhealthy) + def _kill_runner_processes(self, conn: OpenstackConnection, mode: FlushMode) -> None: + """Kill runner application that are not running any jobs. -@contextmanager -def _create_connection(cloud_config: dict[str, dict]) -> Iterator[OpenstackConnection]: - """Create a connection context managed object, to be used within with statements. - - This method should be called with a valid cloud_config. See _validate_cloud_config. - Also, this method assumes that the clouds.yaml exists on ~/.config/openstack/clouds.yaml. - See charm_state.py _write_openstack_config_to_disk. - - Args: - cloud_config: The configuration in clouds.yaml format to apply. - - Raises: - OpenStackError: if the credentials provided is not authorized. + Runners that have not picked up a job has + 1. no Runner.Worker process + 2. no pre-run.sh job process - Yields: - An openstack.connection.Connection object. - """ - clouds = list(cloud_config["clouds"].keys()) - if len(clouds) > 1: - logger.warning("Multiple clouds defined in clouds.yaml. Using the first one to connect.") - cloud_name = clouds[0] + Args: + conn: The connection object to access OpenStack cloud. + mode: The flush mode to determine which runner processes to kill. - # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but - # I could not reproduce it. Therefore, no catch here for such exception. - try: - with openstack.connect(cloud=cloud_name) as conn: - conn.authorize() - yield conn - # pylint thinks this isn't an exception, but does inherit from Exception class. - except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause - logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc + Raises: + NotImplementedError: If unsupported flush mode has been passed. + """ + killer_command: str + match mode: + case FlushMode.FLUSH_IDLE: + # only kill Runner.Listener if Runner.Worker does not exist. + killer_command = ( + "! pgrep -x Runner.Worker && pgrep -x Runner.Listener && " + "kill $(pgrep -x Runner.Listener)" + ) + case FlushMode.FLUSH_BUSY: + # kill both Runner.Listener and Runner.Worker processes. + # This kills pre-job.sh, a child process of Runner.Worker. + killer_command = ( + "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" + "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);" + ) + case _: + raise NotImplementedError(f"Unsupported flush mode {mode}") + + servers = self._get_openstack_instances(conn=conn) + for server in servers: + ssh_conn: SshConnection = self._get_ssh_connection(conn=conn, server_name=server.name) + result: invoke.runners.Result = ssh_conn.run( + killer_command, + warn=True, + ) + if not result.ok: + logger.warning("Failed to kill runner process. Instance: %s", server.name) + continue + logger.info("Successfully killed runner process. Instance: %s", server.name) From d38d818b2146862e3ab5c693a447a6421e89b0d6 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 11:33:59 +0800 Subject: [PATCH 026/278] Test manual test env --- .github/workflows/manual_test_env.yaml | 45 +++++++++++++++++--------- src-docs/runner_manager_type.md | 10 +++--- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index 5bd1a6254..a2a7ff630 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -8,18 +8,33 @@ concurrency: cancel-in-progress: true jobs: - openstack-integration-tests-private-endpoint: - name: Integration test using private-endpoint - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main - secrets: inherit - with: - juju-channel: 3.2/stable - pre-run-script: scripts/setup-lxd.sh - provider: lxd - test-tox-env: integration-juju3.2 - modules: '["test_runner_manager_openstack"]' - extra-arguments: "-m openstack" - self-hosted-runner: true - self-hosted-runner-label: stg-private-endpoint - tmate-debug: true - tmate-timeout: 300 + manual-test-env: + name: manual-test-env + runs-on: ["self-hosted", "stg-private-endpoint"] + steps: + - name: Setup operator environment + uses: charmed-kubernetes/actions-operator@main + with: + provider: lxd + juju-channel: 3.2/stable + - uses: actions/checkout@v4 + - run: bash scripts/setup-lxd.sh + - name: Tmate debugging session (self-hosted) + uses: canonical/action-tmate@main + timeout-minutes: ${{ inputs.tmate-timeout }} + + # openstack-integration-tests-private-endpoint: + # name: Integration test using private-endpoint + # uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + # secrets: inherit + # with: + # juju-channel: 3.2/stable + # pre-run-script: scripts/setup-lxd.sh + # provider: lxd + # test-tox-env: integration-juju3.2 + # modules: '["test_runner_manager_openstack"]' + # extra-arguments: "-m openstack" + # self-hosted-runner: true + # self-hosted-runner-label: stg-private-endpoint + # tmate-debug: true + # tmate-timeout: 300 diff --git a/src-docs/runner_manager_type.md b/src-docs/runner_manager_type.md index c3509b433..f6dd4faae 100644 --- a/src-docs/runner_manager_type.md +++ b/src-docs/runner_manager_type.md @@ -14,6 +14,8 @@ Types used by RunnerManager class. ## class `FlushMode` Strategy for flushing runners. +During pre-job (repo-check), the runners are marked as idle and if the pre-job fails, the runner falls back to being idle again. Hence wait_repo_check is required. + **Attributes:** @@ -30,7 +32,7 @@ Strategy for flushing runners. --- - + ## class `RunnerManagerClients` Clients for accessing various services. @@ -67,7 +69,7 @@ __init__( --- - + ## class `RunnerManagerConfig` Configuration of runner manager. @@ -119,7 +121,7 @@ Whether metrics for the runners should be collected. --- - + ## class `OpenstackRunnerManagerConfig` Configuration of runner manager. @@ -166,7 +168,7 @@ __init__( --- - + ## class `RunnerInfo` Information from GitHub of a runner. From 9990499f362f7d01648844c9b16e805c89f02e35 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 12:07:38 +0800 Subject: [PATCH 027/278] Debug ssh conn --- src-docs/openstack_cloud.openstack_cloud.md | 6 +++--- src/openstack_cloud/openstack_cloud.py | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 6f8fb579b..c18711cbd 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -64,7 +64,7 @@ Create a OpenstackCloud instance. --- - + ### method `cleanup` @@ -106,7 +106,7 @@ get_instance(instance_id: str) → OpenstackInstance --- - + ### method `get_instance_name` @@ -120,7 +120,7 @@ get_instance_name(name: str) → str --- - + ### method `get_instances` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index ab1efdd07..04314fbf2 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -194,6 +194,9 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: raise SshError(f"Missing keyfile for server: {instance.server_name}, key path: {key_path}") if not instance.addresses: raise SshError(f"No addresses found for OpenStack server {instance.server_name}") + + import pytest + pytest.set_trace() for ip in instance.addresses: try: From 2be2847f430b9e5dc58696e8987765692dc2c143 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 12:48:09 +0800 Subject: [PATCH 028/278] Revert manual test enf --- .github/workflows/manual_test_env.yaml | 58 +++++++++++++------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index a2a7ff630..fd9087999 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -8,33 +8,33 @@ concurrency: cancel-in-progress: true jobs: - manual-test-env: - name: manual-test-env - runs-on: ["self-hosted", "stg-private-endpoint"] - steps: - - name: Setup operator environment - uses: charmed-kubernetes/actions-operator@main - with: - provider: lxd - juju-channel: 3.2/stable - - uses: actions/checkout@v4 - - run: bash scripts/setup-lxd.sh - - name: Tmate debugging session (self-hosted) - uses: canonical/action-tmate@main - timeout-minutes: ${{ inputs.tmate-timeout }} + # manual-test-env: + # name: manual-test-env + # runs-on: ["self-hosted", "stg-private-endpoint"] + # steps: + # - name: Setup operator environment + # uses: charmed-kubernetes/actions-operator@main + # with: + # provider: lxd + # juju-channel: 3.2/stable + # - uses: actions/checkout@v4 + # - run: bash scripts/setup-lxd.sh + # - name: Tmate debugging session (self-hosted) + # uses: canonical/action-tmate@main + # timeout-minutes: ${{ inputs.tmate-timeout }} - # openstack-integration-tests-private-endpoint: - # name: Integration test using private-endpoint - # uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main - # secrets: inherit - # with: - # juju-channel: 3.2/stable - # pre-run-script: scripts/setup-lxd.sh - # provider: lxd - # test-tox-env: integration-juju3.2 - # modules: '["test_runner_manager_openstack"]' - # extra-arguments: "-m openstack" - # self-hosted-runner: true - # self-hosted-runner-label: stg-private-endpoint - # tmate-debug: true - # tmate-timeout: 300 + openstack-integration-tests-private-endpoint: + name: Integration test using private-endpoint + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + secrets: inherit + with: + juju-channel: 3.2/stable + pre-run-script: scripts/setup-lxd.sh + provider: lxd + test-tox-env: integration-juju3.2 + modules: '["test_runner_manager_openstack"]' + extra-arguments: "-m openstack" + self-hosted-runner: true + self-hosted-runner-label: stg-private-endpoint + tmate-debug: true + tmate-timeout: 300 From d9b3ffed69e84568c93775abf2c566c3ba2ba67c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 12:50:46 +0800 Subject: [PATCH 029/278] Create manual test env --- .github/workflows/manual_test_env.yaml | 46 +++++++++++++------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index fd9087999..a513cf466 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -8,33 +8,33 @@ concurrency: cancel-in-progress: true jobs: - # manual-test-env: - # name: manual-test-env - # runs-on: ["self-hosted", "stg-private-endpoint"] - # steps: + manual-test-env: + name: manual-test-env + runs-on: ["self-hosted", "stg-private-endpoint"] + steps: # - name: Setup operator environment # uses: charmed-kubernetes/actions-operator@main # with: # provider: lxd # juju-channel: 3.2/stable - # - uses: actions/checkout@v4 + - uses: actions/checkout@v4 # - run: bash scripts/setup-lxd.sh - # - name: Tmate debugging session (self-hosted) - # uses: canonical/action-tmate@main - # timeout-minutes: ${{ inputs.tmate-timeout }} + - name: Tmate debugging session (self-hosted) + uses: canonical/action-tmate@main + timeout-minutes: ${{ inputs.tmate-timeout }} - openstack-integration-tests-private-endpoint: - name: Integration test using private-endpoint - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main - secrets: inherit - with: - juju-channel: 3.2/stable - pre-run-script: scripts/setup-lxd.sh - provider: lxd - test-tox-env: integration-juju3.2 - modules: '["test_runner_manager_openstack"]' - extra-arguments: "-m openstack" - self-hosted-runner: true - self-hosted-runner-label: stg-private-endpoint - tmate-debug: true - tmate-timeout: 300 + # openstack-integration-tests-private-endpoint: + # name: Integration test using private-endpoint + # uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + # secrets: inherit + # with: + # juju-channel: 3.2/stable + # pre-run-script: scripts/setup-lxd.sh + # provider: lxd + # test-tox-env: integration-juju3.2 + # modules: '["test_runner_manager_openstack"]' + # extra-arguments: "-m openstack" + # self-hosted-runner: true + # self-hosted-runner-label: stg-private-endpoint + # tmate-debug: true + # tmate-timeout: 300 From 3ed04356a29cc83068a1508938876964440f8b14 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 12:51:48 +0800 Subject: [PATCH 030/278] Retry old manual env --- .github/workflows/manual_test_env.yaml | 50 ++++++++++++-------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index a513cf466..58c58324b 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -3,38 +3,34 @@ name: Manual test env on: pull_request: -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - jobs: - manual-test-env: - name: manual-test-env - runs-on: ["self-hosted", "stg-private-endpoint"] - steps: + # manual-test-env: + # name: manual-test-env + # runs-on: ["self-hosted", "stg-private-endpoint"] + # steps: # - name: Setup operator environment # uses: charmed-kubernetes/actions-operator@main # with: # provider: lxd # juju-channel: 3.2/stable - - uses: actions/checkout@v4 + # - uses: actions/checkout@v4 # - run: bash scripts/setup-lxd.sh - - name: Tmate debugging session (self-hosted) - uses: canonical/action-tmate@main - timeout-minutes: ${{ inputs.tmate-timeout }} + # - name: Tmate debugging session (self-hosted) + # uses: canonical/action-tmate@main + # timeout-minutes: ${{ inputs.tmate-timeout }} - # openstack-integration-tests-private-endpoint: - # name: Integration test using private-endpoint - # uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main - # secrets: inherit - # with: - # juju-channel: 3.2/stable - # pre-run-script: scripts/setup-lxd.sh - # provider: lxd - # test-tox-env: integration-juju3.2 - # modules: '["test_runner_manager_openstack"]' - # extra-arguments: "-m openstack" - # self-hosted-runner: true - # self-hosted-runner-label: stg-private-endpoint - # tmate-debug: true - # tmate-timeout: 300 + openstack-integration-tests-private-endpoint: + name: Integration test using private-endpoint + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + secrets: inherit + with: + juju-channel: 3.2/stable + pre-run-script: scripts/setup-lxd.sh + provider: lxd + test-tox-env: integration-juju3.2 + modules: '["test_runner_manager_openstack"]' + extra-arguments: "-m openstack" + self-hosted-runner: true + self-hosted-runner-label: stg-private-endpoint + tmate-debug: true + tmate-timeout: 300 From 36e0eb025e232b64400f4723583d863a44e06c2a Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 15:01:25 +0800 Subject: [PATCH 031/278] Spawn two debug runners --- .github/workflows/manual_test_env.yaml | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index 58c58324b..9dd827f36 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -4,21 +4,14 @@ on: pull_request: jobs: - # manual-test-env: - # name: manual-test-env - # runs-on: ["self-hosted", "stg-private-endpoint"] - # steps: - # - name: Setup operator environment - # uses: charmed-kubernetes/actions-operator@main - # with: - # provider: lxd - # juju-channel: 3.2/stable - # - uses: actions/checkout@v4 - # - run: bash scripts/setup-lxd.sh - # - name: Tmate debugging session (self-hosted) - # uses: canonical/action-tmate@main - # timeout-minutes: ${{ inputs.tmate-timeout }} - + manual-test-env: + name: manual-test-env + runs-on: ["self-hosted", "stg-private-endpoint", "X64"] + steps: + - uses: actions/checkout@v4 + - name: Tmate debugging session (self-hosted) + uses: canonical/action-tmate@main + timeout-minutes: ${{ inputs.tmate-timeout }} openstack-integration-tests-private-endpoint: name: Integration test using private-endpoint uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main From 0ea7156d43c28164f2a9df2b741fafce6b816075 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 15:52:29 +0800 Subject: [PATCH 032/278] Fix string formatting. --- src-docs/openstack_cloud.openstack_cloud.md | 6 +++--- src-docs/openstack_cloud.openstack_runner_manager.md | 2 +- src/openstack_cloud/openstack_cloud.py | 5 +---- src/openstack_cloud/openstack_runner_manager.py | 8 ++++++-- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index c18711cbd..6f8fb579b 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -64,7 +64,7 @@ Create a OpenstackCloud instance. --- - + ### method `cleanup` @@ -106,7 +106,7 @@ get_instance(instance_id: str) → OpenstackInstance --- - + ### method `get_instance_name` @@ -120,7 +120,7 @@ get_instance_name(name: str) → str --- - + ### method `get_instances` diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 537b4b776..351e83941 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -79,7 +79,7 @@ __init__(prefix: str, config: OpenstackRunnerManagerConfig) → None --- - + ### method `cleanup` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 04314fbf2..9e24ccf98 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -195,9 +195,6 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: if not instance.addresses: raise SshError(f"No addresses found for OpenStack server {instance.server_name}") - import pytest - pytest.set_trace() - for ip in instance.addresses: try: connection = SshConnection( @@ -206,7 +203,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: connect_kwargs={"key_filename": str(key_path)}, connect_timeout=_SSH_TIMEOUT, ) - result = connection.run("echo {_TEST_STRING}", warn=True, timeout=_SSH_TIMEOUT) + result = connection.run(f"echo {_TEST_STRING}", warn=True, timeout=_SSH_TIMEOUT) if not result.ok: logger.warning( "SSH test connection failed, server: %s, address: %s", instance.server_name, ip diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index b89214ffc..a0d5d07b5 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -114,7 +114,7 @@ def create_runner(self, registration_token: str) -> RunnerId: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SshError as err: raise RunnerCreateError( - "Failed to SSH connect to {instance_name} openstack runner" + f"Failed to SSH connect to {instance_name} openstack runner" ) from err OpenstackRunnerManager._wait_runner_startup(ssh_conn, instance_name) @@ -159,7 +159,11 @@ def delete_runner(self, id: RunnerId, remove_token: str) -> None: self._delete_runner(instance, remove_token) def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) + try: + ssh_conn = self._openstack_cloud.get_ssh_connection(instance) + except SshError: + logger.exception("Failed SSH connection while removing %s", instance.server_name) + raise RunnerRemoveError(F"Failed SSH connection for {instance.server_name}") self._pull_runner_metrics(instance.server_name, ssh_conn) try: OpenstackRunnerManager._run_github_runner_removal_script( From df66ec8a8ecb69ab06716265253b6d6f1093be18 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 16:22:39 +0800 Subject: [PATCH 033/278] Fix startup health check --- ...penstack_cloud.openstack_runner_manager.md | 8 ++--- .../openstack_runner_manager.py | 29 +++++++++---------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 351e83941..e6de1dd7a 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -79,7 +79,7 @@ __init__(prefix: str, config: OpenstackRunnerManagerConfig) → None --- - + ### method `cleanup` @@ -107,7 +107,7 @@ create_runner(registration_token: str) → str --- - + ### method `delete_runner` @@ -135,7 +135,7 @@ get_name_prefix() → str --- - + ### method `get_runner` @@ -149,7 +149,7 @@ get_runner(id: str) → CloudRunnerInstance | None --- - + ### method `get_runners` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index a0d5d07b5..8937decc9 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -110,14 +110,7 @@ def create_runner(self, registration_token: str) -> RunnerId: except OpenStackError as err: raise RunnerCreateError("Failed to create {instance_name} openstack runner") from err - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except SshError as err: - raise RunnerCreateError( - f"Failed to SSH connect to {instance_name} openstack runner" - ) from err - - OpenstackRunnerManager._wait_runner_startup(ssh_conn, instance_name) + self._wait_runner_startup(instance) end_timestamp = time.time() OpenstackRunnerManager._issue_runner_installed_metric( @@ -282,16 +275,22 @@ def _run_health_check(ssh_conn: SshConnection, name: str): raise RunnerError(f"Runner process not found on {name}") @retry(tries=10, delay=60, local_logger=logger) - @staticmethod - def _wait_runner_startup(ssh_conn: SshConnection, name: str) -> None: + def _wait_runner_startup(self, instance: OpenstackInstance) -> None: + try: + ssh_conn = self._openstack_cloud.get_ssh_connection(instance) + except SshError as err: + raise RunnerCreateError( + f"Failed to SSH connect to {instance.server_name} openstack runner" + ) from err + result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) if not result.ok: - logger.warning("SSH run of `ps aux` failed on %s", name) - raise RunnerStartError(f"Unable to SSH run `ps aux` on {name}") + logger.warning("SSH run of `ps aux` failed on %s", instance.server_name) + raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.server_name}") if RUNNER_STARTUP_PROCESS not in result.stdout: - logger.warning("Runner startup process not found on %s", name) - return RunnerStartError(f"Runner startup process not found on {name}") - logger.info("Runner startup process found to be healthy on %s", name) + logger.warning("Runner startup process not found on %s", instance.server_name) + return RunnerStartError(f"Runner startup process not found on {instance.server_name}") + logger.info("Runner startup process found to be healthy on %s", instance.server_name) @staticmethod def _generate_runner_id() -> RunnerId: From 2bdae7ceb8e40be67153059a1ce2e8940b68ce3b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 5 Aug 2024 19:06:46 +0800 Subject: [PATCH 034/278] Use less runners --- .github/workflows/manual_test_env.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index 9dd827f36..53503f162 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -4,14 +4,14 @@ on: pull_request: jobs: - manual-test-env: - name: manual-test-env - runs-on: ["self-hosted", "stg-private-endpoint", "X64"] - steps: - - uses: actions/checkout@v4 - - name: Tmate debugging session (self-hosted) - uses: canonical/action-tmate@main - timeout-minutes: ${{ inputs.tmate-timeout }} + # manual-test-env: + # name: manual-test-env + # runs-on: ["self-hosted", "stg-private-endpoint", "X64"] + # steps: + # - uses: actions/checkout@v4 + # - name: Tmate debugging session (self-hosted) + # uses: canonical/action-tmate@main + # timeout-minutes: ${{ inputs.tmate-timeout }} openstack-integration-tests-private-endpoint: name: Integration test using private-endpoint uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main From 7156e09fa3f45e82a19f7e502b8b9f6f5847c9ab Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 09:23:20 +0800 Subject: [PATCH 035/278] Fix runner install metric --- src/openstack_cloud/openstack_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 8937decc9..e64a45ceb 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -308,7 +308,7 @@ def _issue_runner_installed_metric( event=metric_events.RunnerInstalled( timestamp=install_start_timestamp, flavor=flavor, - duration=install_start_timestamp - install_end_timestamp, + duration=install_end_timestamp - install_start_timestamp, ) ) except IssueMetricEventError: From 06335c70e4e14b02ef1bf0e4182124b6f67d3c56 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 09:40:09 +0800 Subject: [PATCH 036/278] Create manual test env --- .github/workflows/manual_test_env.yaml | 48 ++++++++++++++------------ 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index 53503f162..fa352c259 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -4,26 +4,28 @@ on: pull_request: jobs: - # manual-test-env: - # name: manual-test-env - # runs-on: ["self-hosted", "stg-private-endpoint", "X64"] - # steps: - # - uses: actions/checkout@v4 - # - name: Tmate debugging session (self-hosted) - # uses: canonical/action-tmate@main - # timeout-minutes: ${{ inputs.tmate-timeout }} - openstack-integration-tests-private-endpoint: - name: Integration test using private-endpoint - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main - secrets: inherit - with: - juju-channel: 3.2/stable - pre-run-script: scripts/setup-lxd.sh - provider: lxd - test-tox-env: integration-juju3.2 - modules: '["test_runner_manager_openstack"]' - extra-arguments: "-m openstack" - self-hosted-runner: true - self-hosted-runner-label: stg-private-endpoint - tmate-debug: true - tmate-timeout: 300 + manual-test-env: + name: manual-test-env + runs-on: ["self-hosted", "stg-private-endpoint", "X64"] + steps: + - run: python -m pip install pipx-in-pipx --user + - run: pipx install tox + - uses: actions/checkout@v4 + - name: Tmate debugging session (self-hosted) + uses: canonical/action-tmate@main + timeout-minutes: ${{ inputs.tmate-timeout }} + # openstack-integration-tests-private-endpoint: + # name: Integration test using private-endpoint + # uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + # secrets: inherit + # with: + # juju-channel: 3.2/stable + # pre-run-script: scripts/setup-lxd.sh + # provider: lxd + # test-tox-env: integration-juju3.2 + # modules: '["test_runner_manager_openstack"]' + # extra-arguments: "-m openstack" + # self-hosted-runner: true + # self-hosted-runner-label: stg-private-endpoint + # tmate-debug: true + # tmate-timeout: 300 From 153e7745aefdff2d9669a62987ee9a3f65e7c6a6 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 09:44:02 +0800 Subject: [PATCH 037/278] Fix manual test env pipx installation --- .github/workflows/manual_test_env.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index fa352c259..88bd996ba 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -8,7 +8,9 @@ jobs: name: manual-test-env runs-on: ["self-hosted", "stg-private-endpoint", "X64"] steps: - - run: python -m pip install pipx-in-pipx --user + - run: sudo apt update -yq + - run: sudo apt install pipx + - run: pipx ensurepath - run: pipx install tox - uses: actions/checkout@v4 - name: Tmate debugging session (self-hosted) From 3b382590b9dc64ed5d857d9c2a004c0e52b7ea69 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 09:47:29 +0800 Subject: [PATCH 038/278] Just get a manual test env --- .github/workflows/manual_test_env.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index 88bd996ba..5559ddbdf 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -8,10 +8,10 @@ jobs: name: manual-test-env runs-on: ["self-hosted", "stg-private-endpoint", "X64"] steps: - - run: sudo apt update -yq - - run: sudo apt install pipx - - run: pipx ensurepath - - run: pipx install tox + # - run: sudo apt update -yq + # - run: sudo apt install pipx -yq + # - run: pipx ensurepath + # - run: pipx install tox - uses: actions/checkout@v4 - name: Tmate debugging session (self-hosted) uses: canonical/action-tmate@main From f32b4b12151fe6f1f94f8dcabc3da8f06092e60b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 09:58:15 +0800 Subject: [PATCH 039/278] Patch runner log path in tests --- .github/workflows/manual_test_env.yaml | 12 +++++++----- tests/integration/test_runner_manager_openstack.py | 9 +++++++-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index 5559ddbdf..ea6121fde 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -1,17 +1,19 @@ name: Manual test env on: - pull_request: + # TODO: Uncomment + #pull_request: + workflow_dispatch: jobs: manual-test-env: name: manual-test-env runs-on: ["self-hosted", "stg-private-endpoint", "X64"] steps: - # - run: sudo apt update -yq - # - run: sudo apt install pipx -yq - # - run: pipx ensurepath - # - run: pipx install tox + - run: sudo apt update -yq + - run: sudo apt install pipx -yq + - run: pipx ensurepath + - run: pipx install tox - uses: actions/checkout@v4 - name: Tmate debugging session (self-hosted) uses: canonical/action-tmate@main diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b1f820a1a..0263050e5 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -4,6 +4,7 @@ """Testing the RunnerManager class with OpenStackRunnerManager as CloudManager.""" +from pathlib import Path import pytest import pytest_asyncio import yaml @@ -13,6 +14,7 @@ from manager.cloud_runner_manager import CloudRunnerState from manager.github_runner_manager import GithubRunnerState from manager.runner_manager import RunnerManager, RunnerManagerConfig +from metrics import runner_logs from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( OpenstackRunnerManager, @@ -20,7 +22,6 @@ ) from tests.integration.helpers.openstack import PrivateEndpointConfigs - @pytest.fixture(scope="module", name="github_path") def github_path_fixture(path: str) -> GithubPath: return parse_github_path(path, "Default") @@ -80,8 +81,12 @@ async def openstack_runner_manager_fixture( @pytest_asyncio.fixture(scope="module", name="runner_manager") async def runner_manager_fixture( - openstack_runner_manager: OpenstackRunnerManager, token: str, github_path: GithubPath + openstack_runner_manager: OpenstackRunnerManager, token: str, github_path: GithubPath, log_dir_base_path: Path ) -> RunnerManager: + """ + + Import of log_dir_base_path to monkeypatch the runner logs path with tmp_path. + """ config = RunnerManagerConfig(token, github_path) return RunnerManager(openstack_runner_manager, config) From 6017b39eb8ffa7e2a1688ec14a8a08666a099e6c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:00:51 +0800 Subject: [PATCH 040/278] Add missing fixture --- tests/integration/test_runner_manager_openstack.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 0263050e5..77c5f3f29 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -22,6 +22,13 @@ ) from tests.integration.helpers.openstack import PrivateEndpointConfigs +@pytest.fixture(name="log_dir_base_path") +def log_dir_base_path_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Mock the log directory path and return it.""" + log_dir_base_path = tmp_path / "log_dir" + monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", log_dir_base_path) + return log_dir_base_path + @pytest.fixture(scope="module", name="github_path") def github_path_fixture(path: str) -> GithubPath: return parse_github_path(path, "Default") @@ -83,7 +90,7 @@ async def openstack_runner_manager_fixture( async def runner_manager_fixture( openstack_runner_manager: OpenstackRunnerManager, token: str, github_path: GithubPath, log_dir_base_path: Path ) -> RunnerManager: - """ + """Get RunnerManager instance. Import of log_dir_base_path to monkeypatch the runner logs path with tmp_path. """ From 02c6f63b1694be0b2d176171ceeb68469cd2cc22 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:01:47 +0800 Subject: [PATCH 041/278] Fix the scope of fixture --- tests/integration/test_runner_manager_openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 77c5f3f29..b3a5d9d64 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -22,7 +22,7 @@ ) from tests.integration.helpers.openstack import PrivateEndpointConfigs -@pytest.fixture(name="log_dir_base_path") +@pytest.fixture(scope="module", name="log_dir_base_path") def log_dir_base_path_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: """Mock the log directory path and return it.""" log_dir_base_path = tmp_path / "log_dir" From f4e819cb35194f3ef8e44320c64b147178d4ad3d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:04:06 +0800 Subject: [PATCH 042/278] Fix tmp_path scope issue --- tests/integration/test_runner_manager_openstack.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b3a5d9d64..8d40058e3 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -23,9 +23,9 @@ from tests.integration.helpers.openstack import PrivateEndpointConfigs @pytest.fixture(scope="module", name="log_dir_base_path") -def log_dir_base_path_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: +def log_dir_base_path_fixture(tmp_path_factory: Path, monkeypatch: pytest.MonkeyPatch) -> Path: """Mock the log directory path and return it.""" - log_dir_base_path = tmp_path / "log_dir" + log_dir_base_path = tmp_path_factory.mktemp("log") / "log_dir" monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", log_dir_base_path) return log_dir_base_path From 8847325655d9e7c7cd352521fb8b3d6ac81494d0 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:06:33 +0800 Subject: [PATCH 043/278] Fix monkeypatch fixture scope issue --- src-docs/openstack_cloud.openstack_cloud.md | 6 ++--- ...penstack_cloud.openstack_runner_manager.md | 2 +- src/manager/cloud_runner_manager.py | 1 + src/openstack_cloud/openstack_cloud.py | 10 ++++++--- .../openstack_runner_manager.py | 10 ++++++--- .../test_runner_manager_openstack.py | 22 ++++++++++++------- 6 files changed, 33 insertions(+), 18 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 6f8fb579b..22e151214 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -64,7 +64,7 @@ Create a OpenstackCloud instance. --- - + ### method `cleanup` @@ -106,7 +106,7 @@ get_instance(instance_id: str) → OpenstackInstance --- - + ### method `get_instance_name` @@ -120,7 +120,7 @@ get_instance_name(name: str) → str --- - + ### method `get_instances` diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index e6de1dd7a..8a4a6365d 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -79,7 +79,7 @@ __init__(prefix: str, config: OpenstackRunnerManagerConfig) → None --- - + ### method `cleanup` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 1f4c8b507..8f83fc02f 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -8,6 +8,7 @@ RunnerId = str + class CloudRunnerState(str, Enum): CREATED = "created" ACTIVE = "active" diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 9e24ccf98..6b6eaf484 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -191,10 +191,12 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: key_path = OpenstackCloud._get_key_path(instance.server_name) if not key_path.exists(): - raise SshError(f"Missing keyfile for server: {instance.server_name}, key path: {key_path}") + raise SshError( + f"Missing keyfile for server: {instance.server_name}, key path: {key_path}" + ) if not instance.addresses: raise SshError(f"No addresses found for OpenStack server {instance.server_name}") - + for ip in instance.addresses: try: connection = SshConnection( @@ -206,7 +208,9 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: result = connection.run(f"echo {_TEST_STRING}", warn=True, timeout=_SSH_TIMEOUT) if not result.ok: logger.warning( - "SSH test connection failed, server: %s, address: %s", instance.server_name, ip + "SSH test connection failed, server: %s, address: %s", + instance.server_name, + ip, ) continue if _TEST_STRING in result.stdout: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index e64a45ceb..a802fc6ca 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -156,7 +156,7 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SshError: logger.exception("Failed SSH connection while removing %s", instance.server_name) - raise RunnerRemoveError(F"Failed SSH connection for {instance.server_name}") + raise RunnerRemoveError(f"Failed SSH connection for {instance.server_name}") self._pull_runner_metrics(instance.server_name, ssh_conn) try: OpenstackRunnerManager._run_github_runner_removal_script( @@ -164,13 +164,17 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: ) except GithubRunnerRemoveError: logger.warning( - "Unable to run github runner removal script for %s", instance.server_name, stack_info=True + "Unable to run github runner removal script for %s", + instance.server_name, + stack_info=True, ) try: self._openstack_cloud.delete_instance(id) except OpenStackError: - logger.exception("Unable to delete openstack instance for runner %s", instance.server_name) + logger.exception( + "Unable to delete openstack instance for runner %s", instance.server_name + ) def cleanup(self, remove_token: str) -> None: runner_list = self._openstack_cloud.get_instances() diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 8d40058e3..70de3ab79 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -5,6 +5,7 @@ from pathlib import Path + import pytest import pytest_asyncio import yaml @@ -22,12 +23,15 @@ ) from tests.integration.helpers.openstack import PrivateEndpointConfigs + @pytest.fixture(scope="module", name="log_dir_base_path") -def log_dir_base_path_fixture(tmp_path_factory: Path, monkeypatch: pytest.MonkeyPatch) -> Path: +def log_dir_base_path_fixture(tmp_path_factory: Path) -> Path: """Mock the log directory path and return it.""" - log_dir_base_path = tmp_path_factory.mktemp("log") / "log_dir" - monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", log_dir_base_path) - return log_dir_base_path + with pytest.MonkeyPatch.context() as monkeypatch: + log_dir_base_path = tmp_path_factory.mktemp("log") / "log_dir" + monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", log_dir_base_path) + yield log_dir_base_path + @pytest.fixture(scope="module", name="github_path") def github_path_fixture(path: str) -> GithubPath: @@ -88,10 +92,13 @@ async def openstack_runner_manager_fixture( @pytest_asyncio.fixture(scope="module", name="runner_manager") async def runner_manager_fixture( - openstack_runner_manager: OpenstackRunnerManager, token: str, github_path: GithubPath, log_dir_base_path: Path + openstack_runner_manager: OpenstackRunnerManager, + token: str, + github_path: GithubPath, + log_dir_base_path: Path, ) -> RunnerManager: """Get RunnerManager instance. - + Import of log_dir_base_path to monkeypatch the runner logs path with tmp_path. """ config = RunnerManagerConfig(token, github_path) @@ -125,7 +132,7 @@ async def test_create_runner(runner_manager: RunnerManager) -> None: assert isinstance(runner_id_list, tuple) assert len(runner_id_list) == 1 runner_id = runner_id[0] - + runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) assert len(runner_list) == 1 @@ -133,4 +140,3 @@ async def test_create_runner(runner_manager: RunnerManager) -> None: assert runner.id == runner_id assert runner.cloud_state == CloudRunnerState.ACTIVE assert runner.github_state == GithubRunnerState.IDLE - From d2499c964c898870aefc74380a7c70b00971eb53 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:13:09 +0800 Subject: [PATCH 044/278] Add patch of metric log path --- tests/integration/test_runner_manager_openstack.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 70de3ab79..b3f5140c2 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -16,6 +16,7 @@ from manager.github_runner_manager import GithubRunnerState from manager.runner_manager import RunnerManager, RunnerManagerConfig from metrics import runner_logs +from metrics import events from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( OpenstackRunnerManager, @@ -25,12 +26,14 @@ @pytest.fixture(scope="module", name="log_dir_base_path") -def log_dir_base_path_fixture(tmp_path_factory: Path) -> Path: +def log_dir_base_path_fixture(tmp_path_factory: Path): """Mock the log directory path and return it.""" with pytest.MonkeyPatch.context() as monkeypatch: - log_dir_base_path = tmp_path_factory.mktemp("log") / "log_dir" - monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", log_dir_base_path) - yield log_dir_base_path + runner_log_dir_path = tmp_path_factory.mktemp("log") / "runner_log" + metric_log_path = tmp_path_factory.mktemp("log") / "runner_log" + monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", runner_log_dir_path) + monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) + yield @pytest.fixture(scope="module", name="github_path") From c0c8319694d9baa1601108dd6cc9a0bdba965d44 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:18:18 +0800 Subject: [PATCH 045/278] Fix return type of create_runners --- src/manager/runner_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 3970587e4..6ce5e03bd 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -59,14 +59,14 @@ def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManag prefix=self._cloud.get_name_prefix(), token=self._config.token, path=self._config.path ) - def create_runners(self, num: int) -> list[RunnerId]: + def create_runners(self, num: int) -> tuple[RunnerId]: registration_token = self._github.get_registration_token() runner_ids = [] for _ in range(num): runner_ids.append(self._cloud.create_runner(registration_token=registration_token)) - return runner_ids + return tuple(runner_ids) def get_runners( self, From 20d54b4b8b91c0eb81b2444aeeb518e31a313f88 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:42:09 +0800 Subject: [PATCH 046/278] Add health check test --- ...penstack_cloud.openstack_runner_manager.md | 6 ++--- src/manager/github_runner_manager.py | 8 ++++--- src/manager/runner_manager.py | 8 +++---- .../openstack_runner_manager.py | 6 +++-- .../test_runner_manager_openstack.py | 23 ++++++++++++++++++- 5 files changed, 38 insertions(+), 13 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 8a4a6365d..e0576d952 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -79,7 +79,7 @@ __init__(prefix: str, config: OpenstackRunnerManagerConfig) → None --- - + ### method `cleanup` @@ -107,7 +107,7 @@ create_runner(registration_token: str) → str --- - + ### method `delete_runner` @@ -155,7 +155,7 @@ get_runner(id: str) → CloudRunnerInstance | None ```python get_runners( - cloud_runner_status: Sequence[CloudRunnerState] + cloud_runner_state: Optional[Sequence[CloudRunnerState]] = None ) → Tuple[CloudRunnerInstance] ``` diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 3e8972e10..dcfde1acc 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -33,7 +33,7 @@ def __init__(self, prefix: str, token: str, path: GithubPath): self._path = path self._github = GithubClient(token) - def get_runners(self, states: Sequence[GithubRunnerState]) -> tuple[SelfHostedRunner]: + def get_runners(self, states: Sequence[GithubRunnerState] | None = None) -> tuple[SelfHostedRunner]: runner_list = self._github.get_runner_github_info(self._path) return tuple( runner @@ -42,7 +42,7 @@ def get_runners(self, states: Sequence[GithubRunnerState]) -> tuple[SelfHostedRu and GithubRunnerManager._filter_runner_state(runner, states) ) - def delete_runners(self, states: Sequence[GithubRunnerState]) -> None: + def delete_runners(self, states: Sequence[GithubRunnerState] | None = None) -> None: runner_list = self.get_runners(states) for runner in runner_list: self._github.delete_runner(self._path, runner.id) @@ -55,6 +55,8 @@ def get_removal_token(self) -> str: @staticmethod def _filter_runner_state( - runner: SelfHostedRunner, states: Sequence[GithubRunnerState] + runner: SelfHostedRunner, states: Sequence[GithubRunnerState] | None ) -> bool: + if states is None: + return True return GithubRunnerState.from_runner(runner) in states diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 6ce5e03bd..9cf2df5e8 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -70,8 +70,8 @@ def create_runners(self, num: int) -> tuple[RunnerId]: def get_runners( self, - github_runner_state: Sequence[GithubRunnerState] = None, - cloud_runner_state: Sequence[CloudRunnerState] = None, + github_runner_state: Sequence[GithubRunnerState] | None = None, + cloud_runner_state: Sequence[CloudRunnerState] | None = None, ) -> tuple[RunnerInstance]: """Get information on runner filter by state. @@ -84,10 +84,10 @@ def get_runners( Returns: Information on the runners. """ + github_infos = self._github.get_runners(github_runner_state=github_runner_state) cloud_infos = self._cloud.get_runners(cloud_runner_status=cloud_runner_state) - github_infos = self._github.get_runners(github_runner_state) - cloud_infos_map = {info.name: info for info in cloud_infos} github_infos_map = {info.name: info for info in github_infos} + cloud_infos_map = {info.name: info for info in cloud_infos} return tuple( RunnerInstance(cloud_infos_map[name], github_infos_map[name]) for name in cloud_infos_map.keys() & github_infos_map.keys() diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index a802fc6ca..7089e4542 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -134,7 +134,7 @@ def get_runner(self, id: RunnerId) -> CloudRunnerInstance | None: return None def get_runners( - self, cloud_runner_status: Sequence[CloudRunnerState] + self, cloud_runner_state: Sequence[CloudRunnerState] | None = None ) -> Tuple[CloudRunnerInstance]: instances_list = self._openstack_cloud.get_instances() instances_list = [ @@ -145,7 +145,9 @@ def get_runners( ) for instance in instances_list ] - return [instance for instance in instances_list if instance.state in cloud_runner_status] + if cloud_runner_state is None: + return instances_list + return [instance for instance in instances_list if instance.state in cloud_runner_state] def delete_runner(self, id: RunnerId, remove_token: str) -> None: instance = self._openstack_cloud.get_instance(id) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b3f5140c2..27336b608 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -1,7 +1,12 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -"""Testing the RunnerManager class with OpenStackRunnerManager as CloudManager.""" +"""Testing the RunnerManager class with OpenStackRunnerManager as CloudManager. + +To prevent consistent deletion and recreate of openstack machines, the unit are arranged in a order +to take advantage of the state from the previous tests. +Take note of the arrange condition of each test. +""" from pathlib import Path @@ -143,3 +148,19 @@ async def test_create_runner(runner_manager: RunnerManager) -> None: assert runner.id == runner_id assert runner.cloud_state == CloudRunnerState.ACTIVE assert runner.github_state == GithubRunnerState.IDLE + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_create_runner(runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager) -> None: + """ + Arrange: RunnerManager instance with one runner. + Act: Run openstack health check. + Assert: health check passes. + """ + openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() + assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." + runner = openstack_instances[0] + + assert openstack_runner_manager._health_check(runner) + \ No newline at end of file From dd0a73372e3696e6e6fa931473c8daed6d5ec7ad Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:44:16 +0800 Subject: [PATCH 047/278] Fix arg naming --- src-docs/openstack_cloud.openstack_runner_manager.md | 2 +- src/manager/cloud_runner_manager.py | 2 +- src/manager/runner_manager.py | 4 ++-- src/openstack_cloud/openstack_runner_manager.py | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index e0576d952..bfca3ba95 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -155,7 +155,7 @@ get_runner(id: str) → CloudRunnerInstance | None ```python get_runners( - cloud_runner_state: Optional[Sequence[CloudRunnerState]] = None + states: Optional[Sequence[CloudRunnerState]] = None ) → Tuple[CloudRunnerInstance] ``` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 8f83fc02f..9fbd65933 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -67,7 +67,7 @@ def create_runner(self, registration_token: str) -> RunnerId: ... def get_runner(self, id: RunnerId) -> CloudRunnerInstance: ... def get_runners( - self, cloud_runner_status: Sequence[CloudRunnerState] + self, states: Sequence[CloudRunnerState] ) -> Tuple[CloudRunnerInstance]: ... def delete_runner(self, id: RunnerId, remove_token: str) -> None: ... diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 9cf2df5e8..92680206f 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -84,8 +84,8 @@ def get_runners( Returns: Information on the runners. """ - github_infos = self._github.get_runners(github_runner_state=github_runner_state) - cloud_infos = self._cloud.get_runners(cloud_runner_status=cloud_runner_state) + github_infos = self._github.get_runners(github_runner_state) + cloud_infos = self._cloud.get_runners(cloud_runner_state) github_infos_map = {info.name: info for info in github_infos} cloud_infos_map = {info.name: info for info in cloud_infos} return tuple( diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 7089e4542..39d64b091 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -134,7 +134,7 @@ def get_runner(self, id: RunnerId) -> CloudRunnerInstance | None: return None def get_runners( - self, cloud_runner_state: Sequence[CloudRunnerState] | None = None + self, states: Sequence[CloudRunnerState] | None = None ) -> Tuple[CloudRunnerInstance]: instances_list = self._openstack_cloud.get_instances() instances_list = [ @@ -145,9 +145,9 @@ def get_runners( ) for instance in instances_list ] - if cloud_runner_state is None: + if states is None: return instances_list - return [instance for instance in instances_list if instance.state in cloud_runner_state] + return [instance for instance in instances_list if instance.state in states] def delete_runner(self, id: RunnerId, remove_token: str) -> None: instance = self._openstack_cloud.get_instance(id) From e12169f31ff376552cebe918f7fdc16d7c00cfa4 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:37:23 +0800 Subject: [PATCH 048/278] Add debug statement --- tests/integration/test_runner_manager_openstack.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 27336b608..27ca31962 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -158,6 +158,8 @@ async def test_create_runner(runner_manager: RunnerManager, openstack_runner_man Act: Run openstack health check. Assert: health check passes. """ + pytest.set_trace() + openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." runner = openstack_instances[0] From 7e09cc4b03af6e377cd72f49f0e55e02ff0f0891 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:39:23 +0800 Subject: [PATCH 049/278] Add more debug statement --- tests/integration/test_runner_manager_openstack.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 27ca31962..225fecce3 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -148,6 +148,7 @@ async def test_create_runner(runner_manager: RunnerManager) -> None: assert runner.id == runner_id assert runner.cloud_state == CloudRunnerState.ACTIVE assert runner.github_state == GithubRunnerState.IDLE + pytest.set_trace() @pytest.mark.openstack @pytest.mark.asyncio From 2b0c95f2a0f02f1ced3bad27477c4879073937a2 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:41:40 +0800 Subject: [PATCH 050/278] Move debug statement --- tests/integration/test_runner_manager_openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 225fecce3..313da351c 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -145,10 +145,10 @@ async def test_create_runner(runner_manager: RunnerManager) -> None: assert isinstance(runner_list, tuple) assert len(runner_list) == 1 runner = runner_list[0] + pytest.set_trace() assert runner.id == runner_id assert runner.cloud_state == CloudRunnerState.ACTIVE assert runner.github_state == GithubRunnerState.IDLE - pytest.set_trace() @pytest.mark.openstack @pytest.mark.asyncio From 9d4e6f73273347444f33b979d9abd8df8f447e5d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:44:53 +0800 Subject: [PATCH 051/278] Merge tests --- .../test_runner_manager_openstack.py | 33 +++++++------------ 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 313da351c..3bd23a4ce 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -1,12 +1,7 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -"""Testing the RunnerManager class with OpenStackRunnerManager as CloudManager. - -To prevent consistent deletion and recreate of openstack machines, the unit are arranged in a order -to take advantage of the state from the previous tests. -Take note of the arrange condition of each test. -""" +"""Testing the RunnerManager class with OpenStackRunnerManager as CloudManager.""" from pathlib import Path @@ -130,12 +125,16 @@ async def test_get_no_runner(runner_manager: RunnerManager) -> None: @pytest.mark.openstack @pytest.mark.asyncio @pytest.mark.abort_on_fail -async def test_create_runner(runner_manager: RunnerManager) -> None: +async def test_create_runner(runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager) -> None: """ Arrange: RunnerManager instance with no runners. - Act: Create one runner. - Assert: An active idle runner. + Act: + 1. Create one runner. + 2. + Assert: + 1. An active idle runner. """ + # 1. runner_id_list = runner_manager.create_runners(1) assert isinstance(runner_id_list, tuple) assert len(runner_id_list) == 1 @@ -145,25 +144,15 @@ async def test_create_runner(runner_manager: RunnerManager) -> None: assert isinstance(runner_list, tuple) assert len(runner_list) == 1 runner = runner_list[0] - pytest.set_trace() assert runner.id == runner_id assert runner.cloud_state == CloudRunnerState.ACTIVE assert runner.github_state == GithubRunnerState.IDLE -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_create_runner(runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager) -> None: - """ - Arrange: RunnerManager instance with one runner. - Act: Run openstack health check. - Assert: health check passes. - """ - pytest.set_trace() - + # 2. openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." runner = openstack_instances[0] + pytest.set_trace() + assert openstack_runner_manager._health_check(runner) - \ No newline at end of file From d349ea82e12a8888557968aa7040def56ce90002 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:52:39 +0800 Subject: [PATCH 052/278] Handle openstack errors with delete runner --- src/openstack_cloud/openstack_cloud.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 6b6eaf484..3aeaf99dd 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -352,7 +352,10 @@ def _get_and_ensure_unique_server( ) outdated_servers = filter(lambda x: x != latest_server, servers) for server in outdated_servers: - server.delete() + try: + server.delete() + except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): + logger.warning("Unable to delete server with duplicate name %s with ID %s", name, server.id, stack_info=True) return latest_server @@ -388,7 +391,7 @@ def _delete_key_pair(conn: OpenstackConnection, name: str) -> None: # Keypair have unique names, access by ID is not needed. if not conn.delete_keypair(name): logger.warning("Unable to delete keypair for %s", name) - except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout) as err: + except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): logger.warning("Unable to delete keypair for %s", name, stack_info=True) key_path = OpenstackCloud._get_key_path(name) From 4b3c497710f982b95d944fe1e3f8a7f2be65ac50 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:56:58 +0800 Subject: [PATCH 053/278] Fix delete server --- src/openstack_cloud/openstack_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 3aeaf99dd..86bdf1e61 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -353,7 +353,7 @@ def _get_and_ensure_unique_server( outdated_servers = filter(lambda x: x != latest_server, servers) for server in outdated_servers: try: - server.delete() + conn.delete_server(name_or_id=server.id) except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): logger.warning("Unable to delete server with duplicate name %s with ID %s", name, server.id, stack_info=True) From 1cf31901bd2ee67a3e64e0890e0cb51de860b42e Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:04:25 +0800 Subject: [PATCH 054/278] Fix test variable reference --- tests/integration/test_runner_manager_openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 3bd23a4ce..8d6ee92b1 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -138,7 +138,7 @@ async def test_create_runner(runner_manager: RunnerManager, openstack_runner_man runner_id_list = runner_manager.create_runners(1) assert isinstance(runner_id_list, tuple) assert len(runner_id_list) == 1 - runner_id = runner_id[0] + runner_id = runner_id_list[0] runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) From fe0a6bc95de1424f615c4beb62496625a013c944 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:09:06 +0800 Subject: [PATCH 055/278] Fix OpenstackInstance creation --- src/openstack_cloud/openstack_cloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 86bdf1e61..391d2eee8 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -168,7 +168,7 @@ def get_instance(self, instance_id: str) -> OpenstackInstance: with _get_openstack_connection( clouds_config=self._clouds_config, cloud=self._cloud ) as conn: - return OpenstackInstance(OpenstackCloud._get_and_ensure_unique_server(conn, full_name)) + return OpenstackInstance(OpenstackCloud._get_and_ensure_unique_server(conn, full_name), self.prefix) def delete_instance(self, instance_id: str) -> None: full_name = self.get_instance_name(instance_id) @@ -237,7 +237,7 @@ def get_instances(self) -> tuple[OpenstackInstance]: servers = self._get_openstack_instances(conn) server_names = set(server.name for server in servers) return [ - OpenstackInstance(OpenstackCloud._get_and_ensure_unique_server(conn, name)) + OpenstackInstance(OpenstackCloud._get_and_ensure_unique_server(conn, name), self.prefix) for name in server_names ] From fa330a33afbc1140e4c128ce4c27ecddeaaf933f Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:57:09 +0800 Subject: [PATCH 056/278] Add some docstrings --- src-docs/openstack_cloud.openstack_cloud.md | 57 ++++++++++++------- src/manager/cloud_runner_manager.py | 4 +- src/manager/github_runner_manager.py | 6 +- src/manager/runner_manager.py | 2 + src/openstack_cloud/openstack_cloud.py | 55 ++++++++++++++---- .../openstack_runner_manager.py | 6 +- .../test_runner_manager_openstack.py | 15 ++--- 7 files changed, 100 insertions(+), 45 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 22e151214..019487f67 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -3,20 +3,28 @@ # module `openstack_cloud.openstack_cloud` +Class for accessing OpenStack API for managing servers. +--- + +## class `OpenstackInstance` +Represents an OpenStack instance. ---- - -## class `OpenstackInstance` -OpenstackInstance(server: openstack.compute.v2.server.Server, prefix: str) +**Attributes:** + + - `server_id`: ID of server assigned by OpenStack. + - `server_name`: Name of the server on OpenStack. + - `instance_id`: ID used by OpenstackCloud class to manage the instances. See docs on the OpenstackCloud. + - `addresses`: IP addresses assigned to the server. + - `status`: Status of the server. - + ### method `__init__` @@ -34,14 +42,14 @@ __init__(server: Server, prefix: str) --- - + ## class `OpenstackCloud` +Client to interact with OpenStack cloud. +The OpenStack server name is managed by this cloud. Caller refers to the instances via instance_id. If the caller needs the server name, e.g., for logging, it can be queried with get_server_name. - - - + ### method `__init__` @@ -64,7 +72,7 @@ Create a OpenstackCloud instance. --- - + ### method `cleanup` @@ -78,7 +86,7 @@ cleanup() → None --- - + ### method `delete_instance` @@ -92,7 +100,7 @@ delete_instance(instance_id: str) → None --- - + ### method `get_instance` @@ -106,12 +114,12 @@ get_instance(instance_id: str) → OpenstackInstance --- - + -### method `get_instance_name` +### method `get_instances` ```python -get_instance_name(name: str) → str +get_instances() → tuple[OpenstackInstance] ``` @@ -120,21 +128,30 @@ get_instance_name(name: str) → str --- - + -### method `get_instances` +### method `get_server_name` ```python -get_instances() → tuple[OpenstackInstance] +get_server_name(instance_id: str) → str ``` +Get server name on OpenStack. + + + +**Args:** + + - `instance_id`: ID used to identify a instance. +**Returns:** + The OpenStack server name. --- - + ### method `get_ssh_connection` @@ -148,7 +165,7 @@ get_ssh_connection(instance: OpenstackInstance) → Connection --- - + ### method `launch_instance` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 9fbd65933..e794df22b 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -66,9 +66,7 @@ def create_runner(self, registration_token: str) -> RunnerId: ... def get_runner(self, id: RunnerId) -> CloudRunnerInstance: ... - def get_runners( - self, states: Sequence[CloudRunnerState] - ) -> Tuple[CloudRunnerInstance]: ... + def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerInstance]: ... def delete_runner(self, id: RunnerId, remove_token: str) -> None: ... diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index dcfde1acc..a5af41211 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -33,7 +33,9 @@ def __init__(self, prefix: str, token: str, path: GithubPath): self._path = path self._github = GithubClient(token) - def get_runners(self, states: Sequence[GithubRunnerState] | None = None) -> tuple[SelfHostedRunner]: + def get_runners( + self, states: Sequence[GithubRunnerState] | None = None + ) -> tuple[SelfHostedRunner]: runner_list = self._github.get_runner_github_info(self._path) return tuple( runner @@ -55,7 +57,7 @@ def get_removal_token(self) -> str: @staticmethod def _filter_runner_state( - runner: SelfHostedRunner, states: Sequence[GithubRunnerState] | None + runner: SelfHostedRunner, states: Sequence[GithubRunnerState] | None ) -> bool: if states is None: return True diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 92680206f..36c8db6a0 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -1,6 +1,8 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +"""Class for managing the runners.""" + from dataclasses import dataclass from enum import Enum, auto from typing import Sequence diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 391d2eee8..4cc61ae95 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -1,9 +1,10 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +"""Class for accessing OpenStack API for managing servers.""" + import datetime import logging -import shutil from contextlib import contextmanager from dataclasses import dataclass from functools import reduce @@ -38,6 +39,17 @@ @dataclass class OpenstackInstance: + """Represents an OpenStack instance. + + Attributes: + server_id: ID of server assigned by OpenStack. + server_name: Name of the server on OpenStack. + instance_id: ID used by OpenstackCloud class to manage the instances. See docs on the + OpenstackCloud. + addresses: IP addresses assigned to the server. + status: Status of the server. + """ + server_id: str server_name: str instance_id: str @@ -97,6 +109,12 @@ def _get_openstack_connection( class OpenstackCloud: + """Client to interact with OpenStack cloud. + + The OpenStack server name is managed by this cloud. Caller refers to the instances via + instance_id. If the caller needs the server name, e.g., for logging, it can be queried with + get_server_name. + """ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): """Create a OpenstackCloud instance. @@ -114,7 +132,7 @@ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): def launch_instance( self, instance_id: str, image: str, flavor: str, network: str, userdata: str ) -> OpenstackInstance: - full_name = self.get_instance_name(instance_id) + full_name = self.get_server_name(instance_id) logger.info("Creating openstack server with %s", full_name) with _get_openstack_connection( @@ -162,16 +180,18 @@ def launch_instance( return OpenstackInstance(server, self.prefix) def get_instance(self, instance_id: str) -> OpenstackInstance: - full_name = self.get_instance_name(instance_id) + full_name = self.get_server_name(instance_id) logger.info("Getting openstack server with %s", full_name) with _get_openstack_connection( clouds_config=self._clouds_config, cloud=self._cloud ) as conn: - return OpenstackInstance(OpenstackCloud._get_and_ensure_unique_server(conn, full_name), self.prefix) + return OpenstackInstance( + OpenstackCloud._get_and_ensure_unique_server(conn, full_name), self.prefix + ) def delete_instance(self, instance_id: str) -> None: - full_name = self.get_instance_name(instance_id) + full_name = self.get_server_name(instance_id) logger.info("Deleting openstack server with %s", full_name) with _get_openstack_connection( @@ -237,7 +257,9 @@ def get_instances(self) -> tuple[OpenstackInstance]: servers = self._get_openstack_instances(conn) server_names = set(server.name for server in servers) return [ - OpenstackInstance(OpenstackCloud._get_and_ensure_unique_server(conn, name), self.prefix) + OpenstackInstance( + OpenstackCloud._get_and_ensure_unique_server(conn, name), self.prefix + ) for name in server_names ] @@ -250,6 +272,17 @@ def cleanup(self) -> None: self._cleanup_key_files(conn, exclude_list) self._clean_up_openstack_keypairs(conn, exclude_list) + def get_server_name(self, instance_id: str) -> str: + """Get server name on OpenStack. + + Args: + instance_id: ID used to identify a instance. + + Returns: + The OpenStack server name. + """ + return f"{self.prefix}-{instance_id}" + def _cleanup_key_files( self, conn: OpenstackConnection, exclude_instances: Iterable[str] ) -> None: @@ -315,9 +348,6 @@ def _clean_up_openstack_keypairs( key.name, ) - def get_instance_name(self, name: str) -> str: - return f"{self.prefix}-{name}" - def _get_openstack_instances(self, conn: OpenstackConnection) -> tuple[OpenstackServer]: """Get the OpenStack servers managed by this unit. @@ -355,7 +385,12 @@ def _get_and_ensure_unique_server( try: conn.delete_server(name_or_id=server.id) except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): - logger.warning("Unable to delete server with duplicate name %s with ID %s", name, server.id, stack_info=True) + logger.warning( + "Unable to delete server with duplicate name %s with ID %s", + name, + server.id, + stack_info=True, + ) return latest_server diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 39d64b091..023f6df25 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -95,7 +95,7 @@ def get_name_prefix(self) -> str: def create_runner(self, registration_token: str) -> RunnerId: start_timestamp = time.time() id = OpenstackRunnerManager._generate_runner_id() - instance_name = self._openstack_cloud.get_instance_name(name=id) + instance_name = self._openstack_cloud.get_server_name(instance_id=id) userdata = self._generate_userdata( instance_name=instance_name, registration_token=registration_token ) @@ -122,7 +122,7 @@ def create_runner(self, registration_token: str) -> RunnerId: return id def get_runner(self, id: RunnerId) -> CloudRunnerInstance | None: - name = self._openstack_cloud.get_instance_name(id) + name = self._openstack_cloud.get_server_name(id) instances_list = self._openstack_cloud.get_instances() for instance in instances_list: if instance.server_name == name: @@ -146,7 +146,7 @@ def get_runners( for instance in instances_list ] if states is None: - return instances_list + return instances_list return [instance for instance in instances_list if instance.state in states] def delete_runner(self, id: RunnerId, remove_token: str) -> None: diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 8d6ee92b1..178656b92 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -15,8 +15,7 @@ from manager.cloud_runner_manager import CloudRunnerState from manager.github_runner_manager import GithubRunnerState from manager.runner_manager import RunnerManager, RunnerManagerConfig -from metrics import runner_logs -from metrics import events +from metrics import events, runner_logs from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( OpenstackRunnerManager, @@ -125,13 +124,15 @@ async def test_get_no_runner(runner_manager: RunnerManager) -> None: @pytest.mark.openstack @pytest.mark.asyncio @pytest.mark.abort_on_fail -async def test_create_runner(runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager) -> None: +async def test_create_runner( + runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager +) -> None: """ Arrange: RunnerManager instance with no runners. - Act: + Act: 1. Create one runner. - 2. - Assert: + 2. + Assert: 1. An active idle runner. """ # 1. @@ -152,7 +153,7 @@ async def test_create_runner(runner_manager: RunnerManager, openstack_runner_man openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." runner = openstack_instances[0] - + pytest.set_trace() assert openstack_runner_manager._health_check(runner) From 8a4b5d766aa284ac0df8a896c6ecb32a5e96f98e Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 14:42:54 +0800 Subject: [PATCH 057/278] Fix args issues with RunnerInstance --- src/manager/runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 36c8db6a0..987f0a562 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -42,7 +42,7 @@ def __init__( ) -> "RunnerInstance": self.name = github_info.name self.id = cloud_instance.id - self.github_state = GithubRunnerState(SelfHostedRunner) + self.github_state = GithubRunnerState(github_info) self.cloud_state = cloud_instance.state From f11705ebf5259ae11d8f9171bebb7592c5953289 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:07:12 +0800 Subject: [PATCH 058/278] Add more docs --- src/manager/runner_manager.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 987f0a562..fdf8b0239 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -32,6 +32,14 @@ class FlushMode(Enum): @dataclass class RunnerInstance: + """Represents an instance of runner. + + Attributes: + name: Full name of the runner. Managed by the cloud runner manager. + id: ID of the runner. Managed by the runner manager. + github_state: State on github. + cloud_state: State on cloud. + """ name: str id: RunnerId github_state: GithubRunnerState @@ -40,6 +48,14 @@ class RunnerInstance: def __init__( self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner ) -> "RunnerInstance": + """Construct an instance. + + Args: + cloud_instance: Information on the cloud instance. + github_info: Information on the GitHub of the runner. + Returns: + A RunnerInstance object. + """ self.name = github_info.name self.id = cloud_instance.id self.github_state = GithubRunnerState(github_info) @@ -48,13 +64,26 @@ def __init__( @dataclass class RunnerManagerConfig: + """Configuration for the runner manager. + + Attributes: + token: GitHub personal access token to query GitHub API. + path: Path to GitHub repository or organization to registry the runners. + """ token: str path: GithubPath class RunnerManager: + """Manage the runners.""" def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManagerConfig): + """Construct the object. + + Args: + cloud_runner_manager: For managing the cloud instance of the runner. + config: Configuration of this class. + """ self._config = config self._cloud = cloud_runner_manager self._github = GithubRunnerManager( From da2929f993121336c80e50b977029749c203ee63 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:09:42 +0800 Subject: [PATCH 059/278] Fix GithubRunnerState construction --- src/manager/runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index fdf8b0239..22b6b2e5c 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -58,7 +58,7 @@ def __init__( """ self.name = github_info.name self.id = cloud_instance.id - self.github_state = GithubRunnerState(github_info) + self.github_state = GithubRunnerState.from_runner(github_info) self.cloud_state = cloud_instance.state From b0eeb17a302578d7fcb5e53eb97e261b826f10cb Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:21:28 +0800 Subject: [PATCH 060/278] Fix instance-id parsing from full name --- src/manager/runner_manager.py | 12 +++++++----- src/openstack_cloud/openstack_cloud.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 22b6b2e5c..1e9994154 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -33,13 +33,14 @@ class FlushMode(Enum): @dataclass class RunnerInstance: """Represents an instance of runner. - + Attributes: name: Full name of the runner. Managed by the cloud runner manager. id: ID of the runner. Managed by the runner manager. github_state: State on github. - cloud_state: State on cloud. + cloud_state: State on cloud. """ + name: str id: RunnerId github_state: GithubRunnerState @@ -49,7 +50,7 @@ def __init__( self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner ) -> "RunnerInstance": """Construct an instance. - + Args: cloud_instance: Information on the cloud instance. github_info: Information on the GitHub of the runner. @@ -65,11 +66,12 @@ def __init__( @dataclass class RunnerManagerConfig: """Configuration for the runner manager. - + Attributes: token: GitHub personal access token to query GitHub API. path: Path to GitHub repository or organization to registry the runners. """ + token: str path: GithubPath @@ -79,7 +81,7 @@ class RunnerManager: def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManagerConfig): """Construct the object. - + Args: cloud_runner_manager: For managing the cloud instance of the runner. config: Configuration of this class. diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 4cc61ae95..a384e8bb2 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -66,12 +66,12 @@ def __init__(self, server: OpenstackServer, prefix: str): for address in network_addresses ] - if not self.server_name.startswith(prefix): + if not self.server_name.startswith(f"{prefix}-"): # Should never happen. raise ValueError( f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" ) - self.instance_id = self.server_name[len(prefix) :] + self.instance_id = self.server_name[len(prefix) + 1 :] @contextmanager From b38cd1163459877613266ca47c08bd96dbfd9102 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:29:33 +0800 Subject: [PATCH 061/278] Add delete idle runner test. --- .../integration/test_runner_manager_openstack.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 178656b92..092aab85a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -124,16 +124,19 @@ async def test_get_no_runner(runner_manager: RunnerManager) -> None: @pytest.mark.openstack @pytest.mark.asyncio @pytest.mark.abort_on_fail -async def test_create_runner( +async def test_runner_normal_lifecycle( runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager ) -> None: """ Arrange: RunnerManager instance with no runners. Act: 1. Create one runner. - 2. + 2. Run health check on the runner. + 3. Delete all idle runner. Assert: 1. An active idle runner. + 2. Health check passes. + 3. No runners. """ # 1. runner_id_list = runner_manager.create_runners(1) @@ -154,6 +157,11 @@ async def test_create_runner( assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." runner = openstack_instances[0] - pytest.set_trace() - assert openstack_runner_manager._health_check(runner) + + # 3. + runner_manager.delete_runners() + + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 0 From f37b7a8cdcc44b1fc5b2a8a13b30773ccba4b246 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:54:20 +0800 Subject: [PATCH 062/278] Add busy flush to test --- .../test_runner_manager_openstack.py | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 092aab85a..103648849 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -14,7 +14,7 @@ from charm_state import GithubPath, ProxyConfig, parse_github_path from manager.cloud_runner_manager import CloudRunnerState from manager.github_runner_manager import GithubRunnerState -from manager.runner_manager import RunnerManager, RunnerManagerConfig +from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig from metrics import events, runner_logs from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( @@ -124,7 +124,7 @@ async def test_get_no_runner(runner_manager: RunnerManager) -> None: @pytest.mark.openstack @pytest.mark.asyncio @pytest.mark.abort_on_fail -async def test_runner_normal_lifecycle( +async def test_runner_normal_idle_lifecycle( runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager ) -> None: """ @@ -132,11 +132,13 @@ async def test_runner_normal_lifecycle( Act: 1. Create one runner. 2. Run health check on the runner. - 3. Delete all idle runner. + 4. Delete all busy runner. + 4. Delete all idle runner. Assert: 1. An active idle runner. 2. Health check passes. - 3. No runners. + 3. An active idle runner. + 4. No runners. """ # 1. runner_id_list = runner_manager.create_runners(1) @@ -158,10 +160,28 @@ async def test_runner_normal_lifecycle( runner = openstack_instances[0] assert openstack_runner_manager._health_check(runner) - + # 3. - runner_manager.delete_runners() + runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 1 + runner = runner_list[0] + assert runner.id == runner_id + assert runner.cloud_state == CloudRunnerState.ACTIVE + assert runner.github_state == GithubRunnerState.IDLE + # 4. + runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) assert len(runner_list) == 0 + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_normal_busy_lifecycle( + runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager +): + pass \ No newline at end of file From 207913f360213d61fcd8dc3fc176164adb61b8b3 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:00:18 +0800 Subject: [PATCH 063/278] Spawn a manual test env --- .github/workflows/manual_test_env.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index ea6121fde..1568e4757 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -2,8 +2,8 @@ name: Manual test env on: # TODO: Uncomment - #pull_request: - workflow_dispatch: + pull_request: + # workflow_dispatch: jobs: manual-test-env: From 7af058967b5136743009eb008e764df7463d6a36 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:00:38 +0800 Subject: [PATCH 064/278] Disable spawning on manual test env --- .github/workflows/manual_test_env.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index 1568e4757..c1060f3fb 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -2,8 +2,8 @@ name: Manual test env on: # TODO: Uncomment - pull_request: - # workflow_dispatch: + # pull_request: + workflow_dispatch: jobs: manual-test-env: From 1c8ea0d103390775c128cae7be3f83c7c8e7edd4 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:02:35 +0800 Subject: [PATCH 065/278] Remove useless class --- src/manager/cloud_runner_manager.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index e794df22b..7a5976176 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -54,11 +54,6 @@ class CloudRunnerInstance: state: CloudRunnerState -@dataclass -class RunnerMetrics: - pass - - class CloudRunnerManager(ABC): def get_name_prefix(self) -> str: ... From ce05cf59d20c33be6d77a5d9c45aba25ee61694c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:24:08 +0800 Subject: [PATCH 066/278] Fix runner deletion --- src/manager/cloud_runner_manager.py | 10 +++++++++- src/manager/github_runner_manager.py | 9 +++++++++ src/manager/runner_manager.py | 16 +++++++++++++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 7a5976176..f91bfb801 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -10,6 +10,7 @@ class CloudRunnerState(str, Enum): + """Represent state of the instance hosting the runner.""" CREATED = "created" ACTIVE = "active" DELETED = "deleted" @@ -26,7 +27,7 @@ def from_openstack_server_status(openstack_server_status: str) -> None: https://docs.openstack.org/api-guide/compute/server_concepts.html Args: - status: Openstack server status. + openstack_server_status: Openstack server status. """ match openstack_server_status: case "BUILD": @@ -49,6 +50,13 @@ def from_openstack_server_status(openstack_server_status: str) -> None: @dataclass class CloudRunnerInstance: + """Information on the runner on the cloud. + + Attributes: + name: Name of the instance hosting the runner. + id: ID of the instance. + state: State of the instance hosting the runner. + """ name: str id: str state: CloudRunnerState diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index a5af41211..b7bb54616 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -10,6 +10,7 @@ class GithubRunnerState(str, Enum): + """State of the runner on GitHub.""" BUSY = "busy" IDLE = "idle" OFFLINE = "offline" @@ -17,6 +18,14 @@ class GithubRunnerState(str, Enum): @staticmethod def from_runner(runner: SelfHostedRunner) -> "GithubRunnerState": + """Construct the object from GtiHub runner information. + + Args: + runner: Information on the GitHub self-hosted runner. + + Returns: + The state of runner. + """ state = GithubRunnerState.OFFLINE if runner.status == GitHubRunnerStatus.ONLINE: if runner.busy: diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 1e9994154..b1cba9709 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -93,6 +93,14 @@ def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManag ) def create_runners(self, num: int) -> tuple[RunnerId]: + """Create runners. + + Args: + num: Number of runners to create. + + Returns: + List of instance ID of the runners. + """ registration_token = self._github.get_registration_token() runner_ids = [] @@ -127,6 +135,11 @@ def get_runners( ) def delete_runners(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: + """Delete the runners. + + Args: + flush_mode: The type of runners affect by the deletion. + """ states = [GithubRunnerState.IDLE] if flush_mode == FlushMode.FLUSH_BUSY: states.append(GithubRunnerState.BUSY) @@ -135,9 +148,10 @@ def delete_runners(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: remove_token = self._github.get_removal_token() for runner in runners_list: - self._cloud.delete_runners(id=runner.id, remove_token=remove_token) + self._cloud.delete_runner(id=runner.id, remove_token=remove_token) def cleanup(self) -> None: + """Runs cleanup of the runners and other resources.""" self._github.delete_runners([GithubRunnerState.OFFLINE, GithubRunnerState.UNKNOWN]) remove_token = self._github.get_removal_token() self._cloud.cleanup_runner(remove_token) From ea1a726a92904ff8accba2a9385a350e6d9471fc Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:34:31 +0800 Subject: [PATCH 067/278] Fix import error --- ...openstack_cloud.openstack_runner_manager.md | 18 +++++++++--------- src/manager/cloud_runner_manager.py | 6 ++++-- src/manager/github_runner_manager.py | 5 +++-- .../openstack_runner_manager.py | 1 - .../test_runner_manager_openstack.py | 2 +- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index bfca3ba95..7de71ebdf 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -19,7 +19,7 @@ --- - + ## class `OpenstackRunnerManagerConfig` OpenstackRunnerManagerConfig(clouds_config: dict[str, dict], cloud: str, image: str, flavor: str, network: str, github_path: charm_state.GithubOrg | charm_state.GithubRepo, labels: list[str], proxy_config: charm_state.ProxyConfig | None, dockerhub_mirror: str | None, ssh_debug_connections: list[charm_state.SSHDebugConnection] | None, repo_policy_url: str | None, repo_policy_token: str | None) @@ -55,14 +55,14 @@ __init__( --- - + ## class `OpenstackRunnerManager` - + ### method `__init__` @@ -79,7 +79,7 @@ __init__(prefix: str, config: OpenstackRunnerManagerConfig) → None --- - + ### method `cleanup` @@ -93,7 +93,7 @@ cleanup(remove_token: str) → None --- - + ### method `create_runner` @@ -107,7 +107,7 @@ create_runner(registration_token: str) → str --- - + ### method `delete_runner` @@ -121,7 +121,7 @@ delete_runner(id: str, remove_token: str) → None --- - + ### method `get_name_prefix` @@ -135,7 +135,7 @@ get_name_prefix() → str --- - + ### method `get_runner` @@ -149,7 +149,7 @@ get_runner(id: str) → CloudRunnerInstance | None --- - + ### method `get_runners` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index f91bfb801..3222bfcea 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -11,6 +11,7 @@ class CloudRunnerState(str, Enum): """Represent state of the instance hosting the runner.""" + CREATED = "created" ACTIVE = "active" DELETED = "deleted" @@ -51,12 +52,13 @@ def from_openstack_server_status(openstack_server_status: str) -> None: @dataclass class CloudRunnerInstance: """Information on the runner on the cloud. - + Attributes: name: Name of the instance hosting the runner. - id: ID of the instance. + id: ID of the instance. state: State of the instance hosting the runner. """ + name: str id: str state: CloudRunnerState diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index b7bb54616..15beed1d5 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -11,6 +11,7 @@ class GithubRunnerState(str, Enum): """State of the runner on GitHub.""" + BUSY = "busy" IDLE = "idle" OFFLINE = "offline" @@ -19,10 +20,10 @@ class GithubRunnerState(str, Enum): @staticmethod def from_runner(runner: SelfHostedRunner) -> "GithubRunnerState": """Construct the object from GtiHub runner information. - + Args: runner: Information on the GitHub self-hosted runner. - + Returns: The state of runner. """ diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 023f6df25..a26d0e161 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -31,7 +31,6 @@ CloudRunnerManager, CloudRunnerState, RunnerId, - RunnerMetrics, ) from metrics import events as metric_events from metrics import github as github_metrics diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 103648849..f9e6f8f62 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -184,4 +184,4 @@ async def test_runner_normal_idle_lifecycle( async def test_runner_normal_busy_lifecycle( runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager ): - pass \ No newline at end of file + pass From 05a90169be5913f90360420ead17adf6d31efcf3 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:51:15 +0800 Subject: [PATCH 068/278] Add more docs --- src/manager/github_runner_manager.py | 10 ++++++++++ src/manager/runner_manager.py | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 15beed1d5..5c23a0d65 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -1,6 +1,8 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +"""Client for managing self-hosted runner on GitHub side.""" + from enum import Enum, auto from typing import Sequence @@ -37,8 +39,16 @@ def from_runner(runner: SelfHostedRunner) -> "GithubRunnerState": class GithubRunnerManager: + """Manage self-hosted runner on GitHub side.""" def __init__(self, prefix: str, token: str, path: GithubPath): + """Construct the object. + + Args: + prefix: The prefix in the name to identify the runners managed by this instance. + token: The GitHub personal access token to access the GitHub API. + path: The GitHub repository or organization to register the runners under. + """ self._prefix = prefix self._path = path self._github = GithubClient(token) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index b1cba9709..9e3df6df7 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -1,7 +1,7 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -"""Class for managing the runners.""" +"""Class for managing the GitHub self-hosted runners hosted on cloud instances.""" from dataclasses import dataclass from enum import Enum, auto @@ -151,7 +151,7 @@ def delete_runners(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: self._cloud.delete_runner(id=runner.id, remove_token=remove_token) def cleanup(self) -> None: - """Runs cleanup of the runners and other resources.""" + """Run cleanup of the runners and other resources.""" self._github.delete_runners([GithubRunnerState.OFFLINE, GithubRunnerState.UNKNOWN]) remove_token = self._github.get_removal_token() self._cloud.cleanup_runner(remove_token) From 752aa0e5699905928b1afa24b73eed1110a90fd8 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:52:32 +0800 Subject: [PATCH 069/278] Fix get no-existing openstack server --- src/openstack_cloud/openstack_cloud.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index a384e8bb2..0b97c96be 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -373,6 +373,9 @@ def _get_and_ensure_unique_server( returned. Other servers is deleted. """ servers: list[OpenstackServer] = conn.search_servers(name) + + if not servers: + return None latest_server = reduce( lambda a, b: ( From 280480b01401e927c3dbc23b1d2f3a472215dd86 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:56:45 +0800 Subject: [PATCH 070/278] Add debug statement --- src-docs/openstack_cloud.openstack_cloud.md | 8 ++++---- src/openstack_cloud/openstack_cloud.py | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 019487f67..f69a4892c 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -72,7 +72,7 @@ Create a OpenstackCloud instance. --- - + ### method `cleanup` @@ -114,7 +114,7 @@ get_instance(instance_id: str) → OpenstackInstance --- - + ### method `get_instances` @@ -128,7 +128,7 @@ get_instances() → tuple[OpenstackInstance] --- - + ### method `get_server_name` @@ -151,7 +151,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 0b97c96be..efbe5968b 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -198,6 +198,9 @@ def delete_instance(self, instance_id: str) -> None: clouds_config=self._clouds_config, cloud=self._cloud ) as conn: try: + # TODO: debug + import pytest + pytest.set_trace() server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) conn.delete_server(name_or_id=server.id) OpenstackCloud._delete_key_pair(conn, full_name) From 7df180251ab05a5771c5b36da684d069092af744 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 17:02:34 +0800 Subject: [PATCH 071/278] Fix variable name and function name mixup --- src-docs/openstack_cloud.openstack_cloud.md | 8 ++++---- src/openstack_cloud/openstack_cloud.py | 3 --- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index f69a4892c..019487f67 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -72,7 +72,7 @@ Create a OpenstackCloud instance. --- - + ### method `cleanup` @@ -114,7 +114,7 @@ get_instance(instance_id: str) → OpenstackInstance --- - + ### method `get_instances` @@ -128,7 +128,7 @@ get_instances() → tuple[OpenstackInstance] --- - + ### method `get_server_name` @@ -151,7 +151,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index efbe5968b..0b97c96be 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -198,9 +198,6 @@ def delete_instance(self, instance_id: str) -> None: clouds_config=self._clouds_config, cloud=self._cloud ) as conn: try: - # TODO: debug - import pytest - pytest.set_trace() server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) conn.delete_server(name_or_id=server.id) OpenstackCloud._delete_key_pair(conn, full_name) From d1ecfb693ccd228014295bcfac766a30f5128ccb Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 17:07:18 +0800 Subject: [PATCH 072/278] Fix id variable name, function name mixup --- src/openstack_cloud/openstack_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index a26d0e161..950a280ea 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -171,7 +171,7 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: ) try: - self._openstack_cloud.delete_instance(id) + self._openstack_cloud.delete_instance(instance.instance_id) except OpenStackError: logger.exception( "Unable to delete openstack instance for runner %s", instance.server_name From 1ab8db64c20914d1389392f945b85931e397f6cc Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 17:13:36 +0800 Subject: [PATCH 073/278] Add debug statement --- tests/integration/test_runner_manager_openstack.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index f9e6f8f62..ac9885ff4 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -163,6 +163,8 @@ async def test_runner_normal_idle_lifecycle( # 3. runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) + # TODO: debug + pytest.set_trace() runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) assert len(runner_list) == 1 From 78fdac22f0aed33bda0126824fddc0f233fe42d5 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 18:10:16 +0800 Subject: [PATCH 074/278] Move debug --- tests/integration/test_runner_manager_openstack.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index ac9885ff4..3cf2ad69b 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -161,10 +161,11 @@ async def test_runner_normal_idle_lifecycle( assert openstack_runner_manager._health_check(runner) - # 3. - runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) # TODO: debug pytest.set_trace() + + # 3. + runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) assert len(runner_list) == 1 From fd628e6bd47d0027905e16daee6650e7d593e4d5 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 18:42:45 +0800 Subject: [PATCH 075/278] Add busy runner test --- src/manager/github_runner_manager.py | 2 +- src/openstack_cloud/openstack_cloud.py | 2 +- tests/integration/helpers/common.py | 10 ++- .../test_runner_manager_openstack.py | 85 ++++++++++++++----- 4 files changed, 73 insertions(+), 26 deletions(-) diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 5c23a0d65..ce6591cb4 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -43,7 +43,7 @@ class GithubRunnerManager: def __init__(self, prefix: str, token: str, path: GithubPath): """Construct the object. - + Args: prefix: The prefix in the name to identify the runners managed by this instance. token: The GitHub personal access token to access the GitHub API. diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 0b97c96be..3698ed32a 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -373,7 +373,7 @@ def _get_and_ensure_unique_server( returned. Other servers is deleted. """ servers: list[OpenstackServer] = conn.search_servers(name) - + if not servers: return None diff --git a/tests/integration/helpers/common.py b/tests/integration/helpers/common.py index a1f319697..16622c038 100644 --- a/tests/integration/helpers/common.py +++ b/tests/integration/helpers/common.py @@ -375,7 +375,7 @@ def _is_workflow_run_complete(run: WorkflowRun) -> bool: async def dispatch_workflow( - app: Application, + app: Application | None, branch: Branch, github_repository: Repository, conclusion: str, @@ -400,14 +400,16 @@ async def dispatch_workflow( Returns: The workflow run. """ + if dispatch_input is None: + assert app is not None, "If dispatch input not given the app cannot be None." + dispatch_input = {"runner": app.name} + start_time = datetime.now(timezone.utc) workflow = github_repository.get_workflow(id_or_file_name=workflow_id_or_name) # The `create_dispatch` returns True on success. - assert workflow.create_dispatch( - branch, dispatch_input or {"runner": app.name} - ), "Failed to create workflow" + assert workflow.create_dispatch(branch, dispatch_input), "Failed to create workflow" # There is a very small chance of selecting a run not created by the dispatch above. run: WorkflowRun | None = await wait_for( diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 3cf2ad69b..27dafb3b5 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -9,6 +9,8 @@ import pytest import pytest_asyncio import yaml +from github.Branch import Branch +from github.Repository import Repository from openstack.connection import Connection as OpenstackConnection from charm_state import GithubPath, ProxyConfig, parse_github_path @@ -21,6 +23,11 @@ OpenstackRunnerManager, OpenstackRunnerManagerConfig, ) +from tests.integration.helpers.common import ( + DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, + dispatch_workflow, + wait_for, +) from tests.integration.helpers.openstack import PrivateEndpointConfigs @@ -132,13 +139,11 @@ async def test_runner_normal_idle_lifecycle( Act: 1. Create one runner. 2. Run health check on the runner. - 4. Delete all busy runner. - 4. Delete all idle runner. + 3. Delete all idle runner. Assert: 1. An active idle runner. 2. Health check passes. - 3. An active idle runner. - 4. No runners. + 3. No runners. """ # 1. runner_id_list = runner_manager.create_runners(1) @@ -161,20 +166,7 @@ async def test_runner_normal_idle_lifecycle( assert openstack_runner_manager._health_check(runner) - # TODO: debug - pytest.set_trace() - # 3. - runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) - runner_list = runner_manager.get_runners() - assert isinstance(runner_list, tuple) - assert len(runner_list) == 1 - runner = runner_list[0] - assert runner.id == runner_id - assert runner.cloud_state == CloudRunnerState.ACTIVE - assert runner.github_state == GithubRunnerState.IDLE - - # 4. runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) @@ -184,7 +176,60 @@ async def test_runner_normal_idle_lifecycle( @pytest.mark.openstack @pytest.mark.asyncio @pytest.mark.abort_on_fail -async def test_runner_normal_busy_lifecycle( - runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager +async def test_runner_flush_busy_lifecycle( + runner_manager: RunnerManager, + openstack_runner_manager: OpenstackRunnerManager, + test_github_branch: Branch, + github_repository: Repository, ): - pass + """ + Arrange: RunnerManager with one idle runner. + Act: + 1. Run a long workflow. + 2. Run flush idle runner. + 3. Run flush busy runner. + Assert: + 1. Runner takes the job and become busy. + 2. Busy runner still exists. + 3. No runners exists. + """ + runner_manager.create_runners(1) + runner_list = runner_manager.get_runners() + assert len(runner_list) == 1, "Test arrange failed: Expect one runner" + runner = runner_list[0] + assert ( + runner.cloud_state == CloudRunnerState.ACTIVE + ), "Test arrange failed: Expect runner in active state" + assert ( + runner.github_state == GithubRunnerState.IDLE + ), "Test arrange failed: Expect runner in idle state" + + # 1. + workflow = await dispatch_workflow( + app=None, + branch=test_github_branch, + github_repository=github_repository, + conclusion="success", + workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, + dispatch_input={"runner": runner.name, "minutes": "10"}, + wait=False, + ) + await wait_for(lambda: workflow.update() or workflow.status == "in_progress") + + runner_list = runner_manager.get_runners() + assert len(runner_list) == 1 + busy_runner = runner_list[0] + assert busy_runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GithubRunnerState.BUSY + + # 2. + runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) + runner_list = runner_manager.get_runners() + assert len(runner_list) == 1 + busy_runner = runner_list[0] + assert busy_runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GithubRunnerState.BUSY + + # 3. + runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) + assert len(runner_list) == 0 From e688b6cfbfe0a2132ec864ce9c6971ba2633538b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 18:48:52 +0800 Subject: [PATCH 076/278] Add debug statement. --- tests/integration/test_runner_manager_openstack.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 27dafb3b5..6a9542f05 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -215,6 +215,7 @@ async def test_runner_flush_busy_lifecycle( wait=False, ) await wait_for(lambda: workflow.update() or workflow.status == "in_progress") + pytest.set_trace() runner_list = runner_manager.get_runners() assert len(runner_list) == 1 From 4296e04e855e6e6c582f2fa7fef54f3a87994d28 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 18:49:33 +0800 Subject: [PATCH 077/278] Disable some test --- tests/integration/test_runner_manager_openstack.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 6a9542f05..64642397f 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -114,9 +114,10 @@ async def runner_manager_fixture( return RunnerManager(openstack_runner_manager, config) -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail +# TODO: Re-enable all tests +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail async def test_get_no_runner(runner_manager: RunnerManager) -> None: """ Arrange: RunnerManager instance with no runners. @@ -128,9 +129,9 @@ async def test_get_no_runner(runner_manager: RunnerManager) -> None: assert not runner_list -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail async def test_runner_normal_idle_lifecycle( runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager ) -> None: From 7d2bc004dbd3dd0b1ce24ead07efe5665a3b8b75 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 18:51:25 +0800 Subject: [PATCH 078/278] Disable some test --- .../test_runner_manager_openstack.py | 99 +++++++++---------- 1 file changed, 49 insertions(+), 50 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 64642397f..a6a988e1c 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -114,64 +114,63 @@ async def runner_manager_fixture( return RunnerManager(openstack_runner_manager, config) -# TODO: Re-enable all tests # @pytest.mark.openstack # @pytest.mark.asyncio # @pytest.mark.abort_on_fail -async def test_get_no_runner(runner_manager: RunnerManager) -> None: - """ - Arrange: RunnerManager instance with no runners. - Act: Get runners. - Assert: Empty tuple returned. - """ - runner_list = runner_manager.get_runners() - assert isinstance(runner_list, tuple) - assert not runner_list +# async def test_get_no_runner(runner_manager: RunnerManager) -> None: +# """ +# Arrange: RunnerManager instance with no runners. +# Act: Get runners. +# Assert: Empty tuple returned. +# """ +# runner_list = runner_manager.get_runners() +# assert isinstance(runner_list, tuple) +# assert not runner_list # @pytest.mark.openstack # @pytest.mark.asyncio # @pytest.mark.abort_on_fail -async def test_runner_normal_idle_lifecycle( - runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager -) -> None: - """ - Arrange: RunnerManager instance with no runners. - Act: - 1. Create one runner. - 2. Run health check on the runner. - 3. Delete all idle runner. - Assert: - 1. An active idle runner. - 2. Health check passes. - 3. No runners. - """ - # 1. - runner_id_list = runner_manager.create_runners(1) - assert isinstance(runner_id_list, tuple) - assert len(runner_id_list) == 1 - runner_id = runner_id_list[0] - - runner_list = runner_manager.get_runners() - assert isinstance(runner_list, tuple) - assert len(runner_list) == 1 - runner = runner_list[0] - assert runner.id == runner_id - assert runner.cloud_state == CloudRunnerState.ACTIVE - assert runner.github_state == GithubRunnerState.IDLE - - # 2. - openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() - assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." - runner = openstack_instances[0] - - assert openstack_runner_manager._health_check(runner) - - # 3. - runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) - runner_list = runner_manager.get_runners() - assert isinstance(runner_list, tuple) - assert len(runner_list) == 0 +# async def test_runner_normal_idle_lifecycle( +# runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager +# ) -> None: +# """ +# Arrange: RunnerManager instance with no runners. +# Act: +# 1. Create one runner. +# 2. Run health check on the runner. +# 3. Delete all idle runner. +# Assert: +# 1. An active idle runner. +# 2. Health check passes. +# 3. No runners. +# """ +# # 1. +# runner_id_list = runner_manager.create_runners(1) +# assert isinstance(runner_id_list, tuple) +# assert len(runner_id_list) == 1 +# runner_id = runner_id_list[0] + +# runner_list = runner_manager.get_runners() +# assert isinstance(runner_list, tuple) +# assert len(runner_list) == 1 +# runner = runner_list[0] +# assert runner.id == runner_id +# assert runner.cloud_state == CloudRunnerState.ACTIVE +# assert runner.github_state == GithubRunnerState.IDLE + +# # 2. +# openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() +# assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." +# runner = openstack_instances[0] + +# assert openstack_runner_manager._health_check(runner) + +# # 3. +# runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) +# runner_list = runner_manager.get_runners() +# assert isinstance(runner_list, tuple) +# assert len(runner_list) == 0 @pytest.mark.openstack From d90f70bd3ec276516f1b1e5c0f25ca93ecc55538 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 19:02:33 +0800 Subject: [PATCH 079/278] Fix runner label in workflow --- .../test_runner_manager_openstack.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index a6a988e1c..b6dd1ebd5 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -5,12 +5,14 @@ from pathlib import Path +from secrets import token_hex import pytest import pytest_asyncio import yaml from github.Branch import Branch from github.Repository import Repository +from github.Workflow import Workflow from openstack.connection import Connection as OpenstackConnection from charm_state import GithubPath, ProxyConfig, parse_github_path @@ -31,6 +33,11 @@ from tests.integration.helpers.openstack import PrivateEndpointConfigs + +@pytest.fixture(scope="module", name="runner_label") +def runner_label(): + return f"test-{token_hex(6)}" + @pytest.fixture(scope="module", name="log_dir_base_path") def log_dir_base_path_fixture(tmp_path_factory: Path): """Mock the log directory path and return it.""" @@ -73,6 +80,7 @@ async def openstack_runner_manager_fixture( network_name: str, github_path: GithubPath, proxy_config: ProxyConfig, + runner_label: str, openstack_connection: OpenstackConnection, ) -> OpenstackRunnerManager: """Create OpenstackRunnerManager instance. @@ -89,7 +97,7 @@ async def openstack_runner_manager_fixture( flavor=flavor_name, network=network_name, github_path=github_path, - labels=["openstack_test"], + labels=["openstack_test", runner_label], proxy_config=proxy_config, dockerhub_mirror=None, ssh_debug_connections=None, @@ -113,6 +121,10 @@ async def runner_manager_fixture( config = RunnerManagerConfig(token, github_path) return RunnerManager(openstack_runner_manager, config) +def workflow_in_progress(workflow: Workflow) -> bool: + workflow.update() + return workflow.status == "in_progress" + # @pytest.mark.openstack # @pytest.mark.asyncio @@ -181,6 +193,7 @@ async def test_runner_flush_busy_lifecycle( openstack_runner_manager: OpenstackRunnerManager, test_github_branch: Branch, github_repository: Repository, + runner_label: str ): """ Arrange: RunnerManager with one idle runner. @@ -211,11 +224,10 @@ async def test_runner_flush_busy_lifecycle( github_repository=github_repository, conclusion="success", workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, - dispatch_input={"runner": runner.name, "minutes": "10"}, + dispatch_input={"runner": runner_label, "minutes": "10"}, wait=False, ) - await wait_for(lambda: workflow.update() or workflow.status == "in_progress") - pytest.set_trace() + await wait_for(workflow_in_progress) runner_list = runner_manager.get_runners() assert len(runner_list) == 1 From a868a837b1dd60a0107e153cabbee920d1d83651 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 6 Aug 2024 19:08:15 +0800 Subject: [PATCH 080/278] Fix lambda --- tests/integration/test_runner_manager_openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b6dd1ebd5..6692b4fe8 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -227,7 +227,7 @@ async def test_runner_flush_busy_lifecycle( dispatch_input={"runner": runner_label, "minutes": "10"}, wait=False, ) - await wait_for(workflow_in_progress) + await wait_for(lambda: workflow_in_progress(workflow)) runner_list = runner_manager.get_runners() assert len(runner_list) == 1 From 11b9f7896beb8ea615142a38e2a9a9e84cc3733c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:41:48 +0800 Subject: [PATCH 081/278] Debug --- tests/integration/test_runner_manager_openstack.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 6692b4fe8..c624e5986 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -245,4 +245,5 @@ async def test_runner_flush_busy_lifecycle( # 3. runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) - assert len(runner_list) == 0 + assert len(runner_list) == 1 + pytest.set_trace() From 4d25f2c4912e0f12087ff61e4d768310188eb6a1 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:51:38 +0800 Subject: [PATCH 082/278] Debug --- tests/integration/test_runner_manager_openstack.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index c624e5986..4839bc962 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -190,7 +190,6 @@ def workflow_in_progress(workflow: Workflow) -> bool: @pytest.mark.abort_on_fail async def test_runner_flush_busy_lifecycle( runner_manager: RunnerManager, - openstack_runner_manager: OpenstackRunnerManager, test_github_branch: Branch, github_repository: Repository, runner_label: str @@ -245,5 +244,5 @@ async def test_runner_flush_busy_lifecycle( # 3. runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) - assert len(runner_list) == 1 + runner_list = runner_manager.get_runners() pytest.set_trace() From d38d067db0ed7359aeada85d247fd5808dabdefa Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:16:29 +0800 Subject: [PATCH 083/278] Add debug --- src-docs/openstack_cloud.openstack_cloud.md | 4 ++-- src/openstack_cloud/openstack_cloud.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 019487f67..3bd5690bd 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -72,7 +72,7 @@ Create a OpenstackCloud instance. --- - + ### method `cleanup` @@ -128,7 +128,7 @@ get_instances() → tuple[OpenstackInstance] --- - + ### method `get_server_name` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 3698ed32a..59a4a1db4 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -256,6 +256,11 @@ def get_instances(self) -> tuple[OpenstackInstance]: ) as conn: servers = self._get_openstack_instances(conn) server_names = set(server.name for server in servers) + + # TODO: debug + import pytest + pytest.set_trace() + return [ OpenstackInstance( OpenstackCloud._get_and_ensure_unique_server(conn, name), self.prefix From 03dd8396bca922f590cfb26fed58ba4587867f70 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:17:10 +0800 Subject: [PATCH 084/278] Start new manual test env --- .github/workflows/manual_test_env.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index c1060f3fb..3d0f7ad56 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -2,7 +2,7 @@ name: Manual test env on: # TODO: Uncomment - # pull_request: + pull_request: workflow_dispatch: jobs: From d42f2b5d0cb0432abc4eb5095259df0565398739 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:52:28 +0800 Subject: [PATCH 085/278] Add none check --- .github/workflows/manual_test_env.yaml | 2 +- src-docs/openstack_cloud.openstack_cloud.md | 4 ++-- src/openstack_cloud/openstack_cloud.py | 18 ++++++++---------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index 3d0f7ad56..c1060f3fb 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -2,7 +2,7 @@ name: Manual test env on: # TODO: Uncomment - pull_request: + # pull_request: workflow_dispatch: jobs: diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 3bd5690bd..fcbc4e4fa 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -72,7 +72,7 @@ Create a OpenstackCloud instance. --- - + ### method `cleanup` @@ -128,7 +128,7 @@ get_instances() → tuple[OpenstackInstance] --- - + ### method `get_server_name` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 59a4a1db4..77c4caf08 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -257,16 +257,14 @@ def get_instances(self) -> tuple[OpenstackInstance]: servers = self._get_openstack_instances(conn) server_names = set(server.name for server in servers) - # TODO: debug - import pytest - pytest.set_trace() - - return [ - OpenstackInstance( - OpenstackCloud._get_and_ensure_unique_server(conn, name), self.prefix - ) - for name in server_names - ] + instances = [] + for name in server_names: + # The server can be deleted between the `_get_openstack_instances` call and this + # line. This is an issues during tests. Hence the need for None check. + server = OpenstackCloud._get_and_ensure_unique_server(conn, name) + if server is not None: + instances.append(OpenstackInstance(server)) + return instances def cleanup(self) -> None: with _get_openstack_connection( From 6aff0f13a6d0dce2d75b0d36d8ddd5e0ff71fe03 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:54:45 +0800 Subject: [PATCH 086/278] Fix missing prefix --- src/openstack_cloud/openstack_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 77c4caf08..cf390790a 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -263,7 +263,7 @@ def get_instances(self) -> tuple[OpenstackInstance]: # line. This is an issues during tests. Hence the need for None check. server = OpenstackCloud._get_and_ensure_unique_server(conn, name) if server is not None: - instances.append(OpenstackInstance(server)) + instances.append(OpenstackInstance(server, self.prefix)) return instances def cleanup(self) -> None: From c106fd20b6222861ba9ad6147a2c48d5b183cdd2 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:55:50 +0800 Subject: [PATCH 087/278] Add more logging --- src/manager/runner_manager.py | 33 ++++- src/openstack_cloud/openstack_cloud.py | 6 +- .../test_runner_manager_openstack.py | 127 +++++++++--------- 3 files changed, 98 insertions(+), 68 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 9e3df6df7..4510787ea 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -3,6 +3,7 @@ """Class for managing the GitHub self-hosted runners hosted on cloud instances.""" +import logging from dataclasses import dataclass from enum import Enum, auto from typing import Sequence @@ -17,6 +18,8 @@ ) from manager.github_runner_manager import GithubRunnerManager, GithubRunnerState +logger = logging.getLogger(__name__) + class FlushMode(Enum): """Strategy for flushing runners. @@ -101,6 +104,7 @@ def create_runners(self, num: int) -> tuple[RunnerId]: Returns: List of instance ID of the runners. """ + logger.info("Creating %s runners", num) registration_token = self._github.get_registration_token() runner_ids = [] @@ -125,13 +129,28 @@ def get_runners( Returns: Information on the runners. """ + logger.info("Getting runners...") github_infos = self._github.get_runners(github_runner_state) cloud_infos = self._cloud.get_runners(cloud_runner_state) github_infos_map = {info.name: info for info in github_infos} cloud_infos_map = {info.name: info for info in cloud_infos} + runner_names = cloud_infos_map.keys() & github_infos_map.keys() + logger.info("Found following runners: %s", runner_names) + + cloud_only = cloud_infos_map.keys() - runner_names + github_only = github_infos_map.keys() - runner_names + if cloud_only: + logger.warning( + "Found runner instance on cloud but not registered on GitHub: %s", cloud_only + ) + if github_only: + logger.warning( + "Found self-hosted runner on GitHub but no matching runner instance on cloud: %s", + github_only, + ) + return tuple( - RunnerInstance(cloud_infos_map[name], github_infos_map[name]) - for name in cloud_infos_map.keys() & github_infos_map.keys() + RunnerInstance(cloud_infos_map[name], github_infos_map[name]) for name in runner_names ) def delete_runners(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: @@ -140,11 +159,21 @@ def delete_runners(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: Args: flush_mode: The type of runners affect by the deletion. """ + match flush_mode: + case FlushMode.FLUSH_IDLE: + logger.info("Deleting idle runners...") + case FlushMode.FLUSH_BUSY: + logger.info("Deleting idle and busy runners...") + case _: + logger.critical("Unknown flush mode %s encountered, contact developers", flush_mode) + states = [GithubRunnerState.IDLE] if flush_mode == FlushMode.FLUSH_BUSY: states.append(GithubRunnerState.BUSY) runners_list = self.get_runners(github_runner_state=states) + runner_names = [runner.name for runner in runners_list] + logger.info("Deleting runners: %s", runner_names) remove_token = self._github.get_removal_token() for runner in runners_list: diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index cf390790a..56146a33d 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -256,10 +256,10 @@ def get_instances(self) -> tuple[OpenstackInstance]: ) as conn: servers = self._get_openstack_instances(conn) server_names = set(server.name for server in servers) - - instances = [] + + instances = [] for name in server_names: - # The server can be deleted between the `_get_openstack_instances` call and this + # The server can be deleted between the `_get_openstack_instances` call and this # line. This is an issues during tests. Hence the need for None check. server = OpenstackCloud._get_and_ensure_unique_server(conn, name) if server is not None: diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 4839bc962..8ae890e1a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -33,11 +33,11 @@ from tests.integration.helpers.openstack import PrivateEndpointConfigs - @pytest.fixture(scope="module", name="runner_label") def runner_label(): return f"test-{token_hex(6)}" + @pytest.fixture(scope="module", name="log_dir_base_path") def log_dir_base_path_fixture(tmp_path_factory: Path): """Mock the log directory path and return it.""" @@ -121,8 +121,9 @@ async def runner_manager_fixture( config = RunnerManagerConfig(token, github_path) return RunnerManager(openstack_runner_manager, config) + def workflow_in_progress(workflow: Workflow) -> bool: - workflow.update() + workflow.update() return workflow.status == "in_progress" @@ -185,64 +186,64 @@ def workflow_in_progress(workflow: Workflow) -> bool: # assert len(runner_list) == 0 -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_runner_flush_busy_lifecycle( - runner_manager: RunnerManager, - test_github_branch: Branch, - github_repository: Repository, - runner_label: str -): - """ - Arrange: RunnerManager with one idle runner. - Act: - 1. Run a long workflow. - 2. Run flush idle runner. - 3. Run flush busy runner. - Assert: - 1. Runner takes the job and become busy. - 2. Busy runner still exists. - 3. No runners exists. - """ - runner_manager.create_runners(1) - runner_list = runner_manager.get_runners() - assert len(runner_list) == 1, "Test arrange failed: Expect one runner" - runner = runner_list[0] - assert ( - runner.cloud_state == CloudRunnerState.ACTIVE - ), "Test arrange failed: Expect runner in active state" - assert ( - runner.github_state == GithubRunnerState.IDLE - ), "Test arrange failed: Expect runner in idle state" - - # 1. - workflow = await dispatch_workflow( - app=None, - branch=test_github_branch, - github_repository=github_repository, - conclusion="success", - workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, - dispatch_input={"runner": runner_label, "minutes": "10"}, - wait=False, - ) - await wait_for(lambda: workflow_in_progress(workflow)) - - runner_list = runner_manager.get_runners() - assert len(runner_list) == 1 - busy_runner = runner_list[0] - assert busy_runner.cloud_state == CloudRunnerState.ACTIVE - assert busy_runner.github_state == GithubRunnerState.BUSY - - # 2. - runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) - runner_list = runner_manager.get_runners() - assert len(runner_list) == 1 - busy_runner = runner_list[0] - assert busy_runner.cloud_state == CloudRunnerState.ACTIVE - assert busy_runner.github_state == GithubRunnerState.BUSY - - # 3. - runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) - runner_list = runner_manager.get_runners() - pytest.set_trace() +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail +# async def test_runner_flush_busy_lifecycle( +# runner_manager: RunnerManager, +# test_github_branch: Branch, +# github_repository: Repository, +# runner_label: str, +# ): +# """ +# Arrange: RunnerManager with one idle runner. +# Act: +# 1. Run a long workflow. +# 2. Run flush idle runner. +# 3. Run flush busy runner. +# Assert: +# 1. Runner takes the job and become busy. +# 2. Busy runner still exists. +# 3. No runners exists. +# """ +# runner_manager.create_runners(1) +# runner_list = runner_manager.get_runners() +# assert len(runner_list) == 1, "Test arrange failed: Expect one runner" +# runner = runner_list[0] +# assert ( +# runner.cloud_state == CloudRunnerState.ACTIVE +# ), "Test arrange failed: Expect runner in active state" +# assert ( +# runner.github_state == GithubRunnerState.IDLE +# ), "Test arrange failed: Expect runner in idle state" + +# # 1. +# workflow = await dispatch_workflow( +# app=None, +# branch=test_github_branch, +# github_repository=github_repository, +# conclusion="success", +# workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, +# dispatch_input={"runner": runner_label, "minutes": "10"}, +# wait=False, +# ) +# await wait_for(lambda: workflow_in_progress(workflow)) + +# runner_list = runner_manager.get_runners() +# assert len(runner_list) == 1 +# busy_runner = runner_list[0] +# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE +# assert busy_runner.github_state == GithubRunnerState.BUSY + +# # 2. +# runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) +# runner_list = runner_manager.get_runners() +# assert len(runner_list) == 1 +# busy_runner = runner_list[0] +# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE +# assert busy_runner.github_state == GithubRunnerState.BUSY + +# # 3. +# runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) +# runner_list = runner_manager.get_runners() +# pytest.set_trace() From 0f5b6cee7adb7cbea05bf5944db058505ca583ba Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 16:37:36 +0800 Subject: [PATCH 088/278] Refactor runner manager one runner fixture --- src/manager/runner_manager.py | 16 +- .../test_runner_manager_openstack.py | 168 +++++++++++------- 2 files changed, 111 insertions(+), 73 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 4510787ea..e18cd61fc 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -160,13 +160,15 @@ def delete_runners(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: flush_mode: The type of runners affect by the deletion. """ match flush_mode: - case FlushMode.FLUSH_IDLE: - logger.info("Deleting idle runners...") - case FlushMode.FLUSH_BUSY: - logger.info("Deleting idle and busy runners...") - case _: - logger.critical("Unknown flush mode %s encountered, contact developers", flush_mode) - + case FlushMode.FLUSH_IDLE: + logger.info("Deleting idle runners...") + case FlushMode.FLUSH_BUSY: + logger.info("Deleting idle and busy runners...") + case _: + logger.critical( + "Unknown flush mode %s encountered, contact developers", flush_mode + ) + states = [GithubRunnerState.IDLE] if flush_mode == FlushMode.FLUSH_BUSY: states.append(GithubRunnerState.BUSY) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 8ae890e1a..984a5c640 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -6,6 +6,7 @@ from pathlib import Path from secrets import token_hex +from typing import Iterator import pytest import pytest_asyncio @@ -39,14 +40,14 @@ def runner_label(): @pytest.fixture(scope="module", name="log_dir_base_path") -def log_dir_base_path_fixture(tmp_path_factory: Path): +def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path]]: """Mock the log directory path and return it.""" with pytest.MonkeyPatch.context() as monkeypatch: runner_log_dir_path = tmp_path_factory.mktemp("log") / "runner_log" metric_log_path = tmp_path_factory.mktemp("log") / "runner_log" monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", runner_log_dir_path) monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) - yield + yield {"runner_logs_dir": runner_log_dir_path, "metric_log": metric_log_path} @pytest.fixture(scope="module", name="github_path") @@ -112,7 +113,7 @@ async def runner_manager_fixture( openstack_runner_manager: OpenstackRunnerManager, token: str, github_path: GithubPath, - log_dir_base_path: Path, + log_dir_base_path: dict[str, Path], ) -> RunnerManager: """Get RunnerManager instance. @@ -122,9 +123,24 @@ async def runner_manager_fixture( return RunnerManager(openstack_runner_manager, config) -def workflow_in_progress(workflow: Workflow) -> bool: +@pytest_asyncio.fixture(scope="module", name="runner_manager_with_one_runner") +async def runner_manager_with_one_runner_fixture(runner_manager: RunnerManager) -> RunnerManager: + runner_manager.create_runners(1) + runner_list = runner_manager.get_runners() + assert len(runner_list) == 1, "Test arrange failed: Expect one runner" + runner = runner_list[0] + assert ( + runner.cloud_state == CloudRunnerState.ACTIVE + ), "Test arrange failed: Expect runner in active state" + assert ( + runner.github_state == GithubRunnerState.IDLE + ), "Test arrange failed: Expect runner in idle state" + return runner_manager + + +def workflow_is_status(workflow: Workflow, status: str) -> bool: workflow.update() - return workflow.status == "in_progress" + return workflow.status == status # @pytest.mark.openstack @@ -186,64 +202,84 @@ def workflow_in_progress(workflow: Workflow) -> bool: # assert len(runner_list) == 0 -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_runner_flush_busy_lifecycle( -# runner_manager: RunnerManager, -# test_github_branch: Branch, -# github_repository: Repository, -# runner_label: str, -# ): -# """ -# Arrange: RunnerManager with one idle runner. -# Act: -# 1. Run a long workflow. -# 2. Run flush idle runner. -# 3. Run flush busy runner. -# Assert: -# 1. Runner takes the job and become busy. -# 2. Busy runner still exists. -# 3. No runners exists. -# """ -# runner_manager.create_runners(1) -# runner_list = runner_manager.get_runners() -# assert len(runner_list) == 1, "Test arrange failed: Expect one runner" -# runner = runner_list[0] -# assert ( -# runner.cloud_state == CloudRunnerState.ACTIVE -# ), "Test arrange failed: Expect runner in active state" -# assert ( -# runner.github_state == GithubRunnerState.IDLE -# ), "Test arrange failed: Expect runner in idle state" - -# # 1. -# workflow = await dispatch_workflow( -# app=None, -# branch=test_github_branch, -# github_repository=github_repository, -# conclusion="success", -# workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, -# dispatch_input={"runner": runner_label, "minutes": "10"}, -# wait=False, -# ) -# await wait_for(lambda: workflow_in_progress(workflow)) - -# runner_list = runner_manager.get_runners() -# assert len(runner_list) == 1 -# busy_runner = runner_list[0] -# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE -# assert busy_runner.github_state == GithubRunnerState.BUSY - -# # 2. -# runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) -# runner_list = runner_manager.get_runners() -# assert len(runner_list) == 1 -# busy_runner = runner_list[0] -# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE -# assert busy_runner.github_state == GithubRunnerState.BUSY - -# # 3. -# runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) -# runner_list = runner_manager.get_runners() -# pytest.set_trace() +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_flush_busy_lifecycle( + runner_manager_with_one_runner: RunnerManager, + test_github_branch: Branch, + github_repository: Repository, + runner_label: str, +): + """ + Arrange: RunnerManager with one idle runner. + Act: + 1. Run a long workflow. + 2. Run flush idle runner. + 3. Run flush busy runner. + Assert: + 1. Runner takes the job and become busy. + 2. Busy runner still exists. + 3. No runners exists. + """ + # 1. + workflow = await dispatch_workflow( + app=None, + branch=test_github_branch, + github_repository=github_repository, + conclusion="success", + workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, + dispatch_input={"runner": runner_label, "minutes": "10"}, + wait=False, + ) + await wait_for(lambda: workflow_is_status(workflow, "in_progress")) + + runner_list = runner_manager_with_one_runner.get_runners() + assert len(runner_list) == 1 + busy_runner = runner_list[0] + assert busy_runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GithubRunnerState.BUSY + + # 2. + runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) + runner_list = runner_manager_with_one_runner.get_runners() + assert len(runner_list) == 1 + busy_runner = runner_list[0] + assert busy_runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GithubRunnerState.BUSY + + # 3. + runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) + runner_list = runner_manager_with_one_runner.get_runners() + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_normal_lifecycle( + runner_manager_with_one_runner: RunnerManager, + test_github_branch: Branch, + github_repository: Repository, + runner_label: str, + log_dir_base_path: dict[str, Path], +): + """ + Arrange: RunnerManager with one runner. + Act: + 1. Start a test workflow for the runner. + 2. Run cleanup. + Assert: + 1. The workflow complete successfully. + 2. The runner should be deleted. The metrics should be recorded. + """ + workflow = await dispatch_workflow( + app=None, + branch=test_github_branch, + github_repository=github_repository, + conclusion="success", + workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, + dispatch_input={"runner": runner_label, "minutes": "0"}, + wait=False, + ) + await wait_for(lambda: workflow_is_status(workflow, "completed")) + metric_log_path = log_dir_base_path["metric_log"] From d0a3173878e2fd7df06a45aaf5147c9e8063b9db Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 16:42:09 +0800 Subject: [PATCH 089/278] Fix error string formatting --- src/openstack_cloud/openstack_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 950a280ea..918950a68 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -107,7 +107,7 @@ def create_runner(self, registration_token: str) -> RunnerId: userdata=userdata, ) except OpenStackError as err: - raise RunnerCreateError("Failed to create {instance_name} openstack runner") from err + raise RunnerCreateError(f"Failed to create {instance_name} openstack runner") from err self._wait_runner_startup(instance) From dd14f44895f5d870c4ef150d73b0121bd3b543ae Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:00:19 +0800 Subject: [PATCH 090/278] Adding the docstring for github_runner_manager --- src/manager/github_runner_manager.py | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index ce6591cb4..96b1b2dff 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -56,6 +56,14 @@ def __init__(self, prefix: str, token: str, path: GithubPath): def get_runners( self, states: Sequence[GithubRunnerState] | None = None ) -> tuple[SelfHostedRunner]: + """Get info on self-hosted runners of certain states. + + Args: + states: Filter the runners for these states. If None, all runners are returned. + + Returns: + Information on the runners. + """ runner_list = self._github.get_runner_github_info(self._path) return tuple( runner @@ -65,20 +73,50 @@ def get_runners( ) def delete_runners(self, states: Sequence[GithubRunnerState] | None = None) -> None: + """Delete the self-hosted runners of certain states. + + Args: + states: Filter the runners for these states. If None, all runners are deleted. + + Returns: + Information on the runners. + """ runner_list = self.get_runners(states) for runner in runner_list: self._github.delete_runner(self._path, runner.id) def get_registration_token(self) -> str: + """Get registration token from GitHub. + + This token is used for registering self-hosted runners. + + Returns: + The registration token. + """ return self._github.get_runner_registration_token(self._path) def get_removal_token(self) -> str: + """Get removal token from GitHub. + + This token is used for removing self-hosted runners. + + Returns: + The removal token. + """ return self._github.get_runner_remove_token(self._path) @staticmethod def _filter_runner_state( runner: SelfHostedRunner, states: Sequence[GithubRunnerState] | None ) -> bool: + """Filter the runner by the state. + + Args: + states: Filter the runners for these states. If None, return true. + + Returns: + True if the runner is in one of the state, else false. + """ if states is None: return True return GithubRunnerState.from_runner(runner) in states From 8cc1325c18d03a88dbf6df5362a9f09ee96c89fb Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:03:10 +0800 Subject: [PATCH 091/278] Fix test fixture scope --- tests/integration/test_runner_manager_openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 984a5c640..32b5f28b2 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -123,7 +123,7 @@ async def runner_manager_fixture( return RunnerManager(openstack_runner_manager, config) -@pytest_asyncio.fixture(scope="module", name="runner_manager_with_one_runner") +@pytest_asyncio.fixture(scope="function", name="runner_manager_with_one_runner") async def runner_manager_with_one_runner_fixture(runner_manager: RunnerManager) -> RunnerManager: runner_manager.create_runners(1) runner_list = runner_manager.get_runners() From fa0313ace177d57c816fe7857cde62c6f8fd1d91 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:41:51 +0800 Subject: [PATCH 092/278] Add docstring on cloud_runner_manager --- src/manager/cloud_runner_manager.py | 59 +++++++++++++++++-- src/manager/github_runner_manager.py | 8 +-- src/manager/runner_manager.py | 6 +- .../openstack_runner_manager.py | 10 ++-- 4 files changed, 67 insertions(+), 16 deletions(-) diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 3222bfcea..88fbe6c0f 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -1,12 +1,14 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +"""Interface of manager of runner instance on clouds.""" + from abc import ABC from dataclasses import dataclass from enum import Enum from typing import Sequence, Tuple -RunnerId = str +InstanceId = str class CloudRunnerState(str, Enum): @@ -65,14 +67,63 @@ class CloudRunnerInstance: class CloudRunnerManager(ABC): + """Manage runner instance on cloud.""" + def get_name_prefix(self) -> str: ... - def create_runner(self, registration_token: str) -> RunnerId: ... + """Get the name prefix of the self-hosted runners. + + Returns: + The name prefix. + """ - def get_runner(self, id: RunnerId) -> CloudRunnerInstance: ... + def create_runner(self, registration_token: str) -> InstanceId: ... + + """Create a self-hosted runner. + + Args: + registration_token: The GitHub registration token for registering runners. + + Returns: + Instance ID of the runner. + """ + + def get_runner(self, id: InstanceId) -> CloudRunnerInstance: ... + + """Get a self-hosted runner by instance id. + + Args: + id: The instance id. + + Returns: + Information on the runner instance. + """ def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerInstance]: ... - def delete_runner(self, id: RunnerId, remove_token: str) -> None: ... + """Get self-hosted runners by state. + + Args: + states: Filter for the runners with these github states. If None all states will be + included. + + Returns: + Information on the runner instances. + """ + + def delete_runner(self, id: InstanceId, remove_token: str) -> None: ... + + """Delete self-hosted runners. + + Args: + id: The instance id of the runner to delete. + remove_token: The GitHub remove token. + """ def cleanup_runner(self, remove_token: str) -> None: ... + + """Cleanup runner and resource on the cloud. + + Args: + remove_token: The GitHub remove token. + """ diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 96b1b2dff..5f68b99cc 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -60,7 +60,7 @@ def get_runners( Args: states: Filter the runners for these states. If None, all runners are returned. - + Returns: Information on the runners. """ @@ -74,10 +74,10 @@ def get_runners( def delete_runners(self, states: Sequence[GithubRunnerState] | None = None) -> None: """Delete the self-hosted runners of certain states. - + Args: states: Filter the runners for these states. If None, all runners are deleted. - + Returns: Information on the runners. """ @@ -115,7 +115,7 @@ def _filter_runner_state( states: Filter the runners for these states. If None, return true. Returns: - True if the runner is in one of the state, else false. + True if the runner is in one of the state, else false. """ if states is None: return True diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index e18cd61fc..c2ffb8bee 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -14,7 +14,7 @@ CloudRunnerInstance, CloudRunnerManager, CloudRunnerState, - RunnerId, + InstanceId, ) from manager.github_runner_manager import GithubRunnerManager, GithubRunnerState @@ -45,7 +45,7 @@ class RunnerInstance: """ name: str - id: RunnerId + id: InstanceId github_state: GithubRunnerState cloud_state: CloudRunnerState @@ -95,7 +95,7 @@ def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManag prefix=self._cloud.get_name_prefix(), token=self._config.token, path=self._config.path ) - def create_runners(self, num: int) -> tuple[RunnerId]: + def create_runners(self, num: int) -> tuple[InstanceId]: """Create runners. Args: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 918950a68..90b80227b 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -30,7 +30,7 @@ CloudRunnerInstance, CloudRunnerManager, CloudRunnerState, - RunnerId, + InstanceId, ) from metrics import events as metric_events from metrics import github as github_metrics @@ -91,7 +91,7 @@ def __init__(self, prefix: str, config: OpenstackRunnerManagerConfig) -> None: def get_name_prefix(self) -> str: return self.prefix - def create_runner(self, registration_token: str) -> RunnerId: + def create_runner(self, registration_token: str) -> InstanceId: start_timestamp = time.time() id = OpenstackRunnerManager._generate_runner_id() instance_name = self._openstack_cloud.get_server_name(instance_id=id) @@ -120,7 +120,7 @@ def create_runner(self, registration_token: str) -> RunnerId: ) return id - def get_runner(self, id: RunnerId) -> CloudRunnerInstance | None: + def get_runner(self, id: InstanceId) -> CloudRunnerInstance | None: name = self._openstack_cloud.get_server_name(id) instances_list = self._openstack_cloud.get_instances() for instance in instances_list: @@ -148,7 +148,7 @@ def get_runners( return instances_list return [instance for instance in instances_list if instance.state in states] - def delete_runner(self, id: RunnerId, remove_token: str) -> None: + def delete_runner(self, id: InstanceId, remove_token: str) -> None: instance = self._openstack_cloud.get_instance(id) self._delete_runner(instance, remove_token) @@ -298,7 +298,7 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: logger.info("Runner startup process found to be healthy on %s", instance.server_name) @staticmethod - def _generate_runner_id() -> RunnerId: + def _generate_runner_id() -> InstanceId: return secrets.token_hex(12) @staticmethod From 66e770fd8f93a23922323d280da5811e8f891a56 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:47:33 +0800 Subject: [PATCH 093/278] Add debug --- tests/integration/test_runner_manager_openstack.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 32b5f28b2..1af6d2c1f 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -283,3 +283,4 @@ async def test_runner_normal_lifecycle( ) await wait_for(lambda: workflow_is_status(workflow, "completed")) metric_log_path = log_dir_base_path["metric_log"] + pytest.set_trace() From 042f4c2619402267bd0a27159230f09a071fe936 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:54:04 +0800 Subject: [PATCH 094/278] Fix docstring for cloud runner manager --- ...penstack_cloud.openstack_runner_manager.md | 62 ++++++++++--- src/manager/cloud_runner_manager.py | 88 +++++++++---------- .../openstack_runner_manager.py | 54 ++++++++++++ 3 files changed, 150 insertions(+), 54 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 7de71ebdf..f967746dc 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -58,11 +58,9 @@ __init__( ## class `OpenstackRunnerManager` +Manage self-hosted runner on OpenStack cloud. - - - - + ### method `__init__` @@ -70,16 +68,21 @@ __init__( __init__(prefix: str, config: OpenstackRunnerManagerConfig) → None ``` +Construct the object. +**Args:** + + - `prefix`: The prefix to runner name. + - `config`: Configuration of the object. --- - + ### method `cleanup` @@ -87,13 +90,17 @@ __init__(prefix: str, config: OpenstackRunnerManagerConfig) → None cleanup(remove_token: str) → None ``` +Cleanup runner and resource on the cloud. +**Args:** + + - `remove_token`: The GitHub remove token. --- - + ### method `create_runner` @@ -101,13 +108,22 @@ cleanup(remove_token: str) → None create_runner(registration_token: str) → str ``` +Create a self-hosted runner. + + + +**Args:** + + - `registration_token`: The GitHub registration token for registering runners. +**Returns:** + Instance ID of the runner. --- - + ### method `delete_runner` @@ -115,13 +131,18 @@ create_runner(registration_token: str) → str delete_runner(id: str, remove_token: str) → None ``` +Delete self-hosted runners. +**Args:** + + - `id`: The instance id of the runner to delete. + - `remove_token`: The GitHub remove token. --- - + ### method `get_name_prefix` @@ -129,13 +150,16 @@ delete_runner(id: str, remove_token: str) → None get_name_prefix() → str ``` +Get the name prefix of the self-hosted runners. +**Returns:** + The name prefix. --- - + ### method `get_runner` @@ -143,13 +167,22 @@ get_name_prefix() → str get_runner(id: str) → CloudRunnerInstance | None ``` +Get a self-hosted runner by instance id. +**Args:** + + - `id`: The instance id. + + + +**Returns:** + Information on the runner instance. --- - + ### method `get_runners` @@ -159,8 +192,17 @@ get_runners( ) → Tuple[CloudRunnerInstance] ``` +Get self-hosted runners by state. + + + +**Args:** + + - `states`: Filter for the runners with these github states. If None all states will be included. +**Returns:** + Information on the runner instances. diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 88fbe6c0f..d40664d85 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -69,61 +69,61 @@ class CloudRunnerInstance: class CloudRunnerManager(ABC): """Manage runner instance on cloud.""" - def get_name_prefix(self) -> str: ... + def get_name_prefix(self) -> str: + """Get the name prefix of the self-hosted runners. - """Get the name prefix of the self-hosted runners. - - Returns: - The name prefix. - """ + Returns: + The name prefix. + """ + ... - def create_runner(self, registration_token: str) -> InstanceId: ... + def create_runner(self, registration_token: str) -> InstanceId: + """Create a self-hosted runner. - """Create a self-hosted runner. - - Args: - registration_token: The GitHub registration token for registering runners. + Args: + registration_token: The GitHub registration token for registering runners. - Returns: - Instance ID of the runner. - """ + Returns: + Instance ID of the runner. + """ + ... - def get_runner(self, id: InstanceId) -> CloudRunnerInstance: ... + def get_runner(self, id: InstanceId) -> CloudRunnerInstance: + """Get a self-hosted runner by instance id. - """Get a self-hosted runner by instance id. + Args: + id: The instance id. - Args: - id: The instance id. - - Returns: - Information on the runner instance. - """ + Returns: + Information on the runner instance. + """ + ... - def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerInstance]: ... + def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerInstance]: + """Get self-hosted runners by state. - """Get self-hosted runners by state. - - Args: - states: Filter for the runners with these github states. If None all states will be - included. + Args: + states: Filter for the runners with these github states. If None all states will be + included. - Returns: - Information on the runner instances. - """ + Returns: + Information on the runner instances. + """ + ... - def delete_runner(self, id: InstanceId, remove_token: str) -> None: ... + def delete_runner(self, id: InstanceId, remove_token: str) -> None: + """Delete self-hosted runners. - """Delete self-hosted runners. - - Args: - id: The instance id of the runner to delete. - remove_token: The GitHub remove token. - """ + Args: + id: The instance id of the runner to delete. + remove_token: The GitHub remove token. + """ + ... - def cleanup_runner(self, remove_token: str) -> None: ... + def cleanup_runner(self, remove_token: str) -> None: + """Cleanup runner and resource on the cloud. - """Cleanup runner and resource on the cloud. - - Args: - remove_token: The GitHub remove token. - """ + Args: + remove_token: The GitHub remove token. + """ + ... diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 90b80227b..9acd0c648 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -78,8 +78,15 @@ class OpenstackRunnerManagerConfig: class OpenstackRunnerManager(CloudRunnerManager): + """Manage self-hosted runner on OpenStack cloud.""" def __init__(self, prefix: str, config: OpenstackRunnerManagerConfig) -> None: + """Construct the object. + + Args: + prefix: The prefix to runner name. + config: Configuration of the object. + """ self.prefix = prefix self.config = config self._openstack_cloud = OpenstackCloud( @@ -89,9 +96,22 @@ def __init__(self, prefix: str, config: OpenstackRunnerManagerConfig) -> None: ) def get_name_prefix(self) -> str: + """Get the name prefix of the self-hosted runners. + + Returns: + The name prefix. + """ return self.prefix def create_runner(self, registration_token: str) -> InstanceId: + """Create a self-hosted runner. + + Args: + registration_token: The GitHub registration token for registering runners. + + Returns: + Instance ID of the runner. + """ start_timestamp = time.time() id = OpenstackRunnerManager._generate_runner_id() instance_name = self._openstack_cloud.get_server_name(instance_id=id) @@ -121,6 +141,14 @@ def create_runner(self, registration_token: str) -> InstanceId: return id def get_runner(self, id: InstanceId) -> CloudRunnerInstance | None: + """Get a self-hosted runner by instance id. + + Args: + id: The instance id. + + Returns: + Information on the runner instance. + """ name = self._openstack_cloud.get_server_name(id) instances_list = self._openstack_cloud.get_instances() for instance in instances_list: @@ -135,6 +163,15 @@ def get_runner(self, id: InstanceId) -> CloudRunnerInstance | None: def get_runners( self, states: Sequence[CloudRunnerState] | None = None ) -> Tuple[CloudRunnerInstance]: + """Get self-hosted runners by state. + + Args: + states: Filter for the runners with these github states. If None all states will be + included. + + Returns: + Information on the runner instances. + """ instances_list = self._openstack_cloud.get_instances() instances_list = [ CloudRunnerInstance( @@ -149,10 +186,22 @@ def get_runners( return [instance for instance in instances_list if instance.state in states] def delete_runner(self, id: InstanceId, remove_token: str) -> None: + """Delete self-hosted runners. + + Args: + id: The instance id of the runner to delete. + remove_token: The GitHub remove token. + """ instance = self._openstack_cloud.get_instance(id) self._delete_runner(instance, remove_token) def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: + """Delete self-hosted runners by openstack instance. + + Args: + instance: The OpenStack instance. + remove_token: The GitHub remove token. + """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SshError: @@ -178,6 +227,11 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: ) def cleanup(self, remove_token: str) -> None: + """Cleanup runner and resource on the cloud. + + Args: + remove_token: The GitHub remove token. + """ runner_list = self._openstack_cloud.get_instances() for runner in runner_list: From c91d78b98bf59a55a4af982bc01a410cad7a2824 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 7 Aug 2024 18:25:37 +0800 Subject: [PATCH 095/278] Add more docstrings --- requirements.txt | 2 - src-docs/openstack_cloud.openstack_cloud.md | 67 +++++++++++++---- ...penstack_cloud.openstack_runner_manager.md | 24 +++--- src/manager/github_runner_manager.py | 2 +- src/manager/runner_manager.py | 4 +- src/openstack_cloud/openstack_cloud.py | 74 +++++++++++++++++-- .../openstack_runner_manager.py | 6 +- .../test_runner_manager_openstack.py | 2 +- 8 files changed, 140 insertions(+), 41 deletions(-) diff --git a/requirements.txt b/requirements.txt index 927fa70c5..1046b854a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,3 @@ -# TODO 2024-07-12: PyGithub-based inteface will be replacing the ghapi in the future -PyGithub ghapi jinja2 fabric >=3,<4 diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index fcbc4e4fa..70e9d7f0e 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -32,9 +32,14 @@ Represents an OpenStack instance. __init__(server: Server, prefix: str) ``` +Construct the object. +**Args:** + + - `server`: The OpenStack server. + - `prefix`: The name prefix for the servers. @@ -42,14 +47,14 @@ __init__(server: Server, prefix: str) --- - + ## class `OpenstackCloud` Client to interact with OpenStack cloud. The OpenStack server name is managed by this cloud. Caller refers to the instances via instance_id. If the caller needs the server name, e.g., for logging, it can be queried with get_server_name. - + ### method `__init__` @@ -57,7 +62,7 @@ The OpenStack server name is managed by this cloud. Caller refers to the instanc __init__(clouds_config: dict[str, dict], cloud: str, prefix: str) ``` -Create a OpenstackCloud instance. +Create the object. @@ -72,7 +77,7 @@ Create a OpenstackCloud instance. --- - + ### method `cleanup` @@ -80,13 +85,11 @@ Create a OpenstackCloud instance. cleanup() → None ``` - - - +Cleanup unused openstack resources. --- - + ### method `delete_instance` @@ -94,13 +97,17 @@ cleanup() → None delete_instance(instance_id: str) → None ``` +Delete a openstack instance. +**Args:** + + - `instance_id`: The instance ID of the instance to delete. --- - + ### method `get_instance` @@ -108,13 +115,22 @@ delete_instance(instance_id: str) → None get_instance(instance_id: str) → OpenstackInstance ``` +Get OpenStack instance by instance ID. +**Args:** + + - `instance_id`: The instance ID. + + + +**Returns:** + The OpenStack instance. --- - + ### method `get_instances` @@ -122,13 +138,16 @@ get_instance(instance_id: str) → OpenstackInstance get_instances() → tuple[OpenstackInstance] ``` +Get all OpenStack instances. +**Returns:** + The OpenStack instances. --- - + ### method `get_server_name` @@ -151,7 +170,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` @@ -159,13 +178,22 @@ Get server name on OpenStack. get_ssh_connection(instance: OpenstackInstance) → Connection ``` +Get SSH connection to an OpenStack instance. + +**Args:** + + - `instance`: The OpenStack instance to connect to. + +**Returns:** + SSH connection object. + --- - + ### method `launch_instance` @@ -179,8 +207,21 @@ launch_instance( ) → OpenstackInstance ``` +Create an OpenStack instance. +**Args:** + + - `instance_id`: The instance ID to form the instance name. + - `image`: The image used to create the instance. + - `flavor`: The flavor used to create the instance. + - `network`: The network used to create the instance. + - `userdata`: The cloud init userdata to startup the instance. + + + +**Returns:** + The OpenStack instance created. diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index f967746dc..5a6c882db 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -3,9 +3,7 @@ # module `openstack_cloud.openstack_runner_manager` - - - +Manager for self-hosted runner on OpenStack. **Global Variables** --------------- @@ -19,10 +17,10 @@ --- - + ## class `OpenstackRunnerManagerConfig` -OpenstackRunnerManagerConfig(clouds_config: dict[str, dict], cloud: str, image: str, flavor: str, network: str, github_path: charm_state.GithubOrg | charm_state.GithubRepo, labels: list[str], proxy_config: charm_state.ProxyConfig | None, dockerhub_mirror: str | None, ssh_debug_connections: list[charm_state.SSHDebugConnection] | None, repo_policy_url: str | None, repo_policy_token: str | None) +Configuration for OpenstackRunnerManager. @@ -55,12 +53,12 @@ __init__( --- - + ## class `OpenstackRunnerManager` Manage self-hosted runner on OpenStack cloud. - + ### method `__init__` @@ -82,7 +80,7 @@ Construct the object. --- - + ### method `cleanup` @@ -100,7 +98,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -123,7 +121,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -142,7 +140,7 @@ Delete self-hosted runners. --- - + ### method `get_name_prefix` @@ -159,7 +157,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `get_runner` @@ -182,7 +180,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 5f68b99cc..843c434ba 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -3,7 +3,7 @@ """Client for managing self-hosted runner on GitHub side.""" -from enum import Enum, auto +from enum import Enum from typing import Sequence from charm_state import GithubPath diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index c2ffb8bee..7f398668f 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -51,14 +51,12 @@ class RunnerInstance: def __init__( self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner - ) -> "RunnerInstance": + ): """Construct an instance. Args: cloud_instance: Information on the cloud instance. github_info: Information on the GitHub of the runner. - Returns: - A RunnerInstance object. """ self.name = github_info.name self.id = cloud_instance.id diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 56146a33d..cd1843333 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -57,6 +57,12 @@ class OpenstackInstance: status: str def __init__(self, server: OpenstackServer, prefix: str): + """Construct the object. + + Args: + server: The OpenStack server. + prefix: The name prefix for the servers. + """ self.server_id = server.id self.server_name = server.name self.status = server.status @@ -117,7 +123,7 @@ class OpenstackCloud: """ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): - """Create a OpenstackCloud instance. + """Create the object. Args: clouds_config: The openstack clouds.yaml in dict format. @@ -132,6 +138,18 @@ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): def launch_instance( self, instance_id: str, image: str, flavor: str, network: str, userdata: str ) -> OpenstackInstance: + """Create an OpenStack instance. + + Args: + instance_id: The instance ID to form the instance name. + image: The image used to create the instance. + flavor: The flavor used to create the instance. + network: The network used to create the instance. + userdata: The cloud init userdata to startup the instance. + + Returns: + The OpenStack instance created. + """ full_name = self.get_server_name(instance_id) logger.info("Creating openstack server with %s", full_name) @@ -139,7 +157,7 @@ def launch_instance( clouds_config=self._clouds_config, cloud=self._cloud ) as conn: security_group = OpenstackCloud._ensure_security_group(conn) - keypair = OpenstackCloud._setup_key_pair(conn, full_name) + keypair = OpenstackCloud._setup_keypair(conn, full_name) try: server = conn.create_server( @@ -170,16 +188,24 @@ def launch_instance( "Failed to cleanup openstack server %s that timeout during creation", full_name, ) - self._delete_key_pair(conn, instance_id) + self._delete_keypair(conn, instance_id) raise OpenStackError(f"Timeout creating openstack server {full_name}") from err except openstack.exceptions.SDKException as err: logger.exception("Failed to create openstack server %s", full_name) - self._delete_key_pair(conn, instance_id) + self._delete_keypair(conn, instance_id) raise OpenStackError(f"Failed to create openstack server {full_name}") from err return OpenstackInstance(server, self.prefix) def get_instance(self, instance_id: str) -> OpenstackInstance: + """Get OpenStack instance by instance ID. + + Args: + instance_id: The instance ID. + + Returns: + The OpenStack instance. + """ full_name = self.get_server_name(instance_id) logger.info("Getting openstack server with %s", full_name) @@ -191,6 +217,11 @@ def get_instance(self, instance_id: str) -> OpenstackInstance: ) def delete_instance(self, instance_id: str) -> None: + """Delete a openstack instance. + + Args: + instance_id: The instance ID of the instance to delete. + """ full_name = self.get_server_name(instance_id) logger.info("Deleting openstack server with %s", full_name) @@ -200,7 +231,7 @@ def delete_instance(self, instance_id: str) -> None: try: server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) conn.delete_server(name_or_id=server.id) - OpenstackCloud._delete_key_pair(conn, full_name) + OpenstackCloud._delete_keypair(conn, full_name) except ( openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout, @@ -208,6 +239,14 @@ def delete_instance(self, instance_id: str) -> None: raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: + """Get SSH connection to an OpenStack instance. + + Args: + instance: The OpenStack instance to connect to. + + Returns: + SSH connection object. + """ key_path = OpenstackCloud._get_key_path(instance.server_name) if not key_path.exists(): @@ -249,6 +288,11 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: ) def get_instances(self) -> tuple[OpenstackInstance]: + """Get all OpenStack instances. + + Returns: + The OpenStack instances. + """ logger.info("Getting all openstack servers managed by the charm") with _get_openstack_connection( @@ -267,6 +311,7 @@ def get_instances(self) -> tuple[OpenstackInstance]: return instances def cleanup(self) -> None: + """Cleanup unused openstack resources.""" with _get_openstack_connection( clouds_config=self._clouds_config, cloud=self._cloud ) as conn: @@ -413,7 +458,16 @@ def _get_key_path(name: str) -> Path: return _SSH_KEY_PATH / f"{name}.key" @staticmethod - def _setup_key_pair(conn: OpenstackConnection, name: str) -> OpenstackKeypair: + def _setup_keypair(conn: OpenstackConnection, name: str) -> OpenstackKeypair: + """Create OpenStack keypair. + + Args: + conn: The connection object to access OpenStack cloud. + name: The name of the keypair. + + Returns: + The OpenStack keypair. + """ key_path = OpenstackCloud._get_key_path(name) if key_path.exists(): @@ -427,7 +481,13 @@ def _setup_key_pair(conn: OpenstackConnection, name: str) -> OpenstackKeypair: return keypair @staticmethod - def _delete_key_pair(conn: OpenstackConnection, name: str) -> None: + def _delete_keypair(conn: OpenstackConnection, name: str) -> None: + """Delete OpenStack keypair. + + Args: + conn: The connection object to access OpenStack cloud. + name: The name of the keypair. + """ try: # Keypair have unique names, access by ID is not needed. if not conn.delete_keypair(name): diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 9acd0c648..cdd4a2ee7 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -1,6 +1,8 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +"""Manager for self-hosted runner on OpenStack.""" + import logging import secrets import time @@ -63,6 +65,8 @@ class _PullFileError(Exception): @dataclass class OpenstackRunnerManagerConfig: + """Configuration for OpenstackRunnerManager.""" + clouds_config: dict[str, dict] cloud: str image: str @@ -82,7 +86,7 @@ class OpenstackRunnerManager(CloudRunnerManager): def __init__(self, prefix: str, config: OpenstackRunnerManagerConfig) -> None: """Construct the object. - + Args: prefix: The prefix to runner name. config: Configuration of the object. diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 1af6d2c1f..d09090c3b 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -86,7 +86,7 @@ async def openstack_runner_manager_fixture( ) -> OpenstackRunnerManager: """Create OpenstackRunnerManager instance. - The prefix args of OpenstackRunnerManager set to app_name to let openstack_connection_fixture preform the cleanup of openstack resources. + The prefix args of OpenstackRunnerManager set to app_name to let openstack_connection_fixture perform the cleanup of openstack resources. """ _CLOUDS_YAML_PATH.unlink(missing_ok=True) clouds_config = yaml.safe_load(private_endpoint_clouds_yaml) From db68ce4518b74696ddcbfa373905285569b3ee8d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 8 Aug 2024 15:25:27 +0800 Subject: [PATCH 096/278] Add metrics for deleted and cleanup runners --- src-docs/metrics.runner.md | 8 ++- ...penstack_cloud.openstack_runner_manager.md | 49 +++++++++++++--- src/manager/cloud_runner_manager.py | 15 ++++- src/manager/github_runner_manager.py | 10 ++-- src/manager/runner_manager.py | 54 +++++++++++++++--- src/metrics/runner.py | 10 +++- src/openstack_cloud/openstack_manager.py | 2 +- .../openstack_runner_manager.py | 56 +++++++++++++++---- src/runner_manager.py | 2 +- .../test_runner_manager_openstack.py | 20 +++++-- tests/unit/metrics/test_runner.py | 40 ++++--------- 11 files changed, 188 insertions(+), 78 deletions(-) diff --git a/src-docs/metrics.runner.md b/src-docs/metrics.runner.md index edf8f0a65..269d2581a 100644 --- a/src-docs/metrics.runner.md +++ b/src-docs/metrics.runner.md @@ -21,7 +21,8 @@ Classes and function to extract the metrics from storage and issue runner metric ```python extract( metrics_storage_manager: StorageManager, - ignore_runners: set[str] + runners: set[str], + include: bool = False ) → Iterator[RunnerMetrics] ``` @@ -38,7 +39,8 @@ In order to avoid DoS attacks, the file size is also checked. **Args:** - `metrics_storage_manager`: The metrics storage manager. - - `ignore_runners`: The set of runners to ignore. + - `runners`: The runners to include or exclude. + - `include`: If true the provided runners are included for metric extraction, else the provided runners are excluded. @@ -48,7 +50,7 @@ In order to avoid DoS attacks, the file size is also checked. --- - + ## function `issue_events` diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 5a6c882db..f486aa6c5 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -55,10 +55,36 @@ __init__( +## class `RunnerHealth` +RunnerHealth(healthy: tuple[openstack_cloud.openstack_cloud.OpenstackInstance], unhealthy: tuple[openstack_cloud.openstack_cloud.OpenstackInstance]) + + + +### method `__init__` + +```python +__init__( + healthy: tuple[OpenstackInstance], + unhealthy: tuple[OpenstackInstance] +) → None +``` + + + + + + + + + +--- + + + ## class `OpenstackRunnerManager` Manage self-hosted runner on OpenStack cloud. - + ### method `__init__` @@ -80,12 +106,12 @@ Construct the object. --- - + ### method `cleanup` ```python -cleanup(remove_token: str) → None +cleanup(remove_token: str) → RunnerMetrics ``` Cleanup runner and resource on the cloud. @@ -96,9 +122,14 @@ Cleanup runner and resource on the cloud. - `remove_token`: The GitHub remove token. + + +**Returns:** + Any metrics retrieved from cleanup runners. + --- - + ### method `create_runner` @@ -121,12 +152,12 @@ Create a self-hosted runner. --- - + ### method `delete_runner` ```python -delete_runner(id: str, remove_token: str) → None +delete_runner(id: str, remove_token: str) → RunnerMetrics ``` Delete self-hosted runners. @@ -140,7 +171,7 @@ Delete self-hosted runners. --- - + ### method `get_name_prefix` @@ -157,7 +188,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `get_runner` @@ -180,7 +211,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index d40664d85..2a25717a5 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -6,7 +6,10 @@ from abc import ABC from dataclasses import dataclass from enum import Enum -from typing import Sequence, Tuple +from typing import Iterator, Sequence, Tuple, Type + +from metrics import events as metric_events +from metrics.runner import RunnerMetrics InstanceId = str @@ -111,19 +114,25 @@ def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerIn """ ... - def delete_runner(self, id: InstanceId, remove_token: str) -> None: + def delete_runner(self, id: InstanceId, remove_token: str) -> RunnerMetrics: """Delete self-hosted runners. Args: id: The instance id of the runner to delete. remove_token: The GitHub remove token. + + Returns: + Metrics of the runner deleted. """ ... - def cleanup_runner(self, remove_token: str) -> None: + def cleanup_runner(self, remove_token: str) -> Iterator[RunnerMetrics]: """Cleanup runner and resource on the cloud. Args: remove_token: The GitHub remove token. + + Returns: + Metrics of the runners that was cleanup. """ ... diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 843c434ba..6985405b0 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -51,7 +51,7 @@ def __init__(self, prefix: str, token: str, path: GithubPath): """ self._prefix = prefix self._path = path - self._github = GithubClient(token) + self.github = GithubClient(token) def get_runners( self, states: Sequence[GithubRunnerState] | None = None @@ -64,7 +64,7 @@ def get_runners( Returns: Information on the runners. """ - runner_list = self._github.get_runner_github_info(self._path) + runner_list = self.github.get_runner_github_info(self._path) return tuple( runner for runner in runner_list @@ -83,7 +83,7 @@ def delete_runners(self, states: Sequence[GithubRunnerState] | None = None) -> N """ runner_list = self.get_runners(states) for runner in runner_list: - self._github.delete_runner(self._path, runner.id) + self.github.delete_runner(self._path, runner.id) def get_registration_token(self) -> str: """Get registration token from GitHub. @@ -93,7 +93,7 @@ def get_registration_token(self) -> str: Returns: The registration token. """ - return self._github.get_runner_registration_token(self._path) + return self.github.get_runner_registration_token(self._path) def get_removal_token(self) -> str: """Get removal token from GitHub. @@ -103,7 +103,7 @@ def get_removal_token(self) -> str: Returns: The removal token. """ - return self._github.get_runner_remove_token(self._path) + return self.github.get_runner_remove_token(self._path) @staticmethod def _filter_runner_state( diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 7f398668f..663419e82 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -6,9 +6,10 @@ import logging from dataclasses import dataclass from enum import Enum, auto -from typing import Sequence +from typing import Iterator, Sequence, Type from charm_state import GithubPath +from errors import GithubMetricsError from github_type import SelfHostedRunner from manager.cloud_runner_manager import ( CloudRunnerInstance, @@ -17,9 +18,15 @@ InstanceId, ) from manager.github_runner_manager import GithubRunnerManager, GithubRunnerState +from metrics import events as metric_events +from metrics import github as github_metrics +from metrics import runner as runner_metrics +from metrics.runner import RunnerMetrics logger = logging.getLogger(__name__) +IssuedMetricEventsStats = dict[Type[metric_events.Event], int] + class FlushMode(Enum): """Strategy for flushing runners. @@ -49,9 +56,7 @@ class RunnerInstance: github_state: GithubRunnerState cloud_state: CloudRunnerState - def __init__( - self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner - ): + def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner): """Construct an instance. Args: @@ -151,7 +156,9 @@ def get_runners( RunnerInstance(cloud_infos_map[name], github_infos_map[name]) for name in runner_names ) - def delete_runners(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: + def delete_runners( + self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE + ) -> IssuedMetricEventsStats: """Delete the runners. Args: @@ -176,11 +183,42 @@ def delete_runners(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: logger.info("Deleting runners: %s", runner_names) remove_token = self._github.get_removal_token() + metrics = [] for runner in runners_list: - self._cloud.delete_runner(id=runner.id, remove_token=remove_token) + metrics.append(self._cloud.delete_runner(id=runner.id, remove_token=remove_token)) + + return self._issue_runner_metrics(metrics=iter(metric_events)) - def cleanup(self) -> None: + def cleanup(self) -> IssuedMetricEventsStats: """Run cleanup of the runners and other resources.""" self._github.delete_runners([GithubRunnerState.OFFLINE, GithubRunnerState.UNKNOWN]) remove_token = self._github.get_removal_token() - self._cloud.cleanup_runner(remove_token) + metrics = self._cloud.cleanup_runner(remove_token) + return self._issue_runner_metrics(metrics=metrics) + + def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetricEventsStats: + total_stats: IssuedMetricEventsStats = {} + + for extracted_metrics in metrics: + try: + job_metrics = github_metrics.job( + github_client=self._github.github, + pre_job_metrics=extracted_metrics.pre_job, + runner_name=extracted_metrics.runner_name, + ) + except GithubMetricsError: + logger.exception( + "Failed to calculate job metrics for %s", extracted_metrics.runner_name + ) + job_metrics = None + + issued_events = runner_metrics.issue_events( + runner_metrics=extracted_metrics, + job_metrics=job_metrics, + flavor=self._cloud.get_name_prefix(), + ) + + for event_type in issued_events: + total_stats[event_type] = total_stats.get(event_type, 0) + 1 + + return total_stats diff --git a/src/metrics/runner.py b/src/metrics/runner.py index dfdf11044..b0ccc191a 100644 --- a/src/metrics/runner.py +++ b/src/metrics/runner.py @@ -105,7 +105,7 @@ class RunnerMetrics(BaseModel): def extract( - metrics_storage_manager: MetricsStorageManager, ignore_runners: set[str] + metrics_storage_manager: MetricsStorageManager, runners: set[str], include: bool = False ) -> Iterator[RunnerMetrics]: """Extract metrics from runners. @@ -120,13 +120,17 @@ def extract( Args: metrics_storage_manager: The metrics storage manager. - ignore_runners: The set of runners to ignore. + runners: The runners to include or exclude. + include: If true the provided runners are included for metric extraction, else the provided + runners are excluded. Yields: Extracted runner metrics of a particular runner. """ for ms in metrics_storage_manager.list_all(): - if ms.runner_name not in ignore_runners: + if (include and ms.runner_name in runners) or ( + not include and ms.runner_name not in runners + ): runner_metrics = _extract_storage( metrics_storage_manager=metrics_storage_manager, metrics_storage=ms ) diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index f61d28b8d..fbc4d9ba0 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -1450,7 +1450,7 @@ def _issue_runner_metrics(self, conn: OpenstackConnection) -> IssuedMetricEvents for extracted_metrics in runner_metrics.extract( metrics_storage_manager=metrics_storage, - ignore_runners=instance_names, + runners=instance_names, ): try: job_metrics = github_metrics.job( diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index cdd4a2ee7..c540b2097 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -16,6 +16,7 @@ import paramiko.ssh_exception from fabric import Connection as SshConnection +import shared_fs from charm_state import GithubOrg, GithubPath, ProxyConfig, SSHDebugConnection from errors import ( CreateMetricsStorageError, @@ -35,7 +36,6 @@ InstanceId, ) from metrics import events as metric_events -from metrics import github as github_metrics from metrics import runner as runner_metrics from metrics import storage as metrics_storage from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance @@ -81,6 +81,12 @@ class OpenstackRunnerManagerConfig: repo_policy_token: str | None +@dataclass +class RunnerHealth: + healthy: tuple[OpenstackInstance] + unhealthy: tuple[OpenstackInstance] + + class OpenstackRunnerManager(CloudRunnerManager): """Manage self-hosted runner on OpenStack cloud.""" @@ -189,7 +195,7 @@ def get_runners( return instances_list return [instance for instance in instances_list if instance.state in states] - def delete_runner(self, id: InstanceId, remove_token: str) -> None: + def delete_runner(self, id: InstanceId, remove_token: str) -> runner_metrics.RunnerMetrics: """Delete self-hosted runners. Args: @@ -197,7 +203,31 @@ def delete_runner(self, id: InstanceId, remove_token: str) -> None: remove_token: The GitHub remove token. """ instance = self._openstack_cloud.get_instance(id) + metric = runner_metrics.extract( + metrics_storage_manager=shared_fs, runners=instance.server_name + ) self._delete_runner(instance, remove_token) + return metric + + def cleanup(self, remove_token: str) -> runner_metrics.RunnerMetrics: + """Cleanup runner and resource on the cloud. + + Args: + remove_token: The GitHub remove token. + + Returns: + Any metrics retrieved from cleanup runners. + """ + runners = self._get_runner_health() + healthy_runner_names = [runner.server_name for runner in runners.healthy] + metrics = runner_metrics.extract( + metrics_storage_manager=shared_fs, runners=set(healthy_runner_names) + ) + for runner in runners.unhealthy: + self._delete_runner(runner, remove_token) + + self._openstack_cloud.cleanup() + return metrics def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: """Delete self-hosted runners by openstack instance. @@ -230,24 +260,26 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: "Unable to delete openstack instance for runner %s", instance.server_name ) - def cleanup(self, remove_token: str) -> None: - """Cleanup runner and resource on the cloud. + def _get_runner_health(self) -> RunnerHealth: + """Get runners by health state. - Args: - remove_token: The GitHub remove token. + Returns: + Runners by health state. """ runner_list = self._openstack_cloud.get_instances() + healthy, unhealthy = [], [] for runner in runner_list: - state = (CloudRunnerState(runner.status),) - if state in ( + cloud_state = CloudRunnerState(runner.status) + if cloud_state in ( CloudRunnerState.DELETED, CloudRunnerState.ERROR, CloudRunnerState.STOPPED, - ) or self._health_check(runner): - self._delete_runner(runner, remove_token) - - self._openstack_cloud.cleanup() + ) or not self._health_check(runner): + unhealthy.append(runner) + else: + healthy.append(runner) + return RunnerHealth(healthy=healthy, unhealthy=unhealthy) def _generate_userdata(self, instance_name: str, registration_token: str) -> str: jinja = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"), autoescape=True) diff --git a/src/runner_manager.py b/src/runner_manager.py index 09487f453..25aca060e 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -325,7 +325,7 @@ def _issue_runner_metrics(self) -> IssuedMetricEventsStats: total_stats: IssuedMetricEventsStats = {} for extracted_metrics in runner_metrics.extract( - metrics_storage_manager=shared_fs, ignore_runners=set(runner_states.healthy) + metrics_storage_manager=shared_fs, runners=set(runner_states.healthy) ): try: job_metrics = github_metrics.job( diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index d09090c3b..377e7980d 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -21,6 +21,7 @@ from manager.github_runner_manager import GithubRunnerState from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig from metrics import events, runner_logs +from openstack_cloud import openstack_runner_manager from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( OpenstackRunnerManager, @@ -31,7 +32,6 @@ dispatch_workflow, wait_for, ) -from tests.integration.helpers.openstack import PrivateEndpointConfigs @pytest.fixture(scope="module", name="runner_label") @@ -43,11 +43,23 @@ def runner_label(): def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path]]: """Mock the log directory path and return it.""" with pytest.MonkeyPatch.context() as monkeypatch: - runner_log_dir_path = tmp_path_factory.mktemp("log") / "runner_log" - metric_log_path = tmp_path_factory.mktemp("log") / "runner_log" + temp_log_dir = tmp_path_factory.mktemp("log") + + runner_log_dir_path = temp_log_dir / "runner_log" + metric_log_path = temp_log_dir / "metric_log" + metric_exchange_path = temp_log_dir / "metric_exchange" + monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", runner_log_dir_path) monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) - yield {"runner_logs_dir": runner_log_dir_path, "metric_log": metric_log_path} + monkeypatch.setattr( + openstack_runner_manager, "METRICS_EXCHANGE_PATH", metric_exchange_path + ) + + yield { + "runner_logs_dir": runner_log_dir_path, + "metric_log": metric_log_path, + "metric_exchange": metric_exchange_path, + } @pytest.fixture(scope="module", name="github_path") diff --git a/tests/unit/metrics/test_runner.py b/tests/unit/metrics/test_runner.py index 02b5ad028..bf0a14251 100644 --- a/tests/unit/metrics/test_runner.py +++ b/tests/unit/metrics/test_runner.py @@ -170,9 +170,7 @@ def test_extract(runner_fs_base: Path): metrics_storage_manager.list_all.return_value = [runner1_fs, runner2_fs, runner3_fs] extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=set() - ) + runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert extracted_metrics == [ @@ -218,7 +216,7 @@ def test_extract_ignores_runners(runner_fs_base: Path): extracted_metrics = list( runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=ignore_runners + metrics_storage_manager=metrics_storage_manager, runners=ignore_runners ) ) @@ -253,9 +251,7 @@ def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPa monkeypatch.setattr(runner_metrics, "move_to_quarantine", move_to_quarantine_mock) extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=set() - ) + runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics @@ -275,9 +271,7 @@ def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPa metrics_storage_manager.list_all.return_value = [runner_fs] extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=set() - ) + runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) @@ -296,9 +290,7 @@ def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPa metrics_storage_manager.list_all.return_value = [runner_fs] extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=set() - ) + runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) @@ -317,9 +309,7 @@ def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPa metrics_storage_manager.list_all.return_value = [runner_fs] extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=set() - ) + runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics @@ -357,9 +347,7 @@ def test_extract_raises_error_for_too_large_files( monkeypatch.setattr(runner_metrics, "move_to_quarantine", move_to_quarantine_mock) extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=set() - ) + runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics @@ -381,9 +369,7 @@ def test_extract_raises_error_for_too_large_files( metrics_storage_manager.list_all.return_value = [runner_fs] extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=set() - ) + runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics @@ -406,9 +392,7 @@ def test_extract_raises_error_for_too_large_files( metrics_storage_manager.list_all.return_value = [runner_fs] extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=set() - ) + runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics @@ -446,9 +430,7 @@ def test_extract_ignores_filesystems_without_ts(runner_fs_base: Path): metrics_storage_manager.list_all.return_value = [runner_fs] extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=set() - ) + runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics metrics_storage_manager.delete.assert_called_once_with(runner_fs.runner_name) @@ -481,7 +463,7 @@ def test_extract_ignores_failure_on_shared_fs_cleanup( ) extracted_metrics = runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, ignore_runners=set() + metrics_storage_manager=metrics_storage_manager, runners=set() ) assert list(extracted_metrics) == [runner_metrics_data] From ab6b44514a9733303da11099e6fbbc4ba2964133 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 8 Aug 2024 15:26:30 +0800 Subject: [PATCH 097/278] Enable tests again --- .../test_runner_manager_openstack.py | 114 +++++++++--------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 377e7980d..0fe57194d 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -155,63 +155,63 @@ def workflow_is_status(workflow: Workflow, status: str) -> bool: return workflow.status == status -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_get_no_runner(runner_manager: RunnerManager) -> None: -# """ -# Arrange: RunnerManager instance with no runners. -# Act: Get runners. -# Assert: Empty tuple returned. -# """ -# runner_list = runner_manager.get_runners() -# assert isinstance(runner_list, tuple) -# assert not runner_list - - -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_runner_normal_idle_lifecycle( -# runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager -# ) -> None: -# """ -# Arrange: RunnerManager instance with no runners. -# Act: -# 1. Create one runner. -# 2. Run health check on the runner. -# 3. Delete all idle runner. -# Assert: -# 1. An active idle runner. -# 2. Health check passes. -# 3. No runners. -# """ -# # 1. -# runner_id_list = runner_manager.create_runners(1) -# assert isinstance(runner_id_list, tuple) -# assert len(runner_id_list) == 1 -# runner_id = runner_id_list[0] - -# runner_list = runner_manager.get_runners() -# assert isinstance(runner_list, tuple) -# assert len(runner_list) == 1 -# runner = runner_list[0] -# assert runner.id == runner_id -# assert runner.cloud_state == CloudRunnerState.ACTIVE -# assert runner.github_state == GithubRunnerState.IDLE - -# # 2. -# openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() -# assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." -# runner = openstack_instances[0] - -# assert openstack_runner_manager._health_check(runner) - -# # 3. -# runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) -# runner_list = runner_manager.get_runners() -# assert isinstance(runner_list, tuple) -# assert len(runner_list) == 0 +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_get_no_runner(runner_manager: RunnerManager) -> None: + """ + Arrange: RunnerManager instance with no runners. + Act: Get runners. + Assert: Empty tuple returned. + """ + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert not runner_list + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_normal_idle_lifecycle( + runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager +) -> None: + """ + Arrange: RunnerManager instance with no runners. + Act: + 1. Create one runner. + 2. Run health check on the runner. + 3. Delete all idle runner. + Assert: + 1. An active idle runner. + 2. Health check passes. + 3. No runners. + """ + # 1. + runner_id_list = runner_manager.create_runners(1) + assert isinstance(runner_id_list, tuple) + assert len(runner_id_list) == 1 + runner_id = runner_id_list[0] + + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 1 + runner = runner_list[0] + assert runner.id == runner_id + assert runner.cloud_state == CloudRunnerState.ACTIVE + assert runner.github_state == GithubRunnerState.IDLE + + # 2. + openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() + assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." + runner = openstack_instances[0] + + assert openstack_runner_manager._health_check(runner) + + # 3. + runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 0 @pytest.mark.openstack From ef4a0209ece36e18275fc5422988de0e971d0fff Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 8 Aug 2024 18:18:26 +0800 Subject: [PATCH 098/278] Add debug --- tests/integration/test_runner_manager_openstack.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 0fe57194d..f1b432970 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -188,6 +188,7 @@ async def test_runner_normal_idle_lifecycle( """ # 1. runner_id_list = runner_manager.create_runners(1) + pytest.set_trace() assert isinstance(runner_id_list, tuple) assert len(runner_id_list) == 1 runner_id = runner_id_list[0] From 4aee652d0d8f1820395ce5ed5ae8aaf3a7c7ddd0 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 8 Aug 2024 18:45:52 +0800 Subject: [PATCH 099/278] Get runner info not on GitHub --- src/manager/runner_manager.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 663419e82..aa76bf473 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -53,19 +53,19 @@ class RunnerInstance: name: str id: InstanceId - github_state: GithubRunnerState + github_state: GithubRunnerState | None cloud_state: CloudRunnerState - def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner): + def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner | None): """Construct an instance. Args: cloud_instance: Information on the cloud instance. github_info: Information on the GitHub of the runner. """ - self.name = github_info.name + self.name =cloud_instance.name self.id = cloud_instance.id - self.github_state = GithubRunnerState.from_runner(github_info) + self.github_state = GithubRunnerState.from_runner(github_info) if github_info is not None else None self.cloud_state = cloud_instance.state @@ -122,6 +122,8 @@ def get_runners( cloud_runner_state: Sequence[CloudRunnerState] | None = None, ) -> tuple[RunnerInstance]: """Get information on runner filter by state. + + Only runners that has cloud instance are returned. Args: github_runner_state: Filter for the runners with these github states. If None all @@ -137,9 +139,9 @@ def get_runners( cloud_infos = self._cloud.get_runners(cloud_runner_state) github_infos_map = {info.name: info for info in github_infos} cloud_infos_map = {info.name: info for info in cloud_infos} - runner_names = cloud_infos_map.keys() & github_infos_map.keys() - logger.info("Found following runners: %s", runner_names) + logger.info("Found following runners: %s", cloud_infos_map.keys() | github_infos_map.keys()) + runner_names = cloud_infos_map.keys() & github_infos_map.keys() cloud_only = cloud_infos_map.keys() - runner_names github_only = github_infos_map.keys() - runner_names if cloud_only: @@ -153,7 +155,7 @@ def get_runners( ) return tuple( - RunnerInstance(cloud_infos_map[name], github_infos_map[name]) for name in runner_names + RunnerInstance(cloud_infos_map[name], github_infos_map[name]) for name in cloud_infos_map.keys() ) def delete_runners( From 6403bedc862ebad225a89d09344e350a917cf644 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 8 Aug 2024 18:59:46 +0800 Subject: [PATCH 100/278] Fix dict access --- src/manager/runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index aa76bf473..35704929f 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -155,7 +155,7 @@ def get_runners( ) return tuple( - RunnerInstance(cloud_infos_map[name], github_infos_map[name]) for name in cloud_infos_map.keys() + RunnerInstance(cloud_infos_map[name], github_infos_map[name] if name in github_infos_map else None) for name in cloud_infos_map.keys() ) def delete_runners( From 6149b9c53f31732d384ff70d9f80feb8a1eeee5e Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:02:57 +0800 Subject: [PATCH 101/278] Add debug of userdata --- src-docs/openstack_cloud.openstack_runner_manager.md | 8 ++++---- src/openstack_cloud/openstack_runner_manager.py | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index f486aa6c5..ffdc70dd9 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -106,7 +106,7 @@ Construct the object. --- - + ### method `cleanup` @@ -152,7 +152,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -188,7 +188,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `get_runner` @@ -211,7 +211,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index c540b2097..b1c7a0e48 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -128,6 +128,9 @@ def create_runner(self, registration_token: str) -> InstanceId: userdata = self._generate_userdata( instance_name=instance_name, registration_token=registration_token ) + # TODO: debug + import pytest + pytest.set_trace() try: instance = self._openstack_cloud.launch_instance( instance_id=id, From 178b281c03d1deea9a1387af2454f52e50714044 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:11:40 +0800 Subject: [PATCH 102/278] Fix metric path --- src-docs/openstack_cloud.openstack_runner_manager.md | 8 ++++---- src/openstack_cloud/openstack_runner_manager.py | 5 +---- tests/integration/test_runner_manager_openstack.py | 5 ----- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index ffdc70dd9..f486aa6c5 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -106,7 +106,7 @@ Construct the object. --- - + ### method `cleanup` @@ -152,7 +152,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -188,7 +188,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `get_runner` @@ -211,7 +211,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index b1c7a0e48..3e2b8265c 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -128,9 +128,6 @@ def create_runner(self, registration_token: str) -> InstanceId: userdata = self._generate_userdata( instance_name=instance_name, registration_token=registration_token ) - # TODO: debug - import pytest - pytest.set_trace() try: instance = self._openstack_cloud.launch_instance( instance_id=id, @@ -387,7 +384,7 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.server_name}") if RUNNER_STARTUP_PROCESS not in result.stdout: logger.warning("Runner startup process not found on %s", instance.server_name) - return RunnerStartError(f"Runner startup process not found on {instance.server_name}") + raise RunnerStartError(f"Runner startup process not found on {instance.server_name}") logger.info("Runner startup process found to be healthy on %s", instance.server_name) @staticmethod diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index f1b432970..08013cb90 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -47,18 +47,13 @@ def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path runner_log_dir_path = temp_log_dir / "runner_log" metric_log_path = temp_log_dir / "metric_log" - metric_exchange_path = temp_log_dir / "metric_exchange" monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", runner_log_dir_path) monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) - monkeypatch.setattr( - openstack_runner_manager, "METRICS_EXCHANGE_PATH", metric_exchange_path - ) yield { "runner_logs_dir": runner_log_dir_path, "metric_log": metric_log_path, - "metric_exchange": metric_exchange_path, } From 131d6bc496abb40c74f18516d57b43eb1ea144df Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:21:55 +0800 Subject: [PATCH 103/278] Debug metric --- tests/integration/test_runner_manager_openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 08013cb90..65fd07e77 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -183,7 +183,6 @@ async def test_runner_normal_idle_lifecycle( """ # 1. runner_id_list = runner_manager.create_runners(1) - pytest.set_trace() assert isinstance(runner_id_list, tuple) assert len(runner_id_list) == 1 runner_id = runner_id_list[0] @@ -290,5 +289,6 @@ async def test_runner_normal_lifecycle( wait=False, ) await wait_for(lambda: workflow_is_status(workflow, "completed")) + runner_log_dir_path = log_dir_base_path["runner_log_dir"] metric_log_path = log_dir_base_path["metric_log"] pytest.set_trace() From 9a7d138966fe8bdbc444d87f89c664031ff4ec9b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:29:58 +0800 Subject: [PATCH 104/278] Fix variable naming --- src/manager/runner_manager.py | 2 +- tests/integration/test_runner_manager_openstack.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 35704929f..f4e2b779c 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -189,7 +189,7 @@ def delete_runners( for runner in runners_list: metrics.append(self._cloud.delete_runner(id=runner.id, remove_token=remove_token)) - return self._issue_runner_metrics(metrics=iter(metric_events)) + return self._issue_runner_metrics(metrics=iter(metrics)) def cleanup(self) -> IssuedMetricEventsStats: """Run cleanup of the runners and other resources.""" diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 65fd07e77..5aa2f3142 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -289,6 +289,6 @@ async def test_runner_normal_lifecycle( wait=False, ) await wait_for(lambda: workflow_is_status(workflow, "completed")) - runner_log_dir_path = log_dir_base_path["runner_log_dir"] + runner_log_dir_path = log_dir_base_path["runner_logs_dir"] metric_log_path = log_dir_base_path["metric_log"] pytest.set_trace() From 3fe7977a75997e4b58d8933df6f1596272cff20d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:47:08 +0800 Subject: [PATCH 105/278] Test --- .../test_runner_manager_openstack.py | 183 +++++++++--------- 1 file changed, 91 insertions(+), 92 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 5aa2f3142..ad79d202a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -45,14 +45,11 @@ def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path with pytest.MonkeyPatch.context() as monkeypatch: temp_log_dir = tmp_path_factory.mktemp("log") - runner_log_dir_path = temp_log_dir / "runner_log" metric_log_path = temp_log_dir / "metric_log" - monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", runner_log_dir_path) monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) yield { - "runner_logs_dir": runner_log_dir_path, "metric_log": metric_log_path, } @@ -150,63 +147,63 @@ def workflow_is_status(workflow: Workflow, status: str) -> bool: return workflow.status == status -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_get_no_runner(runner_manager: RunnerManager) -> None: - """ - Arrange: RunnerManager instance with no runners. - Act: Get runners. - Assert: Empty tuple returned. - """ - runner_list = runner_manager.get_runners() - assert isinstance(runner_list, tuple) - assert not runner_list - - -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_runner_normal_idle_lifecycle( - runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager -) -> None: - """ - Arrange: RunnerManager instance with no runners. - Act: - 1. Create one runner. - 2. Run health check on the runner. - 3. Delete all idle runner. - Assert: - 1. An active idle runner. - 2. Health check passes. - 3. No runners. - """ - # 1. - runner_id_list = runner_manager.create_runners(1) - assert isinstance(runner_id_list, tuple) - assert len(runner_id_list) == 1 - runner_id = runner_id_list[0] - - runner_list = runner_manager.get_runners() - assert isinstance(runner_list, tuple) - assert len(runner_list) == 1 - runner = runner_list[0] - assert runner.id == runner_id - assert runner.cloud_state == CloudRunnerState.ACTIVE - assert runner.github_state == GithubRunnerState.IDLE - - # 2. - openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() - assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." - runner = openstack_instances[0] - - assert openstack_runner_manager._health_check(runner) - - # 3. - runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) - runner_list = runner_manager.get_runners() - assert isinstance(runner_list, tuple) - assert len(runner_list) == 0 +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail +# async def test_get_no_runner(runner_manager: RunnerManager) -> None: +# """ +# Arrange: RunnerManager instance with no runners. +# Act: Get runners. +# Assert: Empty tuple returned. +# """ +# runner_list = runner_manager.get_runners() +# assert isinstance(runner_list, tuple) +# assert not runner_list + + +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail +# async def test_runner_normal_idle_lifecycle( +# runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager +# ) -> None: +# """ +# Arrange: RunnerManager instance with no runners. +# Act: +# 1. Create one runner. +# 2. Run health check on the runner. +# 3. Delete all idle runner. +# Assert: +# 1. An active idle runner. +# 2. Health check passes. +# 3. No runners. +# """ +# # 1. +# runner_id_list = runner_manager.create_runners(1) +# assert isinstance(runner_id_list, tuple) +# assert len(runner_id_list) == 1 +# runner_id = runner_id_list[0] + +# runner_list = runner_manager.get_runners() +# assert isinstance(runner_list, tuple) +# assert len(runner_list) == 1 +# runner = runner_list[0] +# assert runner.id == runner_id +# assert runner.cloud_state == CloudRunnerState.ACTIVE +# assert runner.github_state == GithubRunnerState.IDLE + +# # 2. +# openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() +# assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." +# runner = openstack_instances[0] + +# assert openstack_runner_manager._health_check(runner) + +# # 3. +# runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) +# runner_list = runner_manager.get_runners() +# assert isinstance(runner_list, tuple) +# assert len(runner_list) == 0 @pytest.mark.openstack @@ -260,35 +257,37 @@ async def test_runner_flush_busy_lifecycle( runner_list = runner_manager_with_one_runner.get_runners() -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_runner_normal_lifecycle( - runner_manager_with_one_runner: RunnerManager, - test_github_branch: Branch, - github_repository: Repository, - runner_label: str, - log_dir_base_path: dict[str, Path], -): - """ - Arrange: RunnerManager with one runner. - Act: - 1. Start a test workflow for the runner. - 2. Run cleanup. - Assert: - 1. The workflow complete successfully. - 2. The runner should be deleted. The metrics should be recorded. - """ - workflow = await dispatch_workflow( - app=None, - branch=test_github_branch, - github_repository=github_repository, - conclusion="success", - workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, - dispatch_input={"runner": runner_label, "minutes": "0"}, - wait=False, - ) - await wait_for(lambda: workflow_is_status(workflow, "completed")) - runner_log_dir_path = log_dir_base_path["runner_logs_dir"] - metric_log_path = log_dir_base_path["metric_log"] - pytest.set_trace() +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail +# async def test_runner_normal_lifecycle( +# runner_manager_with_one_runner: RunnerManager, +# test_github_branch: Branch, +# github_repository: Repository, +# runner_label: str, +# log_dir_base_path: dict[str, Path], +# ): +# """ +# Arrange: RunnerManager with one runner. Clean metric logs. +# Act: +# 1. Start a test workflow for the runner. +# 2. Run cleanup. +# Assert: +# 1. The workflow complete successfully. +# 2. The runner should be deleted. The metrics should be recorded. +# """ +# metric_log_path = log_dir_base_path["metric_log"] +# metric_log_path.write_text("") + +# workflow = await dispatch_workflow( +# app=None, +# branch=test_github_branch, +# github_repository=github_repository, +# conclusion="success", +# workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, +# dispatch_input={"runner": runner_label, "minutes": "0"}, +# wait=False, +# ) +# await wait_for(lambda: workflow_is_status(workflow, "completed")) + +# pytest.set_trace() From 755ddc282b9f2aa62fb6128a3273b167c23e88f3 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:53:55 +0800 Subject: [PATCH 106/278] Fix iterator --- src/manager/runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index f4e2b779c..767766192 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -189,7 +189,7 @@ def delete_runners( for runner in runners_list: metrics.append(self._cloud.delete_runner(id=runner.id, remove_token=remove_token)) - return self._issue_runner_metrics(metrics=iter(metrics)) + return self._issue_runner_metrics(metrics=metrics) def cleanup(self) -> IssuedMetricEventsStats: """Run cleanup of the runners and other resources.""" From c8b5021e5d19114fd891ef0bc8267ebc93b9e186 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:58:08 +0800 Subject: [PATCH 107/278] Debug --- src/manager/runner_manager.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 767766192..aa826a9eb 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -188,6 +188,10 @@ def delete_runners( metrics = [] for runner in runners_list: metrics.append(self._cloud.delete_runner(id=runner.id, remove_token=remove_token)) + + # TODO: DEBUG + import pytest + pytest.set_trace() return self._issue_runner_metrics(metrics=metrics) @@ -202,6 +206,9 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri total_stats: IssuedMetricEventsStats = {} for extracted_metrics in metrics: + # TODO: DEBUG + import pytest + pytest.set_trace() try: job_metrics = github_metrics.job( github_client=self._github.github, From 6eb469a0208ddb0b49ba67e8990286fcc8feab3d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 11:03:08 +0800 Subject: [PATCH 108/278] Debug --- src/manager/runner_manager.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index aa826a9eb..d70a517f1 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -185,22 +185,22 @@ def delete_runners( logger.info("Deleting runners: %s", runner_names) remove_token = self._github.get_removal_token() - metrics = [] + runner_metrics = [] for runner in runners_list: - metrics.append(self._cloud.delete_runner(id=runner.id, remove_token=remove_token)) + runner_metrics.append(self._cloud.delete_runner(id=runner.id, remove_token=remove_token)) # TODO: DEBUG import pytest pytest.set_trace() - return self._issue_runner_metrics(metrics=metrics) + return self._issue_runner_metrics(metrics=iter(runner_metrics)) def cleanup(self) -> IssuedMetricEventsStats: """Run cleanup of the runners and other resources.""" self._github.delete_runners([GithubRunnerState.OFFLINE, GithubRunnerState.UNKNOWN]) remove_token = self._github.get_removal_token() - metrics = self._cloud.cleanup_runner(remove_token) - return self._issue_runner_metrics(metrics=metrics) + runner_metrics = self._cloud.cleanup_runner(remove_token) + return self._issue_runner_metrics(metrics=runner_metrics) def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetricEventsStats: total_stats: IssuedMetricEventsStats = {} From 5e27cc3fe6f4e6005b8ea13094e74234d1b0ff8e Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 11:08:28 +0800 Subject: [PATCH 109/278] Fix for iterator return value --- src/openstack_cloud/openstack_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 3e2b8265c..e0b9608d9 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -207,7 +207,7 @@ def delete_runner(self, id: InstanceId, remove_token: str) -> runner_metrics.Run metrics_storage_manager=shared_fs, runners=instance.server_name ) self._delete_runner(instance, remove_token) - return metric + return next(metric) def cleanup(self, remove_token: str) -> runner_metrics.RunnerMetrics: """Cleanup runner and resource on the cloud. From 26eb6b78878db070b1099e5dc9b25ca74370c59a Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 11:25:01 +0800 Subject: [PATCH 110/278] Add more log path patching --- src/openstack_cloud/openstack_runner_manager.py | 2 ++ .../integration/test_runner_manager_openstack.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index e0b9608d9..eafbdad0a 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -241,7 +241,9 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: except SshError: logger.exception("Failed SSH connection while removing %s", instance.server_name) raise RunnerRemoveError(f"Failed SSH connection for {instance.server_name}") + self._pull_runner_metrics(instance.server_name, ssh_conn) + try: OpenstackRunnerManager._run_github_runner_removal_script( instance.server_name, ssh_conn, remove_token diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index ad79d202a..769ff6c10 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -20,13 +20,14 @@ from manager.cloud_runner_manager import CloudRunnerState from manager.github_runner_manager import GithubRunnerState from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig -from metrics import events, runner_logs +from metrics import events, runner_logs, storage from openstack_cloud import openstack_runner_manager from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( OpenstackRunnerManager, OpenstackRunnerManagerConfig, ) +import shared_fs from tests.integration.helpers.common import ( DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, dispatch_workflow, @@ -45,11 +46,23 @@ def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path with pytest.MonkeyPatch.context() as monkeypatch: temp_log_dir = tmp_path_factory.mktemp("log") + filesystem_base_path = temp_log_dir / "runner-fs" + filesystem_quarantine_path = temp_log_dir / "runner-fs-quarantine" + filesystem_images_path = temp_log_dir / "runner-fs-images" + metrics_exchange_path = temp_log_dir / "metrics-exchange" metric_log_path = temp_log_dir / "metric_log" + monkeypatch.setattr(storage, "FILESYSTEM_BASE_PATH", filesystem_base_path) + monkeypatch.setattr(storage, "FILESYSTEM_QUARANTINE_PATH", filesystem_quarantine_path) + monkeypatch.setattr(shared_fs, "FILESYSTEM_IMAGES_PATH" , filesystem_images_path) + monkeypatch.setattr(openstack_runner_manager, "METRICS_EXCHANGE" , metrics_exchange_path) monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) yield { + "filesystem_base_path": filesystem_base_path, + "filesystem_quarantine_path": filesystem_quarantine_path, + "filesystem_images_path": filesystem_images_path, + "metrics_exchange": metrics_exchange_path, "metric_log": metric_log_path, } From e620c153c73cb988e4aa627054918df0e2377e02 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 12:26:00 +0800 Subject: [PATCH 111/278] Fix path naming --- tests/integration/test_runner_manager_openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 769ff6c10..b6c940837 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -55,7 +55,7 @@ def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path monkeypatch.setattr(storage, "FILESYSTEM_BASE_PATH", filesystem_base_path) monkeypatch.setattr(storage, "FILESYSTEM_QUARANTINE_PATH", filesystem_quarantine_path) monkeypatch.setattr(shared_fs, "FILESYSTEM_IMAGES_PATH" , filesystem_images_path) - monkeypatch.setattr(openstack_runner_manager, "METRICS_EXCHANGE" , metrics_exchange_path) + monkeypatch.setattr(openstack_runner_manager, "METRICS_EXCHANGE_PATH" , metrics_exchange_path) monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) yield { From a65258297f076591d43b766cc9bf7f7f7ffa254a Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 12:34:39 +0800 Subject: [PATCH 112/278] Fix monkey patch --- tests/integration/test_runner_manager_openstack.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b6c940837..d471cbb5a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -49,20 +49,17 @@ def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path filesystem_base_path = temp_log_dir / "runner-fs" filesystem_quarantine_path = temp_log_dir / "runner-fs-quarantine" filesystem_images_path = temp_log_dir / "runner-fs-images" - metrics_exchange_path = temp_log_dir / "metrics-exchange" metric_log_path = temp_log_dir / "metric_log" monkeypatch.setattr(storage, "FILESYSTEM_BASE_PATH", filesystem_base_path) monkeypatch.setattr(storage, "FILESYSTEM_QUARANTINE_PATH", filesystem_quarantine_path) monkeypatch.setattr(shared_fs, "FILESYSTEM_IMAGES_PATH" , filesystem_images_path) - monkeypatch.setattr(openstack_runner_manager, "METRICS_EXCHANGE_PATH" , metrics_exchange_path) monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) yield { "filesystem_base_path": filesystem_base_path, "filesystem_quarantine_path": filesystem_quarantine_path, "filesystem_images_path": filesystem_images_path, - "metrics_exchange": metrics_exchange_path, "metric_log": metric_log_path, } From 6ab206a24e0575539617e7657cbb2043ff0b0d16 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 12:56:52 +0800 Subject: [PATCH 113/278] Start a arm64 manual test env --- .github/workflows/manual_test_env.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index c1060f3fb..ab1faa838 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -2,13 +2,13 @@ name: Manual test env on: # TODO: Uncomment - # pull_request: + pull_request: workflow_dispatch: jobs: manual-test-env: name: manual-test-env - runs-on: ["self-hosted", "stg-private-endpoint", "X64"] + runs-on: ["self-hosted", "stg-private-endpoint"] steps: - run: sudo apt update -yq - run: sudo apt install pipx -yq From 0f010e657208e71ed0b60f4aed17fe7d75bfd491 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 12:57:19 +0800 Subject: [PATCH 114/278] Not spawning manual test env --- .github/workflows/manual_test_env.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index ab1faa838..c1060f3fb 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -2,13 +2,13 @@ name: Manual test env on: # TODO: Uncomment - pull_request: + # pull_request: workflow_dispatch: jobs: manual-test-env: name: manual-test-env - runs-on: ["self-hosted", "stg-private-endpoint"] + runs-on: ["self-hosted", "stg-private-endpoint", "X64"] steps: - run: sudo apt update -yq - run: sudo apt install pipx -yq From 0a3869493322009dda16d5dcf45a98e6970a5062 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 13:38:57 +0800 Subject: [PATCH 115/278] Update fmt --- ...penstack_cloud.openstack_runner_manager.md | 25 ++++++++++++------- src/manager/runner_manager.py | 25 +++++++++++++------ .../openstack_runner_manager.py | 7 ++++++ .../test_runner_manager_openstack.py | 4 +-- 4 files changed, 43 insertions(+), 18 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index f486aa6c5..699d48c3b 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -56,7 +56,14 @@ __init__( ## class `RunnerHealth` -RunnerHealth(healthy: tuple[openstack_cloud.openstack_cloud.OpenstackInstance], unhealthy: tuple[openstack_cloud.openstack_cloud.OpenstackInstance]) +Runners with health state. + + + +**Attributes:** + + - `healthy`: The list of healthy runners. + - `unhealthy`: The list of unhealthy runners. @@ -79,12 +86,12 @@ __init__( --- - + ## class `OpenstackRunnerManager` Manage self-hosted runner on OpenStack cloud. - + ### method `__init__` @@ -106,7 +113,7 @@ Construct the object. --- - + ### method `cleanup` @@ -129,7 +136,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -152,7 +159,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -171,7 +178,7 @@ Delete self-hosted runners. --- - + ### method `get_name_prefix` @@ -188,7 +195,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `get_runner` @@ -211,7 +218,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index d70a517f1..5c01f06ed 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -63,9 +63,11 @@ def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedR cloud_instance: Information on the cloud instance. github_info: Information on the GitHub of the runner. """ - self.name =cloud_instance.name + self.name = cloud_instance.name self.id = cloud_instance.id - self.github_state = GithubRunnerState.from_runner(github_info) if github_info is not None else None + self.github_state = ( + GithubRunnerState.from_runner(github_info) if github_info is not None else None + ) self.cloud_state = cloud_instance.state @@ -122,7 +124,7 @@ def get_runners( cloud_runner_state: Sequence[CloudRunnerState] | None = None, ) -> tuple[RunnerInstance]: """Get information on runner filter by state. - + Only runners that has cloud instance are returned. Args: @@ -139,7 +141,9 @@ def get_runners( cloud_infos = self._cloud.get_runners(cloud_runner_state) github_infos_map = {info.name: info for info in github_infos} cloud_infos_map = {info.name: info for info in cloud_infos} - logger.info("Found following runners: %s", cloud_infos_map.keys() | github_infos_map.keys()) + logger.info( + "Found following runners: %s", cloud_infos_map.keys() | github_infos_map.keys() + ) runner_names = cloud_infos_map.keys() & github_infos_map.keys() cloud_only = cloud_infos_map.keys() - runner_names @@ -155,7 +159,10 @@ def get_runners( ) return tuple( - RunnerInstance(cloud_infos_map[name], github_infos_map[name] if name in github_infos_map else None) for name in cloud_infos_map.keys() + RunnerInstance( + cloud_infos_map[name], github_infos_map[name] if name in github_infos_map else None + ) + for name in cloud_infos_map.keys() ) def delete_runners( @@ -187,10 +194,13 @@ def delete_runners( runner_metrics = [] for runner in runners_list: - runner_metrics.append(self._cloud.delete_runner(id=runner.id, remove_token=remove_token)) - + runner_metrics.append( + self._cloud.delete_runner(id=runner.id, remove_token=remove_token) + ) + # TODO: DEBUG import pytest + pytest.set_trace() return self._issue_runner_metrics(metrics=iter(runner_metrics)) @@ -208,6 +218,7 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri for extracted_metrics in metrics: # TODO: DEBUG import pytest + pytest.set_trace() try: job_metrics = github_metrics.job( diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index eafbdad0a..9c4485f54 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -83,6 +83,13 @@ class OpenstackRunnerManagerConfig: @dataclass class RunnerHealth: + """Runners with health state. + + Attributes: + healthy: The list of healthy runners. + unhealthy: The list of unhealthy runners. + """ + healthy: tuple[OpenstackInstance] unhealthy: tuple[OpenstackInstance] diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index d471cbb5a..e9692d033 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -16,6 +16,7 @@ from github.Workflow import Workflow from openstack.connection import Connection as OpenstackConnection +import shared_fs from charm_state import GithubPath, ProxyConfig, parse_github_path from manager.cloud_runner_manager import CloudRunnerState from manager.github_runner_manager import GithubRunnerState @@ -27,7 +28,6 @@ OpenstackRunnerManager, OpenstackRunnerManagerConfig, ) -import shared_fs from tests.integration.helpers.common import ( DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, dispatch_workflow, @@ -53,7 +53,7 @@ def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path monkeypatch.setattr(storage, "FILESYSTEM_BASE_PATH", filesystem_base_path) monkeypatch.setattr(storage, "FILESYSTEM_QUARANTINE_PATH", filesystem_quarantine_path) - monkeypatch.setattr(shared_fs, "FILESYSTEM_IMAGES_PATH" , filesystem_images_path) + monkeypatch.setattr(shared_fs, "FILESYSTEM_IMAGES_PATH", filesystem_images_path) monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) yield { From ceae00704f517670b3d3c0a5f0d104e50bcdb779 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 13:45:05 +0800 Subject: [PATCH 116/278] Fix metric storage implementation for openstack --- src/openstack_cloud/openstack_runner_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 9c4485f54..8ec2ac73c 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -84,7 +84,7 @@ class OpenstackRunnerManagerConfig: @dataclass class RunnerHealth: """Runners with health state. - + Attributes: healthy: The list of healthy runners. unhealthy: The list of unhealthy runners. @@ -228,7 +228,7 @@ def cleanup(self, remove_token: str) -> runner_metrics.RunnerMetrics: runners = self._get_runner_health() healthy_runner_names = [runner.server_name for runner in runners.healthy] metrics = runner_metrics.extract( - metrics_storage_manager=shared_fs, runners=set(healthy_runner_names) + metrics_storage_manager=metrics_storage, runners=set(healthy_runner_names) ) for runner in runners.unhealthy: self._delete_runner(runner, remove_token) From cd96114786845b05b24aba04c215a2bb72c83732 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 13:51:26 +0800 Subject: [PATCH 117/278] Fix metric storage provider usage in openstack runner manager --- src/openstack_cloud/openstack_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 8ec2ac73c..4515edc84 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -211,7 +211,7 @@ def delete_runner(self, id: InstanceId, remove_token: str) -> runner_metrics.Run """ instance = self._openstack_cloud.get_instance(id) metric = runner_metrics.extract( - metrics_storage_manager=shared_fs, runners=instance.server_name + metrics_storage_manager=metrics_storage, runners=instance.server_name ) self._delete_runner(instance, remove_token) return next(metric) From 71da31ec758c7099281409f8b20c6c9ee93a7d58 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 13:59:00 +0800 Subject: [PATCH 118/278] Debug --- src-docs/openstack_cloud.openstack_runner_manager.md | 2 +- src/openstack_cloud/openstack_runner_manager.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 699d48c3b..0c53ddc25 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -113,7 +113,7 @@ Construct the object. --- - + ### method `cleanup` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 4515edc84..7d09a664c 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -214,6 +214,9 @@ def delete_runner(self, id: InstanceId, remove_token: str) -> runner_metrics.Run metrics_storage_manager=metrics_storage, runners=instance.server_name ) self._delete_runner(instance, remove_token) + # TODO: debug + import pytest + pytest.set_trace() return next(metric) def cleanup(self, remove_token: str) -> runner_metrics.RunnerMetrics: From 12aa0b8caf43e938a6b7dbb98dca941bb8181811 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 14:05:38 +0800 Subject: [PATCH 119/278] Fix iterator --- src-docs/openstack_cloud.openstack_runner_manager.md | 6 +++--- src/manager/cloud_runner_manager.py | 4 ++-- src/openstack_cloud/openstack_runner_manager.py | 11 ++++------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 0c53ddc25..f3be9a576 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -113,12 +113,12 @@ Construct the object. --- - + ### method `cleanup` ```python -cleanup(remove_token: str) → RunnerMetrics +cleanup(remove_token: str) → Iterator[RunnerMetrics] ``` Cleanup runner and resource on the cloud. @@ -164,7 +164,7 @@ Create a self-hosted runner. ### method `delete_runner` ```python -delete_runner(id: str, remove_token: str) → RunnerMetrics +delete_runner(id: str, remove_token: str) → RunnerMetrics | None ``` Delete self-hosted runners. diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 2a25717a5..0847ea82b 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -114,7 +114,7 @@ def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerIn """ ... - def delete_runner(self, id: InstanceId, remove_token: str) -> RunnerMetrics: + def delete_runner(self, id: InstanceId, remove_token: str) -> RunnerMetrics | None: """Delete self-hosted runners. Args: @@ -122,7 +122,7 @@ def delete_runner(self, id: InstanceId, remove_token: str) -> RunnerMetrics: remove_token: The GitHub remove token. Returns: - Metrics of the runner deleted. + Metrics of the runner deleted if any. """ ... diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 7d09a664c..c86082551 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -8,7 +8,7 @@ import time from dataclasses import dataclass from pathlib import Path -from typing import Sequence, Tuple +from typing import Iterator, Sequence, Tuple import invoke import jinja2 @@ -202,7 +202,7 @@ def get_runners( return instances_list return [instance for instance in instances_list if instance.state in states] - def delete_runner(self, id: InstanceId, remove_token: str) -> runner_metrics.RunnerMetrics: + def delete_runner(self, id: InstanceId, remove_token: str) -> runner_metrics.RunnerMetrics | None: """Delete self-hosted runners. Args: @@ -214,12 +214,9 @@ def delete_runner(self, id: InstanceId, remove_token: str) -> runner_metrics.Run metrics_storage_manager=metrics_storage, runners=instance.server_name ) self._delete_runner(instance, remove_token) - # TODO: debug - import pytest - pytest.set_trace() - return next(metric) + return next(metric, None) - def cleanup(self, remove_token: str) -> runner_metrics.RunnerMetrics: + def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: """Cleanup runner and resource on the cloud. Args: From 4d19b1d84671bcfe9aa1bd49688060f037b7cd59 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 14:19:27 +0800 Subject: [PATCH 120/278] Add debug --- src/manager/runner_manager.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 5c01f06ed..22cd272fc 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -197,12 +197,6 @@ def delete_runners( runner_metrics.append( self._cloud.delete_runner(id=runner.id, remove_token=remove_token) ) - - # TODO: DEBUG - import pytest - - pytest.set_trace() - return self._issue_runner_metrics(metrics=iter(runner_metrics)) def cleanup(self) -> IssuedMetricEventsStats: From 6c286bf77fe1d59065b536f62166347c250a4bc6 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 14:23:15 +0800 Subject: [PATCH 121/278] Fix None in iterator --- src/manager/runner_manager.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 22cd272fc..3479c5f7b 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -192,12 +192,12 @@ def delete_runners( logger.info("Deleting runners: %s", runner_names) remove_token = self._github.get_removal_token() - runner_metrics = [] + runner_metrics_list = [] for runner in runners_list: - runner_metrics.append( - self._cloud.delete_runner(id=runner.id, remove_token=remove_token) - ) - return self._issue_runner_metrics(metrics=iter(runner_metrics)) + runner_metrics = self._cloud.delete_runner(id=runner.id, remove_token=remove_token) + if runner_metrics is not None: + runner_metrics_list.append(runner_metrics) + return self._issue_runner_metrics(metrics=iter(runner_metrics_list)) def cleanup(self) -> IssuedMetricEventsStats: """Run cleanup of the runners and other resources.""" @@ -210,10 +210,6 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri total_stats: IssuedMetricEventsStats = {} for extracted_metrics in metrics: - # TODO: DEBUG - import pytest - - pytest.set_trace() try: job_metrics = github_metrics.job( github_client=self._github.github, From e7e2811c5fa8381e43f72ade749ecdf2b6a12a23 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 14:30:44 +0800 Subject: [PATCH 122/278] Add debug --- src/manager/runner_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 3479c5f7b..07260b0bb 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -188,6 +188,9 @@ def delete_runners( states.append(GithubRunnerState.BUSY) runners_list = self.get_runners(github_runner_state=states) + # TODO: debug + import pytest + pytest.set_trace() runner_names = [runner.name for runner in runners_list] logger.info("Deleting runners: %s", runner_names) remove_token = self._github.get_removal_token() From 2a0272816c83dd009e809d8369f6c85e83b1e8d0 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 14:38:38 +0800 Subject: [PATCH 123/278] Trying fix for get runner filter --- src/manager/runner_manager.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 07260b0bb..c4f4c7403 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -158,12 +158,20 @@ def get_runners( github_only, ) - return tuple( + runner_instances = tuple( RunnerInstance( cloud_infos_map[name], github_infos_map[name] if name in github_infos_map else None ) for name in cloud_infos_map.keys() ) + if cloud_runner_state is not None: + runner_instances = [runner for runner in runner_instances if runner.cloud_state in cloud_runner_state] + if github_runner_state is not None: + runner_instances = [runner for runner in runner_instances if runner.github_state is not None and runner.github_state in github_runner_state] + # TODO: debug + import pytest + pytest.set_trace() + return runner_instances def delete_runners( self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE @@ -188,9 +196,6 @@ def delete_runners( states.append(GithubRunnerState.BUSY) runners_list = self.get_runners(github_runner_state=states) - # TODO: debug - import pytest - pytest.set_trace() runner_names = [runner.name for runner in runners_list] logger.info("Deleting runners: %s", runner_names) remove_token = self._github.get_removal_token() From f6d1ef788e56fc1f9387003d2d0bf97a2c54802c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 14:49:36 +0800 Subject: [PATCH 124/278] Add test --- src/manager/runner_manager.py | 3 - .../test_runner_manager_openstack.py | 80 +++++++++---------- 2 files changed, 37 insertions(+), 46 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index c4f4c7403..4eef09817 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -168,9 +168,6 @@ def get_runners( runner_instances = [runner for runner in runner_instances if runner.cloud_state in cloud_runner_state] if github_runner_state is not None: runner_instances = [runner for runner in runner_instances if runner.github_state is not None and runner.github_state in github_runner_state] - # TODO: debug - import pytest - pytest.set_trace() return runner_instances def delete_runners( diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index e9692d033..22fa7b8fb 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -46,20 +46,14 @@ def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path with pytest.MonkeyPatch.context() as monkeypatch: temp_log_dir = tmp_path_factory.mktemp("log") - filesystem_base_path = temp_log_dir / "runner-fs" - filesystem_quarantine_path = temp_log_dir / "runner-fs-quarantine" - filesystem_images_path = temp_log_dir / "runner-fs-images" + metric_exchange_path = temp_log_dir / "metric_exchange" metric_log_path = temp_log_dir / "metric_log" - monkeypatch.setattr(storage, "FILESYSTEM_BASE_PATH", filesystem_base_path) - monkeypatch.setattr(storage, "FILESYSTEM_QUARANTINE_PATH", filesystem_quarantine_path) - monkeypatch.setattr(shared_fs, "FILESYSTEM_IMAGES_PATH", filesystem_images_path) + monkeypatch.setattr(openstack_runner_manager, "METRICS_EXCHANGE_PATH", metric_exchange_path) monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) yield { - "filesystem_base_path": filesystem_base_path, - "filesystem_quarantine_path": filesystem_quarantine_path, - "filesystem_images_path": filesystem_images_path, + "metric_exchange": metric_exchange_path, "metric_log": metric_log_path, } @@ -267,37 +261,37 @@ async def test_runner_flush_busy_lifecycle( runner_list = runner_manager_with_one_runner.get_runners() -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_runner_normal_lifecycle( -# runner_manager_with_one_runner: RunnerManager, -# test_github_branch: Branch, -# github_repository: Repository, -# runner_label: str, -# log_dir_base_path: dict[str, Path], -# ): -# """ -# Arrange: RunnerManager with one runner. Clean metric logs. -# Act: -# 1. Start a test workflow for the runner. -# 2. Run cleanup. -# Assert: -# 1. The workflow complete successfully. -# 2. The runner should be deleted. The metrics should be recorded. -# """ -# metric_log_path = log_dir_base_path["metric_log"] -# metric_log_path.write_text("") - -# workflow = await dispatch_workflow( -# app=None, -# branch=test_github_branch, -# github_repository=github_repository, -# conclusion="success", -# workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, -# dispatch_input={"runner": runner_label, "minutes": "0"}, -# wait=False, -# ) -# await wait_for(lambda: workflow_is_status(workflow, "completed")) - -# pytest.set_trace() +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_normal_lifecycle( + runner_manager_with_one_runner: RunnerManager, + test_github_branch: Branch, + github_repository: Repository, + runner_label: str, + log_dir_base_path: dict[str, Path], +): + """ + Arrange: RunnerManager with one runner. Clean metric logs. + Act: + 1. Start a test workflow for the runner. + 2. Run cleanup. + Assert: + 1. The workflow complete successfully. + 2. The runner should be deleted. The metrics should be recorded. + """ + metric_log_path = log_dir_base_path["metric_log"] + metric_log_path.write_text("") + + workflow = await dispatch_workflow( + app=None, + branch=test_github_branch, + github_repository=github_repository, + conclusion="success", + workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, + dispatch_input={"runner": runner_label, "minutes": "0"}, + wait=False, + ) + await wait_for(lambda: workflow_is_status(workflow, "completed")) + + pytest.set_trace() From b281a8ad42fb9c28dfd46ce952169a38673053bf Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 15:00:41 +0800 Subject: [PATCH 125/278] Patch the path for logs --- tests/integration/test_runner_manager_openstack.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 22fa7b8fb..119d6dddc 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -46,14 +46,17 @@ def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path with pytest.MonkeyPatch.context() as monkeypatch: temp_log_dir = tmp_path_factory.mktemp("log") - metric_exchange_path = temp_log_dir / "metric_exchange" + filesystem_base_path = temp_log_dir / "runner-fs" + filesystem_quarantine_path = temp_log_dir / "runner-fs-quarantine" metric_log_path = temp_log_dir / "metric_log" - monkeypatch.setattr(openstack_runner_manager, "METRICS_EXCHANGE_PATH", metric_exchange_path) + monkeypatch.setattr(storage, "FILESYSTEM_BASE_PATH", filesystem_base_path) + monkeypatch.setattr(storage, "FILESYSTEM_QUARANTINE_PATH", filesystem_quarantine_path) monkeypatch.setattr(events, "METRICS_LOG_PATH", metric_log_path) yield { - "metric_exchange": metric_exchange_path, + "filesystem_base_path": filesystem_base_path, + "filesystem_quarantine_path": filesystem_quarantine_path, "metric_log": metric_log_path, } From 01d62c19ccfd25d9dc4d7a73599d430970ef0bdf Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 15:16:17 +0800 Subject: [PATCH 126/278] Add cleanup test --- tests/integration/test_runner_manager_openstack.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 119d6dddc..77ebce76e 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -284,7 +284,8 @@ async def test_runner_normal_lifecycle( 2. The runner should be deleted. The metrics should be recorded. """ metric_log_path = log_dir_base_path["metric_log"] - metric_log_path.write_text("") + filesystem_base_path = log_dir_base_path["filesystem_base_path"] + filesystem_quarantine_path = log_dir_base_path["filesystem_quarantine_path"] workflow = await dispatch_workflow( app=None, @@ -296,5 +297,7 @@ async def test_runner_normal_lifecycle( wait=False, ) await wait_for(lambda: workflow_is_status(workflow, "completed")) + + issue_metrics_events = runner_manager_with_one_runner.cleanup() pytest.set_trace() From 7333a762fbc5e045d4fbbf1aa8920e937f84e65f Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 15:38:17 +0800 Subject: [PATCH 127/278] Debug --- src-docs/openstack_cloud.openstack_cloud.md | 24 +++++++---- ...penstack_cloud.openstack_runner_manager.md | 2 +- src/manager/cloud_runner_manager.py | 41 ++++++++----------- src/manager/github_runner_manager.py | 18 +++++--- src/manager/runner_manager.py | 29 +++++++++++-- src/openstack_cloud/openstack_cloud.py | 3 ++ .../openstack_runner_manager.py | 4 +- .../test_runner_manager_openstack.py | 2 +- 8 files changed, 78 insertions(+), 45 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 70e9d7f0e..7cb5fe9a1 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -43,18 +43,24 @@ Construct the object. +**Raises:** + + - `ValueError`: Provided server should not be managed under this prefix. + + + --- - + ## class `OpenstackCloud` Client to interact with OpenStack cloud. The OpenStack server name is managed by this cloud. Caller refers to the instances via instance_id. If the caller needs the server name, e.g., for logging, it can be queried with get_server_name. - + ### method `__init__` @@ -77,7 +83,7 @@ Create the object. --- - + ### method `cleanup` @@ -89,7 +95,7 @@ Cleanup unused openstack resources. --- - + ### method `delete_instance` @@ -107,7 +113,7 @@ Delete a openstack instance. --- - + ### method `get_instance` @@ -130,7 +136,7 @@ Get OpenStack instance by instance ID. --- - + ### method `get_instances` @@ -147,7 +153,7 @@ Get all OpenStack instances. --- - + ### method `get_server_name` @@ -170,7 +176,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` @@ -193,7 +199,7 @@ Get SSH connection to an OpenStack instance. --- - + ### method `launch_instance` diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index f3be9a576..c06da20c1 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -113,7 +113,7 @@ Construct the object. --- - + ### method `cleanup` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 0847ea82b..ef031b2be 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -6,16 +6,25 @@ from abc import ABC from dataclasses import dataclass from enum import Enum -from typing import Iterator, Sequence, Tuple, Type +from typing import Iterator, Sequence, Tuple -from metrics import events as metric_events from metrics.runner import RunnerMetrics InstanceId = str class CloudRunnerState(str, Enum): - """Represent state of the instance hosting the runner.""" + """Represent state of the instance hosting the runner. + + Attributes: + CREATED: The instance is created. + ACTIVE: The instance is active and running. + DELETED: The instance is deleted. + ERROR: The instance has encountered error and not running. + STOPPED: The instance has stopped. + UNKNOWN: The state of the instance is not known. + UNEXPECTED: An unknown state not accounted by the developer is encountered. + """ CREATED = "created" ACTIVE = "active" @@ -26,7 +35,7 @@ class CloudRunnerState(str, Enum): UNEXPECTED = "unexpected" @staticmethod - def from_openstack_server_status(openstack_server_status: str) -> None: + def from_openstack_server_status(openstack_server_status: str) -> "CloudRunnerState": """Create from openstack server status. The openstack server status are documented here: @@ -34,6 +43,9 @@ def from_openstack_server_status(openstack_server_status: str) -> None: Args: openstack_server_status: Openstack server status. + + Returns: + The state of the runner. """ match openstack_server_status: case "BUILD": @@ -73,11 +85,7 @@ class CloudRunnerManager(ABC): """Manage runner instance on cloud.""" def get_name_prefix(self) -> str: - """Get the name prefix of the self-hosted runners. - - Returns: - The name prefix. - """ + """Get the name prefix of the self-hosted runners.""" ... def create_runner(self, registration_token: str) -> InstanceId: @@ -85,9 +93,6 @@ def create_runner(self, registration_token: str) -> InstanceId: Args: registration_token: The GitHub registration token for registering runners. - - Returns: - Instance ID of the runner. """ ... @@ -96,9 +101,6 @@ def get_runner(self, id: InstanceId) -> CloudRunnerInstance: Args: id: The instance id. - - Returns: - Information on the runner instance. """ ... @@ -108,9 +110,6 @@ def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerIn Args: states: Filter for the runners with these github states. If None all states will be included. - - Returns: - Information on the runner instances. """ ... @@ -120,9 +119,6 @@ def delete_runner(self, id: InstanceId, remove_token: str) -> RunnerMetrics | No Args: id: The instance id of the runner to delete. remove_token: The GitHub remove token. - - Returns: - Metrics of the runner deleted if any. """ ... @@ -131,8 +127,5 @@ def cleanup_runner(self, remove_token: str) -> Iterator[RunnerMetrics]: Args: remove_token: The GitHub remove token. - - Returns: - Metrics of the runners that was cleanup. """ ... diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 6985405b0..85c4a8ffd 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -12,12 +12,17 @@ class GithubRunnerState(str, Enum): - """State of the runner on GitHub.""" + """State of the self-hosted runner on GitHub. + + Attributes: + BUSY: Runner is working on a job assigned by GitHub. + IDLE: Runner is waiting to take a job. + OFFLINE: Runner is not connected to GitHub. + """ BUSY = "busy" IDLE = "idle" OFFLINE = "offline" - UNKNOWN = "unknown" @staticmethod def from_runner(runner: SelfHostedRunner) -> "GithubRunnerState": @@ -77,11 +82,13 @@ def delete_runners(self, states: Sequence[GithubRunnerState] | None = None) -> N Args: states: Filter the runners for these states. If None, all runners are deleted. - - Returns: - Information on the runners. """ runner_list = self.get_runners(states) + + # TODO: debug + import pytest + pytest.set_trace() + for runner in runner_list: self.github.delete_runner(self._path, runner.id) @@ -112,6 +119,7 @@ def _filter_runner_state( """Filter the runner by the state. Args: + runner: Runner to filter. states: Filter the runners for these states. If None, return true. Returns: diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 4eef09817..056ab6bf7 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -165,9 +165,15 @@ def get_runners( for name in cloud_infos_map.keys() ) if cloud_runner_state is not None: - runner_instances = [runner for runner in runner_instances if runner.cloud_state in cloud_runner_state] + runner_instances = [ + runner for runner in runner_instances if runner.cloud_state in cloud_runner_state + ] if github_runner_state is not None: - runner_instances = [runner for runner in runner_instances if runner.github_state is not None and runner.github_state in github_runner_state] + runner_instances = [ + runner + for runner in runner_instances + if runner.github_state is not None and runner.github_state in github_runner_state + ] return runner_instances def delete_runners( @@ -177,6 +183,9 @@ def delete_runners( Args: flush_mode: The type of runners affect by the deletion. + + Returns: + Stats on metrics events issued during the deletion of runners. """ match flush_mode: case FlushMode.FLUSH_IDLE: @@ -205,13 +214,25 @@ def delete_runners( return self._issue_runner_metrics(metrics=iter(runner_metrics_list)) def cleanup(self) -> IssuedMetricEventsStats: - """Run cleanup of the runners and other resources.""" - self._github.delete_runners([GithubRunnerState.OFFLINE, GithubRunnerState.UNKNOWN]) + """Run cleanup of the runners and other resources. + + Returns: + Stats on metrics events issued during the cleanup of runners. + """ + self._github.delete_runners([GithubRunnerState.OFFLINE]) remove_token = self._github.get_removal_token() runner_metrics = self._cloud.cleanup_runner(remove_token) return self._issue_runner_metrics(metrics=runner_metrics) def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetricEventsStats: + """Issue runner metrics. + + Args: + metrics: Runner metrics to issue. + + Returns: + Stats on runner metrics issued. + """ total_stats: IssuedMetricEventsStats = {} for extracted_metrics in metrics: diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index cd1843333..9559a405f 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -62,6 +62,9 @@ def __init__(self, server: OpenstackServer, prefix: str): Args: server: The OpenStack server. prefix: The name prefix for the servers. + + Raises: + ValueError: Provided server should not be managed under this prefix. """ self.server_id = server.id self.server_name = server.name diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index c86082551..860ca2fde 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -202,7 +202,9 @@ def get_runners( return instances_list return [instance for instance in instances_list if instance.state in states] - def delete_runner(self, id: InstanceId, remove_token: str) -> runner_metrics.RunnerMetrics | None: + def delete_runner( + self, id: InstanceId, remove_token: str + ) -> runner_metrics.RunnerMetrics | None: """Delete self-hosted runners. Args: diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 77ebce76e..2242e881d 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -297,7 +297,7 @@ async def test_runner_normal_lifecycle( wait=False, ) await wait_for(lambda: workflow_is_status(workflow, "completed")) - + issue_metrics_events = runner_manager_with_one_runner.cleanup() pytest.set_trace() From 163474d0f726824e6861754350de930ac59b41b4 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 15:51:15 +0800 Subject: [PATCH 128/278] Fix github state determining busy runner --- src/manager/github_runner_manager.py | 8 +++++--- .../openstack_runner_manager.py | 19 ++++++++++++++++++- .../test_runner_manager_openstack.py | 7 +++---- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 85c4a8ffd..eae395ad0 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -35,9 +35,10 @@ def from_runner(runner: SelfHostedRunner) -> "GithubRunnerState": The state of runner. """ state = GithubRunnerState.OFFLINE + # A runner that is busy and offline is possible. + if runner.busy: + state = GithubRunnerState.BUSY if runner.status == GitHubRunnerStatus.ONLINE: - if runner.busy: - state = GithubRunnerState.BUSY if not runner.busy: state = GithubRunnerState.IDLE return state @@ -84,9 +85,10 @@ def delete_runners(self, states: Sequence[GithubRunnerState] | None = None) -> N states: Filter the runners for these states. If None, all runners are deleted. """ runner_list = self.get_runners(states) - + # TODO: debug import pytest + pytest.set_trace() for runner in runner_list: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 860ca2fde..748b02bfd 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -293,6 +293,17 @@ def _get_runner_health(self) -> RunnerHealth: return RunnerHealth(healthy=healthy, unhealthy=unhealthy) def _generate_userdata(self, instance_name: str, registration_token: str) -> str: + """Generate cloud init userdata. + + This is the script the openstack server runs on startup. + + Args: + instance_name: The name of the instance. + registration_token: The GitHub runner registration token. + + Returns: + The userdata for openstack instance. + """ jinja = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"), autoescape=True) env_contents = jinja.get_template("env.j2").render( @@ -442,6 +453,12 @@ def _issue_runner_installed_metric( @staticmethod def _pull_runner_metrics(name: str, ssh_conn: SshConnection) -> None: + """Pull metrics from runner. + + Args: + name: The name of the runner. + ssh_conn: The SSH connection to the runner. + """ try: storage = metrics_storage.get(name) except GetMetricsStorageError: @@ -486,7 +503,7 @@ def _ssh_pull_file( Raises: _PullFileError: Unable to pull the file from the runner instance. - _SSHError: Issue with SSH connection. + SSHError: Issue with SSH connection. """ try: result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 2242e881d..2e9ce0108 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -16,13 +16,11 @@ from github.Workflow import Workflow from openstack.connection import Connection as OpenstackConnection -import shared_fs from charm_state import GithubPath, ProxyConfig, parse_github_path from manager.cloud_runner_manager import CloudRunnerState from manager.github_runner_manager import GithubRunnerState from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig -from metrics import events, runner_logs, storage -from openstack_cloud import openstack_runner_manager +from metrics import events, storage from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( OpenstackRunnerManager, @@ -97,7 +95,8 @@ async def openstack_runner_manager_fixture( ) -> OpenstackRunnerManager: """Create OpenstackRunnerManager instance. - The prefix args of OpenstackRunnerManager set to app_name to let openstack_connection_fixture perform the cleanup of openstack resources. + The prefix args of OpenstackRunnerManager set to app_name to let openstack_connection_fixture + perform the cleanup of openstack resources. """ _CLOUDS_YAML_PATH.unlink(missing_ok=True) clouds_config = yaml.safe_load(private_endpoint_clouds_yaml) From 85a5268009bf6a110161b3b316c7b66f916f42fc Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:12:47 +0800 Subject: [PATCH 129/278] Fix wrong naming for method in ABC --- src/manager/cloud_runner_manager.py | 2 +- src/manager/runner_manager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index ef031b2be..ca28d8a8a 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -122,7 +122,7 @@ def delete_runner(self, id: InstanceId, remove_token: str) -> RunnerMetrics | No """ ... - def cleanup_runner(self, remove_token: str) -> Iterator[RunnerMetrics]: + def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: """Cleanup runner and resource on the cloud. Args: diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 056ab6bf7..d2a3f8b24 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -221,7 +221,7 @@ def cleanup(self) -> IssuedMetricEventsStats: """ self._github.delete_runners([GithubRunnerState.OFFLINE]) remove_token = self._github.get_removal_token() - runner_metrics = self._cloud.cleanup_runner(remove_token) + runner_metrics = self._cloud.cleanup(remove_token) return self._issue_runner_metrics(metrics=runner_metrics) def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetricEventsStats: From 512101a574b740987964958f242ac91d2ff23c8e Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:13:57 +0800 Subject: [PATCH 130/278] Remove debugging --- src/manager/github_runner_manager.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index eae395ad0..f48330fa1 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -85,12 +85,6 @@ def delete_runners(self, states: Sequence[GithubRunnerState] | None = None) -> N states: Filter the runners for these states. If None, all runners are deleted. """ runner_list = self.get_runners(states) - - # TODO: debug - import pytest - - pytest.set_trace() - for runner in runner_list: self.github.delete_runner(self._path, runner.id) From f7fee4869e3fe57f7b00038bacd6fdcc9347d07b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:22:50 +0800 Subject: [PATCH 131/278] Add more docstrings --- .../openstack_runner_manager.py | 42 ++++++++++++++++--- tests/integration/test_openstack_cloud.py | 1 - .../test_runner_manager_openstack.py | 9 ++++ 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 748b02bfd..c87f7e1b4 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -130,7 +130,7 @@ def create_runner(self, registration_token: str) -> InstanceId: Instance ID of the runner. """ start_timestamp = time.time() - id = OpenstackRunnerManager._generate_runner_id() + id = OpenstackRunnerManager._generate_instance_id() instance_name = self._openstack_cloud.get_server_name(instance_id=id) userdata = self._generate_userdata( instance_name=instance_name, registration_token=registration_token @@ -294,13 +294,13 @@ def _get_runner_health(self) -> RunnerHealth: def _generate_userdata(self, instance_name: str, registration_token: str) -> str: """Generate cloud init userdata. - + This is the script the openstack server runs on startup. Args: instance_name: The name of the instance. registration_token: The GitHub runner registration token. - + Returns: The userdata for openstack instance. """ @@ -357,6 +357,11 @@ def _generate_userdata(self, instance_name: str, registration_token: str) -> str ) def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | None: + """Get repo policy compliance client. + + Returns: + The repo policy compliance client. + """ if self.config.repo_policy_url and self.config.repo_policy_token: return RepoPolicyComplianceClient( self.config.repo_policy_url, self.config.repo_policy_token @@ -379,7 +384,13 @@ def _health_check(self, instance: OpenstackInstance) -> bool: @retry(tries=3, delay=60, local_logger=logger) @staticmethod - def _run_health_check(ssh_conn: SshConnection, name: str): + def _run_health_check(ssh_conn: SshConnection, name: str) -> None: + """Run a health check for runner process. + + Args: + ssh_conn: The SSH connection to the runner. + name: The name of the runner. + """ result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) if not result.ok: logger.warning("SSH run of `ps aux` failed on %s", name) @@ -393,6 +404,11 @@ def _run_health_check(ssh_conn: SshConnection, name: str): @retry(tries=10, delay=60, local_logger=logger) def _wait_runner_startup(self, instance: OpenstackInstance) -> None: + """Wait until runner is startup. + + Args: + instance: The runner instance. + """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SshError as err: @@ -410,7 +426,12 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: logger.info("Runner startup process found to be healthy on %s", instance.server_name) @staticmethod - def _generate_runner_id() -> InstanceId: + def _generate_instance_id() -> InstanceId: + """Generate a instance id. + + Return: + The id. + """ return secrets.token_hex(12) @staticmethod @@ -420,6 +441,14 @@ def _issue_runner_installed_metric( install_start_timestamp: float, install_end_timestamp: float, ) -> None: + """Issue metric for runner installed event. + + Args: + name: The name of the runner. + flavor: The flavor of the runner. + install_start_timestamp: The timestamp of installation start. + install_end_timestamp: The timestamp of installation end. + """ try: metric_events.issue_event( event=metric_events.RunnerInstalled( @@ -503,7 +532,7 @@ def _ssh_pull_file( Raises: _PullFileError: Unable to pull the file from the runner instance. - SSHError: Issue with SSH connection. + SshError: Issue with SSH connection. """ try: result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) @@ -554,6 +583,7 @@ def _run_github_runner_removal_script( """Run Github runner removal script. Args: + instance_name: The name of the runner instance. ssh_conn: The SSH connection to the runner instance. remove_token: The GitHub instance removal token. diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py index 926e545bb..f0dd8f148 100644 --- a/tests/integration/test_openstack_cloud.py +++ b/tests/integration/test_openstack_cloud.py @@ -4,7 +4,6 @@ """Test for OpenstackCloud class integration with OpenStack.""" from secrets import token_hex -from typing import AsyncIterator import pytest import pytest_asyncio diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 2e9ce0108..7bbfa2424 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -149,6 +149,15 @@ async def runner_manager_with_one_runner_fixture(runner_manager: RunnerManager) def workflow_is_status(workflow: Workflow, status: str) -> bool: + """Check if workflow in provided status. + + Args: + workflow: The workflow to check. + status: The status to check for. + + Returns: + Whether the workflow is in the status. + """ workflow.update() return workflow.status == status From 535c520aafb27b873d0abbeb0f487f1a382db0d2 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:27:06 +0800 Subject: [PATCH 132/278] Fix runner deletion --- .../openstack_runner_manager.py | 40 ++++++++++--------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index c87f7e1b4..004cf2dbc 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -247,22 +247,21 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except SshError: - logger.exception("Failed SSH connection while removing %s", instance.server_name) - raise RunnerRemoveError(f"Failed SSH connection for {instance.server_name}") - - self._pull_runner_metrics(instance.server_name, ssh_conn) + self._pull_runner_metrics(instance.server_name, ssh_conn) - try: - OpenstackRunnerManager._run_github_runner_removal_script( - instance.server_name, ssh_conn, remove_token - ) - except GithubRunnerRemoveError: - logger.warning( - "Unable to run github runner removal script for %s", - instance.server_name, - stack_info=True, - ) + try: + OpenstackRunnerManager._run_github_runner_removal_script( + instance.server_name, ssh_conn, remove_token + ) + except GithubRunnerRemoveError: + logger.warning( + "Unable to run github runner removal script for %s", + instance.server_name, + stack_info=True, + ) + except SshError: + logger.exception("Failed to get SSH connection while removing %s", instance.server_name) + logger.warning("Skipping runner remove script for %s due to SSH issues", instance.server_name) try: self._openstack_cloud.delete_instance(instance.instance_id) @@ -386,7 +385,7 @@ def _health_check(self, instance: OpenstackInstance) -> bool: @staticmethod def _run_health_check(ssh_conn: SshConnection, name: str) -> None: """Run a health check for runner process. - + Args: ssh_conn: The SSH connection to the runner. name: The name of the runner. @@ -408,11 +407,14 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: Args: instance: The runner instance. + + Raises: + RunnerStartError: The runner process was not found on the runner. """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SshError as err: - raise RunnerCreateError( + raise RunnerStartError( f"Failed to SSH connect to {instance.server_name} openstack runner" ) from err @@ -428,8 +430,8 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: @staticmethod def _generate_instance_id() -> InstanceId: """Generate a instance id. - - Return: + + Return: The id. """ return secrets.token_hex(12) From eb08ff7ccdd0a2fed14d7cce1950ed6867a981b6 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:44:36 +0800 Subject: [PATCH 133/278] Add more docs --- src-docs/openstack_cloud.openstack_cloud.md | 18 +++++--- ...penstack_cloud.openstack_runner_manager.md | 43 ++++++++++++++----- src/openstack_cloud/openstack_cloud.py | 5 ++- .../openstack_runner_manager.py | 35 ++++++++++++--- 4 files changed, 78 insertions(+), 23 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 7cb5fe9a1..c9271e0fa 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -83,7 +83,7 @@ Create the object. --- - + ### method `cleanup` @@ -95,7 +95,7 @@ Cleanup unused openstack resources. --- - + ### method `delete_instance` @@ -113,7 +113,7 @@ Delete a openstack instance. --- - + ### method `get_instance` @@ -136,7 +136,7 @@ Get OpenStack instance by instance ID. --- - + ### method `get_instances` @@ -153,7 +153,7 @@ Get all OpenStack instances. --- - + ### method `get_server_name` @@ -176,7 +176,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` @@ -227,6 +227,12 @@ Create an OpenStack instance. +**Raises:** + + - `OpenstackError`: Unable to create OpenStack server for runner. + + + **Returns:** The OpenStack instance created. diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index c06da20c1..b07c74043 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -17,11 +17,28 @@ Manager for self-hosted runner on OpenStack. --- - + ## class `OpenstackRunnerManagerConfig` Configuration for OpenstackRunnerManager. + + +**Attributes:** + + - `clouds_config`: The clouds.yaml. + - `cloud`: The cloud name to connect to. + - `image`: The image name for runners to use. + - `flavor`: The flavor name for runners to use. + - `network`: The network name for runners to use. + - `github_path`: The GitHub organization or repository for runners to connect to. + - `labels`: The labels to add to runners. + - `proxy_config`: The proxy configuration. + - `dockerhub_mirror`: The dockerhub mirror to use for runners. + - `ssh_debug_connections`: The information on the ssh debug services. + - `repo_policy_url`: The URL of the repo policy service. + - `repo_policy_token`: The token to access the repo policy service. + ### method `__init__` @@ -53,7 +70,7 @@ __init__( --- - + ## class `RunnerHealth` Runners with health state. @@ -86,12 +103,12 @@ __init__( --- - + ## class `OpenstackRunnerManager` Manage self-hosted runner on OpenStack cloud. - + ### method `__init__` @@ -113,7 +130,7 @@ Construct the object. --- - + ### method `cleanup` @@ -136,7 +153,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -154,12 +171,18 @@ Create a self-hosted runner. +**Raises:** + + - `RunnerCreateError`: Unable to create runner due to OpenStack issues. + + + **Returns:** Instance ID of the runner. --- - + ### method `delete_runner` @@ -178,7 +201,7 @@ Delete self-hosted runners. --- - + ### method `get_name_prefix` @@ -195,7 +218,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `get_runner` @@ -218,7 +241,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 9559a405f..6c5a87bfb 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -149,6 +149,9 @@ def launch_instance( flavor: The flavor used to create the instance. network: The network used to create the instance. userdata: The cloud init userdata to startup the instance. + + Raises: + OpenstackError: Unable to create OpenStack server for runner. Returns: The OpenStack instance created. @@ -175,7 +178,7 @@ def launch_instance( timeout=_CREATE_SERVER_TIMEOUT, wait=True, ) - except openstack.exceptions.ResourceTimeout as err: + except openstack.exceptions.ResourceTimeout: logger.exception("Timeout creating openstack server %s", full_name) logger.info( "Attempting clean up of openstack server %s that timeout during creation", diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 004cf2dbc..929bd696b 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -16,7 +16,6 @@ import paramiko.ssh_exception from fabric import Connection as SshConnection -import shared_fs from charm_state import GithubOrg, GithubPath, ProxyConfig, SSHDebugConnection from errors import ( CreateMetricsStorageError, @@ -25,7 +24,6 @@ OpenStackError, RunnerCreateError, RunnerError, - RunnerRemoveError, RunnerStartError, SshError, ) @@ -65,7 +63,22 @@ class _PullFileError(Exception): @dataclass class OpenstackRunnerManagerConfig: - """Configuration for OpenstackRunnerManager.""" + """Configuration for OpenstackRunnerManager. + + Attributes: + clouds_config: The clouds.yaml. + cloud: The cloud name to connect to. + image: The image name for runners to use. + flavor: The flavor name for runners to use. + network: The network name for runners to use. + github_path: The GitHub organization or repository for runners to connect to. + labels: The labels to add to runners. + proxy_config: The proxy configuration. + dockerhub_mirror: The dockerhub mirror to use for runners. + ssh_debug_connections: The information on the ssh debug services. + repo_policy_url: The URL of the repo policy service. + repo_policy_token: The token to access the repo policy service. + """ clouds_config: dict[str, dict] cloud: str @@ -125,6 +138,9 @@ def create_runner(self, registration_token: str) -> InstanceId: Args: registration_token: The GitHub registration token for registering runners. + + Raises: + RunnerCreateError: Unable to create runner due to OpenStack issues. Returns: Instance ID of the runner. @@ -260,8 +276,12 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: stack_info=True, ) except SshError: - logger.exception("Failed to get SSH connection while removing %s", instance.server_name) - logger.warning("Skipping runner remove script for %s due to SSH issues", instance.server_name) + logger.exception( + "Failed to get SSH connection while removing %s", instance.server_name + ) + logger.warning( + "Skipping runner remove script for %s due to SSH issues", instance.server_name + ) try: self._openstack_cloud.delete_instance(instance.instance_id) @@ -389,6 +409,9 @@ def _run_health_check(ssh_conn: SshConnection, name: str) -> None: Args: ssh_conn: The SSH connection to the runner. name: The name of the runner. + + Raises: + RunnerError: Unable to SSH and find the runner process on the runner. """ result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) if not result.ok: @@ -407,7 +430,7 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: Args: instance: The runner instance. - + Raises: RunnerStartError: The runner process was not found on the runner. """ From fe7951dc18ef7e026bf189acad174b8beeec7a55 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:47:57 +0800 Subject: [PATCH 134/278] Fix typing --- src/manager/cloud_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index ca28d8a8a..2b6010cf1 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -35,7 +35,7 @@ class CloudRunnerState(str, Enum): UNEXPECTED = "unexpected" @staticmethod - def from_openstack_server_status(openstack_server_status: str) -> "CloudRunnerState": + def from_openstack_server_status(openstack_server_status: str) -> str: """Create from openstack server status. The openstack server status are documented here: From 75bdd915cb1996a9723bf84f7484a4ff4901719e Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:49:29 +0800 Subject: [PATCH 135/278] Debug --- src-docs/openstack_cloud.openstack_runner_manager.md | 4 ++-- src/manager/cloud_runner_manager.py | 2 +- src/openstack_cloud/openstack_runner_manager.py | 5 +++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index b07c74043..778ba0611 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -130,7 +130,7 @@ Construct the object. --- - + ### method `cleanup` @@ -182,7 +182,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 2b6010cf1..ca28d8a8a 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -35,7 +35,7 @@ class CloudRunnerState(str, Enum): UNEXPECTED = "unexpected" @staticmethod - def from_openstack_server_status(openstack_server_status: str) -> str: + def from_openstack_server_status(openstack_server_status: str) -> "CloudRunnerState": """Create from openstack server status. The openstack server status are documented here: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 929bd696b..fa4318ec2 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -206,6 +206,11 @@ def get_runners( Information on the runner instances. """ instances_list = self._openstack_cloud.get_instances() + + # TODO: debug + import pytest + pytest.set_trace() + instances_list = [ CloudRunnerInstance( name=instance.server_name, From b70f1faf363e28eb1e842be5c684e34d46f223e1 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:55:40 +0800 Subject: [PATCH 136/278] Update SSH health check --- src-docs/openstack_cloud.openstack_manager.md | 2 +- .../openstack_runner_manager.py | 34 ++++++++----------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_manager.md b/src-docs/openstack_cloud.openstack_manager.md index 0a39a9a37..f87a1b8b4 100644 --- a/src-docs/openstack_cloud.openstack_manager.md +++ b/src-docs/openstack_cloud.openstack_manager.md @@ -146,7 +146,7 @@ Construct OpenstackRunnerManager object. --- - + ### method `flush` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index fa4318ec2..d13c176c1 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -205,23 +205,23 @@ def get_runners( Returns: Information on the runner instances. """ - instances_list = self._openstack_cloud.get_instances() + instance_list = self._openstack_cloud.get_instances() # TODO: debug import pytest pytest.set_trace() - instances_list = [ + instance_list = [ CloudRunnerInstance( name=instance.server_name, id=instance.instance_id, state=CloudRunnerState.from_openstack_server_status(instance.status), ) - for instance in instances_list + for instance in instance_list ] if states is None: - return instances_list - return [instance for instance in instances_list if instance.state in states] + return instance_list + return [instance for instance in instance_list if instance.state in states] def delete_runner( self, id: InstanceId, remove_token: str @@ -392,42 +392,36 @@ def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | Non ) return None + @retry(tries=3, delay=5, backoff=2, local_logger=logger) def _health_check(self, instance: OpenstackInstance) -> bool: try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SshError: - logger.exception("SSH connection failure with %s", instance.server_name) - return False - try: - OpenstackRunnerManager._run_health_check(ssh_conn, instance.server_name) - except RunnerError: - logger.exception("Health check failure for %s", instance.server_name) - return False - logger.info("Health check success for %s", instance.server_name) - return True + logger.exception("SSH connection failure with %s during health check", instance.server_name) + raise + return OpenstackRunnerManager._run_health_check(ssh_conn, instance.server_name) - @retry(tries=3, delay=60, local_logger=logger) @staticmethod - def _run_health_check(ssh_conn: SshConnection, name: str) -> None: + def _run_health_check(ssh_conn: SshConnection, name: str) -> bool: """Run a health check for runner process. Args: ssh_conn: The SSH connection to the runner. name: The name of the runner. - Raises: - RunnerError: Unable to SSH and find the runner process on the runner. + Returns: + Whether the health succeed. """ result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) if not result.ok: logger.warning("SSH run of `ps aux` failed on %s", name) - raise RunnerError(f"Unable to SSH run `ps aux` on {name}") + return False if ( RUNNER_WORKER_PROCESS not in result.stdout and RUNNER_LISTENER_PROCESS not in result.stdout ): logger.warning("Runner process not found on %s", name) - raise RunnerError(f"Runner process not found on {name}") + return False @retry(tries=10, delay=60, local_logger=logger) def _wait_runner_startup(self, instance: OpenstackInstance) -> None: From 1b8fa8145f21a24e7df10c5c40ae6fbb5453f09f Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:06:49 +0800 Subject: [PATCH 137/278] Tmp disable a passing test --- .../test_runner_manager_openstack.py | 98 +++++++++---------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 7bbfa2424..4b64a8098 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -221,55 +221,55 @@ def workflow_is_status(workflow: Workflow, status: str) -> bool: # assert len(runner_list) == 0 -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_runner_flush_busy_lifecycle( - runner_manager_with_one_runner: RunnerManager, - test_github_branch: Branch, - github_repository: Repository, - runner_label: str, -): - """ - Arrange: RunnerManager with one idle runner. - Act: - 1. Run a long workflow. - 2. Run flush idle runner. - 3. Run flush busy runner. - Assert: - 1. Runner takes the job and become busy. - 2. Busy runner still exists. - 3. No runners exists. - """ - # 1. - workflow = await dispatch_workflow( - app=None, - branch=test_github_branch, - github_repository=github_repository, - conclusion="success", - workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, - dispatch_input={"runner": runner_label, "minutes": "10"}, - wait=False, - ) - await wait_for(lambda: workflow_is_status(workflow, "in_progress")) - - runner_list = runner_manager_with_one_runner.get_runners() - assert len(runner_list) == 1 - busy_runner = runner_list[0] - assert busy_runner.cloud_state == CloudRunnerState.ACTIVE - assert busy_runner.github_state == GithubRunnerState.BUSY - - # 2. - runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) - runner_list = runner_manager_with_one_runner.get_runners() - assert len(runner_list) == 1 - busy_runner = runner_list[0] - assert busy_runner.cloud_state == CloudRunnerState.ACTIVE - assert busy_runner.github_state == GithubRunnerState.BUSY - - # 3. - runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) - runner_list = runner_manager_with_one_runner.get_runners() +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail +# async def test_runner_flush_busy_lifecycle( +# runner_manager_with_one_runner: RunnerManager, +# test_github_branch: Branch, +# github_repository: Repository, +# runner_label: str, +# ): +# """ +# Arrange: RunnerManager with one idle runner. +# Act: +# 1. Run a long workflow. +# 2. Run flush idle runner. +# 3. Run flush busy runner. +# Assert: +# 1. Runner takes the job and become busy. +# 2. Busy runner still exists. +# 3. No runners exists. +# """ +# # 1. +# workflow = await dispatch_workflow( +# app=None, +# branch=test_github_branch, +# github_repository=github_repository, +# conclusion="success", +# workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, +# dispatch_input={"runner": runner_label, "minutes": "10"}, +# wait=False, +# ) +# await wait_for(lambda: workflow_is_status(workflow, "in_progress")) + +# runner_list = runner_manager_with_one_runner.get_runners() +# assert len(runner_list) == 1 +# busy_runner = runner_list[0] +# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE +# assert busy_runner.github_state == GithubRunnerState.BUSY + +# # 2. +# runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) +# runner_list = runner_manager_with_one_runner.get_runners() +# assert len(runner_list) == 1 +# busy_runner = runner_list[0] +# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE +# assert busy_runner.github_state == GithubRunnerState.BUSY + +# # 3. +# runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) +# runner_list = runner_manager_with_one_runner.get_runners() @pytest.mark.openstack From b3dbc572570d4903eac90ab9f46fab609b616673 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:30:12 +0800 Subject: [PATCH 138/278] Add deubg --- src/openstack_cloud/openstack_runner_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index d13c176c1..7a6eaecc0 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -305,6 +305,9 @@ def _get_runner_health(self) -> RunnerHealth: healthy, unhealthy = [], [] for runner in runner_list: + # TODO: debug + import pytest + pytest.set_trace() cloud_state = CloudRunnerState(runner.status) if cloud_state in ( CloudRunnerState.DELETED, From 80b3d0ab360ca7bc2670458d32d66090650c3ae2 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:38:12 +0800 Subject: [PATCH 139/278] Remove a debug --- src-docs/openstack_cloud.openstack_runner_manager.md | 4 ++-- src/openstack_cloud/openstack_runner_manager.py | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 778ba0611..b07c74043 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -130,7 +130,7 @@ Construct the object. --- - + ### method `cleanup` @@ -182,7 +182,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 7a6eaecc0..2f12f739d 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -206,11 +206,6 @@ def get_runners( Information on the runner instances. """ instance_list = self._openstack_cloud.get_instances() - - # TODO: debug - import pytest - pytest.set_trace() - instance_list = [ CloudRunnerInstance( name=instance.server_name, From 5a4655f56fd174db0d85bf5da95ed57cca1a181a Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:39:04 +0800 Subject: [PATCH 140/278] Fix Cloud runner state init --- src/openstack_cloud/openstack_runner_manager.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 2f12f739d..01b1fa0fa 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -300,10 +300,7 @@ def _get_runner_health(self) -> RunnerHealth: healthy, unhealthy = [], [] for runner in runner_list: - # TODO: debug - import pytest - pytest.set_trace() - cloud_state = CloudRunnerState(runner.status) + cloud_state = CloudRunnerState.from_openstack_server_status(runner.status) if cloud_state in ( CloudRunnerState.DELETED, CloudRunnerState.ERROR, From cb8c4262eb90f0f32b8a608c2370c6f4af41a313 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:00:32 +0800 Subject: [PATCH 141/278] Change clean up to cleanup --- src/openstack_cloud/openstack_cloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 6c5a87bfb..1ae8a2897 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -324,7 +324,7 @@ def cleanup(self) -> None: server_list = self._get_openstack_instances(conn) exclude_list = [server.name for server in server_list] self._cleanup_key_files(conn, exclude_list) - self._clean_up_openstack_keypairs(conn, exclude_list) + self._cleanup_openstack_keypairs(conn, exclude_list) def get_server_name(self, instance_id: str) -> str: """Get server name on OpenStack. @@ -377,7 +377,7 @@ def _cleanup_key_files( deleted += 1 logger.info("Found %s key files, clean up %s key files", total, deleted) - def _clean_up_openstack_keypairs( + def _cleanup_openstack_keypairs( self, conn: OpenstackConnection, exclude_instances: Iterable[str] ) -> None: """Delete all OpenStack keypairs except the specified instances. From 6c5788ea1aad0d9b63046bfc7d0de6df009a7e91 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:01:31 +0800 Subject: [PATCH 142/278] Fix attr naming issue in openstack cloud --- src/openstack_cloud/openstack_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 1ae8a2897..0aeb47873 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -357,7 +357,7 @@ def _cleanup_key_files( # Find key file from this application. if ( path.is_file() - and path.name.startswith(self.instance_name) + and path.name.startswith(self.prefix) and path.name.endswith(".key") ): total += 1 From d58cc6d9c2b5a0b7cfd7e7eae0ab15e2f4a21890 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:05:21 +0800 Subject: [PATCH 143/278] Fix reference to non-existing instance_name in openstack cloud --- src/openstack_cloud/openstack_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 0aeb47873..e1618e447 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -390,7 +390,7 @@ def _cleanup_openstack_keypairs( keypairs = conn.list_keypairs() for key in keypairs: # The `name` attribute is of resource.Body type. - if key.name and str(key.name).startswith(self.instance_name): + if key.name and str(key.name).startswith(self.prefix): if str(key.name) in exclude_instances: continue From 0cbd90a7494e2b5ad20492a9b89a7ebbb5f44c37 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 12:45:38 +0800 Subject: [PATCH 144/278] Add metric log processing to test --- tests/integration/test_runner_manager_openstack.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 4b64a8098..ad4e54c67 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -4,6 +4,7 @@ """Testing the RunnerManager class with OpenStackRunnerManager as CloudManager.""" +import json from pathlib import Path from secrets import token_hex from typing import Iterator @@ -292,8 +293,7 @@ async def test_runner_normal_lifecycle( 2. The runner should be deleted. The metrics should be recorded. """ metric_log_path = log_dir_base_path["metric_log"] - filesystem_base_path = log_dir_base_path["filesystem_base_path"] - filesystem_quarantine_path = log_dir_base_path["filesystem_quarantine_path"] + metric_log_existing_content = metric_log_path.read_text(encoding='utf-8') workflow = await dispatch_workflow( app=None, @@ -307,5 +307,12 @@ async def test_runner_normal_lifecycle( await wait_for(lambda: workflow_is_status(workflow, "completed")) issue_metrics_events = runner_manager_with_one_runner.cleanup() + assert issue_metrics_events[events.RunnerStart] == 1 + assert issue_metrics_events[events.RunnerStop] == 1 + + metric_log_full_content = metric_log_path.read_text(encoding='utf-8') + assert metric_log_full_content.startswith(metric_log_existing_content), "The metric log was modified in ways other than appending" + metric_log_new_content = metric_log_full_content[len(metric_log_existing_content):] + metric_logs = [json.loads(metric) for metric in metric_log_new_content.splitlines()] pytest.set_trace() From 11c45b19afca68fd87cbad4db033788e853b6d93 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 12:53:11 +0800 Subject: [PATCH 145/278] Enable all tests --- .../test_runner_manager_openstack.py | 223 +++++++++--------- 1 file changed, 113 insertions(+), 110 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index ad4e54c67..f378ee634 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -163,114 +163,114 @@ def workflow_is_status(workflow: Workflow, status: str) -> bool: return workflow.status == status -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_get_no_runner(runner_manager: RunnerManager) -> None: -# """ -# Arrange: RunnerManager instance with no runners. -# Act: Get runners. -# Assert: Empty tuple returned. -# """ -# runner_list = runner_manager.get_runners() -# assert isinstance(runner_list, tuple) -# assert not runner_list - - -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_runner_normal_idle_lifecycle( -# runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager -# ) -> None: -# """ -# Arrange: RunnerManager instance with no runners. -# Act: -# 1. Create one runner. -# 2. Run health check on the runner. -# 3. Delete all idle runner. -# Assert: -# 1. An active idle runner. -# 2. Health check passes. -# 3. No runners. -# """ -# # 1. -# runner_id_list = runner_manager.create_runners(1) -# assert isinstance(runner_id_list, tuple) -# assert len(runner_id_list) == 1 -# runner_id = runner_id_list[0] - -# runner_list = runner_manager.get_runners() -# assert isinstance(runner_list, tuple) -# assert len(runner_list) == 1 -# runner = runner_list[0] -# assert runner.id == runner_id -# assert runner.cloud_state == CloudRunnerState.ACTIVE -# assert runner.github_state == GithubRunnerState.IDLE - -# # 2. -# openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() -# assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." -# runner = openstack_instances[0] - -# assert openstack_runner_manager._health_check(runner) - -# # 3. -# runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) -# runner_list = runner_manager.get_runners() -# assert isinstance(runner_list, tuple) -# assert len(runner_list) == 0 - - -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_runner_flush_busy_lifecycle( -# runner_manager_with_one_runner: RunnerManager, -# test_github_branch: Branch, -# github_repository: Repository, -# runner_label: str, -# ): -# """ -# Arrange: RunnerManager with one idle runner. -# Act: -# 1. Run a long workflow. -# 2. Run flush idle runner. -# 3. Run flush busy runner. -# Assert: -# 1. Runner takes the job and become busy. -# 2. Busy runner still exists. -# 3. No runners exists. -# """ -# # 1. -# workflow = await dispatch_workflow( -# app=None, -# branch=test_github_branch, -# github_repository=github_repository, -# conclusion="success", -# workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, -# dispatch_input={"runner": runner_label, "minutes": "10"}, -# wait=False, -# ) -# await wait_for(lambda: workflow_is_status(workflow, "in_progress")) - -# runner_list = runner_manager_with_one_runner.get_runners() -# assert len(runner_list) == 1 -# busy_runner = runner_list[0] -# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE -# assert busy_runner.github_state == GithubRunnerState.BUSY - -# # 2. -# runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) -# runner_list = runner_manager_with_one_runner.get_runners() -# assert len(runner_list) == 1 -# busy_runner = runner_list[0] -# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE -# assert busy_runner.github_state == GithubRunnerState.BUSY - -# # 3. -# runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) -# runner_list = runner_manager_with_one_runner.get_runners() +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_get_no_runner(runner_manager: RunnerManager) -> None: + """ + Arrange: RunnerManager instance with no runners. + Act: Get runners. + Assert: Empty tuple returned. + """ + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert not runner_list + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_normal_idle_lifecycle( + runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager +) -> None: + """ + Arrange: RunnerManager instance with no runners. + Act: + 1. Create one runner. + 2. Run health check on the runner. + 3. Delete all idle runner. + Assert: + 1. An active idle runner. + 2. Health check passes. + 3. No runners. + """ + # 1. + runner_id_list = runner_manager.create_runners(1) + assert isinstance(runner_id_list, tuple) + assert len(runner_id_list) == 1 + runner_id = runner_id_list[0] + + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 1 + runner = runner_list[0] + assert runner.id == runner_id + assert runner.cloud_state == CloudRunnerState.ACTIVE + assert runner.github_state == GithubRunnerState.IDLE + + # 2. + openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() + assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." + runner = openstack_instances[0] + + assert openstack_runner_manager._health_check(runner) + + # 3. + runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 0 + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_flush_busy_lifecycle( + runner_manager_with_one_runner: RunnerManager, + test_github_branch: Branch, + github_repository: Repository, + runner_label: str, +): + """ + Arrange: RunnerManager with one idle runner. + Act: + 1. Run a long workflow. + 2. Run flush idle runner. + 3. Run flush busy runner. + Assert: + 1. Runner takes the job and become busy. + 2. Busy runner still exists. + 3. No runners exists. + """ + # 1. + workflow = await dispatch_workflow( + app=None, + branch=test_github_branch, + github_repository=github_repository, + conclusion="success", + workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, + dispatch_input={"runner": runner_label, "minutes": "10"}, + wait=False, + ) + await wait_for(lambda: workflow_is_status(workflow, "in_progress")) + + runner_list = runner_manager_with_one_runner.get_runners() + assert len(runner_list) == 1 + busy_runner = runner_list[0] + assert busy_runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GithubRunnerState.BUSY + + # 2. + runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) + runner_list = runner_manager_with_one_runner.get_runners() + assert len(runner_list) == 1 + busy_runner = runner_list[0] + assert busy_runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GithubRunnerState.BUSY + + # 3. + runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) + runner_list = runner_manager_with_one_runner.get_runners() @pytest.mark.openstack @@ -314,5 +314,8 @@ async def test_runner_normal_lifecycle( assert metric_log_full_content.startswith(metric_log_existing_content), "The metric log was modified in ways other than appending" metric_log_new_content = metric_log_full_content[len(metric_log_existing_content):] metric_logs = [json.loads(metric) for metric in metric_log_new_content.splitlines()] - - pytest.set_trace() + assert len(metric_logs) == 2, "Assuming two events should be runner_start and runner_stop, modify this if new events are added" + assert metric_logs[0]['event'] == "runner_start" + assert metric_logs[0]['workflow'] == "Workflow Dispatch Wait Tests" + assert metric_logs[1]['event'] == "runner_stop" + assert metric_logs[1]['workflow'] == "Workflow Dispatch Wait Tests" From f5551b676aad24bbb04f24f67e1ad0c574b2039c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 12:59:31 +0800 Subject: [PATCH 146/278] Fix health check return value --- src/openstack_cloud/openstack_runner_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 01b1fa0fa..ceb0edfe5 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -417,6 +417,7 @@ def _run_health_check(ssh_conn: SshConnection, name: str) -> bool: ): logger.warning("Runner process not found on %s", name) return False + return True @retry(tries=10, delay=60, local_logger=logger) def _wait_runner_startup(self, instance: OpenstackInstance) -> None: From 97e56857972cb367904e506634cecf26e27f0f56 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 13:21:02 +0800 Subject: [PATCH 147/278] Fix all flake8 lints --- src-docs/errors.md | 2 +- src/errors.py | 6 ++-- src/openstack_cloud/openstack_cloud.py | 32 ++++++++++++------- src/openstack_cloud/openstack_manager.py | 6 ++-- .../openstack_runner_manager.py | 31 +++++++++++++----- .../test_runner_manager_openstack.py | 27 ++++++++++------ tests/unit/test_openstack_manager.py | 4 +-- 7 files changed, 70 insertions(+), 38 deletions(-) diff --git a/src-docs/errors.md b/src-docs/errors.md index d091b72f9..1a6316046 100644 --- a/src-docs/errors.md +++ b/src-docs/errors.md @@ -374,7 +374,7 @@ Base class for all runner logs errors. -## class `OpenStackError` +## class `OpenstackError` Base class for OpenStack errors. diff --git a/src/errors.py b/src/errors.py index 0dab2a54a..204877cd5 100644 --- a/src/errors.py +++ b/src/errors.py @@ -156,15 +156,15 @@ class RunnerLogsError(Exception): """Base class for all runner logs errors.""" -class OpenStackError(Exception): +class OpenstackError(Exception): """Base class for OpenStack errors.""" -class OpenStackInvalidConfigError(OpenStackError): +class OpenStackInvalidConfigError(OpenstackError): """Represents an invalid OpenStack configuration.""" -class OpenStackUnauthorizedError(OpenStackError): +class OpenStackUnauthorizedError(OpenstackError): """Represents an unauthorized connection to OpenStack.""" diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index e1618e447..39cb12ea4 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -80,7 +80,8 @@ def __init__(self, server: OpenstackServer, prefix: str): raise ValueError( f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" ) - self.instance_id = self.server_name[len(prefix) + 1 :] + # Disable E203 (space before :) as it conflicts with the formatter (black). + self.instance_id = self.server_name[len(prefix) + 1 :] # noqa: E203 @contextmanager @@ -92,7 +93,7 @@ def _get_openstack_connection( The file of _CLOUDS_YAML_PATH should only be modified by this function. Args: - cloud_config: The configuration in clouds.yaml format to apply. + clouds_config: The configuration in clouds.yaml format to apply. cloud: The name of cloud to use in the clouds.yaml. Raises: @@ -149,9 +150,9 @@ def launch_instance( flavor: The flavor used to create the instance. network: The network used to create the instance. userdata: The cloud init userdata to startup the instance. - + Raises: - OpenstackError: Unable to create OpenStack server for runner. + OpenStackError: Unable to create OpenStack server. Returns: The OpenStack instance created. @@ -178,7 +179,7 @@ def launch_instance( timeout=_CREATE_SERVER_TIMEOUT, wait=True, ) - except openstack.exceptions.ResourceTimeout: + except openstack.exceptions.ResourceTimeout as err: logger.exception("Timeout creating openstack server %s", full_name) logger.info( "Attempting clean up of openstack server %s that timeout during creation", @@ -189,7 +190,7 @@ def launch_instance( except ( openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout, - ) as err: + ): logger.exception( "Failed to cleanup openstack server %s that timeout during creation", full_name, @@ -225,6 +226,9 @@ def get_instance(self, instance_id: str) -> OpenstackInstance: def delete_instance(self, instance_id: str) -> None: """Delete a openstack instance. + Raises: + OpenStackError: Unable to delete OpenStack server. + Args: instance_id: The instance ID of the instance to delete. """ @@ -250,6 +254,9 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: Args: instance: The OpenStack instance to connect to. + Raises: + SshError: Unable to get a working SSH connection to the instance. + Returns: SSH connection object. """ @@ -355,11 +362,7 @@ def _cleanup_key_files( deleted = 0 for path in _SSH_KEY_PATH.iterdir(): # Find key file from this application. - if ( - path.is_file() - and path.name.startswith(self.prefix) - and path.name.endswith(".key") - ): + if path.is_file() and path.name.startswith(self.prefix) and path.name.endswith(".key"): total += 1 if path.name in exclude_filename: continue @@ -425,6 +428,13 @@ def _get_and_ensure_unique_server( If multiple servers with the same name is found, the latest server in creation time is returned. Other servers is deleted. + + Args: + conn: The connection to OpenStack. + name: The name of the OpenStack name. + + Returns: + A server with the name. """ servers: list[OpenstackServer] = conn.search_servers(name) diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index 35799d8bb..c5d2bce44 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -48,7 +48,7 @@ GithubClientError, GithubMetricsError, IssueMetricEventError, - OpenStackError, + OpenstackError, RunnerCreateError, RunnerStartError, ) @@ -161,7 +161,7 @@ def _create_connection(cloud_config: dict[str, dict]) -> Iterator[openstack.conn cloud_config: The configuration in clouds.yaml format to apply. Raises: - OpenStackError: if the credentials provided is not authorized. + OpenstackError: if the credentials provided is not authorized. Yields: An openstack.connection.Connection object. @@ -180,7 +180,7 @@ def _create_connection(cloud_config: dict[str, dict]) -> Iterator[openstack.conn # pylint thinks this isn't an exception, but does inherit from Exception class. except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc + raise OpenstackError("Failed OpenStack API call") from exc # Disable too many arguments, as they are needed to create the dataclass. diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index ceb0edfe5..4c069ed64 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -21,9 +21,8 @@ CreateMetricsStorageError, GetMetricsStorageError, IssueMetricEventError, - OpenStackError, + OpenstackError, RunnerCreateError, - RunnerError, RunnerStartError, SshError, ) @@ -64,7 +63,7 @@ class _PullFileError(Exception): @dataclass class OpenstackRunnerManagerConfig: """Configuration for OpenstackRunnerManager. - + Attributes: clouds_config: The clouds.yaml. cloud: The cloud name to connect to. @@ -138,7 +137,7 @@ def create_runner(self, registration_token: str) -> InstanceId: Args: registration_token: The GitHub registration token for registering runners. - + Raises: RunnerCreateError: Unable to create runner due to OpenStack issues. @@ -159,7 +158,7 @@ def create_runner(self, registration_token: str) -> InstanceId: network=self.config.network, userdata=userdata, ) - except OpenStackError as err: + except OpenstackError as err: raise RunnerCreateError(f"Failed to create {instance_name} openstack runner") from err self._wait_runner_startup(instance) @@ -226,6 +225,9 @@ def delete_runner( Args: id: The instance id of the runner to delete. remove_token: The GitHub remove token. + + Returns: + Any metrics collected during the deletion of the runner. """ instance = self._openstack_cloud.get_instance(id) metric = runner_metrics.extract( @@ -285,7 +287,7 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: try: self._openstack_cloud.delete_instance(instance.instance_id) - except OpenStackError: + except OpenstackError: logger.exception( "Unable to delete openstack instance for runner %s", instance.server_name ) @@ -389,10 +391,23 @@ def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | Non @retry(tries=3, delay=5, backoff=2, local_logger=logger) def _health_check(self, instance: OpenstackInstance) -> bool: + """Check whether runner is healthy. + + Args: + instance: The OpenStack instance to conduit the health check. + + Raises: + SshError: Unable to get a SSH connection to the instance. + + Returns: + Whether the runner is healthy. + """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SshError: - logger.exception("SSH connection failure with %s during health check", instance.server_name) + logger.exception( + "SSH connection failure with %s during health check", instance.server_name + ) raise return OpenstackRunnerManager._run_health_check(ssh_conn, instance.server_name) @@ -403,7 +418,7 @@ def _run_health_check(ssh_conn: SshConnection, name: str) -> bool: Args: ssh_conn: The SSH connection to the runner. name: The name of the runner. - + Returns: Whether the health succeed. """ diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index f378ee634..2cb1814ae 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -293,7 +293,7 @@ async def test_runner_normal_lifecycle( 2. The runner should be deleted. The metrics should be recorded. """ metric_log_path = log_dir_base_path["metric_log"] - metric_log_existing_content = metric_log_path.read_text(encoding='utf-8') + metric_log_existing_content = metric_log_path.read_text(encoding="utf-8") workflow = await dispatch_workflow( app=None, @@ -309,13 +309,20 @@ async def test_runner_normal_lifecycle( issue_metrics_events = runner_manager_with_one_runner.cleanup() assert issue_metrics_events[events.RunnerStart] == 1 assert issue_metrics_events[events.RunnerStop] == 1 - - metric_log_full_content = metric_log_path.read_text(encoding='utf-8') - assert metric_log_full_content.startswith(metric_log_existing_content), "The metric log was modified in ways other than appending" - metric_log_new_content = metric_log_full_content[len(metric_log_existing_content):] + + metric_log_full_content = metric_log_path.read_text(encoding="utf-8") + assert metric_log_full_content.startswith( + metric_log_existing_content + ), "The metric log was modified in ways other than appending" + # Disable E203 (space before :) as it conflicts with the formatter (black). + metric_log_new_content = metric_log_full_content[ + len(metric_log_existing_content) : # noqa: E203 + ] metric_logs = [json.loads(metric) for metric in metric_log_new_content.splitlines()] - assert len(metric_logs) == 2, "Assuming two events should be runner_start and runner_stop, modify this if new events are added" - assert metric_logs[0]['event'] == "runner_start" - assert metric_logs[0]['workflow'] == "Workflow Dispatch Wait Tests" - assert metric_logs[1]['event'] == "runner_stop" - assert metric_logs[1]['workflow'] == "Workflow Dispatch Wait Tests" + assert ( + len(metric_logs) == 2 + ), "Assuming two events should be runner_start and runner_stop, modify this if new events are added" + assert metric_logs[0]["event"] == "runner_start" + assert metric_logs[0]["workflow"] == "Workflow Dispatch Wait Tests" + assert metric_logs[1]["event"] == "runner_stop" + assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" diff --git a/tests/unit/test_openstack_manager.py b/tests/unit/test_openstack_manager.py index 445a0b8d3..5349b1570 100644 --- a/tests/unit/test_openstack_manager.py +++ b/tests/unit/test_openstack_manager.py @@ -19,7 +19,7 @@ import metrics.storage import reactive.runner_manager from charm_state import CharmState, ProxyConfig, ReactiveConfig, RepoPolicyComplianceConfig -from errors import OpenStackError, RunnerStartError +from errors import OpenstackError, RunnerStartError from github_type import GitHubRunnerStatus, RunnerApplication, SelfHostedRunner from metrics import events as metric_events from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME @@ -262,7 +262,7 @@ def test__create_connection_error(clouds_yaml: dict, openstack_connect_mock: Mag connection_mock.__enter__.return_value = connection_context openstack_connect_mock.return_value = connection_mock - with pytest.raises(OpenStackError) as exc: + with pytest.raises(OpenstackError) as exc: with openstack_manager._create_connection(cloud_config=clouds_yaml): pass From 3ce240e98531ad6f432b53a4054d1afec5f87e78 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 13:28:19 +0800 Subject: [PATCH 148/278] Fix test --- tests/integration/test_runner_manager_openstack.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 2cb1814ae..f40845a2a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -272,6 +272,8 @@ async def test_runner_flush_busy_lifecycle( runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) runner_list = runner_manager_with_one_runner.get_runners() + issue_metrics_events = runner_manager_with_one_runner.cleanup() + assert issue_metrics_events[events.RunnerStart] == 1 @pytest.mark.openstack @pytest.mark.asyncio From 233741889d1def6ecb1369f2a6ba0fb6220e3a07 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:21:21 +0800 Subject: [PATCH 149/278] Fix all lints --- src-docs/openstack_cloud.openstack_cloud.md | 36 ++++++++---- ...penstack_cloud.openstack_runner_manager.md | 15 +++-- src/manager/cloud_runner_manager.py | 30 +++++----- src/manager/runner_manager.py | 20 ++++--- src/openstack_cloud/openstack_cloud.py | 56 +++++++++++-------- .../openstack_runner_manager.py | 38 ++++++++----- .../test_runner_manager_openstack.py | 13 +++-- 7 files changed, 126 insertions(+), 82 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index c9271e0fa..c431d5f28 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -53,14 +53,14 @@ Construct the object. --- - + ## class `OpenstackCloud` Client to interact with OpenStack cloud. The OpenStack server name is managed by this cloud. Caller refers to the instances via instance_id. If the caller needs the server name, e.g., for logging, it can be queried with get_server_name. - + ### method `__init__` @@ -83,7 +83,7 @@ Create the object. --- - + ### method `cleanup` @@ -95,7 +95,7 @@ Cleanup unused openstack resources. --- - + ### method `delete_instance` @@ -107,18 +107,24 @@ Delete a openstack instance. +**Raises:** + + - `OpenstackError`: Unable to delete OpenStack server. + + + **Args:** - `instance_id`: The instance ID of the instance to delete. --- - + ### method `get_instance` ```python -get_instance(instance_id: str) → OpenstackInstance +get_instance(instance_id: str) → OpenstackInstance | None ``` Get OpenStack instance by instance ID. @@ -132,11 +138,11 @@ Get OpenStack instance by instance ID. **Returns:** - The OpenStack instance. + The OpenStack instance if found. --- - + ### method `get_instances` @@ -153,7 +159,7 @@ Get all OpenStack instances. --- - + ### method `get_server_name` @@ -176,7 +182,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` @@ -194,12 +200,18 @@ Get SSH connection to an OpenStack instance. +**Raises:** + + - `SshError`: Unable to get a working SSH connection to the instance. + + + **Returns:** SSH connection object. --- - + ### method `launch_instance` @@ -229,7 +241,7 @@ Create an OpenStack instance. **Raises:** - - `OpenstackError`: Unable to create OpenStack server for runner. + - `OpenstackError`: Unable to create OpenStack server. diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index b07c74043..029b39268 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -130,7 +130,7 @@ Construct the object. --- - + ### method `cleanup` @@ -187,7 +187,7 @@ Create a self-hosted runner. ### method `delete_runner` ```python -delete_runner(id: str, remove_token: str) → RunnerMetrics | None +delete_runner(instance_id: str, remove_token: str) → RunnerMetrics | None ``` Delete self-hosted runners. @@ -196,9 +196,14 @@ Delete self-hosted runners. **Args:** - - `id`: The instance id of the runner to delete. + - `instance_id`: The instance id of the runner to delete. - `remove_token`: The GitHub remove token. + + +**Returns:** + Any metrics collected during the deletion of the runner. + --- @@ -223,7 +228,7 @@ Get the name prefix of the self-hosted runners. ### method `get_runner` ```python -get_runner(id: str) → CloudRunnerInstance | None +get_runner(instance_id: str) → CloudRunnerInstance | None ``` Get a self-hosted runner by instance id. @@ -232,7 +237,7 @@ Get a self-hosted runner by instance id. **Args:** - - `id`: The instance id. + - `instance_id`: The instance id. diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index ca28d8a8a..654e5663b 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -3,7 +3,7 @@ """Interface of manager of runner instance on clouds.""" -from abc import ABC +import abc from dataclasses import dataclass from enum import Enum from typing import Iterator, Sequence, Tuple @@ -34,8 +34,12 @@ class CloudRunnerState(str, Enum): UNKNOWN = "unknown" UNEXPECTED = "unexpected" + # Disable "Too many return statements" as this method is using case statement for converting + # the states, which does not cause a complexity issue. @staticmethod - def from_openstack_server_status(openstack_server_status: str) -> "CloudRunnerState": + def from_openstack_server_status( # pylint: disable=R0911 + openstack_server_status: str, + ) -> "CloudRunnerState": """Create from openstack server status. The openstack server status are documented here: @@ -81,29 +85,30 @@ class CloudRunnerInstance: state: CloudRunnerState -class CloudRunnerManager(ABC): +class CloudRunnerManager(abc.ABC): """Manage runner instance on cloud.""" + @abc.abstractmethod def get_name_prefix(self) -> str: """Get the name prefix of the self-hosted runners.""" - ... + @abc.abstractmethod def create_runner(self, registration_token: str) -> InstanceId: """Create a self-hosted runner. Args: registration_token: The GitHub registration token for registering runners. """ - ... - def get_runner(self, id: InstanceId) -> CloudRunnerInstance: + @abc.abstractmethod + def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance: """Get a self-hosted runner by instance id. Args: - id: The instance id. + instance_id: The instance id. """ - ... + @abc.abstractmethod def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerInstance]: """Get self-hosted runners by state. @@ -111,21 +116,20 @@ def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerIn states: Filter for the runners with these github states. If None all states will be included. """ - ... - def delete_runner(self, id: InstanceId, remove_token: str) -> RunnerMetrics | None: + @abc.abstractmethod + def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: """Delete self-hosted runners. Args: - id: The instance id of the runner to delete. + instance_id: The instance id of the runner to delete. remove_token: The GitHub remove token. """ - ... + @abc.abstractmethod def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: """Cleanup runner and resource on the cloud. Args: remove_token: The GitHub remove token. """ - ... diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index d2a3f8b24..ad5354166 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -6,7 +6,7 @@ import logging from dataclasses import dataclass from enum import Enum, auto -from typing import Iterator, Sequence, Type +from typing import Iterator, Sequence, Type, cast from charm_state import GithubPath from errors import GithubMetricsError @@ -158,12 +158,12 @@ def get_runners( github_only, ) - runner_instances = tuple( + runner_instances: list[RunnerInstance] = [ RunnerInstance( cloud_infos_map[name], github_infos_map[name] if name in github_infos_map else None ) for name in cloud_infos_map.keys() - ) + ] if cloud_runner_state is not None: runner_instances = [ runner for runner in runner_instances if runner.cloud_state in cloud_runner_state @@ -174,7 +174,7 @@ def get_runners( for runner in runner_instances if runner.github_state is not None and runner.github_state in github_runner_state ] - return runner_instances + return cast(tuple[RunnerInstance], tuple(runner_instances)) def delete_runners( self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE @@ -208,9 +208,11 @@ def delete_runners( runner_metrics_list = [] for runner in runners_list: - runner_metrics = self._cloud.delete_runner(id=runner.id, remove_token=remove_token) - if runner_metrics is not None: - runner_metrics_list.append(runner_metrics) + deleted_runner_metrics = self._cloud.delete_runner( + instance_id=runner.id, remove_token=remove_token + ) + if deleted_runner_metrics is not None: + runner_metrics_list.append(deleted_runner_metrics) return self._issue_runner_metrics(metrics=iter(runner_metrics_list)) def cleanup(self) -> IssuedMetricEventsStats: @@ -221,8 +223,8 @@ def cleanup(self) -> IssuedMetricEventsStats: """ self._github.delete_runners([GithubRunnerState.OFFLINE]) remove_token = self._github.get_removal_token() - runner_metrics = self._cloud.cleanup(remove_token) - return self._issue_runner_metrics(metrics=runner_metrics) + deleted_runner_metrics = self._cloud.cleanup(remove_token) + return self._issue_runner_metrics(metrics=deleted_runner_metrics) def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetricEventsStats: """Issue runner metrics. diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 39cb12ea4..888897bdc 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -3,10 +3,10 @@ """Class for accessing OpenStack API for managing servers.""" -import datetime import logging from contextlib import contextmanager from dataclasses import dataclass +from datetime import datetime from functools import reduce from pathlib import Path from typing import Iterable, Iterator, cast @@ -22,7 +22,7 @@ from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup from paramiko.ssh_exception import NoValidConnectionsError -from errors import OpenStackError, SshError +from errors import OpenstackError, SshError logger = logging.getLogger(__name__) @@ -97,7 +97,7 @@ def _get_openstack_connection( cloud: The name of cloud to use in the clouds.yaml. Raises: - OpenStackError: if the credentials provided is not authorized. + OpenstackError: if the credentials provided is not authorized. Yields: An openstack.connection.Connection object. @@ -115,7 +115,7 @@ def _get_openstack_connection( # pylint thinks this isn't an exception, but does inherit from Exception class. except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc + raise OpenstackError("Failed OpenStack API call") from exc class OpenstackCloud: @@ -139,7 +139,9 @@ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): self._cloud = cloud self.prefix = prefix - def launch_instance( + # Ignore "Too many arguments" as 6 args should be fine. Move to a dataclass is new args are + # added. + def launch_instance( # pylint: disable=R0913 self, instance_id: str, image: str, flavor: str, network: str, userdata: str ) -> OpenstackInstance: """Create an OpenStack instance. @@ -152,7 +154,7 @@ def launch_instance( userdata: The cloud init userdata to startup the instance. Raises: - OpenStackError: Unable to create OpenStack server. + OpenstackError: Unable to create OpenStack server. Returns: The OpenStack instance created. @@ -196,22 +198,22 @@ def launch_instance( full_name, ) self._delete_keypair(conn, instance_id) - raise OpenStackError(f"Timeout creating openstack server {full_name}") from err + raise OpenstackError(f"Timeout creating openstack server {full_name}") from err except openstack.exceptions.SDKException as err: logger.exception("Failed to create openstack server %s", full_name) self._delete_keypair(conn, instance_id) - raise OpenStackError(f"Failed to create openstack server {full_name}") from err + raise OpenstackError(f"Failed to create openstack server {full_name}") from err return OpenstackInstance(server, self.prefix) - def get_instance(self, instance_id: str) -> OpenstackInstance: + def get_instance(self, instance_id: str) -> OpenstackInstance | None: """Get OpenStack instance by instance ID. Args: instance_id: The instance ID. Returns: - The OpenStack instance. + The OpenStack instance if found. """ full_name = self.get_server_name(instance_id) logger.info("Getting openstack server with %s", full_name) @@ -219,15 +221,16 @@ def get_instance(self, instance_id: str) -> OpenstackInstance: with _get_openstack_connection( clouds_config=self._clouds_config, cloud=self._cloud ) as conn: - return OpenstackInstance( - OpenstackCloud._get_and_ensure_unique_server(conn, full_name), self.prefix - ) + server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) + if server is not None: + return OpenstackInstance(server, self.prefix) + return None def delete_instance(self, instance_id: str) -> None: """Delete a openstack instance. Raises: - OpenStackError: Unable to delete OpenStack server. + OpenstackError: Unable to delete OpenStack server. Args: instance_id: The instance ID of the instance to delete. @@ -240,13 +243,14 @@ def delete_instance(self, instance_id: str) -> None: ) as conn: try: server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) - conn.delete_server(name_or_id=server.id) + if server is not None: + conn.delete_server(name_or_id=server.id) OpenstackCloud._delete_keypair(conn, full_name) except ( openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout, ) as err: - raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err + raise OpenstackError(f"Failed to remove openstack runner {full_name}") from err def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: """Get SSH connection to an OpenStack instance. @@ -321,7 +325,7 @@ def get_instances(self) -> tuple[OpenstackInstance]: server = OpenstackCloud._get_and_ensure_unique_server(conn, name) if server is not None: instances.append(OpenstackInstance(server, self.prefix)) - return instances + return cast(tuple[OpenstackInstance], tuple(instances)) def cleanup(self) -> None: """Cleanup unused openstack resources.""" @@ -414,11 +418,14 @@ def _get_openstack_instances(self, conn: OpenstackConnection) -> tuple[Openstack Returns: List of OpenStack instances. """ - return [ - server - for server in cast(list[OpenstackServer], conn.list_servers()) - if server.name.startswith(f"{self.prefix}-") - ] + return cast( + tuple[OpenstackServer], + tuple( + server + for server in cast(list[OpenstackServer], conn.list_servers()) + if server.name.startswith(f"{self.prefix}-") + ), + ) @staticmethod def _get_and_ensure_unique_server( @@ -443,7 +450,10 @@ def _get_and_ensure_unique_server( latest_server = reduce( lambda a, b: ( - a if datetime.strptime(a.created_at) < datetime.strptime(b.create_at) else b + a + if datetime.strptime(a.created_at, "a %b %d %H:%M:%S %Y") + < datetime.strptime(b.create_at, "a %b %d %H:%M:%S %Y") + else b ), servers, ) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 4c069ed64..f89ca7367 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -60,8 +60,9 @@ class _PullFileError(Exception): """Represents an error while pulling a file from the runner instance.""" +# Ignore "Too many instance attributes" as this dataclass is for passing arguments. @dataclass -class OpenstackRunnerManagerConfig: +class OpenstackRunnerManagerConfig: # pylint: disable=R0902 """Configuration for OpenstackRunnerManager. Attributes: @@ -145,14 +146,14 @@ def create_runner(self, registration_token: str) -> InstanceId: Instance ID of the runner. """ start_timestamp = time.time() - id = OpenstackRunnerManager._generate_instance_id() - instance_name = self._openstack_cloud.get_server_name(instance_id=id) + instance_id = OpenstackRunnerManager._generate_instance_id() + instance_name = self._openstack_cloud.get_server_name(instance_id=instance_id) userdata = self._generate_userdata( instance_name=instance_name, registration_token=registration_token ) try: instance = self._openstack_cloud.launch_instance( - instance_id=id, + instance_id=instance_id, image=self.config.image, flavor=self.config.flavor, network=self.config.network, @@ -170,24 +171,24 @@ def create_runner(self, registration_token: str) -> InstanceId: install_start_timestamp=start_timestamp, install_end_timestamp=end_timestamp, ) - return id + return instance_id - def get_runner(self, id: InstanceId) -> CloudRunnerInstance | None: + def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: """Get a self-hosted runner by instance id. Args: - id: The instance id. + instance_id: The instance id. Returns: Information on the runner instance. """ - name = self._openstack_cloud.get_server_name(id) + name = self._openstack_cloud.get_server_name(instance_id) instances_list = self._openstack_cloud.get_instances() for instance in instances_list: if instance.server_name == name: return CloudRunnerInstance( name=name, - id=id, + id=instance_id, state=CloudRunnerState.from_openstack_server_status(instance.status), ) return None @@ -215,21 +216,28 @@ def get_runners( ] if states is None: return instance_list - return [instance for instance in instance_list if instance.state in states] + return tuple(instance for instance in instance_list if instance.state in states) def delete_runner( - self, id: InstanceId, remove_token: str + self, instance_id: InstanceId, remove_token: str ) -> runner_metrics.RunnerMetrics | None: """Delete self-hosted runners. Args: - id: The instance id of the runner to delete. + instance_id: The instance id of the runner to delete. remove_token: The GitHub remove token. Returns: Any metrics collected during the deletion of the runner. """ - instance = self._openstack_cloud.get_instance(id) + instance = self._openstack_cloud.get_instance(instance_id) + if instance is None: + logger.warning( + "Unable to delete instance %s as it is not found", + self._openstack_cloud.get_server_name(instance_id), + ) + return None + metric = runner_metrics.extract( metrics_storage_manager=metrics_storage, runners=instance.server_name ) @@ -256,7 +264,7 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: self._openstack_cloud.cleanup() return metrics - def _delete_runner(self, instance: OpenstackInstance, remove_token) -> None: + def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None: """Delete self-hosted runners by openstack instance. Args: @@ -311,7 +319,7 @@ def _get_runner_health(self) -> RunnerHealth: unhealthy.append(runner) else: healthy.append(runner) - return RunnerHealth(healthy=healthy, unhealthy=unhealthy) + return RunnerHealth(healthy=tuple(healthy), unhealthy=tuple(unhealthy)) def _generate_userdata(self, instance_name: str, registration_token: str) -> str: """Generate cloud init userdata. diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index f40845a2a..041bcad3a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -40,7 +40,9 @@ def runner_label(): @pytest.fixture(scope="module", name="log_dir_base_path") -def log_dir_base_path_fixture(tmp_path_factory: Path) -> Iterator[dict[str, Path]]: +def log_dir_base_path_fixture( + tmp_path_factory: pytest.TempPathFactory, +) -> Iterator[dict[str, Path]]: """Mock the log directory path and return it.""" with pytest.MonkeyPatch.context() as monkeypatch: temp_log_dir = tmp_path_factory.mktemp("log") @@ -72,11 +74,11 @@ def openstack_proxy_config_fixture( use_aproxy = False if openstack_http_proxy or openstack_https_proxy: use_aproxy = True - openstack_http_proxy = openstack_http_proxy if openstack_http_proxy else None - openstack_https_proxy = openstack_https_proxy if openstack_https_proxy else None + http_proxy = openstack_http_proxy if openstack_http_proxy else None + https_proxy = openstack_https_proxy if openstack_https_proxy else None return ProxyConfig( - http=openstack_http_proxy, - https=openstack_https_proxy, + http=http_proxy, + https=https_proxy, no_proxy=openstack_no_proxy, use_aproxy=use_aproxy, ) @@ -275,6 +277,7 @@ async def test_runner_flush_busy_lifecycle( issue_metrics_events = runner_manager_with_one_runner.cleanup() assert issue_metrics_events[events.RunnerStart] == 1 + @pytest.mark.openstack @pytest.mark.asyncio @pytest.mark.abort_on_fail From 5224cd28ad41d557e88babee7791cdeb9bbd4e49 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:38:10 +0800 Subject: [PATCH 150/278] Fix unit test issue due to method sig change --- tests/unit/test_openstack_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_openstack_manager.py b/tests/unit/test_openstack_manager.py index 5349b1570..373f656f0 100644 --- a/tests/unit/test_openstack_manager.py +++ b/tests/unit/test_openstack_manager.py @@ -692,7 +692,7 @@ def test_reconcile_ignores_metrics_for_openstack_online_runners( openstack_manager.runner_metrics.extract.assert_called_once_with( metrics_storage_manager=metrics.storage, - ignore_runners=set(openstack_online_runner_names), + runners=set(openstack_online_runner_names), ) From 739d75b39ac82a0a52ca2d17b04671860bef7928 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:59:28 +0800 Subject: [PATCH 151/278] Ignore openstack cloud from coverage due to the test requires private endpoint --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index e7d0f789f..458b72d93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,9 @@ skips = ["*/*test.py", "*/test_*.py", "*tests/*.py"] [tool.coverage.run] branch = true omit = [ + # These are covered by `tests/integration/test_runner_manager_openstack.py`. + "src/openstack_cloud/openstack_cloud.py", + "src/openstack_cloud/openstack_runner_manager.py", # Contains interface for calling LXD. Tested in integration tests and end to end tests. "src/lxd.py", # Contains interface for calling repo policy compliance service. Tested in integration test From 94a60c2b103a3b20bb3adbd646a6092fe1e58af3 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 15:01:49 +0800 Subject: [PATCH 152/278] Enable all tests --- .github/workflows/e2e_test.yaml | 4 +-- .github/workflows/integration_test.yaml | 4 +-- .github/workflows/manual_test_env.yaml | 35 ------------------------- .github/workflows/openstack_test.yaml | 19 ++++++++++++++ 4 files changed, 21 insertions(+), 41 deletions(-) delete mode 100644 .github/workflows/manual_test_env.yaml create mode 100644 .github/workflows/openstack_test.yaml diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index bb1dada46..5933451ee 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,9 +1,7 @@ name: End-to-End tests on: - # TODO: Uncomment - #pull_request: - workflow_dispatch: + pull_request: jobs: # test option values defined at test/conftest.py are passed on via repository secret diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 1edd98aca..349bc302c 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,9 +1,7 @@ name: integration-tests on: - # TODO: Uncomment - #pull_request: - workflow_dispatch: + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml deleted file mode 100644 index c1060f3fb..000000000 --- a/.github/workflows/manual_test_env.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: Manual test env - -on: - # TODO: Uncomment - # pull_request: - workflow_dispatch: - -jobs: - manual-test-env: - name: manual-test-env - runs-on: ["self-hosted", "stg-private-endpoint", "X64"] - steps: - - run: sudo apt update -yq - - run: sudo apt install pipx -yq - - run: pipx ensurepath - - run: pipx install tox - - uses: actions/checkout@v4 - - name: Tmate debugging session (self-hosted) - uses: canonical/action-tmate@main - timeout-minutes: ${{ inputs.tmate-timeout }} - # openstack-integration-tests-private-endpoint: - # name: Integration test using private-endpoint - # uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main - # secrets: inherit - # with: - # juju-channel: 3.2/stable - # pre-run-script: scripts/setup-lxd.sh - # provider: lxd - # test-tox-env: integration-juju3.2 - # modules: '["test_runner_manager_openstack"]' - # extra-arguments: "-m openstack" - # self-hosted-runner: true - # self-hosted-runner-label: stg-private-endpoint - # tmate-debug: true - # tmate-timeout: 300 diff --git a/.github/workflows/openstack_test.yaml b/.github/workflows/openstack_test.yaml new file mode 100644 index 000000000..a7521c472 --- /dev/null +++ b/.github/workflows/openstack_test.yaml @@ -0,0 +1,19 @@ +name: Openstack tests + +on: + pull_request: + +jobs: + openstack-integration-tests-private-endpoint: + name: Openstack integration test using private-endpoint + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + secrets: inherit + with: + juju-channel: 3.2/stable + pre-run-script: scripts/setup-lxd.sh + provider: lxd + test-tox-env: integration-juju3.2 + modules: '["test_runner_manager_openstack"]' + extra-arguments: "-m openstack" + self-hosted-runner: true + self-hosted-runner-label: stg-private-endpoint From fa1eda367a9c92855ae994b18228512f2e254f65 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:42:09 +0800 Subject: [PATCH 153/278] Remove a repeated test --- .github/workflows/openstack_test.yaml | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 .github/workflows/openstack_test.yaml diff --git a/.github/workflows/openstack_test.yaml b/.github/workflows/openstack_test.yaml deleted file mode 100644 index a7521c472..000000000 --- a/.github/workflows/openstack_test.yaml +++ /dev/null @@ -1,19 +0,0 @@ -name: Openstack tests - -on: - pull_request: - -jobs: - openstack-integration-tests-private-endpoint: - name: Openstack integration test using private-endpoint - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main - secrets: inherit - with: - juju-channel: 3.2/stable - pre-run-script: scripts/setup-lxd.sh - provider: lxd - test-tox-env: integration-juju3.2 - modules: '["test_runner_manager_openstack"]' - extra-arguments: "-m openstack" - self-hosted-runner: true - self-hosted-runner-label: stg-private-endpoint From 098f58e2b08e55be33c661f77405926b6dc306c8 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:45:35 +0800 Subject: [PATCH 154/278] Re-enable test.yaml --- .github/workflows/test.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 34803b2fb..99e540d31 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,9 +1,7 @@ name: Tests on: - # TODO: Uncomment - #pull_request: - workflow_dispatch: + pull_request: jobs: unit-tests: From 6344c7aa589118b367908739a33ba29d0ae8e997 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:09:10 +0800 Subject: [PATCH 155/278] Fix integration tests workflwo --- .github/workflows/integration_test.yaml | 15 ++- tests/integration/test_openstack_cloud.py | 130 ---------------------- 2 files changed, 14 insertions(+), 131 deletions(-) delete mode 100644 tests/integration/test_openstack_cloud.py diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 349bc302c..8e0bc700a 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -24,16 +24,29 @@ jobs: # test_debug_ssh ensures tmate SSH actions works. # TODO: Add OpenStack integration versions of these tests. modules: '["test_charm_scheduled_events", "test_debug_ssh"]' + openstack-interface-tests-private-endpoint: + name: openstack interface test using private-endpoint + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + secrets: inherit + with: + juju-channel: 3.2/stable + pre-run-script: scripts/setup-lxd.sh + provider: lxd + test-tox-env: integration-juju3.2 + modules: '["test_runner_manager_openstack"]' + self-hosted-runner: true + self-hosted-runner-label: stg-private-endpoint openstack-integration-tests-private-endpoint: name: Integration test using private-endpoint uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + needs: openstack-interface-tests-private-endpoint secrets: inherit with: juju-channel: 3.2/stable pre-run-script: scripts/setup-lxd.sh provider: lxd test-tox-env: integration-juju3.2 - modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_runner", "test_reactive", "test_openstack_cloud"]' + modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_runner", "test_reactive"]' extra-arguments: "-m openstack" self-hosted-runner: true self-hosted-runner-label: stg-private-endpoint diff --git a/tests/integration/test_openstack_cloud.py b/tests/integration/test_openstack_cloud.py deleted file mode 100644 index f0dd8f148..000000000 --- a/tests/integration/test_openstack_cloud.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Test for OpenstackCloud class integration with OpenStack.""" - -from secrets import token_hex - -import pytest -import pytest_asyncio -import yaml -from openstack.connection import Connection as OpenstackConnection - -from openstack_cloud.openstack_cloud import OpenstackCloud - - -@pytest_asyncio.fixture(scope="function", name="base_openstack_cloud") -async def base_openstack_cloud_fixture(private_endpoint_clouds_yaml: str) -> OpenstackCloud: - """Setup a OpenstackCloud object with connection to openstack.""" - clouds_yaml = yaml.safe_load(private_endpoint_clouds_yaml) - return OpenstackCloud(clouds_yaml, "testcloud", f"test-{token_hex(4)}") - - -@pytest_asyncio.fixture(scope="function", name="openstack_cloud") -async def openstack_cloud_fixture(base_openstack_cloud: OpenstackCloud) -> OpenstackCloud: - """Ensures the OpenstackCloud object has no openstack servers.""" - instances = base_openstack_cloud.get_instances() - for instance in instances: - base_openstack_cloud.delete_instance(instance_id=instance.instance_id) - return base_openstack_cloud - - -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_get_no_instances(base_openstack_cloud: OpenstackCloud) -> None: - """ - arrange: No instance on OpenStack. - act: Get instances on OpenStack. - assert: An empty list returned. - - Uses base_openstack_cloud as openstack_cloud_fixture relies on this test. - """ - instances = base_openstack_cloud.get_instances() - assert not instances - - -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_launch_instance_and_delete( - base_openstack_cloud: OpenstackCloud, - openstack_connection: OpenstackConnection, - openstack_test_image: str, - openstack_test_flavor: str, - network_name: str, -) -> None: - """ - arrange: No instance on OpenStack. - act: - 1. Create an openstack instance. - 2. Delete openstack instance. - assert: - 1. Instance returned. - 2. No instance exists. - - Uses base_openstack_cloud as openstack_cloud_fixture relies on this test. - """ - instances = base_openstack_cloud.get_instances() - assert not instances, "Test arrange failure: found existing openstack instance." - - instance_name = f"{token_hex(2)}" - - # 1. - instance = base_openstack_cloud.launch_instance( - instance_id=instance_name, - image=openstack_test_image, - flavor=openstack_test_flavor, - network=network_name, - userdata="", - ) - - assert instance is not None - assert instance.instance_id is not None - assert instance.server_name is not None - assert instance.id is not None - - servers = openstack_connection.list_servers() - for server in servers: - if instance_name in server.name: - break - else: - assert False, f"OpenStack server with {instance_name} in the name not found" - - # 2. - base_openstack_cloud.delete_instance(instance_id=instance_name) - instances = base_openstack_cloud.get_instances() - assert not instances, "Test failure: openstack instance should be deleted." - - -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_instance_ssh_connection( - openstack_cloud: OpenstackCloud, - openstack_test_image: str, - openstack_test_flavor: str, - network_name: str, -) -> None: - """ - arrange: One instance on OpenStack. - act: Get SSH connection of instance and execute command. - assert: Test SSH command executed successfully. - - This tests whether the network rules (security group) are in place. - """ - rand_chars = f"{token_hex(10)}" - instance_name = f"{token_hex(2)}" - instance = openstack_cloud.launch_instance( - instance_id=instance_name, - image=openstack_test_image, - flavor=openstack_test_flavor, - network=network_name, - userdata="", - ) - - ssh_conn = openstack_cloud.get_ssh_connection(instance) - result = ssh_conn.run(f"echo {rand_chars}") - - assert result.ok - assert rand_chars in result.stdout From addf218c9a5992e64f24a82c49467cf1a00b11ad Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 12:13:04 +0800 Subject: [PATCH 156/278] Add docs on cleanup method of cloud runner manager --- src/manager/cloud_runner_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 654e5663b..922368faf 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -129,6 +129,8 @@ def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMet @abc.abstractmethod def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: """Cleanup runner and resource on the cloud. + + Perform health check on runner and delete the runner if it fails. Args: remove_token: The GitHub remove token. From c90cdb3e120bee4d59339cc3ab1b0db6f30cc15a Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 12:35:35 +0800 Subject: [PATCH 157/278] Add parallel spawning of runners. --- src/manager/runner_manager.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index ad5354166..a15ad0d80 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -6,6 +6,7 @@ import logging from dataclasses import dataclass from enum import Enum, auto +from multiprocessing import Pool from typing import Iterator, Sequence, Type, cast from charm_state import GithubPath @@ -112,11 +113,16 @@ def create_runners(self, num: int) -> tuple[InstanceId]: logger.info("Creating %s runners", num) registration_token = self._github.get_registration_token() - runner_ids = [] - for _ in range(num): - runner_ids.append(self._cloud.create_runner(registration_token=registration_token)) + instance_ids = [] + create_runner_args = [self._cloud, registration_token] * num + with Pool(processes=min(num, 10)) as pool: + pool.map(func=RunnerManager._create_runner, iterable=create_runner_args) + + return tuple(instance_ids) - return tuple(runner_ids) + @staticmethod + def _create_runner(cloud_runner_manager: CloudRunnerManager, registration_token: str) -> InstanceId: + return cloud_runner_manager.create_runner(registration_token=registration_token) def get_runners( self, From a76e528fb02b3032313e85fd7e952571da700245 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 12:38:51 +0800 Subject: [PATCH 158/278] Enable dev testing --- .github/workflows/integration_test.yaml | 4 +++- .github/workflows/manual_test_env.yaml | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/manual_test_env.yaml diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 8e0bc700a..dfdfdf590 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,7 +1,9 @@ name: integration-tests on: - pull_request: + # TODO: Debug + #pull_request: + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml new file mode 100644 index 000000000..6b65cbfce --- /dev/null +++ b/.github/workflows/manual_test_env.yaml @@ -0,0 +1,18 @@ +name: Manual test env + +on: + pull_request: + +jobs: + manual-test-env: + name: manual-test-env + runs-on: ["self-hosted", "stg-private-endpoint"] + steps: + - run: sudo apt update -yq + - run: sudo apt install pipx -yq + - run: pipx ensurepath + - run: pipx install tox + - uses: actions/checkout@v4 + - name: Tmate debugging session (self-hosted) + uses: canonical/action-tmate@main + timeout-minutes: 300 From 91e0a0f758c3ce40bd96a6038d31eb52b124a3a3 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 12:54:13 +0800 Subject: [PATCH 159/278] Fix parallel spawn --- .github/workflows/e2e_test.yaml | 4 +++- src/manager/runner_manager.py | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 5933451ee..bca9e4476 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,7 +1,9 @@ name: End-to-End tests on: - pull_request: + # TODO: Debug + #pull_request: + workflow_dispatch: jobs: # test option values defined at test/conftest.py are passed on via repository secret diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index a15ad0d80..0c7a546cf 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -113,16 +113,12 @@ def create_runners(self, num: int) -> tuple[InstanceId]: logger.info("Creating %s runners", num) registration_token = self._github.get_registration_token() - instance_ids = [] - create_runner_args = [self._cloud, registration_token] * num + create_runner_args = [RunnerManager._CreateRunnerArgs(self._cloud, registration_token) for _ in range(num)] with Pool(processes=min(num, 10)) as pool: - pool.map(func=RunnerManager._create_runner, iterable=create_runner_args) + instance_ids = pool.map(func=RunnerManager._create_runner, iterable=create_runner_args) return tuple(instance_ids) - @staticmethod - def _create_runner(cloud_runner_manager: CloudRunnerManager, registration_token: str) -> InstanceId: - return cloud_runner_manager.create_runner(registration_token=registration_token) def get_runners( self, @@ -266,3 +262,14 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri total_stats[event_type] = total_stats.get(event_type, 0) + 1 return total_stats + + @dataclass + class _CreateRunnerArgs: + cloud_runner_manager: CloudRunnerManager + registration_token: str + + @staticmethod + def _create_runner(args: _CreateRunnerArgs) -> InstanceId: + return args.cloud_runner_manager.create_runner(registration_token=args.registration_token) + + From 7f16bdcc4b33cbda904ba63ab1edefb620712119 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 13:15:19 +0800 Subject: [PATCH 160/278] Allow openstack server to take a bit of time on deletion --- src/manager/cloud_runner_manager.py | 2 +- src/manager/runner_manager.py | 11 +++++------ tests/integration/test_runner_manager_openstack.py | 8 ++++++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 922368faf..5191a934b 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -129,7 +129,7 @@ def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMet @abc.abstractmethod def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: """Cleanup runner and resource on the cloud. - + Perform health check on runner and delete the runner if it fails. Args: diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 0c7a546cf..24284fd10 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -113,12 +113,13 @@ def create_runners(self, num: int) -> tuple[InstanceId]: logger.info("Creating %s runners", num) registration_token = self._github.get_registration_token() - create_runner_args = [RunnerManager._CreateRunnerArgs(self._cloud, registration_token) for _ in range(num)] + create_runner_args = [ + RunnerManager._CreateRunnerArgs(self._cloud, registration_token) for _ in range(num) + ] with Pool(processes=min(num, 10)) as pool: instance_ids = pool.map(func=RunnerManager._create_runner, iterable=create_runner_args) - - return tuple(instance_ids) + return tuple(instance_ids) def get_runners( self, @@ -262,7 +263,7 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri total_stats[event_type] = total_stats.get(event_type, 0) + 1 return total_stats - + @dataclass class _CreateRunnerArgs: cloud_runner_manager: CloudRunnerManager @@ -271,5 +272,3 @@ class _CreateRunnerArgs: @staticmethod def _create_runner(args: _CreateRunnerArgs) -> InstanceId: return args.cloud_runner_manager.create_runner(registration_token=args.registration_token) - - diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 041bcad3a..e7172ba41 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -221,6 +221,14 @@ async def test_runner_normal_idle_lifecycle( runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) + if len(runner_list) == 1: + runner = runner_list[0] + assert runner.github_state == None + + # The openstack server can take sometime to fully clean up. + await wait_for(lambda: len(runner_manager.get_runners()) == 0, timeout=60) + return + assert len(runner_list) == 0 From ad05ac5c7a910bca92851239d0e6b40d6f350635 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 13:29:38 +0800 Subject: [PATCH 161/278] Refactor test detection of no runners --- .../test_runner_manager_openstack.py | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index e7172ba41..034c836f8 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -164,6 +164,24 @@ def workflow_is_status(workflow: Workflow, status: str) -> bool: workflow.update() return workflow.status == status +async def assert_no_runner(runner_manager: RunnerManager): + """Assert the runner manager has no runners. + + Retry are performed if the number of runner is not 0. Due to it may take some time for + openstack to delete the servers. + + A TimeoutError will be thrown if runners are still found after timeout. + + Args: + runner_manager: The RunnerManager to check. + """ + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + if len(runner_list) == 0: + return + + # The openstack server can take sometime to fully clean up. + await wait_for(lambda: len(runner_manager.get_runners()) == 0, timeout=60) @pytest.mark.openstack @pytest.mark.asyncio @@ -219,18 +237,7 @@ async def test_runner_normal_idle_lifecycle( # 3. runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) - runner_list = runner_manager.get_runners() - assert isinstance(runner_list, tuple) - if len(runner_list) == 1: - runner = runner_list[0] - assert runner.github_state == None - - # The openstack server can take sometime to fully clean up. - await wait_for(lambda: len(runner_manager.get_runners()) == 0, timeout=60) - return - - assert len(runner_list) == 0 - + assert_no_runner(runner_manager) @pytest.mark.openstack @pytest.mark.asyncio @@ -280,7 +287,7 @@ async def test_runner_flush_busy_lifecycle( # 3. runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) - runner_list = runner_manager_with_one_runner.get_runners() + assert_no_runner(runner_manager_with_one_runner) issue_metrics_events = runner_manager_with_one_runner.cleanup() assert issue_metrics_events[events.RunnerStart] == 1 @@ -339,3 +346,5 @@ async def test_runner_normal_lifecycle( assert metric_logs[0]["workflow"] == "Workflow Dispatch Wait Tests" assert metric_logs[1]["event"] == "runner_stop" assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" + + assert_no_runner(runner_manager_with_one_runner) From 23bac2f1b286a66a4ee261e1940df0e8d1ce894d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 13:37:16 +0800 Subject: [PATCH 162/278] Re-enable the tests --- .github/workflows/e2e_test.yaml | 4 +--- .github/workflows/integration_test.yaml | 4 +--- .github/workflows/manual_test_env.yaml | 18 ------------------ 3 files changed, 2 insertions(+), 24 deletions(-) delete mode 100644 .github/workflows/manual_test_env.yaml diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index bca9e4476..5933451ee 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,9 +1,7 @@ name: End-to-End tests on: - # TODO: Debug - #pull_request: - workflow_dispatch: + pull_request: jobs: # test option values defined at test/conftest.py are passed on via repository secret diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index dfdfdf590..8e0bc700a 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,9 +1,7 @@ name: integration-tests on: - # TODO: Debug - #pull_request: - workflow_dispatch: + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml deleted file mode 100644 index 6b65cbfce..000000000 --- a/.github/workflows/manual_test_env.yaml +++ /dev/null @@ -1,18 +0,0 @@ -name: Manual test env - -on: - pull_request: - -jobs: - manual-test-env: - name: manual-test-env - runs-on: ["self-hosted", "stg-private-endpoint"] - steps: - - run: sudo apt update -yq - - run: sudo apt install pipx -yq - - run: pipx ensurepath - - run: pipx install tox - - uses: actions/checkout@v4 - - name: Tmate debugging session (self-hosted) - uses: canonical/action-tmate@main - timeout-minutes: 300 From 325be1fb4b3f4730335e405e66d31f1f17297b6c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 13:52:09 +0800 Subject: [PATCH 163/278] Fix lints --- src/manager/runner_manager.py | 17 +++++++++++++++++ .../test_runner_manager_openstack.py | 17 ++++++++++------- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 24284fd10..98ee9ed9c 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -266,9 +266,26 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri @dataclass class _CreateRunnerArgs: + """Arguments for the _create_runner function. + + Attrs: + cloud_runner_manager: For managing the cloud instance of the runner. + registration_token: The GitHub provided-token for registering runners. + """ + cloud_runner_manager: CloudRunnerManager registration_token: str @staticmethod def _create_runner(args: _CreateRunnerArgs) -> InstanceId: + """Create a single runner. + + This is a staticmethod for usage with multiprocess.Pool. + + Args: + args: The arguments. + + Returns: + The instance ID of the runner created. + """ return args.cloud_runner_manager.create_runner(registration_token=args.registration_token) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 034c836f8..b884fc55a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -164,14 +164,15 @@ def workflow_is_status(workflow: Workflow, status: str) -> bool: workflow.update() return workflow.status == status + async def assert_no_runner(runner_manager: RunnerManager): """Assert the runner manager has no runners. - Retry are performed if the number of runner is not 0. Due to it may take some time for + Retry are performed if the number of runner is not 0. Due to it may take some time for openstack to delete the servers. - + A TimeoutError will be thrown if runners are still found after timeout. - + Args: runner_manager: The RunnerManager to check. """ @@ -179,10 +180,11 @@ async def assert_no_runner(runner_manager: RunnerManager): assert isinstance(runner_list, tuple) if len(runner_list) == 0: return - + # The openstack server can take sometime to fully clean up. await wait_for(lambda: len(runner_manager.get_runners()) == 0, timeout=60) + @pytest.mark.openstack @pytest.mark.asyncio @pytest.mark.abort_on_fail @@ -237,7 +239,8 @@ async def test_runner_normal_idle_lifecycle( # 3. runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) - assert_no_runner(runner_manager) + await assert_no_runner(runner_manager) + @pytest.mark.openstack @pytest.mark.asyncio @@ -287,7 +290,7 @@ async def test_runner_flush_busy_lifecycle( # 3. runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) - assert_no_runner(runner_manager_with_one_runner) + await assert_no_runner(runner_manager_with_one_runner) issue_metrics_events = runner_manager_with_one_runner.cleanup() assert issue_metrics_events[events.RunnerStart] == 1 @@ -347,4 +350,4 @@ async def test_runner_normal_lifecycle( assert metric_logs[1]["event"] == "runner_stop" assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" - assert_no_runner(runner_manager_with_one_runner) + await assert_no_runner(runner_manager_with_one_runner) From 4a91bc0d581be662756dee7413b14aabecfff450 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 14:01:49 +0800 Subject: [PATCH 164/278] Disable tests again --- .github/workflows/e2e_test.yaml | 4 +++- .github/workflows/integration_test.yaml | 4 +++- .github/workflows/test.yaml | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 5933451ee..c7636df33 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,7 +1,9 @@ name: End-to-End tests on: - pull_request: + #TODO: Debug + # pull_request: + workflow_dispatch: jobs: # test option values defined at test/conftest.py are passed on via repository secret diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 8e0bc700a..000aa5d2b 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,7 +1,9 @@ name: integration-tests on: - pull_request: + #TODO: Debug + # pull_request: + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 99e540d31..876d3f2df 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,7 +1,9 @@ name: Tests on: - pull_request: + #TODO: Debug + # pull_request: + workflow_dispatch: jobs: unit-tests: From d6b50a4d13c07d2cd4c02b2b8b01129c717b1f02 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 14:02:50 +0800 Subject: [PATCH 165/278] Disable some test --- .../test_runner_manager_openstack.py | 218 +++++++++--------- 1 file changed, 109 insertions(+), 109 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b884fc55a..897272d21 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -242,112 +242,112 @@ async def test_runner_normal_idle_lifecycle( await assert_no_runner(runner_manager) -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_runner_flush_busy_lifecycle( - runner_manager_with_one_runner: RunnerManager, - test_github_branch: Branch, - github_repository: Repository, - runner_label: str, -): - """ - Arrange: RunnerManager with one idle runner. - Act: - 1. Run a long workflow. - 2. Run flush idle runner. - 3. Run flush busy runner. - Assert: - 1. Runner takes the job and become busy. - 2. Busy runner still exists. - 3. No runners exists. - """ - # 1. - workflow = await dispatch_workflow( - app=None, - branch=test_github_branch, - github_repository=github_repository, - conclusion="success", - workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, - dispatch_input={"runner": runner_label, "minutes": "10"}, - wait=False, - ) - await wait_for(lambda: workflow_is_status(workflow, "in_progress")) - - runner_list = runner_manager_with_one_runner.get_runners() - assert len(runner_list) == 1 - busy_runner = runner_list[0] - assert busy_runner.cloud_state == CloudRunnerState.ACTIVE - assert busy_runner.github_state == GithubRunnerState.BUSY - - # 2. - runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) - runner_list = runner_manager_with_one_runner.get_runners() - assert len(runner_list) == 1 - busy_runner = runner_list[0] - assert busy_runner.cloud_state == CloudRunnerState.ACTIVE - assert busy_runner.github_state == GithubRunnerState.BUSY - - # 3. - runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) - await assert_no_runner(runner_manager_with_one_runner) - - issue_metrics_events = runner_manager_with_one_runner.cleanup() - assert issue_metrics_events[events.RunnerStart] == 1 - - -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_runner_normal_lifecycle( - runner_manager_with_one_runner: RunnerManager, - test_github_branch: Branch, - github_repository: Repository, - runner_label: str, - log_dir_base_path: dict[str, Path], -): - """ - Arrange: RunnerManager with one runner. Clean metric logs. - Act: - 1. Start a test workflow for the runner. - 2. Run cleanup. - Assert: - 1. The workflow complete successfully. - 2. The runner should be deleted. The metrics should be recorded. - """ - metric_log_path = log_dir_base_path["metric_log"] - metric_log_existing_content = metric_log_path.read_text(encoding="utf-8") - - workflow = await dispatch_workflow( - app=None, - branch=test_github_branch, - github_repository=github_repository, - conclusion="success", - workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, - dispatch_input={"runner": runner_label, "minutes": "0"}, - wait=False, - ) - await wait_for(lambda: workflow_is_status(workflow, "completed")) - - issue_metrics_events = runner_manager_with_one_runner.cleanup() - assert issue_metrics_events[events.RunnerStart] == 1 - assert issue_metrics_events[events.RunnerStop] == 1 - - metric_log_full_content = metric_log_path.read_text(encoding="utf-8") - assert metric_log_full_content.startswith( - metric_log_existing_content - ), "The metric log was modified in ways other than appending" - # Disable E203 (space before :) as it conflicts with the formatter (black). - metric_log_new_content = metric_log_full_content[ - len(metric_log_existing_content) : # noqa: E203 - ] - metric_logs = [json.loads(metric) for metric in metric_log_new_content.splitlines()] - assert ( - len(metric_logs) == 2 - ), "Assuming two events should be runner_start and runner_stop, modify this if new events are added" - assert metric_logs[0]["event"] == "runner_start" - assert metric_logs[0]["workflow"] == "Workflow Dispatch Wait Tests" - assert metric_logs[1]["event"] == "runner_stop" - assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" - - await assert_no_runner(runner_manager_with_one_runner) +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail +# async def test_runner_flush_busy_lifecycle( +# runner_manager_with_one_runner: RunnerManager, +# test_github_branch: Branch, +# github_repository: Repository, +# runner_label: str, +# ): +# """ +# Arrange: RunnerManager with one idle runner. +# Act: +# 1. Run a long workflow. +# 2. Run flush idle runner. +# 3. Run flush busy runner. +# Assert: +# 1. Runner takes the job and become busy. +# 2. Busy runner still exists. +# 3. No runners exists. +# """ +# # 1. +# workflow = await dispatch_workflow( +# app=None, +# branch=test_github_branch, +# github_repository=github_repository, +# conclusion="success", +# workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, +# dispatch_input={"runner": runner_label, "minutes": "10"}, +# wait=False, +# ) +# await wait_for(lambda: workflow_is_status(workflow, "in_progress")) + +# runner_list = runner_manager_with_one_runner.get_runners() +# assert len(runner_list) == 1 +# busy_runner = runner_list[0] +# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE +# assert busy_runner.github_state == GithubRunnerState.BUSY + +# # 2. +# runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) +# runner_list = runner_manager_with_one_runner.get_runners() +# assert len(runner_list) == 1 +# busy_runner = runner_list[0] +# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE +# assert busy_runner.github_state == GithubRunnerState.BUSY + +# # 3. +# runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) +# await assert_no_runner(runner_manager_with_one_runner) + +# issue_metrics_events = runner_manager_with_one_runner.cleanup() +# assert issue_metrics_events[events.RunnerStart] == 1 + + +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail +# async def test_runner_normal_lifecycle( +# runner_manager_with_one_runner: RunnerManager, +# test_github_branch: Branch, +# github_repository: Repository, +# runner_label: str, +# log_dir_base_path: dict[str, Path], +# ): +# """ +# Arrange: RunnerManager with one runner. Clean metric logs. +# Act: +# 1. Start a test workflow for the runner. +# 2. Run cleanup. +# Assert: +# 1. The workflow complete successfully. +# 2. The runner should be deleted. The metrics should be recorded. +# """ +# metric_log_path = log_dir_base_path["metric_log"] +# metric_log_existing_content = metric_log_path.read_text(encoding="utf-8") + +# workflow = await dispatch_workflow( +# app=None, +# branch=test_github_branch, +# github_repository=github_repository, +# conclusion="success", +# workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, +# dispatch_input={"runner": runner_label, "minutes": "0"}, +# wait=False, +# ) +# await wait_for(lambda: workflow_is_status(workflow, "completed")) + +# issue_metrics_events = runner_manager_with_one_runner.cleanup() +# assert issue_metrics_events[events.RunnerStart] == 1 +# assert issue_metrics_events[events.RunnerStop] == 1 + +# metric_log_full_content = metric_log_path.read_text(encoding="utf-8") +# assert metric_log_full_content.startswith( +# metric_log_existing_content +# ), "The metric log was modified in ways other than appending" +# # Disable E203 (space before :) as it conflicts with the formatter (black). +# metric_log_new_content = metric_log_full_content[ +# len(metric_log_existing_content) : # noqa: E203 +# ] +# metric_logs = [json.loads(metric) for metric in metric_log_new_content.splitlines()] +# assert ( +# len(metric_logs) == 2 +# ), "Assuming two events should be runner_start and runner_stop, modify this if new events are added" +# assert metric_logs[0]["event"] == "runner_start" +# assert metric_logs[0]["workflow"] == "Workflow Dispatch Wait Tests" +# assert metric_logs[1]["event"] == "runner_stop" +# assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" + +# await assert_no_runner(runner_manager_with_one_runner) From 87f5cafb7c2d9cc016e0fbd4f56c2c157e7ff24b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 14:15:44 +0800 Subject: [PATCH 166/278] Add wait until runner is running --- ...penstack_cloud.openstack_runner_manager.md | 8 +++--- .../openstack_runner_manager.py | 28 ++++++++++++++++++- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 029b39268..3a85a76c1 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -130,7 +130,7 @@ Construct the object. --- - + ### method `cleanup` @@ -182,7 +182,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -223,7 +223,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `get_runner` @@ -246,7 +246,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index f89ca7367..25be0ab0d 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -163,6 +163,7 @@ def create_runner(self, registration_token: str) -> InstanceId: raise RunnerCreateError(f"Failed to create {instance_name} openstack runner") from err self._wait_runner_startup(instance) + self._wait_runner_running(instance) end_timestamp = time.time() OpenstackRunnerManager._issue_runner_installed_metric( @@ -450,7 +451,7 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: instance: The runner instance. Raises: - RunnerStartError: The runner process was not found on the runner. + RunnerStartError: The runner startup process was not found on the runner. """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) @@ -468,6 +469,31 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: raise RunnerStartError(f"Runner startup process not found on {instance.server_name}") logger.info("Runner startup process found to be healthy on %s", instance.server_name) + @retry(tries=5, delay=60, local_logger=logger) + def _wait_runner_running(self, instance: OpenstackInstance) -> None: + """Wait until runner is running. + + Args: + instance: The runner instance. + + Raises: + RunnerStartError: The runner process was not found on the runner. + """ + try: + ssh_conn = self._openstack_cloud.get_ssh_connection(instance) + except SshError as err: + raise RunnerStartError( + f"Failed to SSH connect to {instance.server_name} openstack runner" + ) from err + + if not self._run_health_check(ssh_conn=ssh_conn, name=instance.server_name): + logger.info("Runner process not found on %s", instance.server_name) + raise RunnerStartError( + f"Runner process on {instance.server_name} failed to initialize on after starting" + ) + + logger.info("Runner process found to be healthy on %s", instance.server_name) + @staticmethod def _generate_instance_id() -> InstanceId: """Generate a instance id. From 2390478d2d4ffb66034ccf440988a993f2145695 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 14:19:43 +0800 Subject: [PATCH 167/278] Enable openstack runner manager tests --- .../openstack_runner_manager.py | 2 +- .../test_runner_manager_openstack.py | 218 +++++++++--------- 2 files changed, 110 insertions(+), 110 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 25be0ab0d..c25b25459 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -472,7 +472,7 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: @retry(tries=5, delay=60, local_logger=logger) def _wait_runner_running(self, instance: OpenstackInstance) -> None: """Wait until runner is running. - + Args: instance: The runner instance. diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 897272d21..b884fc55a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -242,112 +242,112 @@ async def test_runner_normal_idle_lifecycle( await assert_no_runner(runner_manager) -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_runner_flush_busy_lifecycle( -# runner_manager_with_one_runner: RunnerManager, -# test_github_branch: Branch, -# github_repository: Repository, -# runner_label: str, -# ): -# """ -# Arrange: RunnerManager with one idle runner. -# Act: -# 1. Run a long workflow. -# 2. Run flush idle runner. -# 3. Run flush busy runner. -# Assert: -# 1. Runner takes the job and become busy. -# 2. Busy runner still exists. -# 3. No runners exists. -# """ -# # 1. -# workflow = await dispatch_workflow( -# app=None, -# branch=test_github_branch, -# github_repository=github_repository, -# conclusion="success", -# workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, -# dispatch_input={"runner": runner_label, "minutes": "10"}, -# wait=False, -# ) -# await wait_for(lambda: workflow_is_status(workflow, "in_progress")) - -# runner_list = runner_manager_with_one_runner.get_runners() -# assert len(runner_list) == 1 -# busy_runner = runner_list[0] -# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE -# assert busy_runner.github_state == GithubRunnerState.BUSY - -# # 2. -# runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) -# runner_list = runner_manager_with_one_runner.get_runners() -# assert len(runner_list) == 1 -# busy_runner = runner_list[0] -# assert busy_runner.cloud_state == CloudRunnerState.ACTIVE -# assert busy_runner.github_state == GithubRunnerState.BUSY - -# # 3. -# runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) -# await assert_no_runner(runner_manager_with_one_runner) - -# issue_metrics_events = runner_manager_with_one_runner.cleanup() -# assert issue_metrics_events[events.RunnerStart] == 1 - - -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_runner_normal_lifecycle( -# runner_manager_with_one_runner: RunnerManager, -# test_github_branch: Branch, -# github_repository: Repository, -# runner_label: str, -# log_dir_base_path: dict[str, Path], -# ): -# """ -# Arrange: RunnerManager with one runner. Clean metric logs. -# Act: -# 1. Start a test workflow for the runner. -# 2. Run cleanup. -# Assert: -# 1. The workflow complete successfully. -# 2. The runner should be deleted. The metrics should be recorded. -# """ -# metric_log_path = log_dir_base_path["metric_log"] -# metric_log_existing_content = metric_log_path.read_text(encoding="utf-8") - -# workflow = await dispatch_workflow( -# app=None, -# branch=test_github_branch, -# github_repository=github_repository, -# conclusion="success", -# workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, -# dispatch_input={"runner": runner_label, "minutes": "0"}, -# wait=False, -# ) -# await wait_for(lambda: workflow_is_status(workflow, "completed")) - -# issue_metrics_events = runner_manager_with_one_runner.cleanup() -# assert issue_metrics_events[events.RunnerStart] == 1 -# assert issue_metrics_events[events.RunnerStop] == 1 - -# metric_log_full_content = metric_log_path.read_text(encoding="utf-8") -# assert metric_log_full_content.startswith( -# metric_log_existing_content -# ), "The metric log was modified in ways other than appending" -# # Disable E203 (space before :) as it conflicts with the formatter (black). -# metric_log_new_content = metric_log_full_content[ -# len(metric_log_existing_content) : # noqa: E203 -# ] -# metric_logs = [json.loads(metric) for metric in metric_log_new_content.splitlines()] -# assert ( -# len(metric_logs) == 2 -# ), "Assuming two events should be runner_start and runner_stop, modify this if new events are added" -# assert metric_logs[0]["event"] == "runner_start" -# assert metric_logs[0]["workflow"] == "Workflow Dispatch Wait Tests" -# assert metric_logs[1]["event"] == "runner_stop" -# assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" - -# await assert_no_runner(runner_manager_with_one_runner) +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_flush_busy_lifecycle( + runner_manager_with_one_runner: RunnerManager, + test_github_branch: Branch, + github_repository: Repository, + runner_label: str, +): + """ + Arrange: RunnerManager with one idle runner. + Act: + 1. Run a long workflow. + 2. Run flush idle runner. + 3. Run flush busy runner. + Assert: + 1. Runner takes the job and become busy. + 2. Busy runner still exists. + 3. No runners exists. + """ + # 1. + workflow = await dispatch_workflow( + app=None, + branch=test_github_branch, + github_repository=github_repository, + conclusion="success", + workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, + dispatch_input={"runner": runner_label, "minutes": "10"}, + wait=False, + ) + await wait_for(lambda: workflow_is_status(workflow, "in_progress")) + + runner_list = runner_manager_with_one_runner.get_runners() + assert len(runner_list) == 1 + busy_runner = runner_list[0] + assert busy_runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GithubRunnerState.BUSY + + # 2. + runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) + runner_list = runner_manager_with_one_runner.get_runners() + assert len(runner_list) == 1 + busy_runner = runner_list[0] + assert busy_runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GithubRunnerState.BUSY + + # 3. + runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) + await assert_no_runner(runner_manager_with_one_runner) + + issue_metrics_events = runner_manager_with_one_runner.cleanup() + assert issue_metrics_events[events.RunnerStart] == 1 + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_normal_lifecycle( + runner_manager_with_one_runner: RunnerManager, + test_github_branch: Branch, + github_repository: Repository, + runner_label: str, + log_dir_base_path: dict[str, Path], +): + """ + Arrange: RunnerManager with one runner. Clean metric logs. + Act: + 1. Start a test workflow for the runner. + 2. Run cleanup. + Assert: + 1. The workflow complete successfully. + 2. The runner should be deleted. The metrics should be recorded. + """ + metric_log_path = log_dir_base_path["metric_log"] + metric_log_existing_content = metric_log_path.read_text(encoding="utf-8") + + workflow = await dispatch_workflow( + app=None, + branch=test_github_branch, + github_repository=github_repository, + conclusion="success", + workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, + dispatch_input={"runner": runner_label, "minutes": "0"}, + wait=False, + ) + await wait_for(lambda: workflow_is_status(workflow, "completed")) + + issue_metrics_events = runner_manager_with_one_runner.cleanup() + assert issue_metrics_events[events.RunnerStart] == 1 + assert issue_metrics_events[events.RunnerStop] == 1 + + metric_log_full_content = metric_log_path.read_text(encoding="utf-8") + assert metric_log_full_content.startswith( + metric_log_existing_content + ), "The metric log was modified in ways other than appending" + # Disable E203 (space before :) as it conflicts with the formatter (black). + metric_log_new_content = metric_log_full_content[ + len(metric_log_existing_content) : # noqa: E203 + ] + metric_logs = [json.loads(metric) for metric in metric_log_new_content.splitlines()] + assert ( + len(metric_logs) == 2 + ), "Assuming two events should be runner_start and runner_stop, modify this if new events are added" + assert metric_logs[0]["event"] == "runner_start" + assert metric_logs[0]["workflow"] == "Workflow Dispatch Wait Tests" + assert metric_logs[1]["event"] == "runner_stop" + assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" + + await assert_no_runner(runner_manager_with_one_runner) From d9500942da210a21915401ae6ca98720fbd7eb83 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 14:33:59 +0800 Subject: [PATCH 168/278] Add debug --- tests/integration/test_runner_manager_openstack.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b884fc55a..a9e0e4da7 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -290,11 +290,15 @@ async def test_runner_flush_busy_lifecycle( # 3. runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) - await assert_no_runner(runner_manager_with_one_runner) issue_metrics_events = runner_manager_with_one_runner.cleanup() assert issue_metrics_events[events.RunnerStart] == 1 - + + # TODO: Debug + runner_list = runner_manager_with_one_runner.get_runners() + pytest.set_trace() + + await assert_no_runner(runner_manager_with_one_runner) @pytest.mark.openstack @pytest.mark.asyncio @@ -350,4 +354,8 @@ async def test_runner_normal_lifecycle( assert metric_logs[1]["event"] == "runner_stop" assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" + # TODO: Debug + runner_list = runner_manager_with_one_runner.get_runners() + pytest.set_trace() + await assert_no_runner(runner_manager_with_one_runner) From 2f8c9fcdaa15b95bf57508c75746107b6aaad4b0 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 14:45:07 +0800 Subject: [PATCH 169/278] Wait for github state --- .../integration/test_runner_manager_openstack.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index a9e0e4da7..9b38ab33a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -228,7 +228,12 @@ async def test_runner_normal_idle_lifecycle( runner = runner_list[0] assert runner.id == runner_id assert runner.cloud_state == CloudRunnerState.ACTIVE - assert runner.github_state == GithubRunnerState.IDLE + # Update on GitHub-side can take a bit of time. + await wait_for( + lambda: runner_manager.get_runners()[0].github_state == GithubRunnerState.IDLE, + timeout=120, + check_interval=10, + ) # 2. openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() @@ -293,13 +298,14 @@ async def test_runner_flush_busy_lifecycle( issue_metrics_events = runner_manager_with_one_runner.cleanup() assert issue_metrics_events[events.RunnerStart] == 1 - + # TODO: Debug runner_list = runner_manager_with_one_runner.get_runners() - pytest.set_trace() - + pytest.set_trace() + await assert_no_runner(runner_manager_with_one_runner) + @pytest.mark.openstack @pytest.mark.asyncio @pytest.mark.abort_on_fail @@ -356,6 +362,6 @@ async def test_runner_normal_lifecycle( # TODO: Debug runner_list = runner_manager_with_one_runner.get_runners() - pytest.set_trace() + pytest.set_trace() await assert_no_runner(runner_manager_with_one_runner) From 0f49d156dd4282e4295959e554f7300c5bc3f836 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 14:57:27 +0800 Subject: [PATCH 170/278] Refactor wait until runner spawn --- .../test_runner_manager_openstack.py | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 9b38ab33a..54202b308 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -140,14 +140,24 @@ async def runner_manager_fixture( async def runner_manager_with_one_runner_fixture(runner_manager: RunnerManager) -> RunnerManager: runner_manager.create_runners(1) runner_list = runner_manager.get_runners() - assert len(runner_list) == 1, "Test arrange failed: Expect one runner" + try: + await assert_runner_amount(runner_manager, 1) + except TimeoutError as err: + raise AssertionError("Test arrange failed: Expect one runner") from err + + runner = runner_list[0] assert ( runner.cloud_state == CloudRunnerState.ACTIVE ), "Test arrange failed: Expect runner in active state" - assert ( - runner.github_state == GithubRunnerState.IDLE - ), "Test arrange failed: Expect runner in idle state" + try: + await wait_for( + lambda: runner_manager.get_runners()[0].github_state == GithubRunnerState.IDLE, + timeout=120, + check_interval=10, + ) + except TimeoutError as err: + raise AssertionError("Test arrange failed: Expect runner in idle state") from err return runner_manager @@ -165,7 +175,7 @@ def workflow_is_status(workflow: Workflow, status: str) -> bool: return workflow.status == status -async def assert_no_runner(runner_manager: RunnerManager): +async def assert_runner_amount(runner_manager: RunnerManager, num: int): """Assert the runner manager has no runners. Retry are performed if the number of runner is not 0. Due to it may take some time for @@ -178,11 +188,11 @@ async def assert_no_runner(runner_manager: RunnerManager): """ runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) - if len(runner_list) == 0: + if len(runner_list) == num: return - # The openstack server can take sometime to fully clean up. - await wait_for(lambda: len(runner_manager.get_runners()) == 0, timeout=60) + # The openstack server can take sometime to fully clean up or create. + await wait_for(lambda: len(runner_manager.get_runners()) == num) @pytest.mark.openstack @@ -222,6 +232,11 @@ async def test_runner_normal_idle_lifecycle( assert len(runner_id_list) == 1 runner_id = runner_id_list[0] + try: + await assert_runner_amount(runner_manager, 1) + except TimeoutError as err: + raise AssertionError("Test arrange failed: Expect one runner") from err + runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) assert len(runner_list) == 1 @@ -244,7 +259,7 @@ async def test_runner_normal_idle_lifecycle( # 3. runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) - await assert_no_runner(runner_manager) + await assert_runner_amount(runner_manager, 0) @pytest.mark.openstack @@ -303,7 +318,7 @@ async def test_runner_flush_busy_lifecycle( runner_list = runner_manager_with_one_runner.get_runners() pytest.set_trace() - await assert_no_runner(runner_manager_with_one_runner) + await assert_runner_amount(runner_manager_with_one_runner, 0) @pytest.mark.openstack @@ -364,4 +379,4 @@ async def test_runner_normal_lifecycle( runner_list = runner_manager_with_one_runner.get_runners() pytest.set_trace() - await assert_no_runner(runner_manager_with_one_runner) + await assert_runner_amount(runner_manager_with_one_runner, 0) From bed55b1c8b80a6a1a040bd5f81275aa73eaa3b87 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 15:12:20 +0800 Subject: [PATCH 171/278] Add keyfile erorr --- src-docs/errors.md | 11 ++++++++++ ...penstack_cloud.openstack_runner_manager.md | 20 +++++++++---------- src/errors.py | 3 +++ src/openstack_cloud/openstack_cloud.py | 4 ++-- .../openstack_runner_manager.py | 6 ++++++ 5 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src-docs/errors.md b/src-docs/errors.md index 1a6316046..6ca397b29 100644 --- a/src-docs/errors.md +++ b/src-docs/errors.md @@ -414,3 +414,14 @@ Represents an error while interacting with SSH. +--- + + + +## class `KeyfileError` +Represents missing keyfile for SSH. + + + + + diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 3a85a76c1..7c7acd442 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -17,7 +17,7 @@ Manager for self-hosted runner on OpenStack. --- - + ## class `OpenstackRunnerManagerConfig` Configuration for OpenstackRunnerManager. @@ -70,7 +70,7 @@ __init__( --- - + ## class `RunnerHealth` Runners with health state. @@ -103,12 +103,12 @@ __init__( --- - + ## class `OpenstackRunnerManager` Manage self-hosted runner on OpenStack cloud. - + ### method `__init__` @@ -130,7 +130,7 @@ Construct the object. --- - + ### method `cleanup` @@ -153,7 +153,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -182,7 +182,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -206,7 +206,7 @@ Delete self-hosted runners. --- - + ### method `get_name_prefix` @@ -223,7 +223,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `get_runner` @@ -246,7 +246,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/errors.py b/src/errors.py index 204877cd5..3c639107e 100644 --- a/src/errors.py +++ b/src/errors.py @@ -170,3 +170,6 @@ class OpenStackUnauthorizedError(OpenstackError): class SshError(Exception): """Represents an error while interacting with SSH.""" + +class KeyfileError(SshError): + """Represents missing keyfile for SSH.""" diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 888897bdc..026e51980 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -22,7 +22,7 @@ from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup from paramiko.ssh_exception import NoValidConnectionsError -from errors import OpenstackError, SshError +from errors import KeyfileError, OpenstackError, SshError logger = logging.getLogger(__name__) @@ -267,7 +267,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: key_path = OpenstackCloud._get_key_path(instance.server_name) if not key_path.exists(): - raise SshError( + raise KeyfileError( f"Missing keyfile for server: {instance.server_name}, key path: {key_path}" ) if not instance.addresses: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index c25b25459..ae6aab863 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -21,6 +21,7 @@ CreateMetricsStorageError, GetMetricsStorageError, IssueMetricEventError, + KeyfileError, OpenstackError, RunnerCreateError, RunnerStartError, @@ -413,6 +414,11 @@ def _health_check(self, instance: OpenstackInstance) -> bool: """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) + except KeyfileError: + logger.exception( + "Health check failed due to unable to find keyfile for %s", instance.server_name + ) + return False except SshError: logger.exception( "SSH connection failure with %s during health check", instance.server_name From d4e11966b4bf7f8a41dc4bda971c5bc42a5087b5 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 15:24:28 +0800 Subject: [PATCH 172/278] Remove debug statement --- tests/integration/test_runner_manager_openstack.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 54202b308..608e38ccb 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -314,10 +314,6 @@ async def test_runner_flush_busy_lifecycle( issue_metrics_events = runner_manager_with_one_runner.cleanup() assert issue_metrics_events[events.RunnerStart] == 1 - # TODO: Debug - runner_list = runner_manager_with_one_runner.get_runners() - pytest.set_trace() - await assert_runner_amount(runner_manager_with_one_runner, 0) @@ -375,8 +371,4 @@ async def test_runner_normal_lifecycle( assert metric_logs[1]["event"] == "runner_stop" assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" - # TODO: Debug - runner_list = runner_manager_with_one_runner.get_runners() - pytest.set_trace() - await assert_runner_amount(runner_manager_with_one_runner, 0) From be1cad21411514dcd422e2b9b1d0eb9406e0bbf2 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 15:28:15 +0800 Subject: [PATCH 173/278] Re-enable all tests --- .github/workflows/e2e_test.yaml | 4 +--- .github/workflows/integration_test.yaml | 4 +--- .github/workflows/test.yaml | 4 +--- src-docs/errors.md | 2 +- src-docs/openstack_cloud.openstack_cloud.md | 7 ++++--- src/errors.py | 1 + src/openstack_cloud/openstack_cloud.py | 1 + tests/integration/test_runner_manager_openstack.py | 7 ++----- 8 files changed, 12 insertions(+), 18 deletions(-) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index c7636df33..5933451ee 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,9 +1,7 @@ name: End-to-End tests on: - #TODO: Debug - # pull_request: - workflow_dispatch: + pull_request: jobs: # test option values defined at test/conftest.py are passed on via repository secret diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 000aa5d2b..8e0bc700a 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,9 +1,7 @@ name: integration-tests on: - #TODO: Debug - # pull_request: - workflow_dispatch: + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 876d3f2df..99e540d31 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,9 +1,7 @@ name: Tests on: - #TODO: Debug - # pull_request: - workflow_dispatch: + pull_request: jobs: unit-tests: diff --git a/src-docs/errors.md b/src-docs/errors.md index 6ca397b29..c0a333b42 100644 --- a/src-docs/errors.md +++ b/src-docs/errors.md @@ -416,7 +416,7 @@ Represents an error while interacting with SSH. --- - + ## class `KeyfileError` Represents missing keyfile for SSH. diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index c431d5f28..15348333b 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -83,7 +83,7 @@ Create the object. --- - + ### method `cleanup` @@ -142,7 +142,7 @@ Get OpenStack instance by instance ID. --- - + ### method `get_instances` @@ -159,7 +159,7 @@ Get all OpenStack instances. --- - + ### method `get_server_name` @@ -203,6 +203,7 @@ Get SSH connection to an OpenStack instance. **Raises:** - `SshError`: Unable to get a working SSH connection to the instance. + - `KeyfileError`: Unable to find the keyfile to connect to the instance. diff --git a/src/errors.py b/src/errors.py index 3c639107e..adb4b7e0e 100644 --- a/src/errors.py +++ b/src/errors.py @@ -171,5 +171,6 @@ class OpenStackUnauthorizedError(OpenstackError): class SshError(Exception): """Represents an error while interacting with SSH.""" + class KeyfileError(SshError): """Represents missing keyfile for SSH.""" diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 026e51980..91d88145a 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -260,6 +260,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: Raises: SshError: Unable to get a working SSH connection to the instance. + KeyfileError: Unable to find the keyfile to connect to the instance. Returns: SSH connection object. diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 608e38ccb..3818ed17f 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -144,7 +144,6 @@ async def runner_manager_with_one_runner_fixture(runner_manager: RunnerManager) await assert_runner_amount(runner_manager, 1) except TimeoutError as err: raise AssertionError("Test arrange failed: Expect one runner") from err - runner = runner_list[0] assert ( @@ -176,15 +175,13 @@ def workflow_is_status(workflow: Workflow, status: str) -> bool: async def assert_runner_amount(runner_manager: RunnerManager, num: int): - """Assert the runner manager has no runners. - - Retry are performed if the number of runner is not 0. Due to it may take some time for - openstack to delete the servers. + """Assert the number of runner a runner manager has. A TimeoutError will be thrown if runners are still found after timeout. Args: runner_manager: The RunnerManager to check. + num: Number of runner to check for. """ runner_list = runner_manager.get_runners() assert isinstance(runner_list, tuple) From 0c6e66831b3ecb0ac6d79c4121f73efbd404cc41 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:20:02 +0800 Subject: [PATCH 174/278] Update src/manager/github_runner_manager.py Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> --- src/manager/github_runner_manager.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index f48330fa1..7921be7e2 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -109,15 +109,13 @@ def get_removal_token(self) -> str: return self.github.get_runner_remove_token(self._path) @staticmethod - def _filter_runner_state( - runner: SelfHostedRunner, states: Sequence[GithubRunnerState] | None + def _is_runner_in_state( + runner: SelfHostedRunner, states: Sequence[GithubRunnerState] | None ) -> bool: - """Filter the runner by the state. - + """Check that the runner is in one of the states provided. Args: runner: Runner to filter. - states: Filter the runners for these states. If None, return true. - + states: States in which to check the runner belongs to. Returns: True if the runner is in one of the state, else false. """ From 05f747858be7d536db727d0e94b917220f97f425 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:38:30 +0800 Subject: [PATCH 175/278] Update src/openstack_cloud/openstack_cloud.py Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> --- src/openstack_cloud/openstack_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 91d88145a..2f289d0e8 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -139,7 +139,7 @@ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): self._cloud = cloud self.prefix = prefix - # Ignore "Too many arguments" as 6 args should be fine. Move to a dataclass is new args are + # Ignore "Too many arguments" as 6 args should be fine. Move to a dataclass if new args are # added. def launch_instance( # pylint: disable=R0913 self, instance_id: str, image: str, flavor: str, network: str, userdata: str From 248d5d15241c25f2bc60afbcfe646d817dd724dd Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:41:51 +0800 Subject: [PATCH 176/278] Suggestions --- src-docs/errors.md | 4 +-- src-docs/openstack_cloud.openstack_cloud.md | 6 ++-- src/errors.py | 10 +++---- src/manager/cloud_runner_manager.py | 8 ++++-- src/manager/github_runner_manager.py | 2 +- src/manager/runner_manager.py | 2 +- src/openstack_cloud/openstack_cloud.py | 22 +++++++-------- src/openstack_cloud/openstack_manager.py | 6 ++-- .../openstack_runner_manager.py | 28 +++++++++---------- tests/unit/test_openstack_manager.py | 4 +-- 10 files changed, 48 insertions(+), 44 deletions(-) diff --git a/src-docs/errors.md b/src-docs/errors.md index c0a333b42..cf7cde565 100644 --- a/src-docs/errors.md +++ b/src-docs/errors.md @@ -374,7 +374,7 @@ Base class for all runner logs errors. -## class `OpenstackError` +## class `OpenStackError` Base class for OpenStack errors. @@ -407,7 +407,7 @@ Represents an unauthorized connection to OpenStack. -## class `SshError` +## class `SSHError` Represents an error while interacting with SSH. diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 15348333b..beff5a141 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -109,7 +109,7 @@ Delete a openstack instance. **Raises:** - - `OpenstackError`: Unable to delete OpenStack server. + - `OpenStackError`: Unable to delete OpenStack server. @@ -202,7 +202,7 @@ Get SSH connection to an OpenStack instance. **Raises:** - - `SshError`: Unable to get a working SSH connection to the instance. + - `SSHError`: Unable to get a working SSH connection to the instance. - `KeyfileError`: Unable to find the keyfile to connect to the instance. @@ -242,7 +242,7 @@ Create an OpenStack instance. **Raises:** - - `OpenstackError`: Unable to create OpenStack server. + - `OpenStackError`: Unable to create OpenStack server. diff --git a/src/errors.py b/src/errors.py index adb4b7e0e..59d28a239 100644 --- a/src/errors.py +++ b/src/errors.py @@ -156,21 +156,21 @@ class RunnerLogsError(Exception): """Base class for all runner logs errors.""" -class OpenstackError(Exception): +class OpenStackError(Exception): """Base class for OpenStack errors.""" -class OpenStackInvalidConfigError(OpenstackError): +class OpenStackInvalidConfigError(OpenStackError): """Represents an invalid OpenStack configuration.""" -class OpenStackUnauthorizedError(OpenstackError): +class OpenStackUnauthorizedError(OpenStackError): """Represents an unauthorized connection to OpenStack.""" -class SshError(Exception): +class SSHError(Exception): """Represents an error while interacting with SSH.""" -class KeyfileError(SshError): +class KeyfileError(SSHError): """Represents missing keyfile for SSH.""" diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 5191a934b..dd5d5424e 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -6,13 +6,17 @@ import abc from dataclasses import dataclass from enum import Enum +import logging from typing import Iterator, Sequence, Tuple from metrics.runner import RunnerMetrics +logger = logging.getLogger(__name__) + InstanceId = str + class CloudRunnerState(str, Enum): """Represent state of the instance hosting the runner. @@ -76,12 +80,12 @@ class CloudRunnerInstance: Attributes: name: Name of the instance hosting the runner. - id: ID of the instance. + instance_id: ID of the instance. state: State of the instance hosting the runner. """ name: str - id: str + instance_id: InstanceId state: CloudRunnerState diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 7921be7e2..e72d6bd64 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -16,7 +16,7 @@ class GithubRunnerState(str, Enum): Attributes: BUSY: Runner is working on a job assigned by GitHub. - IDLE: Runner is waiting to take a job. + IDLE: Runner is waiting to take a job or is running pre-job tasks (i.e. repo-policy-compliance check). OFFLINE: Runner is not connected to GitHub. """ diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 98ee9ed9c..6ee4c275e 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -65,7 +65,7 @@ def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedR github_info: Information on the GitHub of the runner. """ self.name = cloud_instance.name - self.id = cloud_instance.id + self.id = cloud_instance.instance_id self.github_state = ( GithubRunnerState.from_runner(github_info) if github_info is not None else None ) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 2f289d0e8..2c4d5e03c 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -22,7 +22,7 @@ from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup from paramiko.ssh_exception import NoValidConnectionsError -from errors import KeyfileError, OpenstackError, SshError +from errors import KeyfileError, OpenStackError, SSHError logger = logging.getLogger(__name__) @@ -97,7 +97,7 @@ def _get_openstack_connection( cloud: The name of cloud to use in the clouds.yaml. Raises: - OpenstackError: if the credentials provided is not authorized. + OpenStackError: if the credentials provided is not authorized. Yields: An openstack.connection.Connection object. @@ -115,7 +115,7 @@ def _get_openstack_connection( # pylint thinks this isn't an exception, but does inherit from Exception class. except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause logger.exception("OpenStack API call failure") - raise OpenstackError("Failed OpenStack API call") from exc + raise OpenStackError("Failed OpenStack API call") from exc class OpenstackCloud: @@ -154,7 +154,7 @@ def launch_instance( # pylint: disable=R0913 userdata: The cloud init userdata to startup the instance. Raises: - OpenstackError: Unable to create OpenStack server. + OpenStackError: Unable to create OpenStack server. Returns: The OpenStack instance created. @@ -198,11 +198,11 @@ def launch_instance( # pylint: disable=R0913 full_name, ) self._delete_keypair(conn, instance_id) - raise OpenstackError(f"Timeout creating openstack server {full_name}") from err + raise OpenStackError(f"Timeout creating openstack server {full_name}") from err except openstack.exceptions.SDKException as err: logger.exception("Failed to create openstack server %s", full_name) self._delete_keypair(conn, instance_id) - raise OpenstackError(f"Failed to create openstack server {full_name}") from err + raise OpenStackError(f"Failed to create openstack server {full_name}") from err return OpenstackInstance(server, self.prefix) @@ -230,7 +230,7 @@ def delete_instance(self, instance_id: str) -> None: """Delete a openstack instance. Raises: - OpenstackError: Unable to delete OpenStack server. + OpenStackError: Unable to delete OpenStack server. Args: instance_id: The instance ID of the instance to delete. @@ -250,7 +250,7 @@ def delete_instance(self, instance_id: str) -> None: openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout, ) as err: - raise OpenstackError(f"Failed to remove openstack runner {full_name}") from err + raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: """Get SSH connection to an OpenStack instance. @@ -259,7 +259,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: instance: The OpenStack instance to connect to. Raises: - SshError: Unable to get a working SSH connection to the instance. + SSHError: Unable to get a working SSH connection to the instance. KeyfileError: Unable to find the keyfile to connect to the instance. Returns: @@ -272,7 +272,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: f"Missing keyfile for server: {instance.server_name}, key path: {key_path}" ) if not instance.addresses: - raise SshError(f"No addresses found for OpenStack server {instance.server_name}") + raise SSHError(f"No addresses found for OpenStack server {instance.server_name}") for ip in instance.addresses: try: @@ -300,7 +300,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: exc_info=True, ) continue - raise SshError( + raise SSHError( f"No connectable SSH addresses found, server: {instance.server_name}, " f"addresses: {instance.addresses}" ) diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index c5d2bce44..35799d8bb 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -48,7 +48,7 @@ GithubClientError, GithubMetricsError, IssueMetricEventError, - OpenstackError, + OpenStackError, RunnerCreateError, RunnerStartError, ) @@ -161,7 +161,7 @@ def _create_connection(cloud_config: dict[str, dict]) -> Iterator[openstack.conn cloud_config: The configuration in clouds.yaml format to apply. Raises: - OpenstackError: if the credentials provided is not authorized. + OpenStackError: if the credentials provided is not authorized. Yields: An openstack.connection.Connection object. @@ -180,7 +180,7 @@ def _create_connection(cloud_config: dict[str, dict]) -> Iterator[openstack.conn # pylint thinks this isn't an exception, but does inherit from Exception class. except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause logger.exception("OpenStack API call failure") - raise OpenstackError("Failed OpenStack API call") from exc + raise OpenStackError("Failed OpenStack API call") from exc # Disable too many arguments, as they are needed to create the dataclass. diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index ae6aab863..2d1e2bb12 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -22,10 +22,10 @@ GetMetricsStorageError, IssueMetricEventError, KeyfileError, - OpenstackError, + OpenStackError, RunnerCreateError, RunnerStartError, - SshError, + SSHError, ) from manager.cloud_runner_manager import ( CloudRunnerInstance, @@ -160,7 +160,7 @@ def create_runner(self, registration_token: str) -> InstanceId: network=self.config.network, userdata=userdata, ) - except OpenstackError as err: + except OpenStackError as err: raise RunnerCreateError(f"Failed to create {instance_name} openstack runner") from err self._wait_runner_startup(instance) @@ -190,7 +190,7 @@ def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: if instance.server_name == name: return CloudRunnerInstance( name=name, - id=instance_id, + instance_id=instance_id, state=CloudRunnerState.from_openstack_server_status(instance.status), ) return None @@ -211,7 +211,7 @@ def get_runners( instance_list = [ CloudRunnerInstance( name=instance.server_name, - id=instance.instance_id, + instance_id=instance.instance_id, state=CloudRunnerState.from_openstack_server_status(instance.status), ) for instance in instance_list @@ -287,7 +287,7 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None instance.server_name, stack_info=True, ) - except SshError: + except SSHError: logger.exception( "Failed to get SSH connection while removing %s", instance.server_name ) @@ -297,7 +297,7 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None try: self._openstack_cloud.delete_instance(instance.instance_id) - except OpenstackError: + except OpenStackError: logger.exception( "Unable to delete openstack instance for runner %s", instance.server_name ) @@ -407,7 +407,7 @@ def _health_check(self, instance: OpenstackInstance) -> bool: instance: The OpenStack instance to conduit the health check. Raises: - SshError: Unable to get a SSH connection to the instance. + SSHError: Unable to get a SSH connection to the instance. Returns: Whether the runner is healthy. @@ -419,7 +419,7 @@ def _health_check(self, instance: OpenstackInstance) -> bool: "Health check failed due to unable to find keyfile for %s", instance.server_name ) return False - except SshError: + except SSHError: logger.exception( "SSH connection failure with %s during health check", instance.server_name ) @@ -461,7 +461,7 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except SshError as err: + except SSHError as err: raise RunnerStartError( f"Failed to SSH connect to {instance.server_name} openstack runner" ) from err @@ -487,7 +487,7 @@ def _wait_runner_running(self, instance: OpenstackInstance) -> None: """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except SshError as err: + except SSHError as err: raise RunnerStartError( f"Failed to SSH connect to {instance.server_name} openstack runner" ) from err @@ -607,7 +607,7 @@ def _ssh_pull_file( Raises: _PullFileError: Unable to pull the file from the runner instance. - SshError: Issue with SSH connection. + SSHError: Issue with SSH connection. """ try: result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) @@ -616,7 +616,7 @@ def _ssh_pull_file( paramiko.ssh_exception.NoValidConnectionsError, paramiko.ssh_exception.SSHException, ) as exc: - raise SshError(f"Unable to SSH into {ssh_conn.host}") from exc + raise SSHError(f"Unable to SSH into {ssh_conn.host}") from exc if not result.ok: logger.warning( ( @@ -647,7 +647,7 @@ def _ssh_pull_file( paramiko.ssh_exception.NoValidConnectionsError, paramiko.ssh_exception.SSHException, ) as exc: - raise SshError(f"Unable to SSH into {ssh_conn.host}") from exc + raise SSHError(f"Unable to SSH into {ssh_conn.host}") from exc except OSError as exc: raise _PullFileError(f"Unable to retrieve file {remote_path}") from exc diff --git a/tests/unit/test_openstack_manager.py b/tests/unit/test_openstack_manager.py index 373f656f0..6aabc7361 100644 --- a/tests/unit/test_openstack_manager.py +++ b/tests/unit/test_openstack_manager.py @@ -19,7 +19,7 @@ import metrics.storage import reactive.runner_manager from charm_state import CharmState, ProxyConfig, ReactiveConfig, RepoPolicyComplianceConfig -from errors import OpenstackError, RunnerStartError +from errors import OpenStackError, RunnerStartError from github_type import GitHubRunnerStatus, RunnerApplication, SelfHostedRunner from metrics import events as metric_events from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME @@ -262,7 +262,7 @@ def test__create_connection_error(clouds_yaml: dict, openstack_connect_mock: Mag connection_mock.__enter__.return_value = connection_context openstack_connect_mock.return_value = connection_mock - with pytest.raises(OpenstackError) as exc: + with pytest.raises(OpenStackError) as exc: with openstack_manager._create_connection(cloud_config=clouds_yaml): pass From e112a9b3daf1cecb636375b087c765e0a285d189 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:45:52 +0800 Subject: [PATCH 177/278] Refactor remove openstack server --- src-docs/openstack_cloud.openstack_cloud.md | 12 ++--- src/openstack_cloud/openstack_cloud.py | 44 ++++++++++--------- .../openstack_runner_manager.py | 1 + 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index beff5a141..c49f9779f 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -83,7 +83,7 @@ Create the object. --- - + ### method `cleanup` @@ -95,7 +95,7 @@ Cleanup unused openstack resources. --- - + ### method `delete_instance` @@ -119,7 +119,7 @@ Delete a openstack instance. --- - + ### method `get_instance` @@ -142,7 +142,7 @@ Get OpenStack instance by instance ID. --- - + ### method `get_instances` @@ -159,7 +159,7 @@ Get all OpenStack instances. --- - + ### method `get_server_name` @@ -182,7 +182,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 2c4d5e03c..b9a3aceb7 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -187,17 +187,7 @@ def launch_instance( # pylint: disable=R0913 "Attempting clean up of openstack server %s that timeout during creation", full_name, ) - try: - conn.delete_server(name_or_id=full_name, wait=True) - except ( - openstack.exceptions.SDKException, - openstack.exceptions.ResourceTimeout, - ): - logger.exception( - "Failed to cleanup openstack server %s that timeout during creation", - full_name, - ) - self._delete_keypair(conn, instance_id) + self._delete_instance(conn, full_name) raise OpenStackError(f"Timeout creating openstack server {full_name}") from err except openstack.exceptions.SDKException as err: logger.exception("Failed to create openstack server %s", full_name) @@ -241,16 +231,28 @@ def delete_instance(self, instance_id: str) -> None: with _get_openstack_connection( clouds_config=self._clouds_config, cloud=self._cloud ) as conn: - try: - server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) - if server is not None: - conn.delete_server(name_or_id=server.id) - OpenstackCloud._delete_keypair(conn, full_name) - except ( - openstack.exceptions.SDKException, - openstack.exceptions.ResourceTimeout, - ) as err: - raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err + self._delete_instance(conn, full_name) + + def _delete_instance(self, conn: OpenstackConnection, full_name: str) -> None: + """Delete a openstack instance. + + Raises: + OpenStackError: Unable to delete OpenStack server. + + Args: + conn: The openstack connection to use. + full_name: The full name of the server. + """ + try: + server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) + if server is not None: + conn.delete_server(name_or_id=server.id) + OpenstackCloud._delete_keypair(conn, full_name) + except ( + openstack.exceptions.SDKException, + openstack.exceptions.ResourceTimeout, + ) as err: + raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: """Get SSH connection to an OpenStack instance. diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 2d1e2bb12..455f5e396 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -317,6 +317,7 @@ def _get_runner_health(self) -> RunnerHealth: CloudRunnerState.DELETED, CloudRunnerState.ERROR, CloudRunnerState.STOPPED, + CloudRunnerState.UNKNOWN, ) or not self._health_check(runner): unhealthy.append(runner) else: From 2562939ac7d3915544f229e8f34ba10d4601ea1c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:57:28 +0800 Subject: [PATCH 178/278] Test spawning two runners. --- .github/workflows/e2e_test.yaml | 4 ++- .github/workflows/integration_test.yaml | 4 ++- src-docs/openstack_cloud.openstack_cloud.md | 14 +++----- src/manager/cloud_runner_manager.py | 3 +- src/manager/github_runner_manager.py | 9 +++-- src/manager/runner_manager.py | 19 ++++++++--- src/openstack_cloud/openstack_cloud.py | 7 ++-- .../test_runner_manager_openstack.py | 34 +++++++++++++++++++ 8 files changed, 68 insertions(+), 26 deletions(-) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 5933451ee..f950dc646 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,7 +1,9 @@ name: End-to-End tests on: - pull_request: + # TODO: Debug only + # pull_request: + workflow_dispatch: jobs: # test option values defined at test/conftest.py are passed on via repository secret diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 8e0bc700a..41d9e5c7f 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,7 +1,9 @@ name: integration-tests on: - pull_request: + # TODO: Debug only + # pull_request: + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index c49f9779f..d0895577a 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -83,7 +83,7 @@ Create the object. --- - + ### method `cleanup` @@ -107,12 +107,6 @@ Delete a openstack instance. -**Raises:** - - - `OpenStackError`: Unable to delete OpenStack server. - - - **Args:** - `instance_id`: The instance ID of the instance to delete. @@ -142,7 +136,7 @@ Get OpenStack instance by instance ID. --- - + ### method `get_instances` @@ -159,7 +153,7 @@ Get all OpenStack instances. --- - + ### method `get_server_name` @@ -182,7 +176,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index dd5d5424e..78fac93be 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -4,9 +4,9 @@ """Interface of manager of runner instance on clouds.""" import abc +import logging from dataclasses import dataclass from enum import Enum -import logging from typing import Iterator, Sequence, Tuple from metrics.runner import RunnerMetrics @@ -16,7 +16,6 @@ InstanceId = str - class CloudRunnerState(str, Enum): """Represent state of the instance hosting the runner. diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index e72d6bd64..4d6ae5788 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -16,7 +16,8 @@ class GithubRunnerState(str, Enum): Attributes: BUSY: Runner is working on a job assigned by GitHub. - IDLE: Runner is waiting to take a job or is running pre-job tasks (i.e. repo-policy-compliance check). + IDLE: Runner is waiting to take a job or is running pre-job tasks (i.e. + repo-policy-compliance check). OFFLINE: Runner is not connected to GitHub. """ @@ -75,7 +76,7 @@ def get_runners( runner for runner in runner_list if runner.name.startswith(self._prefix) - and GithubRunnerManager._filter_runner_state(runner, states) + and GithubRunnerManager._is_runner_in_state(runner, states) ) def delete_runners(self, states: Sequence[GithubRunnerState] | None = None) -> None: @@ -110,12 +111,14 @@ def get_removal_token(self) -> str: @staticmethod def _is_runner_in_state( - runner: SelfHostedRunner, states: Sequence[GithubRunnerState] | None + runner: SelfHostedRunner, states: Sequence[GithubRunnerState] | None ) -> bool: """Check that the runner is in one of the states provided. + Args: runner: Runner to filter. states: States in which to check the runner belongs to. + Returns: True if the runner is in one of the state, else false. """ diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 6ee4c275e..dde83c00b 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -10,7 +10,7 @@ from typing import Iterator, Sequence, Type, cast from charm_state import GithubPath -from errors import GithubMetricsError +from errors import GithubMetricsError, RunnerCreateError from github_type import SelfHostedRunner from manager.cloud_runner_manager import ( CloudRunnerInstance, @@ -116,10 +116,21 @@ def create_runners(self, num: int) -> tuple[InstanceId]: create_runner_args = [ RunnerManager._CreateRunnerArgs(self._cloud, registration_token) for _ in range(num) ] + instance_id_list = [] with Pool(processes=min(num, 10)) as pool: - instance_ids = pool.map(func=RunnerManager._create_runner, iterable=create_runner_args) - - return tuple(instance_ids) + jobs = pool.imap_unordered( + func=RunnerManager._create_runner, iterable=create_runner_args + ) + for _ in range(num): + try: + instance_id = next(jobs) + except RunnerCreateError: + logger.exception("Failed to spawn a runner.") + except StopIteration: + break + else: + instance_id_list.append(instance_id) + return tuple(instance_id_list) def get_runners( self, diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index b9a3aceb7..6406a8c43 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -187,7 +187,7 @@ def launch_instance( # pylint: disable=R0913 "Attempting clean up of openstack server %s that timeout during creation", full_name, ) - self._delete_instance(conn, full_name) + self._delete_instance(conn, full_name) raise OpenStackError(f"Timeout creating openstack server {full_name}") from err except openstack.exceptions.SDKException as err: logger.exception("Failed to create openstack server %s", full_name) @@ -219,9 +219,6 @@ def get_instance(self, instance_id: str) -> OpenstackInstance | None: def delete_instance(self, instance_id: str) -> None: """Delete a openstack instance. - Raises: - OpenStackError: Unable to delete OpenStack server. - Args: instance_id: The instance ID of the instance to delete. """ @@ -232,7 +229,7 @@ def delete_instance(self, instance_id: str) -> None: clouds_config=self._clouds_config, cloud=self._cloud ) as conn: self._delete_instance(conn, full_name) - + def _delete_instance(self, conn: OpenstackConnection, full_name: str) -> None: """Delete a openstack instance. diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 3818ed17f..82a1bbe0b 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -369,3 +369,37 @@ async def test_runner_normal_lifecycle( assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" await assert_runner_amount(runner_manager_with_one_runner, 0) + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_spawn_two( + runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager +) -> None: + """ + Arrange: RunnerManager instance with no runners. + Act: + 1. Create two runner. + 3. Delete all idle runner. + Assert: + 1. Two active idle runner. + 3. No runners. + """ + # 1. + runner_id_list = runner_manager.create_runners(1) + assert isinstance(runner_id_list, tuple) + assert len(runner_id_list) == 2 + + try: + await assert_runner_amount(runner_manager, 2) + except TimeoutError as err: + raise AssertionError("Test arrange failed: Expect two runner") from err + + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 2 + + # 3. + runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) + await assert_runner_amount(runner_manager, 0) From fc11db3424e261e412cdc6ea642420b2168c5a3e Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 17:14:25 +0800 Subject: [PATCH 179/278] Fix test --- tests/integration/test_runner_manager_openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 82a1bbe0b..6160b0b4a 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -387,7 +387,7 @@ async def test_runner_spawn_two( 3. No runners. """ # 1. - runner_id_list = runner_manager.create_runners(1) + runner_id_list = runner_manager.create_runners(2) assert isinstance(runner_id_list, tuple) assert len(runner_id_list) == 2 From 6dad4ab22d645a389088ff91baaa0d4e085e1125 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 14 Aug 2024 17:40:09 +0800 Subject: [PATCH 180/278] Fix naming --- .github/workflows/e2e_test.yaml | 4 +--- .github/workflows/integration_test.yaml | 4 +--- src/openstack_cloud/openstack_cloud.py | 8 +++++--- src/openstack_cloud/openstack_manager.py | 10 +++++----- src/openstack_cloud/openstack_runner_manager.py | 10 +++++----- tests/unit/test_openstack_manager.py | 16 ++++++++-------- 6 files changed, 25 insertions(+), 27 deletions(-) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index f950dc646..5933451ee 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,9 +1,7 @@ name: End-to-End tests on: - # TODO: Debug only - # pull_request: - workflow_dispatch: + pull_request: jobs: # test option values defined at test/conftest.py are passed on via repository secret diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 41d9e5c7f..8e0bc700a 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,9 +1,7 @@ name: integration-tests on: - # TODO: Debug only - # pull_request: - workflow_dispatch: + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 6406a8c43..6de3bfeba 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -15,7 +15,7 @@ import openstack.exceptions import paramiko import yaml -from fabric import Connection as SshConnection +from fabric import Connection as SSHConnection from openstack.compute.v2.keypair import Keypair as OpenstackKeypair from openstack.compute.v2.server import Server as OpenstackServer from openstack.connection import Connection as OpenstackConnection @@ -251,7 +251,7 @@ def _delete_instance(self, conn: OpenstackConnection, full_name: str) -> None: ) as err: raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err - def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: + def get_ssh_connection(self, instance: OpenstackInstance) -> SSHConnection: """Get SSH connection to an OpenStack instance. Args: @@ -275,7 +275,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SshConnection: for ip in instance.addresses: try: - connection = SshConnection( + connection = SSHConnection( host=ip, user="ubuntu", connect_kwargs={"key_filename": str(key_path)}, @@ -448,6 +448,8 @@ def _get_and_ensure_unique_server( if not servers: return None + # 2024/08/14: The `format` arg for `strptime` is the default format. + # This is only provided to get around a bug of the function with type checking. latest_server = reduce( lambda a, b: ( a diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index 35799d8bb..2893d65dd 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -32,7 +32,7 @@ import openstack.exceptions import openstack.image.v2.image import paramiko -from fabric import Connection as SshConnection +from fabric import Connection as SSHConnection from openstack.compute.v2.server import Server from openstack.connection import Connection as OpenstackConnection from openstack.exceptions import SDKException @@ -556,7 +556,7 @@ def _ssh_health_check(conn: OpenstackConnection, server_name: str, startup: bool @retry(tries=3, delay=5, max_delay=60, backoff=2, local_logger=logger) def _get_ssh_connection( conn: OpenstackConnection, server_name: str, timeout: int = 30 - ) -> SshConnection: + ) -> SSHConnection: """Get a valid ssh connection within a network for a given openstack instance. The SSH connection will attempt to establish connection until the timeout configured. @@ -593,7 +593,7 @@ def _get_ssh_connection( ] for ip in server_addresses: try: - connection = SshConnection( + connection = SSHConnection( host=ip, user="ubuntu", connect_kwargs={"key_filename": str(key_path)}, @@ -1093,7 +1093,7 @@ def _pull_metrics(self, conn: OpenstackConnection, instance_name: str) -> None: return def _pull_file( - self, ssh_conn: SshConnection, remote_path: str, local_path: str, max_size: int + self, ssh_conn: SSHConnection, remote_path: str, local_path: str, max_size: int ) -> None: """Pull file from the runner instance. @@ -1589,7 +1589,7 @@ def _kill_runner_processes(self, conn: OpenstackConnection, mode: FlushMode) -> servers = self._get_openstack_instances(conn=conn) for server in servers: - ssh_conn: SshConnection = self._get_ssh_connection(conn=conn, server_name=server.name) + ssh_conn: SSHConnection = self._get_ssh_connection(conn=conn, server_name=server.name) result: invoke.runners.Result = ssh_conn.run( killer_command, warn=True, diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 455f5e396..5e00847c8 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -14,7 +14,7 @@ import jinja2 import paramiko import paramiko.ssh_exception -from fabric import Connection as SshConnection +from fabric import Connection as SSHConnection from charm_state import GithubOrg, GithubPath, ProxyConfig, SSHDebugConnection from errors import ( @@ -428,7 +428,7 @@ def _health_check(self, instance: OpenstackInstance) -> bool: return OpenstackRunnerManager._run_health_check(ssh_conn, instance.server_name) @staticmethod - def _run_health_check(ssh_conn: SshConnection, name: str) -> bool: + def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool: """Run a health check for runner process. Args: @@ -557,7 +557,7 @@ def _issue_runner_installed_metric( ) @staticmethod - def _pull_runner_metrics(name: str, ssh_conn: SshConnection) -> None: + def _pull_runner_metrics(name: str, ssh_conn: SSHConnection) -> None: """Pull metrics from runner. Args: @@ -596,7 +596,7 @@ def _pull_runner_metrics(name: str, ssh_conn: SshConnection) -> None: @staticmethod def _ssh_pull_file( - ssh_conn: SshConnection, remote_path: str, local_path: str, max_size: int + ssh_conn: SSHConnection, remote_path: str, local_path: str, max_size: int ) -> None: """Pull file from the runner instance. @@ -654,7 +654,7 @@ def _ssh_pull_file( @staticmethod def _run_github_runner_removal_script( - instance_name: str, ssh_conn: SshConnection, remove_token: str + instance_name: str, ssh_conn: SSHConnection, remove_token: str ) -> None: """Run Github runner removal script. diff --git a/tests/unit/test_openstack_manager.py b/tests/unit/test_openstack_manager.py index 6aabc7361..9399cd46e 100644 --- a/tests/unit/test_openstack_manager.py +++ b/tests/unit/test_openstack_manager.py @@ -10,7 +10,7 @@ import openstack.connection import openstack.exceptions import pytest -from fabric.connection import Connection as SshConnection +from fabric.connection import Connection as SSHConnection from invoke import Result from openstack.compute.v2.keypair import Keypair from openstack.compute.v2.server import Server @@ -57,7 +57,7 @@ def patch_get_ssh_connection_health_check_fixture(monkeypatch: pytest.MonkeyPatc mock_get_ssh_connection = MagicMock( spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection ) - mock_ssh_connection = MagicMock(spec=SshConnection) + mock_ssh_connection = MagicMock(spec=SSHConnection) mock_ssh_connection.host = "test host IP" mock_result = MagicMock(spec=Result) mock_result.ok = True @@ -79,7 +79,7 @@ def ssh_connection_health_check_fixture(monkeypatch: pytest.MonkeyPatch): mock_get_ssh_connection = MagicMock( spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection ) - mock_ssh_connection = MagicMock(spec=SshConnection) + mock_ssh_connection = MagicMock(spec=SSHConnection) mock_ssh_connection.host = "test host IP" mock_result = MagicMock(spec=Result) mock_result.ok = True @@ -97,7 +97,7 @@ def patch_ssh_connection_error_fixture(monkeypatch: pytest.MonkeyPatch): mock_get_ssh_connection = MagicMock( spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection ) - mock_ssh_connection = MagicMock(spec=SshConnection) + mock_ssh_connection = MagicMock(spec=SSHConnection) mock_result = MagicMock(spec=Result) mock_result.ok = False mock_result.stdout = "Mock stdout" @@ -153,7 +153,7 @@ def patched_create_connection_context_fixture(monkeypatch: pytest.MonkeyPatch): def ssh_connection_mock_fixture() -> MagicMock: """Return a mocked ssh connection.""" test_file_content = secrets.token_hex(16) - ssh_conn_mock = MagicMock(spec=openstack_manager.SshConnection) + ssh_conn_mock = MagicMock(spec=openstack_manager.SSHConnection) ssh_conn_mock.get.side_effect = lambda remote, local: Path(local).write_text(test_file_content) ssh_conn_mock.run.side_effect = lambda cmd, **kwargs: ( Result(stdout="1") if cmd.startswith("stat") else Result() @@ -862,7 +862,7 @@ def test__ssh_health_check_error(monkeypatch: pytest.MonkeyPatch, mock_server: M mock_ssh_connection = MagicMock() mock_ssh_connection.run = MagicMock(side_effect=TimeoutError) monkeypatch.setattr( - openstack_manager, "SshConnection", MagicMock(return_value=mock_ssh_connection) + openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) ) with pytest.raises(openstack_manager._SSHError) as exc: @@ -1132,7 +1132,7 @@ def test__get_ssh_connection_server_no_valid_connections( mock_ssh_connection = MagicMock() mock_ssh_connection.run = run monkeypatch.setattr( - openstack_manager, "SshConnection", MagicMock(return_value=mock_ssh_connection) + openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) ) with pytest.raises(openstack_manager._SSHError) as exc: @@ -1164,7 +1164,7 @@ def test__get_ssh_connection_server(monkeypatch: pytest.MonkeyPatch): return_value=factories.MockSSHRunResult(exited=0, stdout="hello world") ) monkeypatch.setattr( - openstack_manager, "SshConnection", MagicMock(return_value=mock_ssh_connection) + openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) ) assert ( From a4bca24d3ee7f95de6a1d9bb583ebe085b37f170 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 15 Aug 2024 14:36:25 +0800 Subject: [PATCH 181/278] Fix according comment --- pyproject.toml | 4 ++-- src-docs/openstack_cloud.openstack_cloud.md | 22 +++++++++---------- src/openstack_cloud/openstack_cloud.py | 11 +++++----- .../openstack_runner_manager.py | 2 +- .../test_runner_manager_openstack.py | 5 +---- 5 files changed, 20 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 458b72d93..f4a49bd2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,9 +57,9 @@ max-doc-length = 99 max-complexity = 10 exclude = [".git", "__pycache__", ".tox", "build", "dist", "*.egg_info", "venv"] select = ["E", "W", "F", "C", "N", "R", "D", "H"] -# Ignore W503, E501 because using black creates errors with this +# Ignore W503, E501, E203 because using black creates errors with this # Ignore D107 Missing docstring in __init__ -ignore = ["W503", "E501", "D107"] +ignore = ["W503", "E501", "D107", "E203"] # D100, D101, D102, D103, D104: Ignore docstring style issues in tests # temporary disable E402 for the fix in charm.py for lp:2058335 per-file-ignores = ["src/charm.py:E402", "tests/*:D100,D101,D102,D103,D104,D205,D212"] diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index d0895577a..0fe0f5cdb 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -53,14 +53,14 @@ Construct the object. --- - + ## class `OpenstackCloud` Client to interact with OpenStack cloud. The OpenStack server name is managed by this cloud. Caller refers to the instances via instance_id. If the caller needs the server name, e.g., for logging, it can be queried with get_server_name. - + ### method `__init__` @@ -83,7 +83,7 @@ Create the object. --- - + ### method `cleanup` @@ -95,7 +95,7 @@ Cleanup unused openstack resources. --- - + ### method `delete_instance` @@ -113,7 +113,7 @@ Delete a openstack instance. --- - + ### method `get_instance` @@ -136,7 +136,7 @@ Get OpenStack instance by instance ID. --- - + ### method `get_instances` @@ -153,7 +153,7 @@ Get all OpenStack instances. --- - + ### method `get_server_name` @@ -176,7 +176,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` @@ -206,7 +206,7 @@ Get SSH connection to an OpenStack instance. --- - + ### method `launch_instance` @@ -216,7 +216,7 @@ launch_instance( image: str, flavor: str, network: str, - userdata: str + cloud_init: str ) → OpenstackInstance ``` @@ -230,7 +230,7 @@ Create an OpenStack instance. - `image`: The image used to create the instance. - `flavor`: The flavor used to create the instance. - `network`: The network used to create the instance. - - `userdata`: The cloud init userdata to startup the instance. + - `cloud_init`: The cloud init userdata to startup the instance. diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 6de3bfeba..2990d2a9d 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) -_CLOUDS_YAML_PATH = Path(Path.home() / ".config/openstack/clouds.yaml") +_CLOUDS_YAML_PATH = Path.home() / ".config/openstack/clouds.yaml" # Update the version when the security group rules are not backward compatible. _SECURITY_GROUP_NAME = "github-runner-v1" @@ -80,8 +80,7 @@ def __init__(self, server: OpenstackServer, prefix: str): raise ValueError( f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" ) - # Disable E203 (space before :) as it conflicts with the formatter (black). - self.instance_id = self.server_name[len(prefix) + 1 :] # noqa: E203 + self.instance_id = self.server_name[len(prefix) + 1 :] @contextmanager @@ -142,7 +141,7 @@ def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): # Ignore "Too many arguments" as 6 args should be fine. Move to a dataclass if new args are # added. def launch_instance( # pylint: disable=R0913 - self, instance_id: str, image: str, flavor: str, network: str, userdata: str + self, instance_id: str, image: str, flavor: str, network: str, cloud_init: str ) -> OpenstackInstance: """Create an OpenStack instance. @@ -151,7 +150,7 @@ def launch_instance( # pylint: disable=R0913 image: The image used to create the instance. flavor: The flavor used to create the instance. network: The network used to create the instance. - userdata: The cloud init userdata to startup the instance. + cloud_init: The cloud init userdata to startup the instance. Raises: OpenStackError: Unable to create OpenStack server. @@ -176,7 +175,7 @@ def launch_instance( # pylint: disable=R0913 flavor=flavor, network=network, security_groups=[security_group.id], - userdata=userdata, + userdata=cloud_init, auto_ip=False, timeout=_CREATE_SERVER_TIMEOUT, wait=True, diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 5e00847c8..61fc212a8 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -158,7 +158,7 @@ def create_runner(self, registration_token: str) -> InstanceId: image=self.config.image, flavor=self.config.flavor, network=self.config.network, - userdata=userdata, + cloud_init=userdata, ) except OpenStackError as err: raise RunnerCreateError(f"Failed to create {instance_name} openstack runner") from err diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 6160b0b4a..1685d6548 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -355,10 +355,7 @@ async def test_runner_normal_lifecycle( assert metric_log_full_content.startswith( metric_log_existing_content ), "The metric log was modified in ways other than appending" - # Disable E203 (space before :) as it conflicts with the formatter (black). - metric_log_new_content = metric_log_full_content[ - len(metric_log_existing_content) : # noqa: E203 - ] + metric_log_new_content = metric_log_full_content[len(metric_log_existing_content) :] metric_logs = [json.loads(metric) for metric in metric_log_new_content.splitlines()] assert ( len(metric_logs) == 2 From c7e0ab324e0f13084db184151ac3c83940634d1c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:07:17 +0800 Subject: [PATCH 182/278] Fix clouds yaml write issue. --- src-docs/openstack_cloud.openstack_cloud.md | 22 ++++++++++----------- src/openstack_cloud/openstack_cloud.py | 9 +++++++-- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 0fe0f5cdb..316dadba1 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -9,7 +9,7 @@ Class for accessing OpenStack API for managing servers. --- - + ## class `OpenstackInstance` Represents an OpenStack instance. @@ -24,7 +24,7 @@ Represents an OpenStack instance. - `addresses`: IP addresses assigned to the server. - `status`: Status of the server. - + ### method `__init__` @@ -53,14 +53,14 @@ Construct the object. --- - + ## class `OpenstackCloud` Client to interact with OpenStack cloud. The OpenStack server name is managed by this cloud. Caller refers to the instances via instance_id. If the caller needs the server name, e.g., for logging, it can be queried with get_server_name. - + ### method `__init__` @@ -83,7 +83,7 @@ Create the object. --- - + ### method `cleanup` @@ -95,7 +95,7 @@ Cleanup unused openstack resources. --- - + ### method `delete_instance` @@ -113,7 +113,7 @@ Delete a openstack instance. --- - + ### method `get_instance` @@ -136,7 +136,7 @@ Get OpenStack instance by instance ID. --- - + ### method `get_instances` @@ -153,7 +153,7 @@ Get all OpenStack instances. --- - + ### method `get_server_name` @@ -176,7 +176,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` @@ -206,7 +206,7 @@ Get SSH connection to an OpenStack instance. --- - + ### method `launch_instance` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 2990d2a9d..9a27f34b5 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -23,6 +23,7 @@ from paramiko.ssh_exception import NoValidConnectionsError from errors import KeyfileError, OpenStackError, SSHError +from utilities import retry logger = logging.getLogger(__name__) @@ -82,8 +83,8 @@ def __init__(self, server: OpenstackServer, prefix: str): ) self.instance_id = self.server_name[len(prefix) + 1 :] - @contextmanager +@retry(tries=2, delay=5, local_logger=logger) def _get_openstack_connection( clouds_config: dict[str, dict], cloud: str ) -> Iterator[OpenstackConnection]: @@ -103,7 +104,11 @@ def _get_openstack_connection( """ if not _CLOUDS_YAML_PATH.exists(): _CLOUDS_YAML_PATH.parent.mkdir(parents=True, exist_ok=True) - _CLOUDS_YAML_PATH.write_text(data=yaml.dump(clouds_config), encoding="utf-8") + + # Concurrency: Very small chance for the file to be corrupted due to multiple process calling + # this function and writing the file at the same time. This should cause the `conn.authorize` + # to fail, and retry of this function would resolve this. + _CLOUDS_YAML_PATH.write_text(data=yaml.dump(clouds_config), encoding="utf-8") # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but # I could not reproduce it. Therefore, no catch here for such exception. From f586aee4952b56c1e127c5e4d2ae075c81a522a9 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:16:28 +0800 Subject: [PATCH 183/278] Fix format --- src-docs/openstack_cloud.openstack_cloud.md | 18 +++++++++--------- src/openstack_cloud/openstack_cloud.py | 3 ++- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 316dadba1..8f639b1e6 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -53,14 +53,14 @@ Construct the object. --- - + ## class `OpenstackCloud` Client to interact with OpenStack cloud. The OpenStack server name is managed by this cloud. Caller refers to the instances via instance_id. If the caller needs the server name, e.g., for logging, it can be queried with get_server_name. - + ### method `__init__` @@ -83,7 +83,7 @@ Create the object. --- - + ### method `cleanup` @@ -95,7 +95,7 @@ Cleanup unused openstack resources. --- - + ### method `delete_instance` @@ -113,7 +113,7 @@ Delete a openstack instance. --- - + ### method `get_instance` @@ -136,7 +136,7 @@ Get OpenStack instance by instance ID. --- - + ### method `get_instances` @@ -153,7 +153,7 @@ Get all OpenStack instances. --- - + ### method `get_server_name` @@ -176,7 +176,7 @@ Get server name on OpenStack. --- - + ### method `get_ssh_connection` @@ -206,7 +206,7 @@ Get SSH connection to an OpenStack instance. --- - + ### method `launch_instance` diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 9a27f34b5..567d53031 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -83,6 +83,7 @@ def __init__(self, server: OpenstackServer, prefix: str): ) self.instance_id = self.server_name[len(prefix) + 1 :] + @contextmanager @retry(tries=2, delay=5, local_logger=logger) def _get_openstack_connection( @@ -105,7 +106,7 @@ def _get_openstack_connection( if not _CLOUDS_YAML_PATH.exists(): _CLOUDS_YAML_PATH.parent.mkdir(parents=True, exist_ok=True) - # Concurrency: Very small chance for the file to be corrupted due to multiple process calling + # Concurrency: Very small chance for the file to be corrupted due to multiple process calling # this function and writing the file at the same time. This should cause the `conn.authorize` # to fail, and retry of this function would resolve this. _CLOUDS_YAML_PATH.write_text(data=yaml.dump(clouds_config), encoding="utf-8") From 95349f8eac1eade42e4c29215727b93d968bb49d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:36:00 +0800 Subject: [PATCH 184/278] Add delete runner by amount --- src/manager/runner_manager.py | 57 ++++++++++++++----- .../test_runner_manager_openstack.py | 8 +-- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index dde83c00b..3421f2b45 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -190,10 +190,26 @@ def get_runners( ] return cast(tuple[RunnerInstance], tuple(runner_instances)) - def delete_runners( + def delete_runners(self, num: int) -> IssuedMetricEventsStats: + """Delete runners. + + Args: + num: The number of runner to delete. + + Returns: + Stats on metrics events issued during the deletion of runners. + """ + logger.info("Deleting %s number of runners", num) + runners_list = self.get_runners()[:num] + runner_names = [runner.name for runner in runners_list] + logger.info("Deleting runners: %s", runner_names) + remove_token = self._github.get_removal_token() + return self._delete_runners(runners=runners_list, remove_token=remove_token) + + def flush_runners( self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE ) -> IssuedMetricEventsStats: - """Delete the runners. + """Delete runners according to state. Args: flush_mode: The type of runners affect by the deletion. @@ -203,9 +219,9 @@ def delete_runners( """ match flush_mode: case FlushMode.FLUSH_IDLE: - logger.info("Deleting idle runners...") + logger.info("Flushing idle runners...") case FlushMode.FLUSH_BUSY: - logger.info("Deleting idle and busy runners...") + logger.info("Flushing idle and busy runners...") case _: logger.critical( "Unknown flush mode %s encountered, contact developers", flush_mode @@ -217,17 +233,9 @@ def delete_runners( runners_list = self.get_runners(github_runner_state=states) runner_names = [runner.name for runner in runners_list] - logger.info("Deleting runners: %s", runner_names) + logger.info("Flushing runners: %s", runner_names) remove_token = self._github.get_removal_token() - - runner_metrics_list = [] - for runner in runners_list: - deleted_runner_metrics = self._cloud.delete_runner( - instance_id=runner.id, remove_token=remove_token - ) - if deleted_runner_metrics is not None: - runner_metrics_list.append(deleted_runner_metrics) - return self._issue_runner_metrics(metrics=iter(runner_metrics_list)) + return self._delete_runners(runners=runners_list, remove_token=remove_token) def cleanup(self) -> IssuedMetricEventsStats: """Run cleanup of the runners and other resources. @@ -240,6 +248,27 @@ def cleanup(self) -> IssuedMetricEventsStats: deleted_runner_metrics = self._cloud.cleanup(remove_token) return self._issue_runner_metrics(metrics=deleted_runner_metrics) + def _delete_runners( + self, runners: Sequence[RunnerInstance], remove_token: str + ) -> IssuedMetricEventsStats: + """Delete list of runners. + + Args: + runners: The runners to delete. + remove_token: The token for removing self-hosted runners. + + Returns: + Stats on metrics events issued during the deletion of runners. + """ + runner_metrics_list = [] + for runner in runners: + deleted_runner_metrics = self._cloud.delete_runner( + instance_id=runner.id, remove_token=remove_token + ) + if deleted_runner_metrics is not None: + runner_metrics_list.append(deleted_runner_metrics) + return self._issue_runner_metrics(metrics=iter(runner_metrics_list)) + def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetricEventsStats: """Issue runner metrics. diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 1685d6548..b2a75e9ed 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -255,7 +255,7 @@ async def test_runner_normal_idle_lifecycle( assert openstack_runner_manager._health_check(runner) # 3. - runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) + runner_manager.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) await assert_runner_amount(runner_manager, 0) @@ -298,7 +298,7 @@ async def test_runner_flush_busy_lifecycle( assert busy_runner.github_state == GithubRunnerState.BUSY # 2. - runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) + runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) runner_list = runner_manager_with_one_runner.get_runners() assert len(runner_list) == 1 busy_runner = runner_list[0] @@ -306,7 +306,7 @@ async def test_runner_flush_busy_lifecycle( assert busy_runner.github_state == GithubRunnerState.BUSY # 3. - runner_manager_with_one_runner.delete_runners(flush_mode=FlushMode.FLUSH_BUSY) + runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_BUSY) issue_metrics_events = runner_manager_with_one_runner.cleanup() assert issue_metrics_events[events.RunnerStart] == 1 @@ -398,5 +398,5 @@ async def test_runner_spawn_two( assert len(runner_list) == 2 # 3. - runner_manager.delete_runners(flush_mode=FlushMode.FLUSH_IDLE) + runner_manager.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) await assert_runner_amount(runner_manager, 0) From a9e6323f60d440c8506128ce9e0e4485af6812ee Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 15 Aug 2024 17:54:47 +0800 Subject: [PATCH 185/278] Add getting runner health state for metrics --- ...penstack_cloud.openstack_runner_manager.md | 37 ++++++++++++++----- src/manager/cloud_runner_manager.py | 5 +++ src/manager/runner_manager.py | 9 +++++ .../openstack_runner_manager.py | 13 +++++++ 4 files changed, 54 insertions(+), 10 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 7c7acd442..d292b0e87 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -17,7 +17,7 @@ Manager for self-hosted runner on OpenStack. --- - + ## class `OpenstackRunnerManagerConfig` Configuration for OpenstackRunnerManager. @@ -70,7 +70,7 @@ __init__( --- - + ## class `RunnerHealth` Runners with health state. @@ -103,12 +103,12 @@ __init__( --- - + ## class `OpenstackRunnerManager` Manage self-hosted runner on OpenStack cloud. - + ### method `__init__` @@ -130,7 +130,7 @@ Construct the object. --- - + ### method `cleanup` @@ -153,7 +153,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -182,7 +182,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -206,7 +206,7 @@ Delete self-hosted runners. --- - + ### method `get_name_prefix` @@ -223,7 +223,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `get_runner` @@ -246,7 +246,24 @@ Get a self-hosted runner by instance id. --- - + + +### method `get_runner_health` + +```python +get_runner_health() → RunnerByHealth +``` + +Get the runner health state. + + + +**Returns:** + The runners by the health state. + +--- + + ### method `get_runners` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 78fac93be..5a0aa2602 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -10,6 +10,7 @@ from typing import Iterator, Sequence, Tuple from metrics.runner import RunnerMetrics +from runner_type import RunnerByHealth logger = logging.getLogger(__name__) @@ -138,3 +139,7 @@ def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: Args: remove_token: The GitHub remove token. """ + + @abc.abstractmethod + def get_runner_health(self) -> RunnerByHealth: + """Get the runners health state.""" diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 3421f2b45..4ae3b7e6e 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -23,6 +23,7 @@ from metrics import github as github_metrics from metrics import runner as runner_metrics from metrics.runner import RunnerMetrics +from runner_type import RunnerByHealth logger = logging.getLogger(__name__) @@ -248,6 +249,14 @@ def cleanup(self) -> IssuedMetricEventsStats: deleted_runner_metrics = self._cloud.cleanup(remove_token) return self._issue_runner_metrics(metrics=deleted_runner_metrics) + def get_runner_health(self) -> RunnerByHealth: + """Get the runner health state. + + Returns: + The runners by the health state. + """ + return self._cloud.get_runner_health() + def _delete_runners( self, runners: Sequence[RunnerInstance], remove_token: str ) -> IssuedMetricEventsStats: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 61fc212a8..cdb118933 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -39,6 +39,7 @@ from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance from openstack_cloud.openstack_manager import GithubRunnerRemoveError from repo_policy_compliance_client import RepoPolicyComplianceClient +from runner_type import RunnerByHealth from utilities import retry logger = logging.getLogger(__name__) @@ -266,6 +267,18 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: self._openstack_cloud.cleanup() return metrics + def get_runner_health(self) -> RunnerByHealth: + """Get the runner health state. + + Returns: + The runners by the health state. + """ + runners = self._get_runner_health() + return RunnerByHealth( + tuple(runner.server_name for runner in runners.healthy), + tuple(runner.server_name for runner in runners.unhealthy), + ) + def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None: """Delete self-hosted runners by openstack instance. From 6d09cb89c26bd58b9e4a75f25bf23dedc4ed8206 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:41:10 +0800 Subject: [PATCH 186/278] Fix security group ID issues --- src/openstack_cloud/openstack_cloud.py | 6 +++--- src/openstack_cloud/openstack_runner_manager.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 567d53031..a01aecbbd 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -587,14 +587,14 @@ def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: if not rule_exists_icmp: conn.create_security_group_rule( - secgroup_name_or_id=_SECURITY_GROUP_NAME, + secgroup_name_or_id=security_group.id, protocol="icmp", direction="ingress", ethertype="IPv4", ) if not rule_exists_ssh: conn.create_security_group_rule( - secgroup_name_or_id=_SECURITY_GROUP_NAME, + secgroup_name_or_id=security_group.id, port_range_min="22", port_range_max="22", protocol="tcp", @@ -603,7 +603,7 @@ def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: ) if not rule_exists_tmate_ssh: conn.create_security_group_rule( - secgroup_name_or_id=_SECURITY_GROUP_NAME, + secgroup_name_or_id=security_group.id, port_range_min="10022", port_range_max="10022", protocol="tcp", diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index cdb118933..9a78ac37d 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -477,7 +477,8 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SSHError as err: raise RunnerStartError( - f"Failed to SSH connect to {instance.server_name} openstack runner" + f"Failed to SSH to {instance.server_name} during creation possible due to setup " + "not completed" ) from err result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) From 134aac786ee211c678a9f4c8f6ddb1600bb3787a Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 16 Aug 2024 12:57:04 +0800 Subject: [PATCH 187/278] Fix according to review --- src-docs/openstack_cloud.openstack_cloud.md | 4 +- ...penstack_cloud.openstack_runner_manager.md | 8 +-- src-docs/runner_type.md | 2 +- src/manager/cloud_runner_manager.py | 4 +- src/manager/runner_manager.py | 26 ++++---- src/openstack_cloud/openstack_cloud.py | 62 +++++++------------ src/openstack_cloud/openstack_manager.py | 12 ++-- .../openstack_runner_manager.py | 57 ++++++++--------- src/runner_manager.py | 8 +-- src/runner_type.py | 2 +- tests/unit/test_openstack_manager.py | 10 +-- tests/unit/test_runner_manager.py | 6 +- 12 files changed, 93 insertions(+), 108 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_cloud.md b/src-docs/openstack_cloud.openstack_cloud.md index 8f639b1e6..d49b62008 100644 --- a/src-docs/openstack_cloud.openstack_cloud.md +++ b/src-docs/openstack_cloud.openstack_cloud.md @@ -91,7 +91,7 @@ Create the object. cleanup() → None ``` -Cleanup unused openstack resources. +Cleanup unused key files and openstack keypairs. --- @@ -141,7 +141,7 @@ Get OpenStack instance by instance ID. ### method `get_instances` ```python -get_instances() → tuple[OpenstackInstance] +get_instances() → tuple[OpenstackInstance, ] ``` Get all OpenStack instances. diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index d292b0e87..a96d9d2eb 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -88,8 +88,8 @@ Runners with health state. ```python __init__( - healthy: tuple[OpenstackInstance], - unhealthy: tuple[OpenstackInstance] + healthy: tuple[OpenstackInstance, ], + unhealthy: tuple[OpenstackInstance, ] ) → None ``` @@ -251,7 +251,7 @@ Get a self-hosted runner by instance id. ### method `get_runner_health` ```python -get_runner_health() → RunnerByHealth +get_runner_health() → RunnerNameByHealth ``` Get the runner health state. @@ -259,7 +259,7 @@ Get the runner health state. **Returns:** - The runners by the health state. + The names of the runner by health state. --- diff --git a/src-docs/runner_type.md b/src-docs/runner_type.md index 481a17a62..8c9db658a 100644 --- a/src-docs/runner_type.md +++ b/src-docs/runner_type.md @@ -11,7 +11,7 @@ Types used by Runner class. -## class `RunnerByHealth` +## class `RunnerNameByHealth` Set of runners instance by health state. diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 5a0aa2602..c26b9e7e8 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -10,7 +10,7 @@ from typing import Iterator, Sequence, Tuple from metrics.runner import RunnerMetrics -from runner_type import RunnerByHealth +from runner_type import RunnerNameByHealth logger = logging.getLogger(__name__) @@ -141,5 +141,5 @@ def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: """ @abc.abstractmethod - def get_runner_health(self) -> RunnerByHealth: + def get_runner_health(self) -> RunnerNameByHealth: """Get the runners health state.""" diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 4ae3b7e6e..2b011db60 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -23,7 +23,7 @@ from metrics import github as github_metrics from metrics import runner as runner_metrics from metrics.runner import RunnerMetrics -from runner_type import RunnerByHealth +from runner_type import RunnerNameByHealth logger = logging.getLogger(__name__) @@ -135,25 +135,25 @@ def create_runners(self, num: int) -> tuple[InstanceId]: def get_runners( self, - github_runner_state: Sequence[GithubRunnerState] | None = None, - cloud_runner_state: Sequence[CloudRunnerState] | None = None, + github_states: Sequence[GithubRunnerState] | None = None, + cloud_states: Sequence[CloudRunnerState] | None = None, ) -> tuple[RunnerInstance]: """Get information on runner filter by state. Only runners that has cloud instance are returned. Args: - github_runner_state: Filter for the runners with these github states. If None all + github_states: Filter for the runners with these github states. If None all states will be included. - cloud_runner_state: Filter for the runners with these cloud states. If None all states + cloud_states: Filter for the runners with these cloud states. If None all states will be included. Returns: Information on the runners. """ logger.info("Getting runners...") - github_infos = self._github.get_runners(github_runner_state) - cloud_infos = self._cloud.get_runners(cloud_runner_state) + github_infos = self._github.get_runners(github_states) + cloud_infos = self._cloud.get_runners(cloud_states) github_infos_map = {info.name: info for info in github_infos} cloud_infos_map = {info.name: info for info in cloud_infos} logger.info( @@ -179,15 +179,15 @@ def get_runners( ) for name in cloud_infos_map.keys() ] - if cloud_runner_state is not None: + if cloud_states is not None: runner_instances = [ - runner for runner in runner_instances if runner.cloud_state in cloud_runner_state + runner for runner in runner_instances if runner.cloud_state in cloud_states ] - if github_runner_state is not None: + if github_states is not None: runner_instances = [ runner for runner in runner_instances - if runner.github_state is not None and runner.github_state in github_runner_state + if runner.github_state is not None and runner.github_state in github_states ] return cast(tuple[RunnerInstance], tuple(runner_instances)) @@ -232,7 +232,7 @@ def flush_runners( if flush_mode == FlushMode.FLUSH_BUSY: states.append(GithubRunnerState.BUSY) - runners_list = self.get_runners(github_runner_state=states) + runners_list = self.get_runners(github_states=states) runner_names = [runner.name for runner in runners_list] logger.info("Flushing runners: %s", runner_names) remove_token = self._github.get_removal_token() @@ -249,7 +249,7 @@ def cleanup(self) -> IssuedMetricEventsStats: deleted_runner_metrics = self._cloud.cleanup(remove_token) return self._issue_runner_metrics(metrics=deleted_runner_metrics) - def get_runner_health(self) -> RunnerByHealth: + def get_runner_health(self) -> RunnerNameByHealth: """Get the runner health state. Returns: diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index a01aecbbd..ceac7fd6d 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -309,7 +309,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SSHConnection: f"addresses: {instance.addresses}" ) - def get_instances(self) -> tuple[OpenstackInstance]: + def get_instances(self) -> tuple[OpenstackInstance, ...]: """Get all OpenStack instances. Returns: @@ -320,26 +320,26 @@ def get_instances(self) -> tuple[OpenstackInstance]: with _get_openstack_connection( clouds_config=self._clouds_config, cloud=self._cloud ) as conn: - servers = self._get_openstack_instances(conn) - server_names = set(server.name for server in servers) - - instances = [] - for name in server_names: - # The server can be deleted between the `_get_openstack_instances` call and this - # line. This is an issues during tests. Hence the need for None check. - server = OpenstackCloud._get_and_ensure_unique_server(conn, name) - if server is not None: - instances.append(OpenstackInstance(server, self.prefix)) - return cast(tuple[OpenstackInstance], tuple(instances)) + instance_list = self._get_openstack_instances(conn) + server_names = set(server.name for server in instance_list) + + server_list = [ + OpenstackCloud._get_and_ensure_unique_server(conn, name) for name in server_names + ] + return tuple( + OpenstackInstance(server, self.prefix) + for server in server_list + if server is not None + ) def cleanup(self) -> None: - """Cleanup unused openstack resources.""" + """Cleanup unused key files and openstack keypairs.""" with _get_openstack_connection( clouds_config=self._clouds_config, cloud=self._cloud ) as conn: - server_list = self._get_openstack_instances(conn) - exclude_list = [server.name for server in server_list] - self._cleanup_key_files(conn, exclude_list) + instances = self._get_openstack_instances(conn) + exclude_list = [server.name for server in instances] + self._cleanup_key_files(exclude_list) self._cleanup_openstack_keypairs(conn, exclude_list) def get_server_name(self, instance_id: str) -> str: @@ -353,13 +353,10 @@ def get_server_name(self, instance_id: str) -> str: """ return f"{self.prefix}-{instance_id}" - def _cleanup_key_files( - self, conn: OpenstackConnection, exclude_instances: Iterable[str] - ) -> None: + def _cleanup_key_files(self, exclude_instances: Iterable[str]) -> None: """Delete all SSH key files except the specified instances. Args: - conn: The Openstack connection instance. exclude_instances: The keys of these instance will not be deleted. """ logger.info("Cleaning up SSH key files") @@ -375,16 +372,6 @@ def _cleanup_key_files( total += 1 if path.name in exclude_filename: continue - - keypair_name = path.name.split(".")[0] - try: - conn.delete_keypair(keypair_name) - except openstack.exceptions.SDKException: - logger.warning( - "Unable to delete OpenStack keypair associated with deleted key file %s ", - path.name, - ) - path.unlink() deleted += 1 logger.info("Found %s key files, clean up %s key files", total, deleted) @@ -403,7 +390,7 @@ def _cleanup_openstack_keypairs( for key in keypairs: # The `name` attribute is of resource.Body type. if key.name and str(key.name).startswith(self.prefix): - if str(key.name) in exclude_instances: + if str(key.name) in set(exclude_instances): continue try: @@ -414,7 +401,7 @@ def _cleanup_openstack_keypairs( key.name, ) - def _get_openstack_instances(self, conn: OpenstackConnection) -> tuple[OpenstackServer]: + def _get_openstack_instances(self, conn: OpenstackConnection) -> tuple[OpenstackServer, ...]: """Get the OpenStack servers managed by this unit. Args: @@ -423,13 +410,10 @@ def _get_openstack_instances(self, conn: OpenstackConnection) -> tuple[Openstack Returns: List of OpenStack instances. """ - return cast( - tuple[OpenstackServer], - tuple( - server - for server in cast(list[OpenstackServer], conn.list_servers()) - if server.name.startswith(f"{self.prefix}-") - ), + return tuple( + server + for server in cast(list[OpenstackServer], conn.list_servers()) + if server.name.startswith(f"{self.prefix}-") ) @staticmethod diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index 2893d65dd..6b6b3d082 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -62,7 +62,7 @@ from repo_policy_compliance_client import RepoPolicyComplianceClient from runner_manager import IssuedMetricEventsStats from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig -from runner_type import GithubPath, RunnerByHealth, RunnerGithubInfo +from runner_type import GithubPath, RunnerGithubInfo, RunnerNameByHealth from utilities import retry, set_env_var logger = logging.getLogger(__name__) @@ -419,7 +419,7 @@ def get_github_runner_info(self) -> tuple[RunnerGithubInfo, ...]: if runner["name"].startswith(f"{self.instance_name}-") ) - def _get_openstack_runner_status(self, conn: OpenstackConnection) -> RunnerByHealth: + def _get_openstack_runner_status(self, conn: OpenstackConnection) -> RunnerNameByHealth: """Get status on OpenStack of each runner. Args: @@ -440,7 +440,7 @@ def _get_openstack_runner_status(self, conn: OpenstackConnection) -> RunnerByHea else: healthy_runner.append(instance.name) - return RunnerByHealth(healthy=tuple(healthy_runner), unhealthy=tuple(unhealthy_runner)) + return RunnerNameByHealth(healthy=tuple(healthy_runner), unhealthy=tuple(unhealthy_runner)) def _get_openstack_instances(self, conn: OpenstackConnection) -> list[Server]: """Get the OpenStack servers managed by this unit. @@ -1302,7 +1302,7 @@ def _clean_up_openstack_keypairs( ) def _clean_up_runners( - self, conn: OpenstackConnection, runner_by_health: RunnerByHealth, remove_token: str + self, conn: OpenstackConnection, runner_by_health: RunnerNameByHealth, remove_token: str ) -> None: """Clean up offline or unhealthy runners. @@ -1355,7 +1355,7 @@ def _scale( self, quantity: int, conn: OpenstackConnection, - runner_by_health: RunnerByHealth, + runner_by_health: RunnerNameByHealth, remove_token: str, ) -> int: """Scale the number of runners. @@ -1488,7 +1488,7 @@ def _issue_reconciliation_metric( metric_stats: IssuedMetricEventsStats, reconciliation_start_ts: float, reconciliation_end_ts: float, - runner_states: RunnerByHealth, + runner_states: RunnerNameByHealth, ) -> None: """Issue reconciliation metric. diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 9a78ac37d..5019fe853 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -39,7 +39,7 @@ from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance from openstack_cloud.openstack_manager import GithubRunnerRemoveError from repo_policy_compliance_client import RepoPolicyComplianceClient -from runner_type import RunnerByHealth +from runner_type import RunnerNameByHealth from utilities import retry logger = logging.getLogger(__name__) @@ -105,8 +105,8 @@ class RunnerHealth: unhealthy: The list of unhealthy runners. """ - healthy: tuple[OpenstackInstance] - unhealthy: tuple[OpenstackInstance] + healthy: tuple[OpenstackInstance, ...] + unhealthy: tuple[OpenstackInstance, ...] class OpenstackRunnerManager(CloudRunnerManager): @@ -150,7 +150,7 @@ def create_runner(self, registration_token: str) -> InstanceId: start_timestamp = time.time() instance_id = OpenstackRunnerManager._generate_instance_id() instance_name = self._openstack_cloud.get_server_name(instance_id=instance_id) - userdata = self._generate_userdata( + cloud_init = self._generate_cloud_init( instance_name=instance_name, registration_token=registration_token ) try: @@ -159,7 +159,7 @@ def create_runner(self, registration_token: str) -> InstanceId: image=self.config.image, flavor=self.config.flavor, network=self.config.network, - cloud_init=userdata, + cloud_init=cloud_init, ) except OpenStackError as err: raise RunnerCreateError(f"Failed to create {instance_name} openstack runner") from err @@ -185,16 +185,16 @@ def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: Returns: Information on the runner instance. """ - name = self._openstack_cloud.get_server_name(instance_id) - instances_list = self._openstack_cloud.get_instances() - for instance in instances_list: - if instance.server_name == name: - return CloudRunnerInstance( - name=name, - instance_id=instance_id, - state=CloudRunnerState.from_openstack_server_status(instance.status), - ) - return None + instance = self._openstack_cloud.get_instance(instance_id) + return ( + CloudRunnerInstance( + name=instance.server_name, + instance_id=instance_id, + state=CloudRunnerState.from_openstack_server_status(instance.status), + ) + if instance is not None + else None + ) def get_runners( self, states: Sequence[CloudRunnerState] | None = None @@ -242,7 +242,7 @@ def delete_runner( return None metric = runner_metrics.extract( - metrics_storage_manager=metrics_storage, runners=instance.server_name + metrics_storage_manager=metrics_storage, runners=set(instance.server_name) ) self._delete_runner(instance, remove_token) return next(metric, None) @@ -267,14 +267,14 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: self._openstack_cloud.cleanup() return metrics - def get_runner_health(self) -> RunnerByHealth: + def get_runner_health(self) -> RunnerNameByHealth: """Get the runner health state. Returns: - The runners by the health state. + The names of the runner by health state. """ runners = self._get_runner_health() - return RunnerByHealth( + return RunnerNameByHealth( tuple(runner.server_name for runner in runners.healthy), tuple(runner.server_name for runner in runners.unhealthy), ) @@ -291,7 +291,7 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None self._pull_runner_metrics(instance.server_name, ssh_conn) try: - OpenstackRunnerManager._run_github_runner_removal_script( + OpenstackRunnerManager._run_runner_removal_script( instance.server_name, ssh_conn, remove_token ) except GithubRunnerRemoveError: @@ -326,18 +326,19 @@ def _get_runner_health(self) -> RunnerHealth: healthy, unhealthy = [], [] for runner in runner_list: cloud_state = CloudRunnerState.from_openstack_server_status(runner.status) - if cloud_state in ( - CloudRunnerState.DELETED, - CloudRunnerState.ERROR, - CloudRunnerState.STOPPED, - CloudRunnerState.UNKNOWN, + if cloud_state in set( + ( + CloudRunnerState.DELETED, + CloudRunnerState.ERROR, + CloudRunnerState.STOPPED, + ) ) or not self._health_check(runner): unhealthy.append(runner) else: healthy.append(runner) return RunnerHealth(healthy=tuple(healthy), unhealthy=tuple(unhealthy)) - def _generate_userdata(self, instance_name: str, registration_token: str) -> str: + def _generate_cloud_init(self, instance_name: str, registration_token: str) -> str: """Generate cloud init userdata. This is the script the openstack server runs on startup. @@ -347,7 +348,7 @@ def _generate_userdata(self, instance_name: str, registration_token: str) -> str registration_token: The GitHub runner registration token. Returns: - The userdata for openstack instance. + The cloud init userdata for openstack instance. """ jinja = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"), autoescape=True) @@ -667,7 +668,7 @@ def _ssh_pull_file( raise _PullFileError(f"Unable to retrieve file {remote_path}") from exc @staticmethod - def _run_github_runner_removal_script( + def _run_runner_removal_script( instance_name: str, ssh_conn: SSHConnection, remove_token: str ) -> None: """Run Github runner removal script. diff --git a/src/runner_manager.py b/src/runner_manager.py index 25aca060e..8d68a68c9 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -43,7 +43,7 @@ from runner import LXD_PROFILE_YAML, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus from runner_manager_type import FlushMode, RunnerInfo, RunnerManagerClients, RunnerManagerConfig from runner_type import ProxySetting as RunnerProxySetting -from runner_type import RunnerByHealth +from runner_type import RunnerNameByHealth from utilities import execute_command, retry, set_env_var REMOVED_RUNNER_LOG_STR = "Removed runner: %s" @@ -222,7 +222,7 @@ def get_github_info(self) -> Iterator[RunnerInfo]: for runner in remote_runners.values() ) - def _get_runner_health_states(self) -> RunnerByHealth: + def _get_runner_health_states(self) -> RunnerNameByHealth: """Get all runners sorted into health groups. Returns: @@ -247,7 +247,7 @@ def _get_runner_health_states(self) -> RunnerByHealth: else: unhealthy.append(runner.name) - return RunnerByHealth(healthy, unhealthy) + return RunnerNameByHealth(healthy, unhealthy) def _create_runner( self, registration_token: str, resources: VirtualMachineResources, runner: Runner @@ -491,7 +491,7 @@ def _remove_runners(self, count: int, runners: list[Runner]) -> None: logger.info("There are no idle runners to remove.") def _cleanup_offline_runners( - self, runner_states: RunnerByHealth, all_runners: list[Runner] + self, runner_states: RunnerNameByHealth, all_runners: list[Runner] ) -> None: """Cleanup runners that are not running the github run.sh script. diff --git a/src/runner_type.py b/src/runner_type.py index ef4ce5f07..86769eafd 100644 --- a/src/runner_type.py +++ b/src/runner_type.py @@ -12,7 +12,7 @@ @dataclass -class RunnerByHealth: +class RunnerNameByHealth: """Set of runners instance by health state. Attributes: diff --git a/tests/unit/test_openstack_manager.py b/tests/unit/test_openstack_manager.py index 9399cd46e..5e43fb518 100644 --- a/tests/unit/test_openstack_manager.py +++ b/tests/unit/test_openstack_manager.py @@ -27,7 +27,7 @@ from openstack_cloud import openstack_manager from openstack_cloud.openstack_manager import MAX_METRICS_FILE_SIZE, METRICS_EXCHANGE_PATH from runner_manager_type import FlushMode -from runner_type import RunnerByHealth, RunnerGithubInfo +from runner_type import RunnerGithubInfo, RunnerNameByHealth from tests.unit import factories FAKE_MONGODB_URI = "mongodb://example.com/db" @@ -510,7 +510,7 @@ def test_reconcile_pulls_metric_files( monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerByHealth(healthy=(), unhealthy=("test_runner",)) + return_value=RunnerNameByHealth(healthy=(), unhealthy=("test_runner",)) ) ssh_connection_mock.get.side_effect = MagicMock() openstack_manager_for_reconcile.reconcile(quantity=0) @@ -545,7 +545,7 @@ def test_reconcile_does_not_pull_too_large_files( Result(stdout=f"{MAX_METRICS_FILE_SIZE + 1}") if cmd.startswith("stat") else Result() ) openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerByHealth(healthy=("test_runner",), unhealthy=()) + return_value=RunnerNameByHealth(healthy=("test_runner",), unhealthy=()) ) openstack_manager_for_reconcile.reconcile(quantity=0) @@ -570,7 +570,7 @@ def test_reconcile_issue_reconciliation_metrics( monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerByHealth(healthy=("test_runner",), unhealthy=()) + return_value=RunnerNameByHealth(healthy=("test_runner",), unhealthy=()) ) openstack_manager.runner_metrics.extract.return_value = (MagicMock() for _ in range(2)) @@ -635,7 +635,7 @@ def test_reconcile_ignores_metrics_for_openstack_online_runners( ] } openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerByHealth( + return_value=RunnerNameByHealth( healthy=(runner_names["healthy_online"], runner_names["healthy_offline"]), unhealthy=( runner_names["unhealthy_online"], diff --git a/tests/unit/test_runner_manager.py b/tests/unit/test_runner_manager.py index 94d3373d4..66b09cd60 100644 --- a/tests/unit/test_runner_manager.py +++ b/tests/unit/test_runner_manager.py @@ -29,7 +29,7 @@ from metrics.storage import MetricsStorage from runner import Runner, RunnerStatus from runner_manager import BUILD_IMAGE_SCRIPT_FILENAME, RunnerManager, RunnerManagerConfig -from runner_type import RunnerByHealth +from runner_type import RunnerNameByHealth from tests.unit.mock import TEST_BINARY, MockLxdImageManager FAKE_MONGODB_URI = "mongodb://example.com/db" @@ -268,7 +268,7 @@ def mock_get_runners(): # Create online runners. runner_manager._get_runners = mock_get_runners - runner_manager._get_runner_health_states = lambda: RunnerByHealth( + runner_manager._get_runner_health_states = lambda: RunnerNameByHealth( ( f"{runner_manager.instance_name}-0", f"{runner_manager.instance_name}-1", @@ -433,7 +433,7 @@ def mock_get_runners(): # Create online runners. runner_manager._get_runners = mock_get_runners - runner_manager._get_runner_health_states = lambda: RunnerByHealth( + runner_manager._get_runner_health_states = lambda: RunnerNameByHealth( healthy=( online_idle_runner_name, offline_idle_runner_name, From 4727f71f706edde6e65401f5d186056ace30f27a Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 16 Aug 2024 13:49:02 +0800 Subject: [PATCH 188/278] Refactor health state for runner --- src/manager/cloud_runner_manager.py | 23 ++++---- src/manager/runner_manager.py | 34 +++++++----- .../openstack_runner_manager.py | 53 ++++++++++--------- .../test_runner_manager_openstack.py | 2 +- 4 files changed, 60 insertions(+), 52 deletions(-) diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index c26b9e7e8..6a5abd5d8 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -6,9 +6,10 @@ import abc import logging from dataclasses import dataclass -from enum import Enum +from enum import Enum, auto from typing import Iterator, Sequence, Tuple +from manager.runner_manager import HealthState from metrics.runner import RunnerMetrics from runner_type import RunnerNameByHealth @@ -30,13 +31,13 @@ class CloudRunnerState(str, Enum): UNEXPECTED: An unknown state not accounted by the developer is encountered. """ - CREATED = "created" - ACTIVE = "active" - DELETED = "deleted" - ERROR = "error" - STOPPED = "stopped" - UNKNOWN = "unknown" - UNEXPECTED = "unexpected" + CREATED = auto() + ACTIVE = auto() + DELETED = auto() + ERROR = auto() + STOPPED = auto() + UNKNOWN = auto() + UNEXPECTED = auto() # Disable "Too many return statements" as this method is using case statement for converting # the states, which does not cause a complexity issue. @@ -81,11 +82,13 @@ class CloudRunnerInstance: Attributes: name: Name of the instance hosting the runner. instance_id: ID of the instance. + health: Health state of the runner. state: State of the instance hosting the runner. """ name: str instance_id: InstanceId + health: HealthState state: CloudRunnerState @@ -139,7 +142,3 @@ def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: Args: remove_token: The GitHub remove token. """ - - @abc.abstractmethod - def get_runner_health(self) -> RunnerNameByHealth: - """Get the runners health state.""" diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 2b011db60..9a7e7385d 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -40,21 +40,34 @@ class FlushMode(Enum): FLUSH_IDLE = auto() FLUSH_BUSY = auto() - - + +class HealthState(Enum): + """Health state of the runners. + + Attributes: + HEALTHY: The runner is healthy. + UNHEALTHY: The runner is not healthy. + UNKNOWN: Unable to get the health state. + """ + HEALTHY = auto() + UNHEALTHY= auto() + UNKNOWN=auto() + @dataclass class RunnerInstance: """Represents an instance of runner. Attributes: name: Full name of the runner. Managed by the cloud runner manager. - id: ID of the runner. Managed by the runner manager. + instance_id: ID of the runner. Managed by the runner manager. + health: The health state of the runner. github_state: State on github. cloud_state: State on cloud. """ name: str - id: InstanceId + instance_id: InstanceId + health: HealthState github_state: GithubRunnerState | None cloud_state: CloudRunnerState @@ -66,7 +79,8 @@ def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedR github_info: Information on the GitHub of the runner. """ self.name = cloud_instance.name - self.id = cloud_instance.instance_id + self.instance_id = cloud_instance.instance_id + self.health = cloud_instance.health self.github_state = ( GithubRunnerState.from_runner(github_info) if github_info is not None else None ) @@ -249,14 +263,6 @@ def cleanup(self) -> IssuedMetricEventsStats: deleted_runner_metrics = self._cloud.cleanup(remove_token) return self._issue_runner_metrics(metrics=deleted_runner_metrics) - def get_runner_health(self) -> RunnerNameByHealth: - """Get the runner health state. - - Returns: - The runners by the health state. - """ - return self._cloud.get_runner_health() - def _delete_runners( self, runners: Sequence[RunnerInstance], remove_token: str ) -> IssuedMetricEventsStats: @@ -272,7 +278,7 @@ def _delete_runners( runner_metrics_list = [] for runner in runners: deleted_runner_metrics = self._cloud.delete_runner( - instance_id=runner.id, remove_token=remove_token + instance_id=runner.instance_id, remove_token=remove_token ) if deleted_runner_metrics is not None: runner_metrics_list.append(deleted_runner_metrics) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 5019fe853..4ea2ade47 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -33,6 +33,7 @@ CloudRunnerState, InstanceId, ) +from manager.runner_manager import HealthState from metrics import events as metric_events from metrics import runner as runner_metrics from metrics import storage as metrics_storage @@ -186,10 +187,12 @@ def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: Information on the runner instance. """ instance = self._openstack_cloud.get_instance(instance_id) + healthy = self._runner_health_check(instance=instance) return ( CloudRunnerInstance( name=instance.server_name, instance_id=instance_id, + health=HealthState.HEALTHY if healthy else HealthState.UNHEALTHY, state=CloudRunnerState.from_openstack_server_status(instance.status), ) if instance is not None @@ -213,6 +216,7 @@ def get_runners( CloudRunnerInstance( name=instance.server_name, instance_id=instance.instance_id, + health=HealthState.HEALTHY if self._runner_health_check(instance) else HealthState.UNHEALTHY, state=CloudRunnerState.from_openstack_server_status(instance.status), ) for instance in instance_list @@ -256,7 +260,7 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: Returns: Any metrics retrieved from cleanup runners. """ - runners = self._get_runner_health() + runners = self._get_runners_health() healthy_runner_names = [runner.server_name for runner in runners.healthy] metrics = runner_metrics.extract( metrics_storage_manager=metrics_storage, runners=set(healthy_runner_names) @@ -267,18 +271,6 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: self._openstack_cloud.cleanup() return metrics - def get_runner_health(self) -> RunnerNameByHealth: - """Get the runner health state. - - Returns: - The names of the runner by health state. - """ - runners = self._get_runner_health() - return RunnerNameByHealth( - tuple(runner.server_name for runner in runners.healthy), - tuple(runner.server_name for runner in runners.unhealthy), - ) - def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None: """Delete self-hosted runners by openstack instance. @@ -314,8 +306,8 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None logger.exception( "Unable to delete openstack instance for runner %s", instance.server_name ) - - def _get_runner_health(self) -> RunnerHealth: + + def _get_runners_health(self) -> RunnerHealth: """Get runners by health state. Returns: @@ -325,19 +317,30 @@ def _get_runner_health(self) -> RunnerHealth: healthy, unhealthy = [], [] for runner in runner_list: - cloud_state = CloudRunnerState.from_openstack_server_status(runner.status) - if cloud_state in set( - ( - CloudRunnerState.DELETED, - CloudRunnerState.ERROR, - CloudRunnerState.STOPPED, - ) - ) or not self._health_check(runner): - unhealthy.append(runner) - else: + if self._runner_health_check(runner): healthy.append(runner) + else: + unhealthy.append(runner) return RunnerHealth(healthy=tuple(healthy), unhealthy=tuple(unhealthy)) + def _runner_health_check(self, instance: OpenstackInstance) -> bool: + """Run health check on a runner. + + Args: + instance: The instance hosting the runner to run health check on. + + Returns: + True if runner is healthy. + """ + cloud_state = CloudRunnerState.from_openstack_server_status(instance.status) + return cloud_state not in set( + ( + CloudRunnerState.DELETED, + CloudRunnerState.ERROR, + CloudRunnerState.STOPPED, + ) + ) and self._health_check(instance) + def _generate_cloud_init(self, instance_name: str, registration_token: str) -> str: """Generate cloud init userdata. diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b2a75e9ed..d92fec639 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -238,7 +238,7 @@ async def test_runner_normal_idle_lifecycle( assert isinstance(runner_list, tuple) assert len(runner_list) == 1 runner = runner_list[0] - assert runner.id == runner_id + assert runner.instance_id == runner_id assert runner.cloud_state == CloudRunnerState.ACTIVE # Update on GitHub-side can take a bit of time. await wait_for( From 1d055ebf13f6413273a169e42ab905bd9d5adbe2 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:38:29 +0800 Subject: [PATCH 189/278] Fix lint issues --- ...penstack_cloud.openstack_runner_manager.md | 120 ++++++----------- src/manager/cloud_runner_manager.py | 58 +++++++- src/manager/runner_manager.py | 22 +-- .../openstack_runner_manager.py | 127 ++++++++++-------- .../test_runner_manager_openstack.py | 17 ++- 5 files changed, 182 insertions(+), 162 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index a96d9d2eb..7a0969bf5 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -17,10 +17,10 @@ Manager for self-hosted runner on OpenStack. --- - + -## class `OpenstackRunnerManagerConfig` -Configuration for OpenstackRunnerManager. +## class `OpenStackCloudConfig` +Configuration for OpenStack cloud authorisation information. @@ -28,36 +28,13 @@ Configuration for OpenstackRunnerManager. - `clouds_config`: The clouds.yaml. - `cloud`: The cloud name to connect to. - - `image`: The image name for runners to use. - - `flavor`: The flavor name for runners to use. - - `network`: The network name for runners to use. - - `github_path`: The GitHub organization or repository for runners to connect to. - - `labels`: The labels to add to runners. - - `proxy_config`: The proxy configuration. - - `dockerhub_mirror`: The dockerhub mirror to use for runners. - - `ssh_debug_connections`: The information on the ssh debug services. - - `repo_policy_url`: The URL of the repo policy service. - - `repo_policy_token`: The token to access the repo policy service. ### method `__init__` ```python -__init__( - clouds_config: dict[str, dict], - cloud: str, - image: str, - flavor: str, - network: str, - github_path: GithubOrg | GithubRepo, - labels: list[str], - proxy_config: ProxyConfig | None, - dockerhub_mirror: str | None, - ssh_debug_connections: list[SSHDebugConnection] | None, - repo_policy_url: str | None, - repo_policy_token: str | None -) → None +__init__(clouds_config: dict[str, dict], cloud: str) → None ``` @@ -70,27 +47,25 @@ __init__( --- - + -## class `RunnerHealth` -Runners with health state. +## class `OpenStackServerConfig` +Configuration for OpenStack server. **Attributes:** - - `healthy`: The list of healthy runners. - - `unhealthy`: The list of unhealthy runners. + - `image`: The image name for runners to use. + - `flavor`: The flavor name for runners to use. + - `network`: The network name for runners to use. ### method `__init__` ```python -__init__( - healthy: tuple[OpenstackInstance, ], - unhealthy: tuple[OpenstackInstance, ] -) → None +__init__(image: str, flavor: str, network: str) → None ``` @@ -103,17 +78,29 @@ __init__( --- - + ## class `OpenstackRunnerManager` Manage self-hosted runner on OpenStack cloud. - + + +**Attributes:** + + - `name_prefix`: The name prefix of the runners created. + + ### method `__init__` ```python -__init__(prefix: str, config: OpenstackRunnerManagerConfig) → None +__init__( + prefix: str, + cloud_config: OpenStackCloudConfig, + server_config: OpenStackServerConfig, + runner_config: GitHubRunnerConfig, + service_config: SupportServiceConfig +) → None ``` Construct the object. @@ -123,14 +110,23 @@ Construct the object. **Args:** - `prefix`: The prefix to runner name. - - `config`: Configuration of the object. + - `cloud_config`: The configuration for OpenStack authorisation. + - `server_config`: The configuration for creating OpenStack server. + - `runner_config`: The configuration for the runner. + - `service_config`: The configuration of supporting services of the runners. + +--- + +#### property name_prefix + +Get the name prefix of the self-hosted runners. --- - + ### method `cleanup` @@ -153,7 +149,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -182,7 +178,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -206,24 +202,7 @@ Delete self-hosted runners. --- - - -### method `get_name_prefix` - -```python -get_name_prefix() → str -``` - -Get the name prefix of the self-hosted runners. - - - -**Returns:** - The name prefix. - ---- - - + ### method `get_runner` @@ -246,24 +225,7 @@ Get a self-hosted runner by instance id. --- - - -### method `get_runner_health` - -```python -get_runner_health() → RunnerNameByHealth -``` - -Get the runner health state. - - - -**Returns:** - The names of the runner by health state. - ---- - - + ### method `get_runners` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 6a5abd5d8..acc04b542 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -9,15 +9,28 @@ from enum import Enum, auto from typing import Iterator, Sequence, Tuple -from manager.runner_manager import HealthState +from charm_state import GithubPath, ProxyConfig, SSHDebugConnection from metrics.runner import RunnerMetrics -from runner_type import RunnerNameByHealth logger = logging.getLogger(__name__) InstanceId = str +class HealthState(Enum): + """Health state of the runners. + + Attributes: + HEALTHY: The runner is healthy. + UNHEALTHY: The runner is not healthy. + UNKNOWN: Unable to get the health state. + """ + + HEALTHY = auto() + UNHEALTHY = auto() + UNKNOWN = auto() + + class CloudRunnerState(str, Enum): """Represent state of the instance hosting the runner. @@ -75,6 +88,38 @@ def from_openstack_server_status( # pylint: disable=R0911 return CloudRunnerState.UNEXPECTED +@dataclass +class GitHubRunnerConfig: + """Configuration for GitHub runner spawned. + + Attributes: + github_path: The GitHub organization or repository for runners to connect to. + labels: The labels to add to runners. + """ + + github_path: GithubPath + labels: list[str] + + +@dataclass +class SupportServiceConfig: + """Configuration for supporting services for runners. + + Attributes: + proxy_config: The proxy configuration. + dockerhub_mirror: The dockerhub mirror to use for runners. + ssh_debug_connections: The information on the ssh debug services. + repo_policy_url: The URL of the repo policy service. + repo_policy_token: The token to access the repo policy service. + """ + + proxy_config: ProxyConfig | None + dockerhub_mirror: str | None + ssh_debug_connections: list[SSHDebugConnection] | None + repo_policy_url: str | None + repo_policy_token: str | None + + @dataclass class CloudRunnerInstance: """Information on the runner on the cloud. @@ -93,10 +138,15 @@ class CloudRunnerInstance: class CloudRunnerManager(abc.ABC): - """Manage runner instance on cloud.""" + """Manage runner instance on cloud. + + Attributes: + name_prefix: The name prefix of the self-hosted runners. + """ + @property @abc.abstractmethod - def get_name_prefix(self) -> str: + def name_prefix(self) -> str: """Get the name prefix of the self-hosted runners.""" @abc.abstractmethod diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 9a7e7385d..e2472e643 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -16,6 +16,7 @@ CloudRunnerInstance, CloudRunnerManager, CloudRunnerState, + HealthState, InstanceId, ) from manager.github_runner_manager import GithubRunnerManager, GithubRunnerState @@ -23,7 +24,6 @@ from metrics import github as github_metrics from metrics import runner as runner_metrics from metrics.runner import RunnerMetrics -from runner_type import RunnerNameByHealth logger = logging.getLogger(__name__) @@ -40,19 +40,8 @@ class FlushMode(Enum): FLUSH_IDLE = auto() FLUSH_BUSY = auto() - -class HealthState(Enum): - """Health state of the runners. - - Attributes: - HEALTHY: The runner is healthy. - UNHEALTHY: The runner is not healthy. - UNKNOWN: Unable to get the health state. - """ - HEALTHY = auto() - UNHEALTHY= auto() - UNKNOWN=auto() - + + @dataclass class RunnerInstance: """Represents an instance of runner. @@ -112,8 +101,9 @@ def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManag """ self._config = config self._cloud = cloud_runner_manager + self.name_prefix = self._cloud.name_prefix self._github = GithubRunnerManager( - prefix=self._cloud.get_name_prefix(), token=self._config.token, path=self._config.path + prefix=self.name_prefix, token=self._config.token, path=self._config.path ) def create_runners(self, num: int) -> tuple[InstanceId]: @@ -311,7 +301,7 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri issued_events = runner_metrics.issue_events( runner_metrics=extracted_metrics, job_metrics=job_metrics, - flavor=self._cloud.get_name_prefix(), + flavor=self.name_prefix, ) for event_type in issued_events: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 4ea2ade47..733eb1419 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -16,7 +16,7 @@ import paramiko.ssh_exception from fabric import Connection as SSHConnection -from charm_state import GithubOrg, GithubPath, ProxyConfig, SSHDebugConnection +from charm_state import GithubOrg from errors import ( CreateMetricsStorageError, GetMetricsStorageError, @@ -31,7 +31,9 @@ CloudRunnerInstance, CloudRunnerManager, CloudRunnerState, + GitHubRunnerConfig, InstanceId, + SupportServiceConfig, ) from manager.runner_manager import HealthState from metrics import events as metric_events @@ -40,7 +42,6 @@ from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance from openstack_cloud.openstack_manager import GithubRunnerRemoveError from repo_policy_compliance_client import RepoPolicyComplianceClient -from runner_type import RunnerNameByHealth from utilities import retry logger = logging.getLogger(__name__) @@ -63,42 +64,36 @@ class _PullFileError(Exception): """Represents an error while pulling a file from the runner instance.""" -# Ignore "Too many instance attributes" as this dataclass is for passing arguments. @dataclass -class OpenstackRunnerManagerConfig: # pylint: disable=R0902 - """Configuration for OpenstackRunnerManager. +class OpenStackCloudConfig: + """Configuration for OpenStack cloud authorisation information. Attributes: clouds_config: The clouds.yaml. cloud: The cloud name to connect to. + """ + + clouds_config: dict[str, dict] + cloud: str + + +@dataclass +class OpenStackServerConfig: + """Configuration for OpenStack server. + + Attributes: image: The image name for runners to use. flavor: The flavor name for runners to use. network: The network name for runners to use. - github_path: The GitHub organization or repository for runners to connect to. - labels: The labels to add to runners. - proxy_config: The proxy configuration. - dockerhub_mirror: The dockerhub mirror to use for runners. - ssh_debug_connections: The information on the ssh debug services. - repo_policy_url: The URL of the repo policy service. - repo_policy_token: The token to access the repo policy service. """ - clouds_config: dict[str, dict] - cloud: str image: str flavor: str network: str - github_path: GithubPath - labels: list[str] - proxy_config: ProxyConfig | None - dockerhub_mirror: str | None - ssh_debug_connections: list[SSHDebugConnection] | None - repo_policy_url: str | None - repo_policy_token: str | None @dataclass -class RunnerHealth: +class _RunnerHealth: """Runners with health state. Attributes: @@ -111,31 +106,41 @@ class RunnerHealth: class OpenstackRunnerManager(CloudRunnerManager): - """Manage self-hosted runner on OpenStack cloud.""" + """Manage self-hosted runner on OpenStack cloud. - def __init__(self, prefix: str, config: OpenstackRunnerManagerConfig) -> None: + Attributes: + name_prefix: The name prefix of the runners created. + """ + + # Ignore "Too many arguments", as the class requires a lot of configurations. + def __init__( # pylint: disable=R0913 + self, + prefix: str, + cloud_config: OpenStackCloudConfig, + server_config: OpenStackServerConfig, + runner_config: GitHubRunnerConfig, + service_config: SupportServiceConfig, + ) -> None: """Construct the object. Args: prefix: The prefix to runner name. - config: Configuration of the object. + cloud_config: The configuration for OpenStack authorisation. + server_config: The configuration for creating OpenStack server. + runner_config: The configuration for the runner. + service_config: The configuration of supporting services of the runners. """ - self.prefix = prefix - self.config = config + self.name_prefix = prefix + self._cloud_config = cloud_config + self._server_config = server_config + self._runner_config = runner_config + self._service_config = service_config self._openstack_cloud = OpenstackCloud( - clouds_config=self.config.clouds_config, - cloud=self.config.cloud, - prefix=self.prefix, + clouds_config=self._cloud_config.clouds_config, + cloud=self._cloud_config.cloud, + prefix=self.name_prefix, ) - def get_name_prefix(self) -> str: - """Get the name prefix of the self-hosted runners. - - Returns: - The name prefix. - """ - return self.prefix - def create_runner(self, registration_token: str) -> InstanceId: """Create a self-hosted runner. @@ -157,9 +162,9 @@ def create_runner(self, registration_token: str) -> InstanceId: try: instance = self._openstack_cloud.launch_instance( instance_id=instance_id, - image=self.config.image, - flavor=self.config.flavor, - network=self.config.network, + image=self._server_config.image, + flavor=self._server_config.flavor, + network=self._server_config.network, cloud_init=cloud_init, ) except OpenStackError as err: @@ -171,7 +176,7 @@ def create_runner(self, registration_token: str) -> InstanceId: end_timestamp = time.time() OpenstackRunnerManager._issue_runner_installed_metric( name=instance_name, - flavor=self.prefix, + flavor=self.name_prefix, install_start_timestamp=start_timestamp, install_end_timestamp=end_timestamp, ) @@ -216,7 +221,11 @@ def get_runners( CloudRunnerInstance( name=instance.server_name, instance_id=instance.instance_id, - health=HealthState.HEALTHY if self._runner_health_check(instance) else HealthState.UNHEALTHY, + health=( + HealthState.HEALTHY + if self._runner_health_check(instance) + else HealthState.UNHEALTHY + ), state=CloudRunnerState.from_openstack_server_status(instance.status), ) for instance in instance_list @@ -306,8 +315,8 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None logger.exception( "Unable to delete openstack instance for runner %s", instance.server_name ) - - def _get_runners_health(self) -> RunnerHealth: + + def _get_runners_health(self) -> _RunnerHealth: """Get runners by health state. Returns: @@ -321,11 +330,11 @@ def _get_runners_health(self) -> RunnerHealth: healthy.append(runner) else: unhealthy.append(runner) - return RunnerHealth(healthy=tuple(healthy), unhealthy=tuple(unhealthy)) + return _RunnerHealth(healthy=tuple(healthy), unhealthy=tuple(unhealthy)) def _runner_health_check(self, instance: OpenstackInstance) -> bool: """Run health check on a runner. - + Args: instance: The instance hosting the runner to run health check on. @@ -357,10 +366,10 @@ def _generate_cloud_init(self, instance_name: str, registration_token: str) -> s env_contents = jinja.get_template("env.j2").render( pre_job_script=str(PRE_JOB_SCRIPT), - dockerhub_mirror=self.config.dockerhub_mirror or "", + dockerhub_mirror=self._service_config.dockerhub_mirror or "", ssh_debug_info=( - secrets.choice(self.config.ssh_debug_connections) - if self.config.ssh_debug_connections + secrets.choice(self._service_config.ssh_debug_connections) + if self._service_config.ssh_debug_connections else None ), # Proxies are handled by aproxy. @@ -385,24 +394,24 @@ def _generate_cloud_init(self, instance_name: str, registration_token: str) -> s pre_job_contents = jinja.get_template("pre-job.j2").render(pre_job_contents_dict) runner_group = None - if isinstance(self.config.github_path, GithubOrg): - runner_group = self.config.github_path.group + if isinstance(self._runner_config.github_path, GithubOrg): + runner_group = self._runner_config.github_path.group aproxy_address = ( - self.config.proxy_config.aproxy_address - if self.config.proxy_config is not None + self._service_config.proxy_config.aproxy_address + if self._service_config.proxy_config is not None else None ) return jinja.get_template("openstack-userdata.sh.j2").render( - github_url=f"https://github.com/{self.config.github_path.path()}", + github_url=f"https://github.com/{self._runner_config.github_path.path()}", runner_group=runner_group, token=registration_token, - instance_labels=",".join(self.config.labels), + instance_labels=",".join(self._runner_config.labels), instance_name=instance_name, env_contents=env_contents, pre_job_contents=pre_job_contents, metrics_exchange_path=str(METRICS_EXCHANGE_PATH), aproxy_address=aproxy_address, - dockerhub_mirror=self.config.dockerhub_mirror, + dockerhub_mirror=self._service_config.dockerhub_mirror, ) def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | None: @@ -411,9 +420,9 @@ def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | Non Returns: The repo policy compliance client. """ - if self.config.repo_policy_url and self.config.repo_policy_token: + if self._service_config.repo_policy_url and self._service_config.repo_policy_token: return RepoPolicyComplianceClient( - self.config.repo_policy_url, self.config.repo_policy_token + self._service_config.repo_policy_url, self._service_config.repo_policy_token ) return None diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index d92fec639..c09800ae7 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -18,14 +18,15 @@ from openstack.connection import Connection as OpenstackConnection from charm_state import GithubPath, ProxyConfig, parse_github_path -from manager.cloud_runner_manager import CloudRunnerState +from manager.cloud_runner_manager import CloudRunnerState, GitHubRunnerConfig, SupportServiceConfig from manager.github_runner_manager import GithubRunnerState from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig from metrics import events, storage from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( + OpenStackCloudConfig, OpenstackRunnerManager, - OpenstackRunnerManagerConfig, + OpenStackServerConfig, ) from tests.integration.helpers.common import ( DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, @@ -104,21 +105,29 @@ async def openstack_runner_manager_fixture( _CLOUDS_YAML_PATH.unlink(missing_ok=True) clouds_config = yaml.safe_load(private_endpoint_clouds_yaml) - config = OpenstackRunnerManagerConfig( + cloud_config = OpenStackCloudConfig( clouds_config=clouds_config, cloud="testcloud", + ) + server_config = OpenStackServerConfig( image=openstack_test_image, flavor=flavor_name, network=network_name, + ) + runner_config = GitHubRunnerConfig( github_path=github_path, labels=["openstack_test", runner_label], + ) + service_config = SupportServiceConfig( proxy_config=proxy_config, dockerhub_mirror=None, ssh_debug_connections=None, repo_policy_url=None, repo_policy_token=None, ) - return OpenstackRunnerManager(app_name, config) + return OpenstackRunnerManager( + app_name, cloud_config, server_config, runner_config, service_config + ) @pytest_asyncio.fixture(scope="module", name="runner_manager") From 63835848aa3727cf88fa8635f9b48111d0e3e29a Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:39:48 +0800 Subject: [PATCH 190/278] Add missing docs --- src/manager/runner_manager.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index e2472e643..76a1f985d 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -90,7 +90,11 @@ class RunnerManagerConfig: class RunnerManager: - """Manage the runners.""" + """Manage the runners. + + Attributes: + name_prefix: The name prefix of the runners. + """ def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManagerConfig): """Construct the object. From 7c0c2256fb7420fb1c54de1d9c8516a9b14ff337 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:50:08 +0800 Subject: [PATCH 191/278] Update the github state enum to use auto --- src/manager/github_runner_manager.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 4d6ae5788..218155b0f 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -3,7 +3,7 @@ """Client for managing self-hosted runner on GitHub side.""" -from enum import Enum +from enum import Enum, auto from typing import Sequence from charm_state import GithubPath @@ -21,9 +21,9 @@ class GithubRunnerState(str, Enum): OFFLINE: Runner is not connected to GitHub. """ - BUSY = "busy" - IDLE = "idle" - OFFLINE = "offline" + BUSY = auto() + IDLE = auto() + OFFLINE = auto() @staticmethod def from_runner(runner: SelfHostedRunner) -> "GithubRunnerState": From 3d62ef97691bed3e63ba25c5f0d93080f1155e8c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 16 Aug 2024 15:03:17 +0800 Subject: [PATCH 192/278] Rename class to fit convension --- src/manager/github_runner_manager.py | 18 +++++++++--------- src/manager/runner_manager.py | 16 ++++++++-------- .../test_runner_manager_openstack.py | 10 +++++----- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 218155b0f..0aed972bd 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -11,7 +11,7 @@ from github_type import GitHubRunnerStatus, SelfHostedRunner -class GithubRunnerState(str, Enum): +class GitHubRunnerState(str, Enum): """State of the self-hosted runner on GitHub. Attributes: @@ -26,7 +26,7 @@ class GithubRunnerState(str, Enum): OFFLINE = auto() @staticmethod - def from_runner(runner: SelfHostedRunner) -> "GithubRunnerState": + def from_runner(runner: SelfHostedRunner) -> "GitHubRunnerState": """Construct the object from GtiHub runner information. Args: @@ -35,13 +35,13 @@ def from_runner(runner: SelfHostedRunner) -> "GithubRunnerState": Returns: The state of runner. """ - state = GithubRunnerState.OFFLINE + state = GitHubRunnerState.OFFLINE # A runner that is busy and offline is possible. if runner.busy: - state = GithubRunnerState.BUSY + state = GitHubRunnerState.BUSY if runner.status == GitHubRunnerStatus.ONLINE: if not runner.busy: - state = GithubRunnerState.IDLE + state = GitHubRunnerState.IDLE return state @@ -61,7 +61,7 @@ def __init__(self, prefix: str, token: str, path: GithubPath): self.github = GithubClient(token) def get_runners( - self, states: Sequence[GithubRunnerState] | None = None + self, states: Sequence[GitHubRunnerState] | None = None ) -> tuple[SelfHostedRunner]: """Get info on self-hosted runners of certain states. @@ -79,7 +79,7 @@ def get_runners( and GithubRunnerManager._is_runner_in_state(runner, states) ) - def delete_runners(self, states: Sequence[GithubRunnerState] | None = None) -> None: + def delete_runners(self, states: Sequence[GitHubRunnerState] | None = None) -> None: """Delete the self-hosted runners of certain states. Args: @@ -111,7 +111,7 @@ def get_removal_token(self) -> str: @staticmethod def _is_runner_in_state( - runner: SelfHostedRunner, states: Sequence[GithubRunnerState] | None + runner: SelfHostedRunner, states: Sequence[GitHubRunnerState] | None ) -> bool: """Check that the runner is in one of the states provided. @@ -124,4 +124,4 @@ def _is_runner_in_state( """ if states is None: return True - return GithubRunnerState.from_runner(runner) in states + return GitHubRunnerState.from_runner(runner) in states diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 76a1f985d..98cf5f35f 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -19,7 +19,7 @@ HealthState, InstanceId, ) -from manager.github_runner_manager import GithubRunnerManager, GithubRunnerState +from manager.github_runner_manager import GithubRunnerManager, GitHubRunnerState from metrics import events as metric_events from metrics import github as github_metrics from metrics import runner as runner_metrics @@ -57,7 +57,7 @@ class RunnerInstance: name: str instance_id: InstanceId health: HealthState - github_state: GithubRunnerState | None + github_state: GitHubRunnerState | None cloud_state: CloudRunnerState def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner | None): @@ -71,7 +71,7 @@ def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedR self.instance_id = cloud_instance.instance_id self.health = cloud_instance.health self.github_state = ( - GithubRunnerState.from_runner(github_info) if github_info is not None else None + GitHubRunnerState.from_runner(github_info) if github_info is not None else None ) self.cloud_state = cloud_instance.state @@ -91,7 +91,7 @@ class RunnerManagerConfig: class RunnerManager: """Manage the runners. - + Attributes: name_prefix: The name prefix of the runners. """ @@ -143,7 +143,7 @@ def create_runners(self, num: int) -> tuple[InstanceId]: def get_runners( self, - github_states: Sequence[GithubRunnerState] | None = None, + github_states: Sequence[GitHubRunnerState] | None = None, cloud_states: Sequence[CloudRunnerState] | None = None, ) -> tuple[RunnerInstance]: """Get information on runner filter by state. @@ -236,9 +236,9 @@ def flush_runners( "Unknown flush mode %s encountered, contact developers", flush_mode ) - states = [GithubRunnerState.IDLE] + states = [GitHubRunnerState.IDLE] if flush_mode == FlushMode.FLUSH_BUSY: - states.append(GithubRunnerState.BUSY) + states.append(GitHubRunnerState.BUSY) runners_list = self.get_runners(github_states=states) runner_names = [runner.name for runner in runners_list] @@ -252,7 +252,7 @@ def cleanup(self) -> IssuedMetricEventsStats: Returns: Stats on metrics events issued during the cleanup of runners. """ - self._github.delete_runners([GithubRunnerState.OFFLINE]) + self._github.delete_runners([GitHubRunnerState.OFFLINE]) remove_token = self._github.get_removal_token() deleted_runner_metrics = self._cloud.cleanup(remove_token) return self._issue_runner_metrics(metrics=deleted_runner_metrics) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index c09800ae7..523c2b5ae 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -19,7 +19,7 @@ from charm_state import GithubPath, ProxyConfig, parse_github_path from manager.cloud_runner_manager import CloudRunnerState, GitHubRunnerConfig, SupportServiceConfig -from manager.github_runner_manager import GithubRunnerState +from manager.github_runner_manager import GitHubRunnerState from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig from metrics import events, storage from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH @@ -160,7 +160,7 @@ async def runner_manager_with_one_runner_fixture(runner_manager: RunnerManager) ), "Test arrange failed: Expect runner in active state" try: await wait_for( - lambda: runner_manager.get_runners()[0].github_state == GithubRunnerState.IDLE, + lambda: runner_manager.get_runners()[0].github_state == GitHubRunnerState.IDLE, timeout=120, check_interval=10, ) @@ -251,7 +251,7 @@ async def test_runner_normal_idle_lifecycle( assert runner.cloud_state == CloudRunnerState.ACTIVE # Update on GitHub-side can take a bit of time. await wait_for( - lambda: runner_manager.get_runners()[0].github_state == GithubRunnerState.IDLE, + lambda: runner_manager.get_runners()[0].github_state == GitHubRunnerState.IDLE, timeout=120, check_interval=10, ) @@ -304,7 +304,7 @@ async def test_runner_flush_busy_lifecycle( assert len(runner_list) == 1 busy_runner = runner_list[0] assert busy_runner.cloud_state == CloudRunnerState.ACTIVE - assert busy_runner.github_state == GithubRunnerState.BUSY + assert busy_runner.github_state == GitHubRunnerState.BUSY # 2. runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) @@ -312,7 +312,7 @@ async def test_runner_flush_busy_lifecycle( assert len(runner_list) == 1 busy_runner = runner_list[0] assert busy_runner.cloud_state == CloudRunnerState.ACTIVE - assert busy_runner.github_state == GithubRunnerState.BUSY + assert busy_runner.github_state == GitHubRunnerState.BUSY # 3. runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_BUSY) From d14f92ae2e9db48f4cf8aaa461c8b4192388f5f8 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 16 Aug 2024 15:29:45 +0800 Subject: [PATCH 193/278] Fix according to review --- ...penstack_cloud.openstack_runner_manager.md | 2 +- src/manager/cloud_runner_manager.py | 22 +++++++++---------- src/openstack_cloud/openstack_cloud.py | 4 ++-- .../openstack_runner_manager.py | 12 +++++----- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 7a0969bf5..a40e7009c 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -232,7 +232,7 @@ Get a self-hosted runner by instance id. ```python get_runners( states: Optional[Sequence[CloudRunnerState]] = None -) → Tuple[CloudRunnerInstance] +) → tuple[CloudRunnerInstance, ] ``` Get self-hosted runners by state. diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index acc04b542..7362990d0 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -52,10 +52,8 @@ class CloudRunnerState(str, Enum): UNKNOWN = auto() UNEXPECTED = auto() - # Disable "Too many return statements" as this method is using case statement for converting - # the states, which does not cause a complexity issue. @staticmethod - def from_openstack_server_status( # pylint: disable=R0911 + def from_openstack_server_status( openstack_server_status: str, ) -> "CloudRunnerState": """Create from openstack server status. @@ -69,23 +67,25 @@ def from_openstack_server_status( # pylint: disable=R0911 Returns: The state of the runner. """ + state = CloudRunnerState.UNEXPECTED match openstack_server_status: case "BUILD": - return CloudRunnerState.CREATED + state = CloudRunnerState.CREATED case "REBUILD": - return CloudRunnerState.CREATED + state = CloudRunnerState.CREATED case "ACTIVE": - return CloudRunnerState.ACTIVE + state = CloudRunnerState.ACTIVE case "ERROR": - return CloudRunnerState.ERROR + state = CloudRunnerState.ERROR case "STOPPED": - return CloudRunnerState.STOPPED + state = CloudRunnerState.STOPPED case "DELETED": - return CloudRunnerState.DELETED + state = CloudRunnerState.DELETED case "UNKNOWN": - return CloudRunnerState.UNKNOWN + state = CloudRunnerState.UNKNOWN case _: - return CloudRunnerState.UNEXPECTED + state = CloudRunnerState.UNEXPECTED + return state @dataclass diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index ceac7fd6d..462b6b46b 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -394,7 +394,7 @@ def _cleanup_openstack_keypairs( continue try: - conn.delete_keypair(key.name) + self._delete_keypair(conn, key.name) except openstack.exceptions.SDKException: logger.warning( "Unable to delete OpenStack keypair associated with deleted key file %s ", @@ -422,7 +422,7 @@ def _get_and_ensure_unique_server( ) -> OpenstackServer | None: """Get the latest server of the name and ensure it is unique. - If multiple servers with the same name is found, the latest server in creation time is + If multiple servers with the same name are found, the latest server in creation time is returned. Other servers is deleted. Args: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 733eb1419..cac1187c0 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -8,7 +8,7 @@ import time from dataclasses import dataclass from pathlib import Path -from typing import Iterator, Sequence, Tuple +from typing import Iterator, Sequence import invoke import jinja2 @@ -206,7 +206,7 @@ def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: def get_runners( self, states: Sequence[CloudRunnerState] | None = None - ) -> Tuple[CloudRunnerInstance]: + ) -> tuple[CloudRunnerInstance, ...]: """Get self-hosted runners by state. Args: @@ -231,7 +231,7 @@ def get_runners( for instance in instance_list ] if states is None: - return instance_list + return tuple(instance_list) return tuple(instance for instance in instance_list if instance.state in states) def delete_runner( @@ -254,11 +254,11 @@ def delete_runner( ) return None - metric = runner_metrics.extract( - metrics_storage_manager=metrics_storage, runners=set(instance.server_name) + extracted_metrics = runner_metrics.extract( + metrics_storage_manager=metrics_storage, runners=set([instance.server_name]) ) self._delete_runner(instance, remove_token) - return next(metric, None) + return next(extracted_metrics, None) def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: """Cleanup runner and resource on the cloud. From 4bce4a810b517790c72fb3905d7a626149ad7c91 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:38:35 +0800 Subject: [PATCH 194/278] Fix name_prefix property cloud runner manager --- .../openstack_cloud.openstack_runner_manager.md | 17 +++++++++++------ src/openstack_cloud/openstack_runner_manager.py | 11 ++++++++++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index a40e7009c..82991f1e2 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -120,13 +120,18 @@ Construct the object. #### property name_prefix -Get the name prefix of the self-hosted runners. +The prefix of runner names. + + + +**Returns:** + The prefix of the runner names managed by this class. --- - + ### method `cleanup` @@ -149,7 +154,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -178,7 +183,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -202,7 +207,7 @@ Delete self-hosted runners. --- - + ### method `get_runner` @@ -225,7 +230,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index cac1187c0..c58d131c0 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -130,7 +130,7 @@ def __init__( # pylint: disable=R0913 runner_config: The configuration for the runner. service_config: The configuration of supporting services of the runners. """ - self.name_prefix = prefix + self._prefix = prefix self._cloud_config = cloud_config self._server_config = server_config self._runner_config = runner_config @@ -141,6 +141,15 @@ def __init__( # pylint: disable=R0913 prefix=self.name_prefix, ) + @property + def name_prefix(self) -> str: + """The prefix of runner names. + + Returns: + The prefix of the runner names managed by this class. + """ + return self._prefix + def create_runner(self, registration_token: str) -> InstanceId: """Create a self-hosted runner. From 0d23dbecf23114a89955c24d8499d4fbd21ae2c0 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 19 Aug 2024 15:48:49 +0800 Subject: [PATCH 195/278] Add class for scaling runners --- src-docs/errors.md | 65 ++++---- src-docs/metrics.md | 3 - src-docs/openstack_cloud.md | 5 - ...penstack_cloud.openstack_runner_manager.md | 24 +-- src-docs/runner_manager.md | 4 +- src-docs/runner_manager_type.md | 2 +- src/charm.py | 147 ++++++++++-------- src/errors.py | 4 + src/manager/runner_scaler.py | 138 ++++++++++++++++ .../openstack_runner_manager.py | 23 +-- src/runner_manager.py | 16 +- src/runner_manager_type.py | 2 +- tests/integration/helpers/common.py | 8 +- .../test_charm_scheduled_events.py | 4 +- .../test_runner_manager_openstack.py | 12 +- tests/unit/test_charm.py | 12 +- tests/unit/test_runner_manager.py | 44 +++--- 17 files changed, 337 insertions(+), 176 deletions(-) create mode 100644 src/manager/runner_scaler.py diff --git a/src-docs/errors.md b/src-docs/errors.md index cf7cde565..ee5db5a11 100644 --- a/src-docs/errors.md +++ b/src-docs/errors.md @@ -99,6 +99,17 @@ Error for setting up aproxy. +## class `MissingServerConfigError` +Error for unable to create runner due to missing server configurations. + + + + + +--- + + + ## class `MissingRunnerBinaryError` Error for missing runner binary. @@ -108,7 +119,7 @@ Error for missing runner binary. --- - + ## class `ConfigurationError` Error for juju configuration. @@ -119,7 +130,7 @@ Error for juju configuration. --- - + ## class `MissingMongoDBError` Error for missing integration data. @@ -130,7 +141,7 @@ Error for missing integration data. --- - + ## class `LxdError` Error for executing LXD actions. @@ -141,7 +152,7 @@ Error for executing LXD actions. --- - + ## class `SubprocessError` Error for Subprocess calls. @@ -155,7 +166,7 @@ Error for Subprocess calls. - `stdout`: Content of stdout of the subprocess. - `stderr`: Content of stderr of the subprocess. - + ### method `__init__` @@ -185,7 +196,7 @@ Construct the subprocess error. --- - + ## class `IssueMetricEventError` Represents an error when issuing a metric event. @@ -196,7 +207,7 @@ Represents an error when issuing a metric event. --- - + ## class `LogrotateSetupError` Represents an error raised when logrotate cannot be setup. @@ -207,7 +218,7 @@ Represents an error raised when logrotate cannot be setup. --- - + ## class `MetricsStorageError` Base class for all metrics storage errors. @@ -218,7 +229,7 @@ Base class for all metrics storage errors. --- - + ## class `SharedFilesystemError` Base class for all shared filesystem errors. @@ -229,7 +240,7 @@ Base class for all shared filesystem errors. --- - + ## class `CreateMetricsStorageError` Represents an error when the metrics storage could not be created. @@ -240,7 +251,7 @@ Represents an error when the metrics storage could not be created. --- - + ## class `DeleteMetricsStorageError` Represents an error when the metrics storage could not be deleted. @@ -251,7 +262,7 @@ Represents an error when the metrics storage could not be deleted. --- - + ## class `GetMetricsStorageError` Represents an error when the metrics storage could not be retrieved. @@ -262,7 +273,7 @@ Represents an error when the metrics storage could not be retrieved. --- - + ## class `QuarantineMetricsStorageError` Represents an error when the metrics storage could not be quarantined. @@ -273,7 +284,7 @@ Represents an error when the metrics storage could not be quarantined. --- - + ## class `SharedFilesystemMountError` Represents an error related to the mounting of the shared filesystem. @@ -284,7 +295,7 @@ Represents an error related to the mounting of the shared filesystem. --- - + ## class `RunnerMetricsError` Base class for all runner metrics errors. @@ -295,7 +306,7 @@ Base class for all runner metrics errors. --- - + ## class `CorruptMetricDataError` Represents an error with the data being corrupt. @@ -306,7 +317,7 @@ Represents an error with the data being corrupt. --- - + ## class `GithubMetricsError` Base class for all github metrics errors. @@ -317,7 +328,7 @@ Base class for all github metrics errors. --- - + ## class `GithubClientError` Base class for all github client errors. @@ -328,7 +339,7 @@ Base class for all github client errors. --- - + ## class `GithubApiError` Represents an error when the GitHub API returns an error. @@ -339,7 +350,7 @@ Represents an error when the GitHub API returns an error. --- - + ## class `TokenError` Represents an error when the token is invalid or has not enough permissions. @@ -350,7 +361,7 @@ Represents an error when the token is invalid or has not enough permissions. --- - + ## class `JobNotFoundError` Represents an error when the job could not be found on GitHub. @@ -361,7 +372,7 @@ Represents an error when the job could not be found on GitHub. --- - + ## class `RunnerLogsError` Base class for all runner logs errors. @@ -372,7 +383,7 @@ Base class for all runner logs errors. --- - + ## class `OpenStackError` Base class for OpenStack errors. @@ -383,7 +394,7 @@ Base class for OpenStack errors. --- - + ## class `OpenStackInvalidConfigError` Represents an invalid OpenStack configuration. @@ -394,7 +405,7 @@ Represents an invalid OpenStack configuration. --- - + ## class `OpenStackUnauthorizedError` Represents an unauthorized connection to OpenStack. @@ -405,7 +416,7 @@ Represents an unauthorized connection to OpenStack. --- - + ## class `SSHError` Represents an error while interacting with SSH. @@ -416,7 +427,7 @@ Represents an error while interacting with SSH. --- - + ## class `KeyfileError` Represents missing keyfile for SSH. diff --git a/src-docs/metrics.md b/src-docs/metrics.md index e4275c340..25c650c41 100644 --- a/src-docs/metrics.md +++ b/src-docs/metrics.md @@ -22,8 +22,5 @@ Package for common metrics-related code. - **github**: # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -- **runner_logs**: # Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - diff --git a/src-docs/openstack_cloud.md b/src-docs/openstack_cloud.md index 4d82f5359..80cdb146b 100644 --- a/src-docs/openstack_cloud.md +++ b/src-docs/openstack_cloud.md @@ -5,11 +5,6 @@ # module `openstack_cloud` Module for managing Openstack cloud. -**Global Variables** ---------------- -- **openstack_manager**: # Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - --- diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 82991f1e2..7499e4ce5 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -17,7 +17,7 @@ Manager for self-hosted runner on OpenStack. --- - + ## class `OpenStackCloudConfig` Configuration for OpenStack cloud authorisation information. @@ -47,7 +47,7 @@ __init__(clouds_config: dict[str, dict], cloud: str) → None --- - + ## class `OpenStackServerConfig` Configuration for OpenStack server. @@ -78,9 +78,9 @@ __init__(image: str, flavor: str, network: str) → None --- - + -## class `OpenstackRunnerManager` +## class `OpenStackRunnerManager` Manage self-hosted runner on OpenStack cloud. @@ -89,7 +89,7 @@ Manage self-hosted runner on OpenStack cloud. - `name_prefix`: The name prefix of the runners created. - + ### method `__init__` @@ -97,7 +97,7 @@ Manage self-hosted runner on OpenStack cloud. __init__( prefix: str, cloud_config: OpenStackCloudConfig, - server_config: OpenStackServerConfig, + server_config: OpenStackServerConfig | None, runner_config: GitHubRunnerConfig, service_config: SupportServiceConfig ) → None @@ -111,7 +111,7 @@ Construct the object. - `prefix`: The prefix to runner name. - `cloud_config`: The configuration for OpenStack authorisation. - - `server_config`: The configuration for creating OpenStack server. + - `server_config`: The configuration for creating OpenStack server. Unable to create runner if None. - `runner_config`: The configuration for the runner. - `service_config`: The configuration of supporting services of the runners. @@ -131,7 +131,7 @@ The prefix of runner names. --- - + ### method `cleanup` @@ -154,7 +154,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -183,7 +183,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -207,7 +207,7 @@ Delete self-hosted runners. --- - + ### method `get_runner` @@ -230,7 +230,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src-docs/runner_manager.md b/src-docs/runner_manager.md index 8d1773bf0..2cf622469 100644 --- a/src-docs/runner_manager.md +++ b/src-docs/runner_manager.md @@ -15,7 +15,7 @@ Runner Manager manages the runners on LXD and GitHub. -## class `RunnerManager` +## class `LXDRunnerManager` Manage a group of runners according to configuration. @@ -33,7 +33,7 @@ Manage a group of runners according to configuration. __init__( app_name: str, unit: int, - runner_manager_config: RunnerManagerConfig + runner_manager_config: LXDRunnerManagerConfig ) → None ``` diff --git a/src-docs/runner_manager_type.md b/src-docs/runner_manager_type.md index f6dd4faae..7e0675add 100644 --- a/src-docs/runner_manager_type.md +++ b/src-docs/runner_manager_type.md @@ -71,7 +71,7 @@ __init__( -## class `RunnerManagerConfig` +## class `LXDRunnerManagerConfig` Configuration of runner manager. diff --git a/src/charm.py b/src/charm.py index 057982ef2..b869e3627 100755 --- a/src/charm.py +++ b/src/charm.py @@ -8,6 +8,10 @@ """Charm for creating and managing GitHub self-hosted runner instances.""" +from manager.cloud_runner_manager import GitHubRunnerConfig, SupportServiceConfig +from manager.runner_manager import RunnerManager, RunnerManagerConfig +from manager.runner_scaler import RunnerScaler +from openstack_cloud.openstack_runner_manager import OpenStackCloudConfig, OpenStackServerConfig from utilities import bytes_with_unit_to_kib, execute_command, remove_residual_venv_dirs, retry # This is a workaround for https://bugs.launchpad.net/juju/+bug/2058335 @@ -78,9 +82,9 @@ from event_timer import EventTimer, TimerStatusError from firewall import Firewall, FirewallEntry from github_type import GitHubRunnerStatus -from openstack_cloud.openstack_manager import OpenstackRunnerManager +from openstack_cloud.openstack_runner_manager import OpenStackRunnerManager from runner import LXD_PROFILE_YAML -from runner_manager import RunnerManager, RunnerManagerConfig +from runner_manager import LXDRunnerManager, LXDRunnerManagerConfig from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig RECONCILE_RUNNERS_EVENT = "reconcile-runners" @@ -366,7 +370,7 @@ def _ensure_service_health(self) -> None: def _get_runner_manager( self, state: CharmState, token: str | None = None, path: GithubPath | None = None - ) -> RunnerManager: + ) -> LXDRunnerManager: """Get a RunnerManager instance. Args: @@ -399,10 +403,10 @@ def _get_runner_manager( app_name, unit = self.unit.name.rsplit("/", 1) - return RunnerManager( + return LXDRunnerManager( app_name, unit, - RunnerManagerConfig( + LXDRunnerManagerConfig( charm_state=state, dockerhub_mirror=state.charm_config.dockerhub_mirror, image=state.runner_config.base_image.value, @@ -495,8 +499,8 @@ def _on_start(self, _: StartEvent) -> None: if state.instance_type == InstanceType.OPENSTACK: if not self._get_set_image_ready_status(): return - openstack_runner_manager = self._get_openstack_runner_manager(state) - openstack_runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() return @@ -618,9 +622,9 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 if not self._get_set_image_ready_status(): return if state.charm_config.token != self._stored.token: - openstack_runner_manager = self._get_openstack_runner_manager(state) - openstack_runner_manager.flush() - openstack_runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.flush() + runner_scaler.reconcile(state.runner_config.virtual_machines) # TODO: 2024-04-12: Flush on token changes. self.unit.status = ActiveStatus() return @@ -639,7 +643,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 self.unit.status = ActiveStatus() def _check_and_update_local_lxd_dependencies( - self, runner_manager: RunnerManager, token: str, proxy_config: ProxyConfig + self, runner_manager: LXDRunnerManager, token: str, proxy_config: ProxyConfig ) -> bool: """Check and update runner binary and services for local LXD runners. @@ -719,8 +723,8 @@ def _trigger_reconciliation(self) -> None: if state.instance_type == InstanceType.OPENSTACK: if not self._get_set_image_ready_status(): return - runner_manager = self._get_openstack_runner_manager(state) - runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() return @@ -757,23 +761,8 @@ def _on_check_runners_action(self, event: ActionEvent) -> None: state = self._setup_state() if state.instance_type == InstanceType.OPENSTACK: - openstack_runner_manager = self._get_openstack_runner_manager(state) - runner_info = openstack_runner_manager.get_github_runner_info() - - for info in runner_info: - if info.online: - online += 1 - runner_names.append(info.runner_name) - else: - offline += 1 - event.set_results( - { - "online": online, - "offline": offline, - "unknown": unknown, - "runners": ", ".join(runner_names), - } - ) + runner_scaler = self._get_runner_scaler(state) + event.set_results(runner_scaler.get_runner_info()) return runner_manager = self._get_runner_manager(state) @@ -814,9 +803,9 @@ def _on_reconcile_runners_action(self, event: ActionEvent) -> None: if not self._get_set_image_ready_status(): event.fail("Openstack image not yet provided/ready.") return - runner_manager = self._get_openstack_runner_manager(state) + runner_scaler = self._get_runner_scaler(state) - delta = runner_manager.reconcile(state.runner_config.virtual_machines) + delta = runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() event.set_results({"delta": {"virtual-machines": delta}}) return @@ -847,8 +836,8 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None: if state.instance_type == InstanceType.OPENSTACK: # Flushing mode not implemented for OpenStack yet. - runner_manager = self._get_openstack_runner_manager(state) - flushed = runner_manager.flush() + runner_scaler = self._get_runner_scaler(state) + flushed = runner_scaler.flush() event.set_results({"delta": {"virtual-machines": flushed}}) return @@ -895,15 +884,15 @@ def _on_stop(self, _: StopEvent) -> None: state = self._setup_state() if state.instance_type == InstanceType.OPENSTACK: - runner_manager = self._get_openstack_runner_manager(state) - runner_manager.flush() + runner_scaler = self._get_runner_scaler(state) + runner_scaler.flush() return runner_manager = self._get_runner_manager(state) runner_manager.flush(FlushMode.FLUSH_BUSY) def _reconcile_runners( - self, runner_manager: RunnerManager, num: int, resources: VirtualMachineResources + self, runner_manager: LXDRunnerManager, num: int, resources: VirtualMachineResources ) -> Dict[str, Any]: """Reconcile the current runners state and intended runner state. @@ -918,7 +907,7 @@ def _reconcile_runners( Returns: Changes in runner number due to reconciling runners. """ - if not RunnerManager.runner_bin_path.is_file(): + if not LXDRunnerManager.runner_bin_path.is_file(): logger.warning("Unable to reconcile due to missing runner binary") raise MissingRunnerBinaryError("Runner binary not found.") @@ -1148,10 +1137,10 @@ def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None: if state.instance_type == InstanceType.OPENSTACK: if not self._get_set_image_ready_status(): return - runner_manager = self._get_openstack_runner_manager(state) + runner_scaler = self._get_runner_scaler(state) # TODO: 2024-04-12: Should be flush idle. - runner_manager.flush() - runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler.flush() + runner_scaler.reconcile(state.runner_config.virtual_machines) return self._refresh_firewall(state) @@ -1176,10 +1165,10 @@ def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None: if not self._get_set_image_ready_status(): return - runner_manager = self._get_openstack_runner_manager(state) + runner_scaler = self._get_runner_scaler(state) # TODO: 2024-04-12: Should be flush idle. - runner_manager.flush() - runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler.flush() + runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() return @@ -1198,10 +1187,10 @@ def _get_set_image_ready_status(self) -> bool: return False return True - def _get_openstack_runner_manager( + def _get_runner_scaler( self, state: CharmState, token: str | None = None, path: GithubPath | None = None - ) -> OpenstackRunnerManager: - """Get OpenstackRunnerManager instance. + ) -> RunnerScaler: + """Get runner scaler instance for scaling runners. TODO: 2024-07-09 Combine this with `_get_runner_manager` during the runner manager \ interface refactor. @@ -1214,39 +1203,61 @@ def _get_openstack_runner_manager( name. If None the path in charm state is used. Returns: - An instance of OpenstackRunnerManager. + An instance of RunnerScaler. """ if token is None: token = state.charm_config.token if path is None: path = state.charm_config.path - # Empty image can be passed down due to a delete only case where deletion of runners do not - # depend on the image ID being available. Make sure that the charm goes to blocked status - # in hook where a runner may be created. TODO: 2024-07-09 This logic is subject to - # refactoring. + app_name, unit = self.unit.name.rsplit("/", 1) + + clouds = list(state.charm_config.openstack_clouds_yaml["clouds"].keys()) + if len(clouds) > 1: + logger.warning( + "Multiple clouds defined in clouds.yaml. Using the first one to connect." + ) + cloud_config = OpenStackCloudConfig( + clouds_config=state.charm_config.openstack_clouds_yaml, + cloud=clouds[0], + ) + server_config = None + image_labels = [] image = state.runner_config.openstack_image - image_id = image.id if image and image.id else "" - image_labels = image.tags if image and image.tags else [] + if image and image.id: + server_config = OpenStackServerConfig( + image=image.id, + flavor=state.runner_config.openstack_flavor, + network=state.runner_config.openstack_network, + ) + if image.tags: + image_labels += image.tags - app_name, unit = self.unit.name.rsplit("/", 1) - openstack_runner_manager_config = OpenstackRunnerManagerConfig( - charm_state=state, - path=path, - token=token, - labels=(*state.charm_config.labels, *image_labels), - flavor=state.runner_config.openstack_flavor, - image=image_id, - network=state.runner_config.openstack_network, + runner_config = GitHubRunnerConfig( + github_path=path, labels=(*state.charm_config.labels, *image_labels) + ) + service_config = SupportServiceConfig( + proxy_config=state.proxy_config, dockerhub_mirror=state.charm_config.dockerhub_mirror, - reactive_config=state.reactive_config, + ssh_debug_connections=state.ssh_debug_connections, + repo_policy_url=state.charm_config.repo_policy_compliance.url, + repo_policy_token=state.charm_config.repo_policy_compliance.token, ) - return OpenstackRunnerManager( + openstack_runner_manager = OpenStackRunnerManager( app_name, - unit, - openstack_runner_manager_config, - state.charm_config.openstack_clouds_yaml, + cloud_config=cloud_config, + server_config=server_config, + runner_config=runner_config, + service_config=service_config, + ) + runner_manager_config = RunnerManagerConfig( + token=token, + path=path, + ) + runner_manager = RunnerManager( + cloud_runner_manager=openstack_runner_manager, config=runner_manager_config ) + return RunnerScaler(runner_manager=runner_manager) if __name__ == "__main__": diff --git a/src/errors.py b/src/errors.py index 59d28a239..4285dc6e4 100644 --- a/src/errors.py +++ b/src/errors.py @@ -39,6 +39,10 @@ class RunnerAproxyError(RunnerError): """Error for setting up aproxy.""" +class MissingServerConfigError(RunnerError): + """Error for unable to create runner due to missing server configurations.""" + + class MissingRunnerBinaryError(Exception): """Error for missing runner binary.""" diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py new file mode 100644 index 000000000..7bf9b81b9 --- /dev/null +++ b/src/manager/runner_scaler.py @@ -0,0 +1,138 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + + +import logging +import time +from typing import TypedDict + +from errors import IssueMetricEventError, MissingServerConfigError +from manager.cloud_runner_manager import HealthState +from manager.github_runner_manager import GitHubRunnerState +from manager.runner_manager import FlushMode, RunnerManager +from metrics import events as metric_events + +logger = logging.getLogger(__name__) + + +@TypedDict +class RunnerInfo: + """Information on the runners. + + Attributes: + online: The number of runner in online state. + offline: The number of runner in offline state. + unknown: The number of runner in unknown state. + runners: The names of the online runners. + """ + + online: int + offline: int + unknown: int + runners: tuple[str, ...] + + +class RunnerScaler: + """Manage the reconcile of runners.""" + + def __init__(self, runner_manager: RunnerManager): + """Construct the object. + + Args: + runner_manager: The RunnerManager to preform runner reconcile. + """ + self._manager = runner_manager + + def get_runner_info(self) -> RunnerInfo: + runner_list = self._manager.get_runners() + online = 0 + offline = 0 + unknown = 0 + online_runners = [] + for runner in runner_list: + match runner.github_state: + case GitHubRunnerState.BUSY: + online += 1 + online_runners.append(runner.name) + case GitHubRunnerState.IDLE: + online += 1 + online_runners.append(runner.name) + case GitHubRunnerState.OFFLINE: + offline += 1 + case _: + unknown += 1 + return RunnerInfo(online=online, offline=offline, unknown=unknown, runners=online_runners) + + def flush(self, flush_mode: FlushMode) -> None: + """Flush the runners. + + Args: + flush_mode: Determines the types of runner to be flushed. + """ + self._manager.cleanup() + self._manager.delete_runners(flush_mode=flush_mode) + + def reconcile(self, num_of_runner: int) -> int: + """Reconcile the quantity of runners. + + Args: + num_of_runner: The number of intended runners. + + Returns: + The Change in number of runners. + """ + logger.info("Start reconcile to %s runner", num_of_runner) + + start_timestamp = time.time() + delete_metric_stats = None + metric_stats = self._manager.cleanup() + runners = self._manager.get_runners() + current_num = len(runners) + logger.info("Reconcile runners from %s to %s", current_num, num_of_runner) + runner_diff = num_of_runner - current_num + if runner_diff > 0: + try: + self._manager.create_runners(runner_diff) + except MissingServerConfigError: + logging.exception( + "Unable to spawn runner due to missing server configuration, such as, image." + ) + elif runner_diff < 0: + delete_metric_stats = self._manager.delete_runners(-runner_diff) + else: + logger.info("No changes to the number of runners.") + end_timestamp = time.time() + + # Merge the two metric stats. + if delete_metric_stats is not None: + metric_stats = { + delete_metric_stats.get(event_name, 0) + metric_stats.get(event_name, 0) + for event_name in set(delete_metric_stats) | set(metric_stats) + } + + runner_list = self._manager.get_runners() + idle_runners = [ + runner for runner in runner_list if runner.github_state == GitHubRunnerState.IDLE + ] + offline_healthy_runners = [ + runner + for runner in runner_list + if runner.github_state == GitHubRunnerState.OFFLINE + and runner.health == HealthState.HEALTHY + ] + + try: + metric_events.issue_event( + metric_events.Reconciliation( + timestamp=time.time(), + flavor=self._manager.name_prefix, + crashed_runners=metric_stats.get(metric_events.RunnerStart, 0) + - metric_stats.get(metric_events.RunnerStop, 0), + idle_runners=len(set(idle_runners) | set(offline_healthy_runners)), + duration=end_timestamp - start_timestamp, + ) + ) + except IssueMetricEventError: + logger.exception("Failed to issue Reconciliation metric") + + return runner_diff diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index c58d131c0..45231bdb0 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -22,6 +22,7 @@ GetMetricsStorageError, IssueMetricEventError, KeyfileError, + MissingServerConfigError, OpenStackError, RunnerCreateError, RunnerStartError, @@ -105,7 +106,7 @@ class _RunnerHealth: unhealthy: tuple[OpenstackInstance, ...] -class OpenstackRunnerManager(CloudRunnerManager): +class OpenStackRunnerManager(CloudRunnerManager): """Manage self-hosted runner on OpenStack cloud. Attributes: @@ -117,7 +118,7 @@ def __init__( # pylint: disable=R0913 self, prefix: str, cloud_config: OpenStackCloudConfig, - server_config: OpenStackServerConfig, + server_config: OpenStackServerConfig | None, runner_config: GitHubRunnerConfig, service_config: SupportServiceConfig, ) -> None: @@ -126,7 +127,8 @@ def __init__( # pylint: disable=R0913 Args: prefix: The prefix to runner name. cloud_config: The configuration for OpenStack authorisation. - server_config: The configuration for creating OpenStack server. + server_config: The configuration for creating OpenStack server. Unable to create + runner if None. runner_config: The configuration for the runner. service_config: The configuration of supporting services of the runners. """ @@ -162,8 +164,11 @@ def create_runner(self, registration_token: str) -> InstanceId: Returns: Instance ID of the runner. """ + if self._server_config is None: + raise MissingServerConfigError("Missing server configuration to create runners") + start_timestamp = time.time() - instance_id = OpenstackRunnerManager._generate_instance_id() + instance_id = OpenStackRunnerManager._generate_instance_id() instance_name = self._openstack_cloud.get_server_name(instance_id=instance_id) cloud_init = self._generate_cloud_init( instance_name=instance_name, registration_token=registration_token @@ -183,7 +188,7 @@ def create_runner(self, registration_token: str) -> InstanceId: self._wait_runner_running(instance) end_timestamp = time.time() - OpenstackRunnerManager._issue_runner_installed_metric( + OpenStackRunnerManager._issue_runner_installed_metric( name=instance_name, flavor=self.name_prefix, install_start_timestamp=start_timestamp, @@ -301,7 +306,7 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None self._pull_runner_metrics(instance.server_name, ssh_conn) try: - OpenstackRunnerManager._run_runner_removal_script( + OpenStackRunnerManager._run_runner_removal_script( instance.server_name, ssh_conn, remove_token ) except GithubRunnerRemoveError: @@ -460,7 +465,7 @@ def _health_check(self, instance: OpenstackInstance) -> bool: "SSH connection failure with %s during health check", instance.server_name ) raise - return OpenstackRunnerManager._run_health_check(ssh_conn, instance.server_name) + return OpenStackRunnerManager._run_health_check(ssh_conn, instance.server_name) @staticmethod def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool: @@ -611,13 +616,13 @@ def _pull_runner_metrics(name: str, ssh_conn: SSHConnection) -> None: return try: - OpenstackRunnerManager._ssh_pull_file( + OpenStackRunnerManager._ssh_pull_file( ssh_conn=ssh_conn, remote_path=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), local_path=str(storage.path / "pre-job-metrics.json"), max_size=MAX_METRICS_FILE_SIZE, ) - OpenstackRunnerManager._ssh_pull_file( + OpenStackRunnerManager._ssh_pull_file( ssh_conn=ssh_conn, remote_path=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), local_path=str(storage.path / "post-job-metrics.json"), diff --git a/src/runner_manager.py b/src/runner_manager.py index 8d68a68c9..e79d9f7a6 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -41,7 +41,7 @@ from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME from repo_policy_compliance_client import RepoPolicyComplianceClient from runner import LXD_PROFILE_YAML, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus -from runner_manager_type import FlushMode, RunnerInfo, RunnerManagerClients, RunnerManagerConfig +from runner_manager_type import FlushMode, LXDRunnerManagerConfig, RunnerInfo, RunnerManagerClients from runner_type import ProxySetting as RunnerProxySetting from runner_type import RunnerNameByHealth from utilities import execute_command, retry, set_env_var @@ -56,7 +56,7 @@ IssuedMetricEventsStats = dict[Type[metric_events.Event], int] -class RunnerManager: +class LXDRunnerManager: """Manage a group of runners according to configuration. Attributes: @@ -71,7 +71,7 @@ def __init__( self, app_name: str, unit: int, - runner_manager_config: RunnerManagerConfig, + runner_manager_config: LXDRunnerManagerConfig, ) -> None: """Construct RunnerManager object for creating and managing runners. @@ -159,7 +159,7 @@ def update_runner_bin(self, binary: RunnerApplication) -> None: try: # Delete old version of runner binary. - RunnerManager.runner_bin_path.unlink(missing_ok=True) + LXDRunnerManager.runner_bin_path.unlink(missing_ok=True) except OSError as err: logger.exception("Unable to perform file operation on the runner binary path") raise RunnerBinaryError("File operation failed on the runner binary path") from err @@ -182,7 +182,7 @@ def update_runner_bin(self, binary: RunnerApplication) -> None: sha256 = hashlib.sha256() - with RunnerManager.runner_bin_path.open(mode="wb") as file: + with LXDRunnerManager.runner_bin_path.open(mode="wb") as file: # Process with chunk_size of 128 KiB. for chunk in response.iter_content(chunk_size=128 * 1024, decode_unicode=False): file.write(chunk) @@ -267,7 +267,7 @@ def _create_runner( config=CreateRunnerConfig( image=self.config.image, resources=resources, - binary_path=RunnerManager.runner_bin_path, + binary_path=LXDRunnerManager.runner_bin_path, registration_token=registration_token, arch=self.config.charm_state.arch, ) @@ -309,7 +309,7 @@ def _create_runner( config=CreateRunnerConfig( image=self.config.image, resources=resources, - binary_path=RunnerManager.runner_bin_path, + binary_path=LXDRunnerManager.runner_bin_path, registration_token=registration_token, arch=self.config.charm_state.arch, ) @@ -447,7 +447,7 @@ def _spawn_new_runners(self, count: int, resources: VirtualMachineResources) -> Raises: RunnerCreateError: If there was an error spawning new runner. """ - if not RunnerManager.runner_bin_path.exists(): + if not LXDRunnerManager.runner_bin_path.exists(): raise RunnerCreateError("Unable to create runner due to missing runner binary.") logger.info("Getting registration token for GitHub runners.") registration_token = self._clients.github.get_runner_registration_token(self.config.path) diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py index f3a2112f5..343b1eb04 100644 --- a/src/runner_manager_type.py +++ b/src/runner_manager_type.py @@ -61,7 +61,7 @@ class RunnerManagerClients: @dataclass # The instance attributes are all required. -class RunnerManagerConfig: # pylint: disable=too-many-instance-attributes +class LXDRunnerManagerConfig: # pylint: disable=too-many-instance-attributes """Configuration of runner manager. Attributes: diff --git a/tests/integration/helpers/common.py b/tests/integration/helpers/common.py index 16622c038..495c952b3 100644 --- a/tests/integration/helpers/common.py +++ b/tests/integration/helpers/common.py @@ -36,7 +36,7 @@ TOKEN_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME, ) -from runner_manager import RunnerManager +from runner_manager import LXDRunnerManager from tests.status_name import ACTIVE DISPATCH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_test.yaml" @@ -93,7 +93,7 @@ async def check_runner_binary_exists(unit: Unit) -> bool: Returns: Whether the runner binary file exists in the charm. """ - return_code, _, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}") + return_code, _, _ = await run_in_unit(unit, f"test -f {LXDRunnerManager.runner_bin_path}") return return_code == 0 @@ -141,10 +141,10 @@ async def remove_runner_bin(unit: Unit) -> None: Args: unit: Unit instance to check for the LXD profile. """ - await run_in_unit(unit, f"rm {RunnerManager.runner_bin_path}") + await run_in_unit(unit, f"rm {LXDRunnerManager.runner_bin_path}") # No file should exists under with the filename. - return_code, _, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}") + return_code, _, _ = await run_in_unit(unit, f"test -f {LXDRunnerManager.runner_bin_path}") assert return_code != 0 diff --git a/tests/integration/test_charm_scheduled_events.py b/tests/integration/test_charm_scheduled_events.py index aa4a9f1b3..5e9819f23 100644 --- a/tests/integration/test_charm_scheduled_events.py +++ b/tests/integration/test_charm_scheduled_events.py @@ -13,7 +13,7 @@ from juju.application import Application from juju.model import Model -from runner_manager import RunnerManager +from runner_manager import LXDRunnerManager from tests.integration.helpers.common import check_runner_binary_exists from tests.integration.helpers.lxd import get_runner_names, run_in_unit, wait_till_num_of_runners from tests.status_name import ACTIVE @@ -40,7 +40,7 @@ async def test_update_interval(model: Model, app_scheduled_events: Application) unit = app_scheduled_events.units[0] assert await check_runner_binary_exists(unit) - ret_code, stdout, stderr = await run_in_unit(unit, f"rm -f {RunnerManager.runner_bin_path}") + ret_code, stdout, stderr = await run_in_unit(unit, f"rm -f {LXDRunnerManager.runner_bin_path}") assert ret_code == 0, f"Failed to remove runner binary {stdout} {stderr}" assert not await check_runner_binary_exists(unit) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 523c2b5ae..58410b402 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -25,7 +25,7 @@ from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( OpenStackCloudConfig, - OpenstackRunnerManager, + OpenStackRunnerManager, OpenStackServerConfig, ) from tests.integration.helpers.common import ( @@ -96,7 +96,7 @@ async def openstack_runner_manager_fixture( proxy_config: ProxyConfig, runner_label: str, openstack_connection: OpenstackConnection, -) -> OpenstackRunnerManager: +) -> OpenStackRunnerManager: """Create OpenstackRunnerManager instance. The prefix args of OpenstackRunnerManager set to app_name to let openstack_connection_fixture @@ -125,14 +125,14 @@ async def openstack_runner_manager_fixture( repo_policy_url=None, repo_policy_token=None, ) - return OpenstackRunnerManager( + return OpenStackRunnerManager( app_name, cloud_config, server_config, runner_config, service_config ) @pytest_asyncio.fixture(scope="module", name="runner_manager") async def runner_manager_fixture( - openstack_runner_manager: OpenstackRunnerManager, + openstack_runner_manager: OpenStackRunnerManager, token: str, github_path: GithubPath, log_dir_base_path: dict[str, Path], @@ -219,7 +219,7 @@ async def test_get_no_runner(runner_manager: RunnerManager) -> None: @pytest.mark.asyncio @pytest.mark.abort_on_fail async def test_runner_normal_idle_lifecycle( - runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager + runner_manager: RunnerManager, openstack_runner_manager: OpenStackRunnerManager ) -> None: """ Arrange: RunnerManager instance with no runners. @@ -381,7 +381,7 @@ async def test_runner_normal_lifecycle( @pytest.mark.asyncio @pytest.mark.abort_on_fail async def test_runner_spawn_two( - runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager + runner_manager: RunnerManager, openstack_runner_manager: OpenStackRunnerManager ) -> None: """ Arrange: RunnerManager instance with no runners. diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index a7276f078..994e0a4cf 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -47,7 +47,7 @@ from event_timer import EventTimer, TimerEnableError from firewall import FirewallEntry from github_type import GitHubRunnerStatus -from runner_manager import RunnerInfo, RunnerManagerConfig +from runner_manager import LXDRunnerManagerConfig, RunnerInfo TEST_PROXY_SERVER_URL = "http://proxy.server:1234" @@ -458,7 +458,7 @@ def test_org_register(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( + LXDRunnerManagerConfig( path=GithubOrg(org="mockorg", group="mockgroup"), token="mocktoken", image="jammy", @@ -488,7 +488,7 @@ def test_repo_register(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( + LXDRunnerManagerConfig( path=GithubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", @@ -546,7 +546,7 @@ def test_update_config(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( + LXDRunnerManagerConfig( path=GithubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", @@ -568,7 +568,7 @@ def test_update_config(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( + LXDRunnerManagerConfig( path=GithubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", @@ -926,7 +926,7 @@ def test__on_image_relation_image_ready(): harness.charm._setup_state = MagicMock(return_value=state_mock) harness.charm._get_set_image_ready_status = MagicMock(return_value=True) runner_manager_mock = MagicMock() - harness.charm._get_openstack_runner_manager = MagicMock(return_value=runner_manager_mock) + harness.charm._get_runner_scaler = MagicMock(return_value=runner_manager_mock) harness.charm._on_image_relation_changed(MagicMock()) diff --git a/tests/unit/test_runner_manager.py b/tests/unit/test_runner_manager.py index 66b09cd60..7d7600825 100644 --- a/tests/unit/test_runner_manager.py +++ b/tests/unit/test_runner_manager.py @@ -28,7 +28,7 @@ from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME from metrics.storage import MetricsStorage from runner import Runner, RunnerStatus -from runner_manager import BUILD_IMAGE_SCRIPT_FILENAME, RunnerManager, RunnerManagerConfig +from runner_manager import BUILD_IMAGE_SCRIPT_FILENAME, LXDRunnerManager, LXDRunnerManagerConfig from runner_type import RunnerNameByHealth from tests.unit.mock import TEST_BINARY, MockLxdImageManager @@ -87,10 +87,10 @@ def runner_manager_fixture(request, tmp_path, monkeypatch, token, charm_state): pool_path = tmp_path / "test_storage" pool_path.mkdir(exist_ok=True) - runner_manager = RunnerManager( + runner_manager = LXDRunnerManager( "test app", "0", - RunnerManagerConfig( + LXDRunnerManagerConfig( path=request.param[0], token=token, image=IMAGE_NAME, @@ -144,7 +144,7 @@ def reactive_reconcile_fixture(monkeypatch: MonkeyPatch, tmp_path: Path) -> Magi pytest.param(Arch.X64), ], ) -def test_get_latest_runner_bin_url(runner_manager: RunnerManager, arch: Arch, charm_state): +def test_get_latest_runner_bin_url(runner_manager: LXDRunnerManager, arch: Arch, charm_state): """ arrange: Nothing. act: Get runner bin url of existing binary. @@ -168,7 +168,7 @@ def test_get_latest_runner_bin_url(runner_manager: RunnerManager, arch: Arch, ch assert runner_bin["filename"] == filename -def test_get_latest_runner_bin_url_missing_binary(runner_manager: RunnerManager): +def test_get_latest_runner_bin_url_missing_binary(runner_manager: LXDRunnerManager): """ arrange: Given a mocked GH API client that does not return any runner binaries. act: Get runner bin url of non-existing binary. @@ -181,7 +181,7 @@ def test_get_latest_runner_bin_url_missing_binary(runner_manager: RunnerManager) runner_manager.get_latest_runner_bin_url(os_name="not_exist") -def test_update_runner_bin(runner_manager: RunnerManager): +def test_update_runner_bin(runner_manager: LXDRunnerManager): """ arrange: Remove the existing runner binary. act: Update runner binary. @@ -222,7 +222,7 @@ def iter_content(self, *args, **kwargs): assert runner_manager.runner_bin_path.read_bytes() == TEST_BINARY -def test_reconcile_zero_count(runner_manager: RunnerManager): +def test_reconcile_zero_count(runner_manager: LXDRunnerManager): """ arrange: Nothing. act: Reconcile with the current amount of runner. @@ -234,7 +234,7 @@ def test_reconcile_zero_count(runner_manager: RunnerManager): assert delta == 0 -def test_reconcile_create_runner(runner_manager: RunnerManager): +def test_reconcile_create_runner(runner_manager: LXDRunnerManager): """ arrange: Nothing. act: Reconcile to create a runner. @@ -246,7 +246,7 @@ def test_reconcile_create_runner(runner_manager: RunnerManager): assert delta == 1 -def test_reconcile_remove_runner(runner_manager: RunnerManager): +def test_reconcile_remove_runner(runner_manager: LXDRunnerManager): """ arrange: Create online runners. act: Reconcile to remove a runner. @@ -282,7 +282,7 @@ def mock_get_runners(): assert delta == -1 -def test_reconcile(runner_manager: RunnerManager, tmp_path: Path): +def test_reconcile(runner_manager: LXDRunnerManager, tmp_path: Path): """ arrange: Setup one runner. act: Reconcile with the current amount of runner. @@ -295,7 +295,7 @@ def test_reconcile(runner_manager: RunnerManager, tmp_path: Path): assert len(runner_manager._get_runners()) == 1 -def test_empty_flush(runner_manager: RunnerManager): +def test_empty_flush(runner_manager: LXDRunnerManager): """ arrange: No initial runners. act: Perform flushing with no runners. @@ -305,7 +305,7 @@ def test_empty_flush(runner_manager: RunnerManager): runner_manager.flush() -def test_flush(runner_manager: RunnerManager, tmp_path: Path): +def test_flush(runner_manager: LXDRunnerManager, tmp_path: Path): """ arrange: Create some runners. act: Perform flushing. @@ -319,7 +319,7 @@ def test_flush(runner_manager: RunnerManager, tmp_path: Path): def test_reconcile_issues_runner_installed_event( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, monkeypatch: MonkeyPatch, issue_event_mock: MagicMock, charm_state: MagicMock, @@ -341,7 +341,7 @@ def test_reconcile_issues_runner_installed_event( def test_reconcile_issues_no_runner_installed_event_if_metrics_disabled( - runner_manager: RunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock + runner_manager: LXDRunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock ): """ arrange: Disable issuing of metrics. @@ -356,7 +356,7 @@ def test_reconcile_issues_no_runner_installed_event_if_metrics_disabled( def test_reconcile_error_on_issue_event_is_ignored( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock, ): @@ -375,7 +375,7 @@ def test_reconcile_error_on_issue_event_is_ignored( def test_reconcile_issues_reconciliation_metric_event( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, monkeypatch: MonkeyPatch, issue_event_mock: MagicMock, runner_metrics: MagicMock, @@ -458,7 +458,7 @@ def mock_get_runners(): def test_reconcile_places_timestamp_in_newly_created_runner( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, monkeypatch: MonkeyPatch, shared_fs: MagicMock, tmp_path: Path, @@ -485,7 +485,7 @@ def test_reconcile_places_timestamp_in_newly_created_runner( def test_reconcile_error_on_placing_timestamp_is_ignored( - runner_manager: RunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock + runner_manager: LXDRunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock ): """ arrange: Enable issuing of metrics and do not create the directory for the shared filesystem\ @@ -504,7 +504,7 @@ def test_reconcile_error_on_placing_timestamp_is_ignored( def test_reconcile_places_no_timestamp_in_newly_created_runner_if_metrics_disabled( - runner_manager: RunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock + runner_manager: LXDRunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock ): """ arrange: Disable issuing of metrics, mock timestamps and the shared filesystem module. @@ -522,7 +522,7 @@ def test_reconcile_places_no_timestamp_in_newly_created_runner_if_metrics_disabl def test_reconcile_reactive_mode( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, reactive_reconcile_mock: MagicMock, caplog: LogCaptureFixture, ): @@ -542,7 +542,7 @@ def test_reconcile_reactive_mode( def test_schedule_build_runner_image( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, tmp_path: Path, charm_state: CharmState, monkeypatch: MonkeyPatch, @@ -569,7 +569,7 @@ def test_schedule_build_runner_image( assert cronfile.read_text() == f"4 4,10,16,22 * * * ubuntu {cmd} jammy\n" -def test_has_runner_image(runner_manager: RunnerManager): +def test_has_runner_image(runner_manager: LXDRunnerManager): """ arrange: Multiple setups. 1. no runner image exists. From ab1c4b814200dc92cbdc24ce8303bebb879d66cb Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:44:59 +0800 Subject: [PATCH 196/278] Fix lints --- src-docs/charm.md | 10 +++---- src-docs/metrics.md | 3 ++ src-docs/openstack_cloud.md | 11 +++++++ ...penstack_cloud.openstack_runner_manager.md | 9 +++--- src/charm.py | 11 ++++--- src/manager/runner_scaler.py | 29 ++++++++++++++----- .../openstack_runner_manager.py | 1 + 7 files changed, 54 insertions(+), 20 deletions(-) diff --git a/src-docs/charm.md b/src-docs/charm.md index c03d1411b..9fd2aac04 100644 --- a/src-docs/charm.md +++ b/src-docs/charm.md @@ -20,7 +20,7 @@ Charm for creating and managing GitHub self-hosted runner instances. --- - + ## function `catch_charm_errors` @@ -46,7 +46,7 @@ Catch common errors in charm. --- - + ## function `catch_action_errors` @@ -72,7 +72,7 @@ Catch common errors in actions. --- - + ## class `ReconcileRunnersEvent` Event representing a periodic check to ensure runners are ok. @@ -83,7 +83,7 @@ Event representing a periodic check to ensure runners are ok. --- - + ## class `GithubRunnerCharm` Charm for managing GitHub self-hosted runners. @@ -100,7 +100,7 @@ Charm for managing GitHub self-hosted runners. - `ram_pool_path`: The path to memdisk storage. - `kernel_module_path`: The path to kernel modules. - + ### method `__init__` diff --git a/src-docs/metrics.md b/src-docs/metrics.md index 25c650c41..e4275c340 100644 --- a/src-docs/metrics.md +++ b/src-docs/metrics.md @@ -22,5 +22,8 @@ Package for common metrics-related code. - **github**: # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +- **runner_logs**: # Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + diff --git a/src-docs/openstack_cloud.md b/src-docs/openstack_cloud.md index 80cdb146b..51140a4b2 100644 --- a/src-docs/openstack_cloud.md +++ b/src-docs/openstack_cloud.md @@ -5,6 +5,17 @@ # module `openstack_cloud` Module for managing Openstack cloud. +**Global Variables** +--------------- +- **openstack_cloud**: # Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +- **openstack_manager**: # Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +- **openstack_runner_manager**: # Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + --- diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 7499e4ce5..02841811f 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -131,7 +131,7 @@ The prefix of runner names. --- - + ### method `cleanup` @@ -174,6 +174,7 @@ Create a self-hosted runner. **Raises:** + - `MissingServerConfigError`: Unable to create runner due to missing configuration. - `RunnerCreateError`: Unable to create runner due to OpenStack issues. @@ -183,7 +184,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -207,7 +208,7 @@ Delete self-hosted runners. --- - + ### method `get_runner` @@ -230,7 +231,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/charm.py b/src/charm.py index b869e3627..d3b48d44d 100755 --- a/src/charm.py +++ b/src/charm.py @@ -11,7 +11,6 @@ from manager.cloud_runner_manager import GitHubRunnerConfig, SupportServiceConfig from manager.runner_manager import RunnerManager, RunnerManagerConfig from manager.runner_scaler import RunnerScaler -from openstack_cloud.openstack_runner_manager import OpenStackCloudConfig, OpenStackServerConfig from utilities import bytes_with_unit_to_kib, execute_command, remove_residual_venv_dirs, retry # This is a workaround for https://bugs.launchpad.net/juju/+bug/2058335 @@ -82,10 +81,14 @@ from event_timer import EventTimer, TimerStatusError from firewall import Firewall, FirewallEntry from github_type import GitHubRunnerStatus -from openstack_cloud.openstack_runner_manager import OpenStackRunnerManager +from openstack_cloud.openstack_runner_manager import ( + OpenStackCloudConfig, + OpenStackRunnerManager, + OpenStackServerConfig, +) from runner import LXD_PROFILE_YAML from runner_manager import LXDRunnerManager, LXDRunnerManagerConfig -from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig +from runner_manager_type import FlushMode RECONCILE_RUNNERS_EVENT = "reconcile-runners" @@ -1210,7 +1213,7 @@ def _get_runner_scaler( if path is None: path = state.charm_config.path - app_name, unit = self.unit.name.rsplit("/", 1) + app_name, _ = self.unit.name.rsplit("/", 1) clouds = list(state.charm_config.openstack_clouds_yaml["clouds"].keys()) if len(clouds) > 1: diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index 7bf9b81b9..980c542d0 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -1,6 +1,7 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +"""Module for scaling the runners amount.""" import logging import time @@ -15,8 +16,7 @@ logger = logging.getLogger(__name__) -@TypedDict -class RunnerInfo: +class RunnerInfo(TypedDict): """Information on the runners. Attributes: @@ -39,11 +39,16 @@ def __init__(self, runner_manager: RunnerManager): """Construct the object. Args: - runner_manager: The RunnerManager to preform runner reconcile. + runner_manager: The RunnerManager to perform runner reconcile. """ self._manager = runner_manager def get_runner_info(self) -> RunnerInfo: + """Get information on the runners. + + Returns: + The information on the runners. + """ runner_list = self._manager.get_runners() online = 0 offline = 0 @@ -61,16 +66,26 @@ def get_runner_info(self) -> RunnerInfo: offline += 1 case _: unknown += 1 - return RunnerInfo(online=online, offline=offline, unknown=unknown, runners=online_runners) + return RunnerInfo( + online=online, offline=offline, unknown=unknown, runners=tuple(online_runners) + ) - def flush(self, flush_mode: FlushMode) -> None: + def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: """Flush the runners. Args: flush_mode: Determines the types of runner to be flushed. + + Returns: + Number of runners flushed. """ - self._manager.cleanup() - self._manager.delete_runners(flush_mode=flush_mode) + metric_stats = self._manager.cleanup() + delete_metric_stats = self._manager.delete_runners(flush_mode=flush_mode) + metric_stats = { + delete_metric_stats.get(event_name, 0) + metric_stats.get(event_name, 0) + for event_name in set(delete_metric_stats) | set(metric_stats) + } + return metric_stats.get(metric_events.RunnerStop, 0) def reconcile(self, num_of_runner: int) -> int: """Reconcile the quantity of runners. diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 45231bdb0..00612ce03 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -159,6 +159,7 @@ def create_runner(self, registration_token: str) -> InstanceId: registration_token: The GitHub registration token for registering runners. Raises: + MissingServerConfigError: Unable to create runner due to missing configuration. RunnerCreateError: Unable to create runner due to OpenStack issues. Returns: From ba2800d67d92e80725454e7e2528410122b4669d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 20 Aug 2024 13:18:37 +0800 Subject: [PATCH 197/278] Fix unit test --- tests/unit/conftest.py | 10 +++--- tests/unit/test_charm.py | 36 ++++++++++--------- ..._manager.py => test_lxd_runner_manager.py} | 4 +-- 3 files changed, 26 insertions(+), 24 deletions(-) rename tests/unit/{test_runner_manager.py => test_lxd_runner_manager.py} (99%) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 3ee0259f7..cb50275f6 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -10,7 +10,7 @@ import pytest import utilities -from openstack_cloud import openstack_manager +from manager.runner_scaler import RunnerScaler from tests.unit.mock import MockGhapiClient, MockLxdClient, MockRepoPolicyComplianceClient @@ -46,7 +46,7 @@ def disk_usage_mock(total_disk: int): @pytest.fixture(autouse=True) def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_path): - openstack_manager_mock = unittest.mock.MagicMock(spec=openstack_manager) + runner_scaler_mock = unittest.mock.MagicMock(spec=RunnerScaler) cron_path = tmp_path / "cron.d" cron_path.mkdir() @@ -61,7 +61,7 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr( "charm.GithubRunnerCharm.repo_check_systemd_service", tmp_path / "systemd_service" ) - monkeypatch.setattr("charm.OpenstackRunnerManager", openstack_manager_mock) + monkeypatch.setattr("charm.RunnerScaler", runner_scaler_mock) monkeypatch.setattr("charm.GithubRunnerCharm.kernel_module_path", tmp_path / "modules") monkeypatch.setattr("charm.GithubRunnerCharm._update_kernel", lambda self, now: None) monkeypatch.setattr("charm.execute_command", exec_command) @@ -86,8 +86,8 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr("runner_manager.LxdClient", MockLxdClient) monkeypatch.setattr("runner_manager.shared_fs", unittest.mock.MagicMock()) monkeypatch.setattr("runner_manager.execute_command", exec_command) - monkeypatch.setattr("runner_manager.RunnerManager.runner_bin_path", runner_binary_path) - monkeypatch.setattr("runner_manager.RunnerManager.cron_path", cron_path) + monkeypatch.setattr("runner_manager.LXDRunnerManager.runner_bin_path", runner_binary_path) + monkeypatch.setattr("runner_manager.LXDRunnerManager.cron_path", cron_path) monkeypatch.setattr( "runner_manager.RepoPolicyComplianceClient", MockRepoPolicyComplianceClient ) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 994e0a4cf..d44eca542 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -156,8 +156,10 @@ def stub_update_runner_bin(*args, **kwargs) -> None: harness = Harness(GithubRunnerCharm) harness.update_config({PATH_CONFIG_NAME: "mock/repo", TOKEN_CONFIG_NAME: "mocktoken"}) harness.begin() - monkeypatch.setattr("runner_manager.RunnerManager.update_runner_bin", stub_update_runner_bin) - monkeypatch.setattr("runner_manager.RunnerManager._runners_in_pre_job", lambda self: False) + monkeypatch.setattr( + "runner_manager.LXDRunnerManager.update_runner_bin", stub_update_runner_bin + ) + monkeypatch.setattr("runner_manager.LXDRunnerManager._runners_in_pre_job", lambda self: False) monkeypatch.setattr("charm.EventTimer.ensure_event_timer", MagicMock()) monkeypatch.setattr("charm.logrotate.setup", MagicMock()) return harness @@ -206,7 +208,7 @@ def test_common_install_code( monkeypatch.setattr("charm.logrotate.setup", setup_logrotate := MagicMock()) monkeypatch.setattr( - "runner_manager.RunnerManager.schedule_build_runner_image", + "runner_manager.LXDRunnerManager.schedule_build_runner_image", schedule_build_runner_image := MagicMock(), ) event_timer_mock = MagicMock(spec=EventTimer) @@ -241,11 +243,11 @@ def test_common_install_code_does_not_rebuild_image( assert: Image is not rebuilt. """ monkeypatch.setattr( - "runner_manager.RunnerManager.build_runner_image", + "runner_manager.LXDRunnerManager.build_runner_image", build_runner_image := MagicMock(), ) monkeypatch.setattr( - "runner_manager.RunnerManager.has_runner_image", + "runner_manager.LXDRunnerManager.has_runner_image", MagicMock(return_value=True), ) getattr(harness.charm.on, hook).emit() @@ -437,7 +439,7 @@ def test_database_integration_events_trigger_reconciliation( class TestCharm(unittest.TestCase): """Test the GithubRunner charm.""" - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -468,7 +470,7 @@ def test_org_register(self, run, wt, mkdir, rm): ), ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -498,7 +500,7 @@ def test_repo_register(self, run, wt, mkdir, rm): ), ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -525,7 +527,7 @@ def test_exceed_free_disk_size(self, run, wt, mkdir, rm): ) ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -582,7 +584,7 @@ def test_update_config(self, run, wt, mkdir, rm): ) mock_rm.reset_mock() - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -627,7 +629,7 @@ def test_on_update_status(self, run, wt, mkdir, rm): with pytest.raises(TimerEnableError): harness.charm.on.update_status.emit() - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -639,7 +641,7 @@ def test_on_stop(self, run, wt, mkdir, rm): harness.charm.on.stop.emit() mock_rm.flush.assert_called() - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -658,8 +660,8 @@ def test_on_start_failure(self, run, wt, mkdir, rm): "Failed to start runners: mock error" ) - @patch("charm.RunnerManager") - @patch("charm.OpenstackRunnerManager") + @patch("charm.LXDRunnerManager") + @patch("charm.RunnerScaler") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -698,7 +700,7 @@ def test_on_config_changed_openstack_clouds_yaml(self, run, wt, mkdir, orm, rm): assert harness.charm.unit.status == BlockedStatus("Please provide image integration.") - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -717,7 +719,7 @@ def test_check_runners_action(self, run, wt, mkdir, rm): {"online": 2, "offline": 2, "unknown": 1, "runners": "test runner 0, test runner 1"} ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -731,7 +733,7 @@ def test_check_runners_action_with_errors(self, run, wt, mkdir, rm): harness.charm._on_check_runners_action(mock_event) mock_event.fail.assert_called_with("Invalid Github config, Missing path configuration") - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") diff --git a/tests/unit/test_runner_manager.py b/tests/unit/test_lxd_runner_manager.py similarity index 99% rename from tests/unit/test_runner_manager.py rename to tests/unit/test_lxd_runner_manager.py index 7d7600825..829d73c9c 100644 --- a/tests/unit/test_runner_manager.py +++ b/tests/unit/test_lxd_runner_manager.py @@ -1,7 +1,7 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -"""Test cases of RunnerManager class.""" +"""Test cases of LXDRunnerManager class.""" import random import secrets from pathlib import Path @@ -82,7 +82,7 @@ def charm_state_fixture(charm_config: MagicMock): def runner_manager_fixture(request, tmp_path, monkeypatch, token, charm_state): charm_state.proxy_config = request.param[1] monkeypatch.setattr( - "runner_manager.RunnerManager.runner_bin_path", tmp_path / "mock_runner_binary" + "runner_manager.LXDRunnerManager.runner_bin_path", tmp_path / "mock_runner_binary" ) pool_path = tmp_path / "test_storage" pool_path.mkdir(exist_ok=True) From a7979d155dc73568b1cd135f334f156791858bbe Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 20 Aug 2024 14:34:36 +0800 Subject: [PATCH 198/278] Fix according to review comment --- src-docs/openstack_cloud.openstack_manager.md | 12 +-- ...penstack_cloud.openstack_runner_manager.md | 25 +++++- src/manager/cloud_runner_manager.py | 14 +++- src/manager/runner_manager.py | 13 ++-- src/openstack_cloud/openstack_cloud.py | 3 +- src/openstack_cloud/openstack_manager.py | 2 - .../openstack_runner_manager.py | 77 ++++++++++++++++++- templates/env.j2 | 14 ---- 8 files changed, 124 insertions(+), 36 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_manager.md b/src-docs/openstack_cloud.openstack_manager.md index f87a1b8b4..a0f0a2531 100644 --- a/src-docs/openstack_cloud.openstack_manager.md +++ b/src-docs/openstack_cloud.openstack_manager.md @@ -93,7 +93,7 @@ __init__( --- - + ## class `GithubRunnerRemoveError` Represents an error removing registered runner from Github. @@ -104,7 +104,7 @@ Represents an error removing registered runner from Github. --- - + ## class `OpenstackRunnerManager` Runner manager for OpenStack-based instances. @@ -117,7 +117,7 @@ Runner manager for OpenStack-based instances. - `unit_num`: The juju unit number. - `instance_name`: Prefix of the name for the set of runners. - + ### method `__init__` @@ -146,7 +146,7 @@ Construct OpenstackRunnerManager object. --- - + ### method `flush` @@ -171,7 +171,7 @@ Flush Openstack servers. --- - + ### method `get_github_runner_info` @@ -188,7 +188,7 @@ Get information on GitHub for the runners. --- - + ### method `reconcile` diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 82991f1e2..752e2f9d3 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -131,7 +131,7 @@ The prefix of runner names. --- - + ### method `cleanup` @@ -207,6 +207,29 @@ Delete self-hosted runners. --- + + +### method `flush_runners` + +```python +flush_runners(remove_token: str, busy: bool = False) → Iterator[RunnerMetrics] +``` + +Remove idle and/or busy runners. + + + +**Args:** + remove_token: + - `busy`: If false, only idle runners are removed. If true, both idle and busy runners are removed. + + + +**Returns:** + Any metrics retrieved from flushed runners. + +--- + ### method `get_runner` diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 7362990d0..28ed17b20 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -176,13 +176,23 @@ def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerIn @abc.abstractmethod def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: - """Delete self-hosted runners. + """Delete self-hosted runner. Args: instance_id: The instance id of the runner to delete. remove_token: The GitHub remove token. """ + @abc.abstractmethod + def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[RunnerMetrics]: + """Stop all runners. + + Args: + remove_token: The GitHub remove token for removing runners. + busy: If false, only idle runners are removed. If true, both idle and busy runners are + removed. + """ + @abc.abstractmethod def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: """Cleanup runner and resource on the cloud. @@ -190,5 +200,5 @@ def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: Perform health check on runner and delete the runner if it fails. Args: - remove_token: The GitHub remove token. + remove_token: The GitHub remove token for removing runners. """ diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 98cf5f35f..a9cff2f35 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -236,15 +236,12 @@ def flush_runners( "Unknown flush mode %s encountered, contact developers", flush_mode ) - states = [GitHubRunnerState.IDLE] - if flush_mode == FlushMode.FLUSH_BUSY: - states.append(GitHubRunnerState.BUSY) - - runners_list = self.get_runners(github_states=states) - runner_names = [runner.name for runner in runners_list] - logger.info("Flushing runners: %s", runner_names) + busy = False + if FlushMode.FLUSH_BUSY: + busy = True remove_token = self._github.get_removal_token() - return self._delete_runners(runners=runners_list, remove_token=remove_token) + stats = self._cloud.flush_runners(remove_token, busy) + return self._issue_runner_metrics(metrics=stats) def cleanup(self) -> IssuedMetricEventsStats: """Run cleanup of the runners and other resources. diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 462b6b46b..fb3d07cb0 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -386,11 +386,12 @@ def _cleanup_openstack_keypairs( exclude_instances: The keys of these instance will not be deleted. """ logger.info("Cleaning up openstack keypairs") + exclude_instance_set = set(exclude_instances) keypairs = conn.list_keypairs() for key in keypairs: # The `name` attribute is of resource.Body type. if key.name and str(key.name).startswith(self.prefix): - if str(key.name) in set(exclude_instances): + if str(key.name) in exclude_instance_set: continue try: diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index 6b6b3d082..379d2ae4c 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -237,8 +237,6 @@ def _generate_runner_env( pre_job_script=str(PRE_JOB_SCRIPT), dockerhub_mirror=dockerhub_mirror or "", ssh_debug_info=(secrets.choice(ssh_debug_connections) if ssh_debug_connections else None), - # Proxies are handled by aproxy. - proxies={}, ) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index c58d131c0..d95017710 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -269,6 +269,31 @@ def delete_runner( self._delete_runner(instance, remove_token) return next(extracted_metrics, None) + def flush_runners( + self, remove_token: str, busy: bool = False + ) -> Iterator[runner_metrics.RunnerMetrics]: + """Remove idle and/or busy runners. + + Args: + remove_token: + busy: If false, only idle runners are removed. If true, both idle and busy runners are + removed. + + Returns: + Any metrics retrieved from flushed runners. + """ + instance_list = self._openstack_cloud.get_instances() + for instance in instance_list: + try: + self._check_state_and_flush(instance, busy) + except SSHError: + logger.warning( + "Unable to determine state of %s and kill runner process due to SSH issues", + instance.server_name, + ) + continue + return self.cleanup(remove_token) + def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: """Cleanup runner and resource on the cloud. @@ -381,8 +406,6 @@ def _generate_cloud_init(self, instance_name: str, registration_token: str) -> s if self._service_config.ssh_debug_connections else None ), - # Proxies are handled by aproxy. - proxies={}, ) pre_job_contents_dict = { @@ -435,6 +458,56 @@ def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | Non ) return None + @retry(tries=3, delay=5, backoff=2, local_logger=logger) + def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> None: + """Kill runner process depending on idle or busy. + + Due to update to runner state has some delay with GitHub API. The state of the runner is + determined by which runner processes are running. If the Runner.Worker process is running, + the runner is deemed to be busy. + + Raises: + SSHError: Unable to check the state of the runner and kill the runner process due to + SSH failure. + + Args: + instance: The openstack instance to kill the runner process. + busy: Kill the process if runner is busy, else only kill runner + process if runner is idle. + """ + try: + ssh_conn = self._openstack_cloud.get_ssh_connection(instance) + except KeyfileError: + logger.exception( + "Health check failed due to unable to find keyfile for %s", instance.server_name + ) + return + except SSHError: + logger.exception( + "SSH connection failure with %s during health check", instance.server_name + ) + raise + + if not busy: + # only kill Runner.Listener if Runner.Worker does not exist. + kill_command = ( + "! pgrep -x Runner.Worker && pgrep -x Runner.Listener && " + "kill $(pgrep -x Runner.Listener)" + ) + else: + # kill both Runner.Listener and Runner.Worker processes. + # This kills pre-job.sh, a child process of Runner.Worker. + kill_command = ( + "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" + "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);" + ) + + result: invoke.runners.Result = ssh_conn.run(kill_command, warn=True) + if not result.ok: + logger.warning("Unable to SSH to kill runner process on %s", instance.name) + return + logger.info("Killed runner process on %s", instance.name) + @retry(tries=3, delay=5, backoff=2, local_logger=logger) def _health_check(self, instance: OpenstackInstance) -> bool: """Check whether runner is healthy. diff --git a/templates/env.j2 b/templates/env.j2 index c0de54aad..f7da33219 100644 --- a/templates/env.j2 +++ b/templates/env.j2 @@ -1,18 +1,4 @@ PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin -{% if proxies.http %} -HTTP_PROXY={{proxies.http}} -http_proxy={{proxies.http}} -{% endif %} -{% if proxies.https %} -HTTPS_PROXY={{proxies.https}} -https_proxy={{proxies.https}} -{% endif %} -{% if proxies.ftp_proxy %} -{% endif %} -{% if proxies.no_proxy %} -NO_PROXY={{proxies.no_proxy}} -no_proxy={{proxies.no_proxy}} -{% endif %} {% if dockerhub_mirror %} DOCKERHUB_MIRROR={{dockerhub_mirror}} CONTAINER_REGISTRY_URL={{dockerhub_mirror}} From 16025254400e0aecb02b46114b993c80fd262ad1 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 20 Aug 2024 14:37:46 +0800 Subject: [PATCH 199/278] Fix test according to comments --- .../test_runner_manager_openstack.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 523c2b5ae..cec1158e5 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -150,7 +150,7 @@ async def runner_manager_with_one_runner_fixture(runner_manager: RunnerManager) runner_manager.create_runners(1) runner_list = runner_manager.get_runners() try: - await assert_runner_amount(runner_manager, 1) + await wait_runner_amount(runner_manager, 1) except TimeoutError as err: raise AssertionError("Test arrange failed: Expect one runner") from err @@ -183,10 +183,10 @@ def workflow_is_status(workflow: Workflow, status: str) -> bool: return workflow.status == status -async def assert_runner_amount(runner_manager: RunnerManager, num: int): - """Assert the number of runner a runner manager has. +async def wait_runner_amount(runner_manager: RunnerManager, num: int): + """Wait until the runner manager has the number of runners. - A TimeoutError will be thrown if runners are still found after timeout. + A TimeoutError will be thrown if runners amount is not correct after timeout. Args: runner_manager: The RunnerManager to check. @@ -239,7 +239,7 @@ async def test_runner_normal_idle_lifecycle( runner_id = runner_id_list[0] try: - await assert_runner_amount(runner_manager, 1) + await wait_runner_amount(runner_manager, 1) except TimeoutError as err: raise AssertionError("Test arrange failed: Expect one runner") from err @@ -265,7 +265,7 @@ async def test_runner_normal_idle_lifecycle( # 3. runner_manager.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) - await assert_runner_amount(runner_manager, 0) + await wait_runner_amount(runner_manager, 0) @pytest.mark.openstack @@ -320,7 +320,7 @@ async def test_runner_flush_busy_lifecycle( issue_metrics_events = runner_manager_with_one_runner.cleanup() assert issue_metrics_events[events.RunnerStart] == 1 - await assert_runner_amount(runner_manager_with_one_runner, 0) + await wait_runner_amount(runner_manager_with_one_runner, 0) @pytest.mark.openstack @@ -374,7 +374,7 @@ async def test_runner_normal_lifecycle( assert metric_logs[1]["event"] == "runner_stop" assert metric_logs[1]["workflow"] == "Workflow Dispatch Wait Tests" - await assert_runner_amount(runner_manager_with_one_runner, 0) + await wait_runner_amount(runner_manager_with_one_runner, 0) @pytest.mark.openstack @@ -387,10 +387,10 @@ async def test_runner_spawn_two( Arrange: RunnerManager instance with no runners. Act: 1. Create two runner. - 3. Delete all idle runner. + 2. Delete all idle runner. Assert: 1. Two active idle runner. - 3. No runners. + 2. No runners. """ # 1. runner_id_list = runner_manager.create_runners(2) @@ -398,7 +398,7 @@ async def test_runner_spawn_two( assert len(runner_id_list) == 2 try: - await assert_runner_amount(runner_manager, 2) + await wait_runner_amount(runner_manager, 2) except TimeoutError as err: raise AssertionError("Test arrange failed: Expect two runner") from err @@ -408,4 +408,4 @@ async def test_runner_spawn_two( # 3. runner_manager.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) - await assert_runner_amount(runner_manager, 0) + await wait_runner_amount(runner_manager, 0) From 66003f400a4a830efd586b4da08a7c9719d92731 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 20 Aug 2024 14:52:31 +0800 Subject: [PATCH 200/278] Fix unit test --- tests/unit/test_openstack_manager.py | 46 +--------------------------- 1 file changed, 1 insertion(+), 45 deletions(-) diff --git a/tests/unit/test_openstack_manager.py b/tests/unit/test_openstack_manager.py index 5e43fb518..5329b1282 100644 --- a/tests/unit/test_openstack_manager.py +++ b/tests/unit/test_openstack_manager.py @@ -287,52 +287,23 @@ def test__create_connection( @pytest.mark.parametrize( - "proxy_config, dockerhub_mirror, ssh_debug_connections, expected_env_contents", + "dockerhub_mirror, ssh_debug_connections, expected_env_contents", [ pytest.param( - None, None, None, """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - - - - LANG=C.UTF-8 ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh """, id="all values empty", ), pytest.param( - openstack_manager.ProxyConfig( - http="http://test.internal", - https="https://test.internal", - no_proxy="http://no_proxy.internal", - ), - None, - None, - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - - - - - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh -""", - id="proxy value set", - ), - pytest.param( - None, "http://dockerhub_mirror.test", None, """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - - - - DOCKERHUB_MIRROR=http://dockerhub_mirror.test CONTAINER_REGISTRY_URL=http://dockerhub_mirror.test @@ -342,7 +313,6 @@ def test__create_connection( id="dockerhub mirror set", ), pytest.param( - None, None, [ openstack_manager.SSHDebugConnection( @@ -354,10 +324,6 @@ def test__create_connection( ], """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - - - - LANG=C.UTF-8 ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh @@ -369,11 +335,6 @@ def test__create_connection( id="ssh debug connection set", ), pytest.param( - openstack_manager.ProxyConfig( - http="http://test.internal", - https="https://test.internal", - no_proxy="http://no_proxy.internal", - ), "http://dockerhub_mirror.test", [ openstack_manager.SSHDebugConnection( @@ -385,10 +346,6 @@ def test__create_connection( ], """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - - - - DOCKERHUB_MIRROR=http://dockerhub_mirror.test CONTAINER_REGISTRY_URL=http://dockerhub_mirror.test @@ -405,7 +362,6 @@ def test__create_connection( ], ) def test__generate_runner_env( - proxy_config: Optional[openstack_manager.ProxyConfig], dockerhub_mirror: Optional[str], ssh_debug_connections: Optional[list[openstack_manager.SSHDebugConnection]], expected_env_contents: str, From 16ae84e1224eab9184bfa43d8e5e9dce28526682 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 09:37:48 +0800 Subject: [PATCH 201/278] Fix typo of attr --- src/openstack_cloud/openstack_runner_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index d95017710..25e849a9d 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -504,9 +504,9 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non result: invoke.runners.Result = ssh_conn.run(kill_command, warn=True) if not result.ok: - logger.warning("Unable to SSH to kill runner process on %s", instance.name) + logger.warning("Unable to SSH to kill runner process on %s: %s", instance.server_name, result.stderr) return - logger.info("Killed runner process on %s", instance.name) + logger.info("Killed runner process on %s", instance.server_name) @retry(tries=3, delay=5, backoff=2, local_logger=logger) def _health_check(self, instance: OpenstackInstance) -> bool: @@ -548,7 +548,7 @@ def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool: """ result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) if not result.ok: - logger.warning("SSH run of `ps aux` failed on %s", name) + logger.warning("SSH run of `ps aux` failed on %s: %s", name, result.stderr) return False if ( RUNNER_WORKER_PROCESS not in result.stdout From 6b96dd4238948b00be7e86e8e09a2fef5f460ac5 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 09:50:58 +0800 Subject: [PATCH 202/278] Add debug --- src/openstack_cloud/openstack_runner_manager.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 25e849a9d..696a61088 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -488,6 +488,9 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non ) raise + # TODO: debug + import pytest + pytest.set_trace() if not busy: # only kill Runner.Listener if Runner.Worker does not exist. kill_command = ( @@ -501,6 +504,7 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);" ) + pytest.set_trace() result: invoke.runners.Result = ssh_conn.run(kill_command, warn=True) if not result.ok: From 9657299e09c0bafed362144b4c978bccf54d45b6 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 09:55:51 +0800 Subject: [PATCH 203/278] Add debug statement --- src/openstack_cloud/openstack_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 696a61088..c6bc7a973 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -489,6 +489,7 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non raise # TODO: debug + result: invoke.runners.Result = ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) import pytest pytest.set_trace() if not busy: @@ -504,7 +505,6 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);" ) - pytest.set_trace() result: invoke.runners.Result = ssh_conn.run(kill_command, warn=True) if not result.ok: From 29ef80994a659f063f5cb58b6d16dd179a55a56c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 09:57:17 +0800 Subject: [PATCH 204/278] Debug --- .../test_runner_manager_openstack.py | 130 +++++++++--------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index cec1158e5..434747acd 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -201,71 +201,71 @@ async def wait_runner_amount(runner_manager: RunnerManager, num: int): await wait_for(lambda: len(runner_manager.get_runners()) == num) -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_get_no_runner(runner_manager: RunnerManager) -> None: - """ - Arrange: RunnerManager instance with no runners. - Act: Get runners. - Assert: Empty tuple returned. - """ - runner_list = runner_manager.get_runners() - assert isinstance(runner_list, tuple) - assert not runner_list - - -@pytest.mark.openstack -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_runner_normal_idle_lifecycle( - runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager -) -> None: - """ - Arrange: RunnerManager instance with no runners. - Act: - 1. Create one runner. - 2. Run health check on the runner. - 3. Delete all idle runner. - Assert: - 1. An active idle runner. - 2. Health check passes. - 3. No runners. - """ - # 1. - runner_id_list = runner_manager.create_runners(1) - assert isinstance(runner_id_list, tuple) - assert len(runner_id_list) == 1 - runner_id = runner_id_list[0] - - try: - await wait_runner_amount(runner_manager, 1) - except TimeoutError as err: - raise AssertionError("Test arrange failed: Expect one runner") from err - - runner_list = runner_manager.get_runners() - assert isinstance(runner_list, tuple) - assert len(runner_list) == 1 - runner = runner_list[0] - assert runner.instance_id == runner_id - assert runner.cloud_state == CloudRunnerState.ACTIVE - # Update on GitHub-side can take a bit of time. - await wait_for( - lambda: runner_manager.get_runners()[0].github_state == GitHubRunnerState.IDLE, - timeout=120, - check_interval=10, - ) - - # 2. - openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() - assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." - runner = openstack_instances[0] - - assert openstack_runner_manager._health_check(runner) - - # 3. - runner_manager.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) - await wait_runner_amount(runner_manager, 0) +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail +# async def test_get_no_runner(runner_manager: RunnerManager) -> None: +# """ +# Arrange: RunnerManager instance with no runners. +# Act: Get runners. +# Assert: Empty tuple returned. +# """ +# runner_list = runner_manager.get_runners() +# assert isinstance(runner_list, tuple) +# assert not runner_list + + +# @pytest.mark.openstack +# @pytest.mark.asyncio +# @pytest.mark.abort_on_fail +# async def test_runner_normal_idle_lifecycle( +# runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager +# ) -> None: +# """ +# Arrange: RunnerManager instance with no runners. +# Act: +# 1. Create one runner. +# 2. Run health check on the runner. +# 3. Delete all idle runner. +# Assert: +# 1. An active idle runner. +# 2. Health check passes. +# 3. No runners. +# """ +# # 1. +# runner_id_list = runner_manager.create_runners(1) +# assert isinstance(runner_id_list, tuple) +# assert len(runner_id_list) == 1 +# runner_id = runner_id_list[0] + +# try: +# await wait_runner_amount(runner_manager, 1) +# except TimeoutError as err: +# raise AssertionError("Test arrange failed: Expect one runner") from err + +# runner_list = runner_manager.get_runners() +# assert isinstance(runner_list, tuple) +# assert len(runner_list) == 1 +# runner = runner_list[0] +# assert runner.instance_id == runner_id +# assert runner.cloud_state == CloudRunnerState.ACTIVE +# # Update on GitHub-side can take a bit of time. +# await wait_for( +# lambda: runner_manager.get_runners()[0].github_state == GitHubRunnerState.IDLE, +# timeout=120, +# check_interval=10, +# ) + +# # 2. +# openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() +# assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." +# runner = openstack_instances[0] + +# assert openstack_runner_manager._health_check(runner) + +# # 3. +# runner_manager.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) +# await wait_runner_amount(runner_manager, 0) @pytest.mark.openstack From d14effbfaa9cb7c3d7135492dcac27160005559c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 10:15:31 +0800 Subject: [PATCH 205/278] Fix return code of the kill command --- src/openstack_cloud/openstack_runner_manager.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index c6bc7a973..57b3e2278 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -506,11 +506,8 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);" ) - result: invoke.runners.Result = ssh_conn.run(kill_command, warn=True) - if not result.ok: - logger.warning("Unable to SSH to kill runner process on %s: %s", instance.server_name, result.stderr) - return - logger.info("Killed runner process on %s", instance.server_name) + ssh_conn.run(kill_command, warn=True) + logger.info("Attempted to killed runner process on %s", instance.server_name) @retry(tries=3, delay=5, backoff=2, local_logger=logger) def _health_check(self, instance: OpenstackInstance) -> bool: From 09a13c9357e583c227ae53ae28bfcc4ae047e295 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 10:22:03 +0800 Subject: [PATCH 206/278] Remove debug --- src/openstack_cloud/openstack_runner_manager.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 57b3e2278..ebcd7dac9 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -488,10 +488,6 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non ) raise - # TODO: debug - result: invoke.runners.Result = ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) - import pytest - pytest.set_trace() if not busy: # only kill Runner.Listener if Runner.Worker does not exist. kill_command = ( From f08b4ad66c96665aef77e2b400b3ea62ebd22717 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 10:32:20 +0800 Subject: [PATCH 207/278] Add comments on the flush kill command --- src/openstack_cloud/openstack_runner_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index ebcd7dac9..527596c74 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -488,6 +488,8 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non ) raise + # Using a single command to determine the state and kill the process if needed. + # This makes it more robust when network is unstable. if not busy: # only kill Runner.Listener if Runner.Worker does not exist. kill_command = ( @@ -501,7 +503,7 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);" ) - + # Checking the result of kill command is not useful, as the exit code does not reveal much. ssh_conn.run(kill_command, warn=True) logger.info("Attempted to killed runner process on %s", instance.server_name) From 68bd0cc0481737e33be65891fafa62cb2e9917fe Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 10:51:12 +0800 Subject: [PATCH 208/278] Add debug --- .../openstack_runner_manager.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 527596c74..7ba96a4bb 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -487,21 +487,27 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non "SSH connection failure with %s during health check", instance.server_name ) raise + + # TODO: Debug + ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) + import pytest + pytest.set_trace() + # Using a single command to determine the state and kill the process if needed. # This makes it more robust when network is unstable. - if not busy: - # only kill Runner.Listener if Runner.Worker does not exist. + if busy: + # kill both Runner.Listener and Runner.Worker processes. + # This kills pre-job.sh, a child process of Runner.Worker. kill_command = ( - "! pgrep -x Runner.Worker && pgrep -x Runner.Listener && " - "kill $(pgrep -x Runner.Listener)" + f"pgrep -x {RUNNER_LISTENER_PROCESS} && kill $(pgrep -x {RUNNER_LISTENER_PROCESS});" + f"pgrep -x {RUNNER_WORKER_PROCESS} && kill $(pgrep -x {RUNNER_WORKER_PROCESS});" ) else: - # kill both Runner.Listener and Runner.Worker processes. - # This kills pre-job.sh, a child process of Runner.Worker. + # Only kill Runner.Listener if Runner.Worker does not exist. kill_command = ( - "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" - "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);" + f"! pgrep -x {RUNNER_WORKER_PROCESS} && pgrep -x {RUNNER_LISTENER_PROCESS} && " + f"kill $(pgrep -x {RUNNER_LISTENER_PROCESS})" ) # Checking the result of kill command is not useful, as the exit code does not reveal much. ssh_conn.run(kill_command, warn=True) From d92da28030e57e8c35476a1cb1489ed4c651e730 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 10:57:10 +0800 Subject: [PATCH 209/278] Fix debug --- src/openstack_cloud/openstack_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 7ba96a4bb..7592515f8 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -489,7 +489,7 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non raise # TODO: Debug - ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) + result = ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) import pytest pytest.set_trace() From 96dfab7756d19cb84d245389a9ab8d0381fbb7cd Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 11:10:12 +0800 Subject: [PATCH 210/278] Debug --- .../openstack_runner_manager.py | 10 +- .../test_runner_manager_openstack.py | 130 +++++++++--------- 2 files changed, 73 insertions(+), 67 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 7592515f8..c812521a9 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -492,11 +492,17 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non result = ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) import pytest pytest.set_trace() - + result = ssh_conn.run("! pgrep -x Runner.Worker && pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener)", warn=True) + import pytest + pytest.set_trace() + result = ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) + import pytest + pytest.set_trace() # Using a single command to determine the state and kill the process if needed. # This makes it more robust when network is unstable. if busy: + logger.info("Attempting to kill all runner process on %s", instance.server_name) # kill both Runner.Listener and Runner.Worker processes. # This kills pre-job.sh, a child process of Runner.Worker. kill_command = ( @@ -504,6 +510,7 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non f"pgrep -x {RUNNER_WORKER_PROCESS} && kill $(pgrep -x {RUNNER_WORKER_PROCESS});" ) else: + logger.info("Attempting to kill runner process on %s if not busy", instance.server_name) # Only kill Runner.Listener if Runner.Worker does not exist. kill_command = ( f"! pgrep -x {RUNNER_WORKER_PROCESS} && pgrep -x {RUNNER_LISTENER_PROCESS} && " @@ -511,7 +518,6 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non ) # Checking the result of kill command is not useful, as the exit code does not reveal much. ssh_conn.run(kill_command, warn=True) - logger.info("Attempted to killed runner process on %s", instance.server_name) @retry(tries=3, delay=5, backoff=2, local_logger=logger) def _health_check(self, instance: OpenstackInstance) -> bool: diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 434747acd..cec1158e5 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -201,71 +201,71 @@ async def wait_runner_amount(runner_manager: RunnerManager, num: int): await wait_for(lambda: len(runner_manager.get_runners()) == num) -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_get_no_runner(runner_manager: RunnerManager) -> None: -# """ -# Arrange: RunnerManager instance with no runners. -# Act: Get runners. -# Assert: Empty tuple returned. -# """ -# runner_list = runner_manager.get_runners() -# assert isinstance(runner_list, tuple) -# assert not runner_list - - -# @pytest.mark.openstack -# @pytest.mark.asyncio -# @pytest.mark.abort_on_fail -# async def test_runner_normal_idle_lifecycle( -# runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager -# ) -> None: -# """ -# Arrange: RunnerManager instance with no runners. -# Act: -# 1. Create one runner. -# 2. Run health check on the runner. -# 3. Delete all idle runner. -# Assert: -# 1. An active idle runner. -# 2. Health check passes. -# 3. No runners. -# """ -# # 1. -# runner_id_list = runner_manager.create_runners(1) -# assert isinstance(runner_id_list, tuple) -# assert len(runner_id_list) == 1 -# runner_id = runner_id_list[0] - -# try: -# await wait_runner_amount(runner_manager, 1) -# except TimeoutError as err: -# raise AssertionError("Test arrange failed: Expect one runner") from err - -# runner_list = runner_manager.get_runners() -# assert isinstance(runner_list, tuple) -# assert len(runner_list) == 1 -# runner = runner_list[0] -# assert runner.instance_id == runner_id -# assert runner.cloud_state == CloudRunnerState.ACTIVE -# # Update on GitHub-side can take a bit of time. -# await wait_for( -# lambda: runner_manager.get_runners()[0].github_state == GitHubRunnerState.IDLE, -# timeout=120, -# check_interval=10, -# ) - -# # 2. -# openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() -# assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." -# runner = openstack_instances[0] - -# assert openstack_runner_manager._health_check(runner) - -# # 3. -# runner_manager.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) -# await wait_runner_amount(runner_manager, 0) +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_get_no_runner(runner_manager: RunnerManager) -> None: + """ + Arrange: RunnerManager instance with no runners. + Act: Get runners. + Assert: Empty tuple returned. + """ + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert not runner_list + + +@pytest.mark.openstack +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_runner_normal_idle_lifecycle( + runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager +) -> None: + """ + Arrange: RunnerManager instance with no runners. + Act: + 1. Create one runner. + 2. Run health check on the runner. + 3. Delete all idle runner. + Assert: + 1. An active idle runner. + 2. Health check passes. + 3. No runners. + """ + # 1. + runner_id_list = runner_manager.create_runners(1) + assert isinstance(runner_id_list, tuple) + assert len(runner_id_list) == 1 + runner_id = runner_id_list[0] + + try: + await wait_runner_amount(runner_manager, 1) + except TimeoutError as err: + raise AssertionError("Test arrange failed: Expect one runner") from err + + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 1 + runner = runner_list[0] + assert runner.instance_id == runner_id + assert runner.cloud_state == CloudRunnerState.ACTIVE + # Update on GitHub-side can take a bit of time. + await wait_for( + lambda: runner_manager.get_runners()[0].github_state == GitHubRunnerState.IDLE, + timeout=120, + check_interval=10, + ) + + # 2. + openstack_instances = openstack_runner_manager._openstack_cloud.get_instances() + assert len(openstack_instances) == 1, "Test arrange failed: Needs one runner." + runner = openstack_instances[0] + + assert openstack_runner_manager._health_check(runner) + + # 3. + runner_manager.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) + await wait_runner_amount(runner_manager, 0) @pytest.mark.openstack From 5a92be929f23324aea22f0139e775972b6e5914b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 11:19:25 +0800 Subject: [PATCH 211/278] Debug --- src/openstack_cloud/openstack_runner_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index c812521a9..236ed71ae 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -489,10 +489,13 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non raise # TODO: Debug + result = ssh_conn.run("! pgrep -x Runner.Worker && pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener)", warn=True) + import pytest + pytest.set_trace() result = ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) import pytest pytest.set_trace() - result = ssh_conn.run("! pgrep -x Runner.Worker && pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener)", warn=True) + result = ssh_conn.run("ps aux", warn=True) import pytest pytest.set_trace() result = ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) From d2d3b21a0e3c7256c4a3f0a2215ca27d16b10103 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 11:32:03 +0800 Subject: [PATCH 212/278] Remove debug --- src/openstack_cloud/openstack_runner_manager.py | 14 -------------- tests/integration/test_runner_manager_openstack.py | 2 +- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 236ed71ae..8363a64bc 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -487,20 +487,6 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non "SSH connection failure with %s during health check", instance.server_name ) raise - - # TODO: Debug - result = ssh_conn.run("! pgrep -x Runner.Worker && pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener)", warn=True) - import pytest - pytest.set_trace() - result = ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) - import pytest - pytest.set_trace() - result = ssh_conn.run("ps aux", warn=True) - import pytest - pytest.set_trace() - result = ssh_conn.run("! pgrep -x Runner.Worker && echo HELLO", warn=True) - import pytest - pytest.set_trace() # Using a single command to determine the state and kill the process if needed. # This makes it more robust when network is unstable. diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index cec1158e5..38873e3d6 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -295,7 +295,7 @@ async def test_runner_flush_busy_lifecycle( github_repository=github_repository, conclusion="success", workflow_id_or_name=DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, - dispatch_input={"runner": runner_label, "minutes": "10"}, + dispatch_input={"runner": runner_label, "minutes": "30"}, wait=False, ) await wait_for(lambda: workflow_is_status(workflow, "in_progress")) From c48ef0b508ddcef37c2e6591858a746795365f9f Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 11:40:10 +0800 Subject: [PATCH 213/278] Add cleanup during idle and busy runner test --- .../test_runner_manager_openstack.py | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 38873e3d6..f3e4955f7 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -226,11 +226,13 @@ async def test_runner_normal_idle_lifecycle( Act: 1. Create one runner. 2. Run health check on the runner. - 3. Delete all idle runner. + 3. Run cleanup. + 4. Delete all idle runner. Assert: 1. An active idle runner. 2. Health check passes. - 3. No runners. + 3. One idle runner remains. + 4. No runners. """ # 1. runner_id_list = runner_manager.create_runners(1) @@ -262,8 +264,17 @@ async def test_runner_normal_idle_lifecycle( runner = openstack_instances[0] assert openstack_runner_manager._health_check(runner) - + # 3. + runner_manager.cleanup() + runner_list = runner_manager.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 1 + runner = runner_list[0] + assert runner.instance_id == runner_id + assert runner.cloud_state == CloudRunnerState.ACTIVE + + # 4. runner_manager.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) await wait_runner_amount(runner_manager, 0) @@ -281,12 +292,12 @@ async def test_runner_flush_busy_lifecycle( Arrange: RunnerManager with one idle runner. Act: 1. Run a long workflow. - 2. Run flush idle runner. - 3. Run flush busy runner. + 3. Run flush idle runner. + 4. Run flush busy runner. Assert: 1. Runner takes the job and become busy. - 2. Busy runner still exists. - 3. No runners exists. + 3. Busy runner still exists. + 4. No runners exists. """ # 1. workflow = await dispatch_workflow( @@ -307,6 +318,15 @@ async def test_runner_flush_busy_lifecycle( assert busy_runner.github_state == GitHubRunnerState.BUSY # 2. + runner_manager_with_one_runner.cleanup() + runner_list = runner_manager_with_one_runner.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 1 + runner = runner_list[0] + assert runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GitHubRunnerState.BUSY + + # 3. runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) runner_list = runner_manager_with_one_runner.get_runners() assert len(runner_list) == 1 @@ -314,7 +334,7 @@ async def test_runner_flush_busy_lifecycle( assert busy_runner.cloud_state == CloudRunnerState.ACTIVE assert busy_runner.github_state == GitHubRunnerState.BUSY - # 3. + # 4. runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_BUSY) issue_metrics_events = runner_manager_with_one_runner.cleanup() From 3f348634843738f79920a8c5918696deb7a2d018 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 11:52:10 +0800 Subject: [PATCH 214/278] Debug --- src/openstack_cloud/openstack_cloud.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index fb3d07cb0..037e33cb9 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -391,6 +391,9 @@ def _cleanup_openstack_keypairs( for key in keypairs: # The `name` attribute is of resource.Body type. if key.name and str(key.name).startswith(self.prefix): + # TODO: DEBUG + import pytest + pytest.set_trace() if str(key.name) in exclude_instance_set: continue From e79bafed95adfcae88346778a305b2e1adf39c23 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 11:54:36 +0800 Subject: [PATCH 215/278] Disable tests during debug --- .github/workflows/e2e_test.yaml | 5 ++++- .github/workflows/integration_test.yaml | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 5933451ee..df2667ae2 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,7 +1,10 @@ name: End-to-End tests on: - pull_request: + # TODO: DEBUG + workflow_dispatch: + # pull_request: + jobs: # test option values defined at test/conftest.py are passed on via repository secret diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 8e0bc700a..b67ec9e61 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,7 +1,9 @@ name: integration-tests on: - pull_request: + # TODO: DEBUG + workflow_dispatch: + # pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} From ccb62251dd9c2950e4155e822add780f0e3113d5 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 12:12:05 +0800 Subject: [PATCH 216/278] Debug missing keyfiles --- src/openstack_cloud/openstack_cloud.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index 037e33cb9..e4c1c9b78 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -370,6 +370,9 @@ def _cleanup_key_files(self, exclude_instances: Iterable[str]) -> None: # Find key file from this application. if path.is_file() and path.name.startswith(self.prefix) and path.name.endswith(".key"): total += 1 + # TODO: DEBUG + import pytest + pytest.set_trace() if path.name in exclude_filename: continue path.unlink() From 4a26d75b8b8b49ebbc5a4fc0ca090762cb9cfac2 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 12:24:11 +0800 Subject: [PATCH 217/278] Fix keyfile path matching issue --- src/openstack_cloud/openstack_cloud.py | 9 +-------- src/openstack_cloud/openstack_runner_manager.py | 4 +++- tests/integration/test_runner_manager_openstack.py | 2 +- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py index e4c1c9b78..ad21f4d97 100644 --- a/src/openstack_cloud/openstack_cloud.py +++ b/src/openstack_cloud/openstack_cloud.py @@ -370,10 +370,7 @@ def _cleanup_key_files(self, exclude_instances: Iterable[str]) -> None: # Find key file from this application. if path.is_file() and path.name.startswith(self.prefix) and path.name.endswith(".key"): total += 1 - # TODO: DEBUG - import pytest - pytest.set_trace() - if path.name in exclude_filename: + if path in exclude_filename: continue path.unlink() deleted += 1 @@ -394,12 +391,8 @@ def _cleanup_openstack_keypairs( for key in keypairs: # The `name` attribute is of resource.Body type. if key.name and str(key.name).startswith(self.prefix): - # TODO: DEBUG - import pytest - pytest.set_trace() if str(key.name) in exclude_instance_set: continue - try: self._delete_keypair(conn, key.name) except openstack.exceptions.SDKException: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 8363a64bc..fca8fec01 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -499,7 +499,9 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non f"pgrep -x {RUNNER_WORKER_PROCESS} && kill $(pgrep -x {RUNNER_WORKER_PROCESS});" ) else: - logger.info("Attempting to kill runner process on %s if not busy", instance.server_name) + logger.info( + "Attempting to kill runner process on %s if not busy", instance.server_name + ) # Only kill Runner.Listener if Runner.Worker does not exist. kill_command = ( f"! pgrep -x {RUNNER_WORKER_PROCESS} && pgrep -x {RUNNER_LISTENER_PROCESS} && " diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index f3e4955f7..39d351845 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -264,7 +264,7 @@ async def test_runner_normal_idle_lifecycle( runner = openstack_instances[0] assert openstack_runner_manager._health_check(runner) - + # 3. runner_manager.cleanup() runner_list = runner_manager.get_runners() From 1292940fc1d5273416d23aac5b789cee339a0d34 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 12:36:25 +0800 Subject: [PATCH 218/278] testing --- src-docs/openstack_cloud.openstack_runner_manager.md | 2 +- src/openstack_cloud/openstack_runner_manager.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 752e2f9d3..01d8eb3f6 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -131,7 +131,7 @@ The prefix of runner names. --- - + ### method `cleanup` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index fca8fec01..1ed907b41 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -285,7 +285,8 @@ def flush_runners( instance_list = self._openstack_cloud.get_instances() for instance in instance_list: try: - self._check_state_and_flush(instance, busy) + pass + # self._check_state_and_flush(instance, busy) except SSHError: logger.warning( "Unable to determine state of %s and kill runner process due to SSH issues", From b1af621cbc55dcf26e829d30b0ec94604ccaa4ef Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 12:56:54 +0800 Subject: [PATCH 219/278] debug --- tests/integration/test_runner_manager_openstack.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 39d351845..57e420e1b 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -335,9 +335,7 @@ async def test_runner_flush_busy_lifecycle( assert busy_runner.github_state == GitHubRunnerState.BUSY # 4. - runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_BUSY) - - issue_metrics_events = runner_manager_with_one_runner.cleanup() + issue_metrics_events = runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_BUSY) assert issue_metrics_events[events.RunnerStart] == 1 await wait_runner_amount(runner_manager_with_one_runner, 0) From 88b8fc38e9fdc07433b32e934ccf0f0530b57869 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 13:04:02 +0800 Subject: [PATCH 220/278] Add debug --- src-docs/openstack_cloud.openstack_runner_manager.md | 2 +- src/openstack_cloud/openstack_runner_manager.py | 12 ++++++++++-- tests/integration/test_runner_manager_openstack.py | 4 +--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 01d8eb3f6..752e2f9d3 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -131,7 +131,7 @@ The prefix of runner names. --- - + ### method `cleanup` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 1ed907b41..bfa7d5827 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -285,8 +285,7 @@ def flush_runners( instance_list = self._openstack_cloud.get_instances() for instance in instance_list: try: - pass - # self._check_state_and_flush(instance, busy) + self._check_state_and_flush(instance, busy) except SSHError: logger.warning( "Unable to determine state of %s and kill runner process due to SSH issues", @@ -510,6 +509,15 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non ) # Checking the result of kill command is not useful, as the exit code does not reveal much. ssh_conn.run(kill_command, warn=True) + + # TODO: debug + result = ssh_conn.run("ps aux", warn=True) + import pytest + pytest.set_trace() + + result = ssh_conn.run("ps aux", warn=True) + import pytest + pytest.set_trace() @retry(tries=3, delay=5, backoff=2, local_logger=logger) def _health_check(self, instance: OpenstackInstance) -> bool: diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 57e420e1b..b20426ca0 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -335,9 +335,7 @@ async def test_runner_flush_busy_lifecycle( assert busy_runner.github_state == GitHubRunnerState.BUSY # 4. - issue_metrics_events = runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_BUSY) - assert issue_metrics_events[events.RunnerStart] == 1 - + runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_BUSY) await wait_runner_amount(runner_manager_with_one_runner, 0) From f5ded42ce9c431cc68c767b70d8000b5d6789892 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 13:31:53 +0800 Subject: [PATCH 221/278] Use OR --- src/openstack_cloud/openstack_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index bfa7d5827..28b51d74f 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -504,7 +504,7 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non ) # Only kill Runner.Listener if Runner.Worker does not exist. kill_command = ( - f"! pgrep -x {RUNNER_WORKER_PROCESS} && pgrep -x {RUNNER_LISTENER_PROCESS} && " + f"pgrep -x {RUNNER_WORKER_PROCESS} || pgrep -x {RUNNER_LISTENER_PROCESS} && " f"kill $(pgrep -x {RUNNER_LISTENER_PROCESS})" ) # Checking the result of kill command is not useful, as the exit code does not reveal much. From abe0b07b059a2b29b02e3d77284ca2c09be0277c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 13:37:19 +0800 Subject: [PATCH 222/278] debug --- src/openstack_cloud/openstack_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 28b51d74f..758f432ca 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -508,7 +508,7 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non f"kill $(pgrep -x {RUNNER_LISTENER_PROCESS})" ) # Checking the result of kill command is not useful, as the exit code does not reveal much. - ssh_conn.run(kill_command, warn=True) + # ssh_conn.run(kill_command, warn=True) # TODO: debug result = ssh_conn.run("ps aux", warn=True) From 5db2f6d2bbb1ab738ea43196e8b2a380459b8529 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:00:43 +0800 Subject: [PATCH 223/278] Debug --- src/manager/runner_manager.py | 3 ++- src/openstack_cloud/openstack_runner_manager.py | 13 ++----------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index a9cff2f35..94a99275b 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -249,7 +249,8 @@ def cleanup(self) -> IssuedMetricEventsStats: Returns: Stats on metrics events issued during the cleanup of runners. """ - self._github.delete_runners([GitHubRunnerState.OFFLINE]) + # TODO: DEBUG + # self._github.delete_runners([GitHubRunnerState.OFFLINE]) remove_token = self._github.get_removal_token() deleted_runner_metrics = self._cloud.cleanup(remove_token) return self._issue_runner_metrics(metrics=deleted_runner_metrics) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 758f432ca..6323b65fa 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -484,7 +484,7 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non return except SSHError: logger.exception( - "SSH connection failure with %s during health check", instance.server_name + "SSH connection failure with %s during flushing", instance.server_name ) raise @@ -508,16 +508,7 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non f"kill $(pgrep -x {RUNNER_LISTENER_PROCESS})" ) # Checking the result of kill command is not useful, as the exit code does not reveal much. - # ssh_conn.run(kill_command, warn=True) - - # TODO: debug - result = ssh_conn.run("ps aux", warn=True) - import pytest - pytest.set_trace() - - result = ssh_conn.run("ps aux", warn=True) - import pytest - pytest.set_trace() + ssh_conn.run(kill_command, warn=True) @retry(tries=3, delay=5, backoff=2, local_logger=logger) def _health_check(self, instance: OpenstackInstance) -> bool: From 1cfe5f060fd346a933fc39ef201a8dd1d1db1611 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:11:34 +0800 Subject: [PATCH 224/278] Debug --- src/manager/runner_manager.py | 3 +-- src/openstack_cloud/openstack_runner_manager.py | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 94a99275b..a9cff2f35 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -249,8 +249,7 @@ def cleanup(self) -> IssuedMetricEventsStats: Returns: Stats on metrics events issued during the cleanup of runners. """ - # TODO: DEBUG - # self._github.delete_runners([GitHubRunnerState.OFFLINE]) + self._github.delete_runners([GitHubRunnerState.OFFLINE]) remove_token = self._github.get_removal_token() deleted_runner_metrics = self._cloud.cleanup(remove_token) return self._issue_runner_metrics(metrics=deleted_runner_metrics) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 6323b65fa..685644b30 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -548,6 +548,9 @@ def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool: Returns: Whether the health succeed. """ + # TODO: Debug + import pytest + pytest.set_trace() result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) if not result.ok: logger.warning("SSH run of `ps aux` failed on %s: %s", name, result.stderr) From 6709348071315a1df1c11d6cd923b263f4dea206 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:17:40 +0800 Subject: [PATCH 225/278] Debug --- src/openstack_cloud/openstack_runner_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 685644b30..4715474b6 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -535,6 +535,9 @@ def _health_check(self, instance: OpenstackInstance) -> bool: "SSH connection failure with %s during health check", instance.server_name ) raise + # TODO: Debug + import pytest + pytest.set_trace() return OpenstackRunnerManager._run_health_check(ssh_conn, instance.server_name) @staticmethod @@ -548,9 +551,6 @@ def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool: Returns: Whether the health succeed. """ - # TODO: Debug - import pytest - pytest.set_trace() result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) if not result.ok: logger.warning("SSH run of `ps aux` failed on %s: %s", name, result.stderr) From e168737fc2fbd71f382e3bf992e98506ba068995 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:22:23 +0800 Subject: [PATCH 226/278] Debug --- src/openstack_cloud/openstack_runner_manager.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 4715474b6..ae986ce6c 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -488,6 +488,10 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non ) raise + # TODO: Debug + import pytest + pytest.set_trace() + # Using a single command to determine the state and kill the process if needed. # This makes it more robust when network is unstable. if busy: @@ -535,9 +539,6 @@ def _health_check(self, instance: OpenstackInstance) -> bool: "SSH connection failure with %s during health check", instance.server_name ) raise - # TODO: Debug - import pytest - pytest.set_trace() return OpenstackRunnerManager._run_health_check(ssh_conn, instance.server_name) @staticmethod From 550b81a3d0b235ae6fa981ab82de476b8df5f4c3 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:26:56 +0800 Subject: [PATCH 227/278] Debug --- .../test_runner_manager_openstack.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b20426ca0..21dd25305 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -311,20 +311,21 @@ async def test_runner_flush_busy_lifecycle( ) await wait_for(lambda: workflow_is_status(workflow, "in_progress")) - runner_list = runner_manager_with_one_runner.get_runners() - assert len(runner_list) == 1 - busy_runner = runner_list[0] - assert busy_runner.cloud_state == CloudRunnerState.ACTIVE - assert busy_runner.github_state == GitHubRunnerState.BUSY - - # 2. - runner_manager_with_one_runner.cleanup() - runner_list = runner_manager_with_one_runner.get_runners() - assert isinstance(runner_list, tuple) - assert len(runner_list) == 1 - runner = runner_list[0] - assert runner.cloud_state == CloudRunnerState.ACTIVE - assert busy_runner.github_state == GitHubRunnerState.BUSY + # TODO: debug + # runner_list = runner_manager_with_one_runner.get_runners() + # assert len(runner_list) == 1 + # busy_runner = runner_list[0] + # assert busy_runner.cloud_state == CloudRunnerState.ACTIVE + # assert busy_runner.github_state == GitHubRunnerState.BUSY + + # # 2. + # runner_manager_with_one_runner.cleanup() + # runner_list = runner_manager_with_one_runner.get_runners() + # assert isinstance(runner_list, tuple) + # assert len(runner_list) == 1 + # runner = runner_list[0] + # assert runner.cloud_state == CloudRunnerState.ACTIVE + # assert busy_runner.github_state == GitHubRunnerState.BUSY # 3. runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) From ca52e5729ede1023e71b92a1f58f00b70d8ce987 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 16:26:47 +0800 Subject: [PATCH 228/278] Fix flush mode --- src/manager/runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index a9cff2f35..048b9c628 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -237,7 +237,7 @@ def flush_runners( ) busy = False - if FlushMode.FLUSH_BUSY: + if flush_mode == FlushMode.FLUSH_BUSY: busy = True remove_token = self._github.get_removal_token() stats = self._cloud.flush_runners(remove_token, busy) From 0c730d6e532985bef70212d43336cccdac8c1fe7 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 16:55:30 +0800 Subject: [PATCH 229/278] Remove debug --- .../openstack_runner_manager.py | 4 --- .../test_runner_manager_openstack.py | 29 +++++++++---------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index ae986ce6c..6323b65fa 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -488,10 +488,6 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non ) raise - # TODO: Debug - import pytest - pytest.set_trace() - # Using a single command to determine the state and kill the process if needed. # This makes it more robust when network is unstable. if busy: diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 21dd25305..b20426ca0 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -311,21 +311,20 @@ async def test_runner_flush_busy_lifecycle( ) await wait_for(lambda: workflow_is_status(workflow, "in_progress")) - # TODO: debug - # runner_list = runner_manager_with_one_runner.get_runners() - # assert len(runner_list) == 1 - # busy_runner = runner_list[0] - # assert busy_runner.cloud_state == CloudRunnerState.ACTIVE - # assert busy_runner.github_state == GitHubRunnerState.BUSY - - # # 2. - # runner_manager_with_one_runner.cleanup() - # runner_list = runner_manager_with_one_runner.get_runners() - # assert isinstance(runner_list, tuple) - # assert len(runner_list) == 1 - # runner = runner_list[0] - # assert runner.cloud_state == CloudRunnerState.ACTIVE - # assert busy_runner.github_state == GitHubRunnerState.BUSY + runner_list = runner_manager_with_one_runner.get_runners() + assert len(runner_list) == 1 + busy_runner = runner_list[0] + assert busy_runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GitHubRunnerState.BUSY + + # 2. + runner_manager_with_one_runner.cleanup() + runner_list = runner_manager_with_one_runner.get_runners() + assert isinstance(runner_list, tuple) + assert len(runner_list) == 1 + runner = runner_list[0] + assert runner.cloud_state == CloudRunnerState.ACTIVE + assert busy_runner.github_state == GitHubRunnerState.BUSY # 3. runner_manager_with_one_runner.flush_runners(flush_mode=FlushMode.FLUSH_IDLE) From 6edf7c0aba48d3fba283edf3f84ee44b6fcd9e79 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 21 Aug 2024 17:25:07 +0800 Subject: [PATCH 230/278] Re-enable all tests --- .github/workflows/e2e_test.yaml | 4 +--- .github/workflows/integration_test.yaml | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index df2667ae2..7d0383c12 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,9 +1,7 @@ name: End-to-End tests on: - # TODO: DEBUG - workflow_dispatch: - # pull_request: + pull_request: jobs: diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index b67ec9e61..8e0bc700a 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,9 +1,7 @@ name: integration-tests on: - # TODO: DEBUG - workflow_dispatch: - # pull_request: + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} From 4e3134b890ceef226534ca455d2df386243fa22b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:47:11 +0800 Subject: [PATCH 231/278] Initial unit test for runner scaler --- pyproject.toml | 2 + src-docs/github_client.md | 12 +- src-docs/openstack_cloud.openstack_manager.md | 4 +- src-docs/runner_manager_type.md | 4 +- src-docs/runner_type.md | 2 +- src/charm.py | 6 +- src/charm_state.py | 16 +- src/github_client.py | 34 +-- src/manager/cloud_runner_manager.py | 9 +- src/manager/github_runner_manager.py | 19 +- src/manager/runner_manager.py | 8 +- src/metrics/github.py | 4 +- src/openstack_cloud/openstack_manager.py | 10 +- .../openstack_runner_manager.py | 4 +- src/runner.py | 4 +- src/runner_manager_type.py | 6 +- src/runner_type.py | 4 +- .../test_runner_manager_openstack.py | 8 +- tests/integration/test_self_hosted_runner.py | 4 +- tests/unit/mock_runner_managers.py | 194 ++++++++++++++++++ tests/unit/test_charm.py | 12 +- tests/unit/test_charm_state.py | 16 +- tests/unit/test_github_client.py | 12 +- tests/unit/test_lxd_runner_manager.py | 8 +- tests/unit/test_runner.py | 6 +- tests/unit/test_runner_manager.py | 2 + tests/unit/test_runner_scaler.py | 40 ++++ 27 files changed, 345 insertions(+), 105 deletions(-) create mode 100644 tests/unit/mock_runner_managers.py create mode 100644 tests/unit/test_runner_manager.py create mode 100644 tests/unit/test_runner_scaler.py diff --git a/pyproject.toml b/pyproject.toml index f4a49bd2a..a60427837 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,8 @@ omit = [ # These are covered by `tests/integration/test_runner_manager_openstack.py`. "src/openstack_cloud/openstack_cloud.py", "src/openstack_cloud/openstack_runner_manager.py", + # Thin wrapper around GitHub API. Not a lot of value in unit tests. + "src/manager/github_runner_manager.py", # Contains interface for calling LXD. Tested in integration tests and end to end tests. "src/lxd.py", # Contains interface for calling repo policy compliance service. Tested in integration test diff --git a/src-docs/github_client.md b/src-docs/github_client.md index 6cd298c52..fc0de8f7b 100644 --- a/src-docs/github_client.md +++ b/src-docs/github_client.md @@ -67,7 +67,7 @@ Instantiate the GiHub API client. ### method `delete_runner` ```python -delete_runner(path: GithubOrg | GithubRepo, runner_id: int) → None +delete_runner(path: GitHubOrg | GitHubRepo, runner_id: int) → None ``` Delete the self-hosted runner from GitHub. @@ -87,7 +87,7 @@ Delete the self-hosted runner from GitHub. ```python get_job_info( - path: GithubRepo, + path: GitHubRepo, workflow_run_id: str, runner_name: str ) → JobStats @@ -123,7 +123,7 @@ Get information about a job for a specific workflow run. ```python get_runner_application( - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, arch: Arch, os: str = 'linux' ) → RunnerApplication @@ -157,7 +157,7 @@ Get runner application available for download for given arch. ### method `get_runner_github_info` ```python -get_runner_github_info(path: GithubOrg | GithubRepo) → list[SelfHostedRunner] +get_runner_github_info(path: GitHubOrg | GitHubRepo) → list[SelfHostedRunner] ``` Get runner information on GitHub under a repo or org. @@ -180,7 +180,7 @@ Get runner information on GitHub under a repo or org. ### method `get_runner_registration_token` ```python -get_runner_registration_token(path: GithubOrg | GithubRepo) → str +get_runner_registration_token(path: GitHubOrg | GitHubRepo) → str ``` Get token from GitHub used for registering runners. @@ -203,7 +203,7 @@ Get token from GitHub used for registering runners. ### method `get_runner_remove_token` ```python -get_runner_remove_token(path: GithubOrg | GithubRepo) → str +get_runner_remove_token(path: GitHubOrg | GitHubRepo) → str ``` Get token from GitHub used for removing runners. diff --git a/src-docs/openstack_cloud.openstack_manager.md b/src-docs/openstack_cloud.openstack_manager.md index a0f0a2531..115eec05b 100644 --- a/src-docs/openstack_cloud.openstack_manager.md +++ b/src-docs/openstack_cloud.openstack_manager.md @@ -27,7 +27,7 @@ create_instance_config( app_name: str, unit_num: int, image_id: str, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, labels: Iterable[str], registration_token: str ) → InstanceConfig @@ -75,7 +75,7 @@ The configuration values for creating a single runner instance. ```python __init__( - github_path: GithubOrg | GithubRepo, + github_path: GitHubOrg | GitHubRepo, image_id: str, labels: Iterable[str], name: str, diff --git a/src-docs/runner_manager_type.md b/src-docs/runner_manager_type.md index 7e0675add..f6b58a83c 100644 --- a/src-docs/runner_manager_type.md +++ b/src-docs/runner_manager_type.md @@ -97,7 +97,7 @@ __init__( charm_state: CharmState, image: str, lxd_storage_path: Path, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, service_token: str, token: str, dockerhub_mirror: str | None = None, @@ -147,7 +147,7 @@ Configuration of runner manager. ```python __init__( charm_state: CharmState, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, labels: Iterable[str], token: str, flavor: str, diff --git a/src-docs/runner_type.md b/src-docs/runner_type.md index 8c9db658a..d5029f4f8 100644 --- a/src-docs/runner_type.md +++ b/src-docs/runner_type.md @@ -106,7 +106,7 @@ __init__( labels: tuple[str], lxd_storage_path: Path, name: str, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, proxies: ProxySetting, dockerhub_mirror: str | None = None, ssh_debug_connections: list[SSHDebugConnection] | None = None diff --git a/src/charm.py b/src/charm.py index d3b48d44d..f842c591e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -59,7 +59,7 @@ TOKEN_CONFIG_NAME, CharmConfigInvalidError, CharmState, - GithubPath, + GitHubPath, InstanceType, OpenstackImage, ProxyConfig, @@ -372,7 +372,7 @@ def _ensure_service_health(self) -> None: raise def _get_runner_manager( - self, state: CharmState, token: str | None = None, path: GithubPath | None = None + self, state: CharmState, token: str | None = None, path: GitHubPath | None = None ) -> LXDRunnerManager: """Get a RunnerManager instance. @@ -1191,7 +1191,7 @@ def _get_set_image_ready_status(self) -> bool: return True def _get_runner_scaler( - self, state: CharmState, token: str | None = None, path: GithubPath | None = None + self, state: CharmState, token: str | None = None, path: GitHubPath | None = None ) -> RunnerScaler: """Get runner scaler instance for scaling runners. diff --git a/src/charm_state.py b/src/charm_state.py index 186609806..d562c2242 100644 --- a/src/charm_state.py +++ b/src/charm_state.py @@ -87,7 +87,7 @@ class AnyHttpsUrl(AnyHttpUrl): @dataclasses.dataclass -class GithubRepo: +class GitHubRepo: """Represent GitHub repository. Attributes: @@ -108,7 +108,7 @@ def path(self) -> str: @dataclasses.dataclass -class GithubOrg: +class GitHubOrg: """Represent GitHub organization. Attributes: @@ -128,10 +128,10 @@ def path(self) -> str: return self.org -GithubPath = GithubOrg | GithubRepo +GitHubPath = GitHubOrg | GitHubRepo -def parse_github_path(path_str: str, runner_group: str) -> GithubPath: +def parse_github_path(path_str: str, runner_group: str) -> GitHubPath: """Parse GitHub path. Args: @@ -151,8 +151,8 @@ def parse_github_path(path_str: str, runner_group: str) -> GithubPath: if len(paths) != 2: raise CharmConfigInvalidError(f"Invalid path configuration {path_str}") owner, repo = paths - return GithubRepo(owner=owner, repo=repo) - return GithubOrg(org=path_str, group=runner_group) + return GitHubRepo(owner=owner, repo=repo) + return GitHubOrg(org=path_str, group=runner_group) @dataclasses.dataclass @@ -165,7 +165,7 @@ class GithubConfig: """ token: str - path: GithubPath + path: GitHubPath @classmethod def from_charm(cls, charm: CharmBase) -> "GithubConfig": @@ -367,7 +367,7 @@ class CharmConfig(BaseModel): dockerhub_mirror: AnyHttpsUrl | None labels: tuple[str, ...] openstack_clouds_yaml: dict[str, dict] | None - path: GithubPath + path: GitHubPath reconcile_interval: int repo_policy_compliance: RepoPolicyComplianceConfig | None token: str diff --git a/src/github_client.py b/src/github_client.py index 3c7718f94..b724b5cdb 100644 --- a/src/github_client.py +++ b/src/github_client.py @@ -16,7 +16,7 @@ from ghapi.page import paged from typing_extensions import assert_never -from charm_state import Arch, GithubOrg, GithubPath, GithubRepo +from charm_state import Arch, GitHubOrg, GitHubPath, GitHubRepo from errors import GithubApiError, JobNotFoundError, RunnerBinaryError, TokenError from github_type import ( JobStats, @@ -88,7 +88,7 @@ def __init__(self, token: str): @catch_http_errors def get_runner_application( - self, path: GithubPath, arch: Arch, os: str = "linux" + self, path: GitHubPath, arch: Arch, os: str = "linux" ) -> RunnerApplication: """Get runner application available for download for given arch. @@ -106,11 +106,11 @@ def get_runner_application( The runner application. """ runner_applications: RunnerApplicationList = [] - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): runner_applications = self._client.actions.list_runner_applications_for_repo( owner=path.owner, repo=path.repo ) - if isinstance(path, GithubOrg): + if isinstance(path, GitHubOrg): runner_applications = self._client.actions.list_runner_applications_for_org( org=path.org ) @@ -127,7 +127,7 @@ def get_runner_application( ) from err @catch_http_errors - def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: + def get_runner_github_info(self, path: GitHubPath) -> list[SelfHostedRunner]: """Get runner information on GitHub under a repo or org. Args: @@ -139,7 +139,7 @@ def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: """ remote_runners_list: list[SelfHostedRunner] = [] - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): # The documentation of ghapi for pagination is incorrect and examples will give errors. # This workaround is a temp solution. Will be moving to PyGitHub in the future. self._client.actions.list_self_hosted_runners_for_repo( @@ -157,7 +157,7 @@ def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: ) for item in page["runners"] ] - if isinstance(path, GithubOrg): + if isinstance(path, GitHubOrg): # The documentation of ghapi for pagination is incorrect and examples will give errors. # This workaround is a temp solution. Will be moving to PyGitHub in the future. self._client.actions.list_self_hosted_runners_for_org(org=path.org, per_page=100) @@ -175,7 +175,7 @@ def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: return remote_runners_list @catch_http_errors - def get_runner_remove_token(self, path: GithubPath) -> str: + def get_runner_remove_token(self, path: GitHubPath) -> str: """Get token from GitHub used for removing runners. Args: @@ -185,11 +185,11 @@ def get_runner_remove_token(self, path: GithubPath) -> str: The removing token. """ token: RemoveToken - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): token = self._client.actions.create_remove_token_for_repo( owner=path.owner, repo=path.repo ) - elif isinstance(path, GithubOrg): + elif isinstance(path, GitHubOrg): token = self._client.actions.create_remove_token_for_org(org=path.org) else: assert_never(token) @@ -197,7 +197,7 @@ def get_runner_remove_token(self, path: GithubPath) -> str: return token["token"] @catch_http_errors - def get_runner_registration_token(self, path: GithubPath) -> str: + def get_runner_registration_token(self, path: GitHubPath) -> str: """Get token from GitHub used for registering runners. Args: @@ -208,11 +208,11 @@ def get_runner_registration_token(self, path: GithubPath) -> str: The registration token. """ token: RegistrationToken - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): token = self._client.actions.create_registration_token_for_repo( owner=path.owner, repo=path.repo ) - elif isinstance(path, GithubOrg): + elif isinstance(path, GitHubOrg): token = self._client.actions.create_registration_token_for_org(org=path.org) else: assert_never(token) @@ -220,7 +220,7 @@ def get_runner_registration_token(self, path: GithubPath) -> str: return token["token"] @catch_http_errors - def delete_runner(self, path: GithubPath, runner_id: int) -> None: + def delete_runner(self, path: GitHubPath, runner_id: int) -> None: """Delete the self-hosted runner from GitHub. Args: @@ -228,19 +228,19 @@ def delete_runner(self, path: GithubPath, runner_id: int) -> None: name. runner_id: Id of the runner. """ - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): self._client.actions.delete_self_hosted_runner_from_repo( owner=path.owner, repo=path.repo, runner_id=runner_id, ) - if isinstance(path, GithubOrg): + if isinstance(path, GitHubOrg): self._client.actions.delete_self_hosted_runner_from_org( org=path.org, runner_id=runner_id, ) - def get_job_info(self, path: GithubRepo, workflow_run_id: str, runner_name: str) -> JobStats: + def get_job_info(self, path: GitHubRepo, workflow_run_id: str, runner_name: str) -> JobStats: """Get information about a job for a specific workflow run. Args: diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 28ed17b20..b2624199d 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -9,7 +9,7 @@ from enum import Enum, auto from typing import Iterator, Sequence, Tuple -from charm_state import GithubPath, ProxyConfig, SSHDebugConnection +from charm_state import GitHubPath, ProxyConfig, SSHDebugConnection from metrics.runner import RunnerMetrics logger = logging.getLogger(__name__) @@ -52,8 +52,9 @@ class CloudRunnerState(str, Enum): UNKNOWN = auto() UNEXPECTED = auto() + # Exclude from coverage as not much value for testing this object conversion. @staticmethod - def from_openstack_server_status( + def from_openstack_server_status( # pragma: no cover openstack_server_status: str, ) -> "CloudRunnerState": """Create from openstack server status. @@ -97,7 +98,7 @@ class GitHubRunnerConfig: labels: The labels to add to runners. """ - github_path: GithubPath + github_path: GitHubPath labels: list[str] @@ -158,7 +159,7 @@ def create_runner(self, registration_token: str) -> InstanceId: """ @abc.abstractmethod - def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance: + def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: """Get a self-hosted runner by instance id. Args: diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 0aed972bd..fcbaccfb2 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -4,9 +4,9 @@ """Client for managing self-hosted runner on GitHub side.""" from enum import Enum, auto -from typing import Sequence +from typing import Iterable -from charm_state import GithubPath +from charm_state import GitHubPath from github_client import GithubClient from github_type import GitHubRunnerStatus, SelfHostedRunner @@ -45,10 +45,10 @@ def from_runner(runner: SelfHostedRunner) -> "GitHubRunnerState": return state -class GithubRunnerManager: +class GitHubRunnerManager: """Manage self-hosted runner on GitHub side.""" - def __init__(self, prefix: str, token: str, path: GithubPath): + def __init__(self, prefix: str, token: str, path: GitHubPath): """Construct the object. Args: @@ -61,8 +61,8 @@ def __init__(self, prefix: str, token: str, path: GithubPath): self.github = GithubClient(token) def get_runners( - self, states: Sequence[GitHubRunnerState] | None = None - ) -> tuple[SelfHostedRunner]: + self, states: Iterable[GitHubRunnerState] | None = None + ) -> tuple[SelfHostedRunner, ...]: """Get info on self-hosted runners of certain states. Args: @@ -72,14 +72,15 @@ def get_runners( Information on the runners. """ runner_list = self.github.get_runner_github_info(self._path) + state_set = set(states) return tuple( runner for runner in runner_list if runner.name.startswith(self._prefix) - and GithubRunnerManager._is_runner_in_state(runner, states) + and GitHubRunnerManager._is_runner_in_state(runner, state_set) ) - def delete_runners(self, states: Sequence[GitHubRunnerState] | None = None) -> None: + def delete_runners(self, states: Iterable[GitHubRunnerState] | None = None) -> None: """Delete the self-hosted runners of certain states. Args: @@ -111,7 +112,7 @@ def get_removal_token(self) -> str: @staticmethod def _is_runner_in_state( - runner: SelfHostedRunner, states: Sequence[GitHubRunnerState] | None + runner: SelfHostedRunner, states: set[GitHubRunnerState] | None ) -> bool: """Check that the runner is in one of the states provided. diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 048b9c628..e509473c0 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -9,7 +9,7 @@ from multiprocessing import Pool from typing import Iterator, Sequence, Type, cast -from charm_state import GithubPath +from charm_state import GitHubPath from errors import GithubMetricsError, RunnerCreateError from github_type import SelfHostedRunner from manager.cloud_runner_manager import ( @@ -19,7 +19,7 @@ HealthState, InstanceId, ) -from manager.github_runner_manager import GithubRunnerManager, GitHubRunnerState +from manager.github_runner_manager import GitHubRunnerManager, GitHubRunnerState from metrics import events as metric_events from metrics import github as github_metrics from metrics import runner as runner_metrics @@ -86,7 +86,7 @@ class RunnerManagerConfig: """ token: str - path: GithubPath + path: GitHubPath class RunnerManager: @@ -106,7 +106,7 @@ def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManag self._config = config self._cloud = cloud_runner_manager self.name_prefix = self._cloud.name_prefix - self._github = GithubRunnerManager( + self._github = GitHubRunnerManager( prefix=self.name_prefix, token=self._config.token, path=self._config.path ) diff --git a/src/metrics/github.py b/src/metrics/github.py index 354933fea..e40574eb7 100644 --- a/src/metrics/github.py +++ b/src/metrics/github.py @@ -4,7 +4,7 @@ """Functions to calculate metrics from data retrieved from GitHub.""" import logging -from charm_state import GithubRepo +from charm_state import GitHubRepo from errors import GithubMetricsError, JobNotFoundError from github_client import GithubClient from metrics.runner import PreJobMetrics @@ -35,7 +35,7 @@ def job( try: job_info = github_client.get_job_info( - path=GithubRepo(owner=owner, repo=repo), + path=GitHubRepo(owner=owner, repo=repo), workflow_run_id=pre_job_metrics.workflow_run_id, runner_name=runner_name, ) diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index 379d2ae4c..e0ce47d4f 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -40,7 +40,7 @@ from paramiko.ssh_exception import NoValidConnectionsError import reactive.runner_manager as reactive_runner_manager -from charm_state import CharmState, GithubOrg, ProxyConfig, SSHDebugConnection +from charm_state import CharmState, GitHubOrg, ProxyConfig, SSHDebugConnection from errors import ( CreateMetricsStorageError, GetMetricsStorageError, @@ -62,7 +62,7 @@ from repo_policy_compliance_client import RepoPolicyComplianceClient from runner_manager import IssuedMetricEventsStats from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig -from runner_type import GithubPath, RunnerGithubInfo, RunnerNameByHealth +from runner_type import GitHubPath, RunnerGithubInfo, RunnerNameByHealth from utilities import retry, set_env_var logger = logging.getLogger(__name__) @@ -120,7 +120,7 @@ class InstanceConfig: registration_token: Token for registering the runner on GitHub. """ - github_path: GithubPath + github_path: GitHubPath image_id: str labels: Iterable[str] name: str @@ -188,7 +188,7 @@ def create_instance_config( # pylint: disable=too-many-arguments app_name: str, unit_num: int, image_id: str, - path: GithubPath, + path: GitHubPath, labels: Iterable[str], registration_token: str, ) -> InstanceConfig: @@ -257,7 +257,7 @@ def _generate_cloud_init_userdata( instance_config = cloud_init_userdata.instance_config proxies = cloud_init_userdata.proxies - if isinstance(instance_config.github_path, GithubOrg): + if isinstance(instance_config.github_path, GitHubOrg): runner_group = instance_config.github_path.group aproxy_address = proxies.aproxy_address if proxies is not None else None diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index f504fd3d7..39475ee69 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -16,7 +16,7 @@ import paramiko.ssh_exception from fabric import Connection as SSHConnection -from charm_state import GithubOrg +from charm_state import GitHubOrg from errors import ( CreateMetricsStorageError, GetMetricsStorageError, @@ -432,7 +432,7 @@ def _generate_cloud_init(self, instance_name: str, registration_token: str) -> s pre_job_contents = jinja.get_template("pre-job.j2").render(pre_job_contents_dict) runner_group = None - if isinstance(self._runner_config.github_path, GithubOrg): + if isinstance(self._runner_config.github_path, GitHubOrg): runner_group = self._runner_config.github_path.group aproxy_address = ( self._service_config.proxy_config.aproxy_address diff --git a/src/runner.py b/src/runner.py index 4610faded..61a12115c 100644 --- a/src/runner.py +++ b/src/runner.py @@ -23,7 +23,7 @@ import yaml import shared_fs -from charm_state import Arch, GithubOrg, SSHDebugConnection, VirtualMachineResources +from charm_state import Arch, GitHubOrg, SSHDebugConnection, VirtualMachineResources from errors import ( CreateMetricsStorageError, GithubClientError, @@ -838,7 +838,7 @@ def _register_runner(self, registration_token: str, labels: Sequence[str]) -> No self.instance.name, ] - if isinstance(self.config.path, GithubOrg): + if isinstance(self.config.path, GitHubOrg): register_cmd += ["--runnergroup", self.config.path.group] logger.info("Executing registration command...") diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py index 343b1eb04..e37e0b290 100644 --- a/src/runner_manager_type.py +++ b/src/runner_manager_type.py @@ -10,7 +10,7 @@ import jinja2 -from charm_state import CharmState, GithubPath, ReactiveConfig +from charm_state import CharmState, GitHubPath, ReactiveConfig from github_client import GithubClient from github_type import GitHubRunnerStatus from lxd import LxdClient @@ -81,7 +81,7 @@ class LXDRunnerManagerConfig: # pylint: disable=too-many-instance-attributes charm_state: CharmState image: str lxd_storage_path: Path - path: GithubPath + path: GitHubPath service_token: str token: str dockerhub_mirror: str | None = None @@ -113,7 +113,7 @@ class OpenstackRunnerManagerConfig: # pylint: disable=too-many-instance-attribu """ charm_state: CharmState - path: GithubPath + path: GitHubPath labels: Iterable[str] token: str flavor: str diff --git a/src/runner_type.py b/src/runner_type.py index 86769eafd..92560cbcf 100644 --- a/src/runner_type.py +++ b/src/runner_type.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Optional -from charm_state import GithubPath, SSHDebugConnection +from charm_state import GitHubPath, SSHDebugConnection @dataclass @@ -64,7 +64,7 @@ class RunnerConfig: # pylint: disable=too-many-instance-attributes labels: tuple[str] lxd_storage_path: Path name: str - path: GithubPath + path: GitHubPath proxies: ProxySetting dockerhub_mirror: str | None = None ssh_debug_connections: list[SSHDebugConnection] | None = None diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 5421569fe..b12d42414 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -17,7 +17,7 @@ from github.Workflow import Workflow from openstack.connection import Connection as OpenstackConnection -from charm_state import GithubPath, ProxyConfig, parse_github_path +from charm_state import GitHubPath, ProxyConfig, parse_github_path from manager.cloud_runner_manager import CloudRunnerState, GitHubRunnerConfig, SupportServiceConfig from manager.github_runner_manager import GitHubRunnerState from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig @@ -64,7 +64,7 @@ def log_dir_base_path_fixture( @pytest.fixture(scope="module", name="github_path") -def github_path_fixture(path: str) -> GithubPath: +def github_path_fixture(path: str) -> GitHubPath: return parse_github_path(path, "Default") @@ -92,7 +92,7 @@ async def openstack_runner_manager_fixture( openstack_test_image: str, flavor_name: str, network_name: str, - github_path: GithubPath, + github_path: GitHubPath, proxy_config: ProxyConfig, runner_label: str, openstack_connection: OpenstackConnection, @@ -134,7 +134,7 @@ async def openstack_runner_manager_fixture( async def runner_manager_fixture( openstack_runner_manager: OpenStackRunnerManager, token: str, - github_path: GithubPath, + github_path: GitHubPath, log_dir_base_path: dict[str, Path], ) -> RunnerManager: """Get RunnerManager instance. diff --git a/tests/integration/test_self_hosted_runner.py b/tests/integration/test_self_hosted_runner.py index c91ac8e97..4232fae4b 100644 --- a/tests/integration/test_self_hosted_runner.py +++ b/tests/integration/test_self_hosted_runner.py @@ -16,7 +16,7 @@ DOCKERHUB_MIRROR_CONFIG_NAME, PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME, - GithubRepo, + GitHubRepo, ) from github_client import GithubClient from tests.integration.helpers.common import ( @@ -150,7 +150,7 @@ async def test_flush_busy_runner( # Wait until runner online and then busy. for _ in range(30): all_runners = runner_manager_github_client.get_runner_github_info( - GithubRepo( + GitHubRepo( owner=forked_github_repository.owner.login, repo=forked_github_repository.name ) ) diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py new file mode 100644 index 000000000..75209680f --- /dev/null +++ b/tests/unit/mock_runner_managers.py @@ -0,0 +1,194 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +import random +import secrets +from dataclasses import dataclass +from typing import Iterable, Iterator, Sequence + +from charm_state import GitHubPath +from github_type import GitHubRunnerStatus, SelfHostedRunner +from manager.cloud_runner_manager import ( + CloudRunnerInstance, + CloudRunnerManager, + CloudRunnerState, + InstanceId, +) +from manager.github_runner_manager import GitHubRunnerManager, GitHubRunnerState +from metrics.runner import RunnerMetrics + + +@dataclass +class MockRunner: + """Mock of a runner""" + + name: str + instance_id: InstanceId + cloud_state: CloudRunnerState + github_state: GitHubRunnerState + health: bool + + def __init__(self, name: str): + self.name = name + self.instance_id = secrets.token_hex(6) + self.state = CloudRunnerState.ACTIVE + self.github_state = GitHubRunnerState.IDLE + self.health = True + + def to_cloud_runner(self) -> CloudRunnerInstance: + return CloudRunnerInstance( + name=self.name, + instance_id=self.instance_id, + health=self.health, + state=self.cloud_state, + ) + + +@dataclass +class SharedMockRunnerManagerState: + """State shared by mock runner managers. + + For sharing the mock runner states between MockCloudRunnerManager and MockGitHubRunnerManager. + """ + + runners: dict[InstanceId, MockRunner] + + def __init__(self): + self.runners = {} + + +class MockCloudRunnerManager(CloudRunnerManager): + """Mock for CloudRunnerManager. + + Metrics is not supported in this mock. + """ + + def __init__(self, state: SharedMockRunnerManagerState): + self.prefix = f"mock_{secrets.token_hex(4)}" + self.state = state + + @property + def name_prefix(self) -> str: + """Get the name prefix of the self-hosted runners.""" + return self.prefix + + def create_runner(self, registration_token: str) -> InstanceId: + """Create a self-hosted runner. + + Args: + registration_token: The GitHub registration token for registering runners. + """ + name = f"{self.name_prefix}-{secrets.token_hex(6)}" + runner = MockRunner(name) + self.state.runners[runner.instance_id] = runner + return runner.instance_id + + def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: + """Get a self-hosted runner by instance id. + + Args: + instance_id: The instance id. + """ + runner = self.state.runners.get(instance_id, None) + if runner is not None: + return runner.to_cloud_runner() + return None + + def get_runners(self, states: Sequence[CloudRunnerState]) -> tuple[CloudRunnerInstance, ...]: + """Get self-hosted runners by state. + + Args: + states: Filter for the runners with these github states. If None all states will be + included. + """ + return tuple( + runner.to_cloud_runner() + for runner in self.state.runners.values() + if runner.state in states + ) + + def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: + """Delete self-hosted runner. + + Args: + instance_id: The instance id of the runner to delete. + remove_token: The GitHub remove token. + """ + self.state.runners.pop(instance_id, None) + return iter([]) + + def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[RunnerMetrics]: + """Stop all runners. + + Args: + remove_token: The GitHub remove token for removing runners. + busy: If false, only idle runners are removed. If true, both idle and busy runners are + removed. + """ + # No supporting metrics in the mocks. + if busy: + self.state.runners = {} + else: + self.state.runners = { + instance_id: runner + for instance_id, runner in self.state.runners.items() + if runner.github_state == GitHubRunnerState.BUSY + } + return iter([]) + + def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: + """Cleanup runner and resource on the cloud. + + Perform health check on runner and delete the runner if it fails. + + Args: + remove_token: The GitHub remove token for removing runners. + """ + # No supporting metrics in the mocks. + return iter([]) + + +class MockGitHubRunnerManager: + + def __init__(self, name_prefix: str, path: GitHubPath, state: SharedMockRunnerManagerState): + self.name_prefix = name_prefix + self.state = state + self.path = path + + def get_registration_token(self) -> str: + return "mock_registration_token" + + def get_remove_token(self) -> str: + return "mock_remove_token" + + def get_runners( + self, github_states: Iterable[GitHubRunnerState] | None = None + ) -> tuple[SelfHostedRunner, ...]: + if github_states is None: + github_states = [member.value for member in GitHubRunnerState] + + github_state_set = set(github_states) + return tuple( + SelfHostedRunner( + busy=runner.github_state == GitHubRunnerState.BUSY, + id=random.randint(1, 1000000), + labels=[], + os="linux", + name=runner.name, + status=( + GitHubRunnerStatus.OFFLINE + if runner.github_state == GitHubRunnerState.OFFLINE + else GitHubRunnerStatus.ONLINE + ), + ) + for runner in self.state.runners.values() + if runner.github_state in github_state_set + ) + + def delete_runners(self, states: Iterable[GitHubRunnerState]) -> None: + github_states = set(states) + self.state.runners = { + instance_id: runner + for instance_id, runner in self.state.runners.items() + if runner.github_state not in github_states + } diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index d44eca542..493d3ff94 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -27,8 +27,8 @@ VM_CPU_CONFIG_NAME, VM_DISK_CONFIG_NAME, Arch, - GithubOrg, - GithubRepo, + GitHubOrg, + GitHubRepo, InstanceType, OpenstackImage, ProxyConfig, @@ -461,7 +461,7 @@ def test_org_register(self, run, wt, mkdir, rm): "github-runner", "0", LXDRunnerManagerConfig( - path=GithubOrg(org="mockorg", group="mockgroup"), + path=GitHubOrg(org="mockorg", group="mockgroup"), token="mocktoken", image="jammy", service_token=token, @@ -491,7 +491,7 @@ def test_repo_register(self, run, wt, mkdir, rm): "github-runner", "0", LXDRunnerManagerConfig( - path=GithubRepo(owner="mockorg", repo="repo"), + path=GitHubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", service_token=token, @@ -549,7 +549,7 @@ def test_update_config(self, run, wt, mkdir, rm): "github-runner", "0", LXDRunnerManagerConfig( - path=GithubRepo(owner="mockorg", repo="repo"), + path=GitHubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", service_token=token, @@ -571,7 +571,7 @@ def test_update_config(self, run, wt, mkdir, rm): "github-runner", "0", LXDRunnerManagerConfig( - path=GithubRepo(owner="mockorg", repo="repo"), + path=GitHubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", service_token=token, diff --git a/tests/unit/test_charm_state.py b/tests/unit/test_charm_state.py index 2025c0e76..b4b25afb0 100644 --- a/tests/unit/test_charm_state.py +++ b/tests/unit/test_charm_state.py @@ -40,8 +40,8 @@ CharmState, FirewallEntry, GithubConfig, - GithubOrg, - GithubRepo, + GitHubOrg, + GitHubRepo, ImmutableConfigChangedError, LocalLxdRunnerConfig, OpenstackImage, @@ -64,7 +64,7 @@ def test_github_repo_path(): """ owner = "test_owner" repo = "test_repo" - github_repo = GithubRepo(owner, repo) + github_repo = GitHubRepo(owner, repo) path = github_repo.path() @@ -79,7 +79,7 @@ def test_github_org_path(): """ org = "test_org" group = "test_group" - github_org = GithubOrg(org, group) + github_org = GitHubOrg(org, group) path = github_org.path() @@ -128,14 +128,14 @@ def test_github_config_from_charm_invalid_token(): @pytest.mark.parametrize( "path_str, runner_group, expected_type, expected_attrs", [ - ("owner/repo", "test_group", GithubRepo, {"owner": "owner", "repo": "repo"}), - ("test_org", "test_group", GithubOrg, {"org": "test_org", "group": "test_group"}), + ("owner/repo", "test_group", GitHubRepo, {"owner": "owner", "repo": "repo"}), + ("test_org", "test_group", GitHubOrg, {"org": "test_org", "group": "test_group"}), ], ) def test_parse_github_path( path_str: str, runner_group: str, - expected_type: GithubRepo | GithubOrg, + expected_type: GitHubRepo | GitHubOrg, expected_attrs: dict[str, str], ): """ @@ -479,7 +479,7 @@ def test_charm_config_from_charm_valid(): result = CharmConfig.from_charm(mock_charm) - assert result.path == GithubRepo(owner="owner", repo="repo") + assert result.path == GitHubRepo(owner="owner", repo="repo") assert result.reconcile_interval == 5 assert result.denylist == [ FirewallEntry(ip_range="192.168.1.1"), diff --git a/tests/unit/test_github_client.py b/tests/unit/test_github_client.py index b01a75a01..9bd336a03 100644 --- a/tests/unit/test_github_client.py +++ b/tests/unit/test_github_client.py @@ -10,7 +10,7 @@ import pytest -from charm_state import GithubRepo +from charm_state import GitHubRepo from errors import JobNotFoundError from github_client import GithubClient from github_type import JobConclusion, JobStats @@ -95,7 +95,7 @@ def test_get_job_info(github_client: GithubClient, job_stats_raw: JobStatsRawDat act: Call get_job_info. assert: The correct JobStats object is returned. """ - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) + github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) job_stats = github_client.get_job_info( path=github_repo, workflow_run_id=secrets.token_hex(16), @@ -128,7 +128,7 @@ def test_get_job_info_no_conclusion(github_client: GithubClient, job_stats_raw: } ] } - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) + github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) job_stats = github_client.get_job_info( path=github_repo, workflow_run_id=secrets.token_hex(16), @@ -156,7 +156,7 @@ def test_github_api_pagination_multiple_pages( github_client=github_client, job_stats_raw=job_stats_raw, include_runner=True ) - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) + github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) job_stats = github_client.get_job_info( path=github_repo, workflow_run_id=secrets.token_hex(16), @@ -184,7 +184,7 @@ def test_github_api_pagination_job_not_found( github_client=github_client, job_stats_raw=job_stats_raw, include_runner=False ) - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) + github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) with pytest.raises(JobNotFoundError): github_client.get_job_info( @@ -198,7 +198,7 @@ def test_github_api_http_error(github_client: GithubClient, job_stats_raw: JobSt github_client._client.actions.list_jobs_for_workflow_run.side_effect = HTTPError( "http://test.com", 500, "", http.client.HTTPMessage(), None ) - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) + github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) with pytest.raises(JobNotFoundError): github_client.get_job_info( diff --git a/tests/unit/test_lxd_runner_manager.py b/tests/unit/test_lxd_runner_manager.py index 829d73c9c..36c36df11 100644 --- a/tests/unit/test_lxd_runner_manager.py +++ b/tests/unit/test_lxd_runner_manager.py @@ -16,8 +16,8 @@ Arch, CharmConfig, CharmState, - GithubOrg, - GithubRepo, + GitHubOrg, + GitHubRepo, ProxyConfig, ReactiveConfig, VirtualMachineResources, @@ -67,9 +67,9 @@ def charm_state_fixture(charm_config: MagicMock): scope="function", name="runner_manager", params=[ - (GithubOrg("test_org", "test_group"), ProxyConfig()), + (GitHubOrg("test_org", "test_group"), ProxyConfig()), ( - GithubRepo("test_owner", "test_repo"), + GitHubRepo("test_owner", "test_repo"), ProxyConfig( no_proxy="test_no_proxy", http=TEST_PROXY_SERVER_URL, diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py index fdf8fc2a1..af7954d06 100644 --- a/tests/unit/test_runner.py +++ b/tests/unit/test_runner.py @@ -13,7 +13,7 @@ from _pytest.monkeypatch import MonkeyPatch import metrics.runner_logs -from charm_state import GithubOrg, GithubRepo, SSHDebugConnection, VirtualMachineResources +from charm_state import GitHubOrg, GitHubRepo, SSHDebugConnection, VirtualMachineResources from errors import ( CreateMetricsStorageError, LxdError, @@ -138,11 +138,11 @@ def ssh_debug_connections_fixture() -> list[SSHDebugConnection]: name="runner", params=[ ( - GithubOrg("test_org", "test_group"), + GitHubOrg("test_org", "test_group"), ProxySetting(no_proxy=None, http=None, https=None, aproxy_address=None), ), ( - GithubRepo("test_owner", "test_repo"), + GitHubRepo("test_owner", "test_repo"), ProxySetting( no_proxy="test_no_proxy", http=TEST_PROXY_SERVER_URL, diff --git a/tests/unit/test_runner_manager.py b/tests/unit/test_runner_manager.py new file mode 100644 index 000000000..e3979c0f6 --- /dev/null +++ b/tests/unit/test_runner_manager.py @@ -0,0 +1,2 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py new file mode 100644 index 000000000..76d14d941 --- /dev/null +++ b/tests/unit/test_runner_scaler.py @@ -0,0 +1,40 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + + +import pytest + +from charm_state import GitHubRepo +from manager.runner_manager import RunnerManager, RunnerManagerConfig +from manager.runner_scaler import RunnerScaler +from tests.unit.mock_runner_managers import ( + MockCloudRunnerManager, + MockGitHubRunnerManager, + SharedMockRunnerManagerState, +) + + +@pytest.fixture(name="runner_manager") +def runner_manager_fixture() -> RunnerManager: + state = SharedMockRunnerManagerState() + mock_cloud = MockCloudRunnerManager(state) + mock_path = GitHubRepo("mock_owner", "mock_repo") + mock_github = MockGitHubRunnerManager(mock_cloud.name_prefix, mock_path, state) + + config = RunnerManagerConfig("mock_token", mock_path) + runner_manager = RunnerManager(mock_cloud, config) + runner_manager._github = mock_github + return runner_manager + + +@pytest.fixture(name="runner_scaler") +def runner_scaler_fixture(runner_manager: RunnerManager) -> RunnerScaler: + return RunnerScaler(runner_manager) + + +def test_get_no_runner(runner_scaler: RunnerScaler): + info = runner_scaler.get_runner_info() + assert info["offline"] == 0 + assert info["online"] == 0 + assert info["unknown"] == 0 + assert info["runners"] == tuple() From 7632c91abc2aab4273014f80432faefd1101e8bd Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 22 Aug 2024 18:59:59 +0800 Subject: [PATCH 232/278] Add more unit tests for runner scaler --- src/manager/github_runner_manager.py | 6 +- src/manager/runner_manager.py | 53 ++++++++---- src/manager/runner_scaler.py | 13 +-- tests/unit/mock_runner_managers.py | 14 ++-- tests/unit/test_runner_scaler.py | 116 ++++++++++++++++++++++++--- 5 files changed, 163 insertions(+), 39 deletions(-) diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index fcbaccfb2..8f00525b0 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -37,10 +37,10 @@ def from_runner(runner: SelfHostedRunner) -> "GitHubRunnerState": """ state = GitHubRunnerState.OFFLINE # A runner that is busy and offline is possible. - if runner.busy: + if runner["busy"]: state = GitHubRunnerState.BUSY - if runner.status == GitHubRunnerStatus.ONLINE: - if not runner.busy: + if runner["status"] == GitHubRunnerStatus.ONLINE: + if not runner["busy"]: state = GitHubRunnerState.IDLE return state diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index e509473c0..40cde9bb4 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from enum import Enum, auto from multiprocessing import Pool -from typing import Iterator, Sequence, Type, cast +from typing import Iterable, Iterator, Sequence, Type, cast from charm_state import GitHubPath from errors import GithubMetricsError, RunnerCreateError @@ -125,21 +125,8 @@ def create_runners(self, num: int) -> tuple[InstanceId]: create_runner_args = [ RunnerManager._CreateRunnerArgs(self._cloud, registration_token) for _ in range(num) ] - instance_id_list = [] - with Pool(processes=min(num, 10)) as pool: - jobs = pool.imap_unordered( - func=RunnerManager._create_runner, iterable=create_runner_args - ) - for _ in range(num): - try: - instance_id = next(jobs) - except RunnerCreateError: - logger.exception("Failed to spawn a runner.") - except StopIteration: - break - else: - instance_id_list.append(instance_id) - return tuple(instance_id_list) + return RunnerManager._spawn_runners(create_runner_args) + def get_runners( self, @@ -162,7 +149,7 @@ def get_runners( logger.info("Getting runners...") github_infos = self._github.get_runners(github_states) cloud_infos = self._cloud.get_runners(cloud_states) - github_infos_map = {info.name: info for info in github_infos} + github_infos_map = {info["name"]: info for info in github_infos} cloud_infos_map = {info.name: info for info in cloud_infos} logger.info( "Found following runners: %s", cloud_infos_map.keys() | github_infos_map.keys() @@ -254,6 +241,38 @@ def cleanup(self) -> IssuedMetricEventsStats: deleted_runner_metrics = self._cloud.cleanup(remove_token) return self._issue_runner_metrics(metrics=deleted_runner_metrics) + @staticmethod + def _spawn_runners(create_runner_args: Iterable["RunnerManager._CreateRunnerArgs"]) -> tuple[InstanceId, ...]: + """Parallel spawn of runners. + + The length of the create_runner_args is number _create_runner invocation, and therefore the + number of runner spawned. + + Args: + create_runner_args: List of arg for invoking _create_runner method. + + Returns: + A list of instance ID of runner spawned. + """ + num = len(create_runner_args) + + instance_id_list = [] + with Pool(processes=min(num, 10)) as pool: + jobs = pool.imap_unordered( + func=RunnerManager._create_runner, iterable=create_runner_args + ) + for _ in range(num): + try: + instance_id = next(jobs) + except RunnerCreateError: + logger.exception("Failed to spawn a runner.") + except StopIteration: + break + else: + instance_id_list.append(instance_id) + return tuple(instance_id_list) + + def _delete_runners( self, runners: Sequence[RunnerInstance], remove_token: str ) -> IssuedMetricEventsStats: diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index 980c542d0..06c0ecee5 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -70,7 +70,7 @@ def get_runner_info(self) -> RunnerInfo: online=online, offline=offline, unknown=unknown, runners=tuple(online_runners) ) - def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: + def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: """Flush the runners. Args: @@ -80,10 +80,11 @@ def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> None: Number of runners flushed. """ metric_stats = self._manager.cleanup() - delete_metric_stats = self._manager.delete_runners(flush_mode=flush_mode) + delete_metric_stats = self._manager.flush_runners(flush_mode=flush_mode) + events = set(delete_metric_stats.keys()) | set(metric_stats.keys()) metric_stats = { - delete_metric_stats.get(event_name, 0) + metric_stats.get(event_name, 0) - for event_name in set(delete_metric_stats) | set(metric_stats) + event_name: delete_metric_stats.get(event_name, 0) + metric_stats.get(event_name, 0) + for event_name in events } return metric_stats.get(metric_events.RunnerStop, 0) @@ -137,13 +138,15 @@ def reconcile(self, num_of_runner: int) -> int: ] try: + available_runners = set(runner.name for runner in idle_runners) | set(runner.name for runner in offline_healthy_runners) + logger.info("Current available runners (idle + healthy offline): %s", available_runners) metric_events.issue_event( metric_events.Reconciliation( timestamp=time.time(), flavor=self._manager.name_prefix, crashed_runners=metric_stats.get(metric_events.RunnerStart, 0) - metric_stats.get(metric_events.RunnerStop, 0), - idle_runners=len(set(idle_runners) | set(offline_healthy_runners)), + idle_runners=len(available_runners), duration=end_timestamp - start_timestamp, ) ) diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py index 75209680f..7db7e24a0 100644 --- a/tests/unit/mock_runner_managers.py +++ b/tests/unit/mock_runner_managers.py @@ -14,7 +14,7 @@ CloudRunnerState, InstanceId, ) -from manager.github_runner_manager import GitHubRunnerManager, GitHubRunnerState +from manager.github_runner_manager import GitHubRunnerState from metrics.runner import RunnerMetrics @@ -31,7 +31,7 @@ class MockRunner: def __init__(self, name: str): self.name = name self.instance_id = secrets.token_hex(6) - self.state = CloudRunnerState.ACTIVE + self.cloud_state = CloudRunnerState.ACTIVE self.github_state = GitHubRunnerState.IDLE self.health = True @@ -94,17 +94,21 @@ def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: return runner.to_cloud_runner() return None - def get_runners(self, states: Sequence[CloudRunnerState]) -> tuple[CloudRunnerInstance, ...]: + def get_runners(self, states: Sequence[CloudRunnerState] | None = None) -> tuple[CloudRunnerInstance, ...]: """Get self-hosted runners by state. Args: states: Filter for the runners with these github states. If None all states will be included. """ + if states is None: + states = [member.value for member in CloudRunnerState] + + state_set = set(states) return tuple( runner.to_cloud_runner() for runner in self.state.runners.values() - if runner.state in states + if runner.cloud_state in state_set ) def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: @@ -158,7 +162,7 @@ def __init__(self, name_prefix: str, path: GitHubPath, state: SharedMockRunnerMa def get_registration_token(self) -> str: return "mock_registration_token" - def get_remove_token(self) -> str: + def get_removal_token(self) -> str: return "mock_remove_token" def get_runners( diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 76d14d941..3dd2f9800 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -2,10 +2,12 @@ # See LICENSE file for licensing details. +from typing import Iterable import pytest from charm_state import GitHubRepo -from manager.runner_manager import RunnerManager, RunnerManagerConfig +from manager.cloud_runner_manager import InstanceId +from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig from manager.runner_scaler import RunnerScaler from tests.unit.mock_runner_managers import ( MockCloudRunnerManager, @@ -14,27 +16,123 @@ ) -@pytest.fixture(name="runner_manager") -def runner_manager_fixture() -> RunnerManager: +def mock_runner_manager_spawn_runners(create_runner_args: Iterable[RunnerManager._CreateRunnerArgs]) -> tuple[InstanceId, ...]: + """Mock _spawn_runners method of RunnerManager. + + The _spawn_runners method uses multi-process, which copies the object, e.g., the mocks. + There is easy way to sync the state of the mocks object across processes. Replacing the + _spawn_runner to remove the multi-process.pool is an easier approach. + """ + return tuple(RunnerManager._create_runner(arg) for arg in create_runner_args) + + +@pytest.fixture(scope="function", name="runner_manager") +def runner_manager_fixture(monkeypatch) -> RunnerManager: state = SharedMockRunnerManagerState() mock_cloud = MockCloudRunnerManager(state) mock_path = GitHubRepo("mock_owner", "mock_repo") mock_github = MockGitHubRunnerManager(mock_cloud.name_prefix, mock_path, state) + monkeypatch.setattr("manager.runner_manager.RunnerManager._spawn_runners", mock_runner_manager_spawn_runners) config = RunnerManagerConfig("mock_token", mock_path) runner_manager = RunnerManager(mock_cloud, config) runner_manager._github = mock_github return runner_manager -@pytest.fixture(name="runner_scaler") +@pytest.fixture(scope="function", name="runner_scaler") def runner_scaler_fixture(runner_manager: RunnerManager) -> RunnerScaler: return RunnerScaler(runner_manager) -def test_get_no_runner(runner_scaler: RunnerScaler): +def assert_runner_info( + runner_scaler: RunnerScaler, online: int = 0, offline: int = 0, unknown: int = 0 +) -> None: + """Assert runner info contains a certain amount of runners. + + Args: + runner_scaler: The RunnerScaler to get information from. + online: The number of online runners to assert for. + offline: The number of offline runners to assert for. + unknown: The number of unknown runners to assert for. + """ info = runner_scaler.get_runner_info() - assert info["offline"] == 0 - assert info["online"] == 0 - assert info["unknown"] == 0 - assert info["runners"] == tuple() + assert info["offline"] == offline + assert info["online"] == online + assert info["unknown"] == unknown + assert isinstance(info["runners"], tuple) + assert len(info["runners"]) == online + + +def test_get_no_runner(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: Get runner information. + Assert: Information should contain no runners. + """ + assert_runner_info(runner_scaler, online=0) + + +def test_flush_no_runner(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: + 1. Flush idle runners. + 2. Flush busy runners. + Assert: + 1. No change in number of runners. Runner info should contain no runners. + 2. No change in number of runners. + """ + # 1. + diff = runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + assert diff == 0 + assert_runner_info(runner_scaler, online=0) + + # 2. + diff = runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + assert diff == 0 + assert_runner_info(runner_scaler, online=0) + + +def test_reconcile_runner_create_one(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: Reconcile to no runners. + Assert: No changes. Runner info should contain no runners. + """ + diff = runner_scaler.reconcile(num_of_runner=0) + assert diff == 0 + assert_runner_info(runner_scaler, online=0) + + +def test_one_runner(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: + 1. Reconcile to one runner. + 2. Reconcile to one runner. + 3. Flush idle runners. + 4. Reconcile to one runner. + Assert: + 1. Runner info has one runner. + 2. No changes to number of runner. + 3. Runner info has one runner. + """ + # 1. + diff = runner_scaler.reconcile(1) + assert diff == 1 + assert_runner_info(runner_scaler, online=1) + + # 2. + diff = runner_scaler.reconcile(1) + assert diff == 0 + assert_runner_info(runner_scaler, online=1) + + # 3. + runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + assert_runner_info(runner_scaler, online=0) + + # 3. + diff = runner_scaler.reconcile(1) + assert diff == 1 + assert_runner_info(runner_scaler, online=1) From 52a772428a19da92c0600979316df9df8e34584c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:10:08 +0800 Subject: [PATCH 233/278] Add more tests --- src/manager/runner_manager.py | 12 +-- src/manager/runner_scaler.py | 23 ++++- tests/unit/mock_runner_managers.py | 18 +++- tests/unit/test_runner_scaler.py | 140 ++++++++++++++++++++++++++--- 4 files changed, 170 insertions(+), 23 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 40cde9bb4..03639a306 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -126,7 +126,6 @@ def create_runners(self, num: int) -> tuple[InstanceId]: RunnerManager._CreateRunnerArgs(self._cloud, registration_token) for _ in range(num) ] return RunnerManager._spawn_runners(create_runner_args) - def get_runners( self, @@ -242,17 +241,19 @@ def cleanup(self) -> IssuedMetricEventsStats: return self._issue_runner_metrics(metrics=deleted_runner_metrics) @staticmethod - def _spawn_runners(create_runner_args: Iterable["RunnerManager._CreateRunnerArgs"]) -> tuple[InstanceId, ...]: + def _spawn_runners( + create_runner_args: Iterable["RunnerManager._CreateRunnerArgs"], + ) -> tuple[InstanceId, ...]: """Parallel spawn of runners. - - The length of the create_runner_args is number _create_runner invocation, and therefore the + + The length of the create_runner_args is number _create_runner invocation, and therefore the number of runner spawned. Args: create_runner_args: List of arg for invoking _create_runner method. Returns: - A list of instance ID of runner spawned. + A list of instance ID of runner spawned. """ num = len(create_runner_args) @@ -271,7 +272,6 @@ def _spawn_runners(create_runner_args: Iterable["RunnerManager._CreateRunnerArgs else: instance_id_list.append(instance_id) return tuple(instance_id_list) - def _delete_runners( self, runners: Sequence[RunnerInstance], remove_token: str diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index 06c0ecee5..c7cca69a3 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -21,15 +21,19 @@ class RunnerInfo(TypedDict): Attributes: online: The number of runner in online state. + busy: The number of the runner in busy state. offline: The number of runner in offline state. unknown: The number of runner in unknown state. runners: The names of the online runners. + busy_runners: The names of the busy runners. """ online: int + busy: int offline: int unknown: int runners: tuple[str, ...] + busy_runners: tuple[str, ...] class RunnerScaler: @@ -51,14 +55,18 @@ def get_runner_info(self) -> RunnerInfo: """ runner_list = self._manager.get_runners() online = 0 + busy = 0 offline = 0 unknown = 0 online_runners = [] + busy_runners = [] for runner in runner_list: match runner.github_state: case GitHubRunnerState.BUSY: online += 1 online_runners.append(runner.name) + busy += 1 + busy_runners.append(runner.name) case GitHubRunnerState.IDLE: online += 1 online_runners.append(runner.name) @@ -67,7 +75,12 @@ def get_runner_info(self) -> RunnerInfo: case _: unknown += 1 return RunnerInfo( - online=online, offline=offline, unknown=unknown, runners=tuple(online_runners) + online=online, + busy=busy, + offline=offline, + unknown=unknown, + runners=tuple(online_runners), + busy_runners=tuple(busy_runners), ) def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: @@ -138,8 +151,12 @@ def reconcile(self, num_of_runner: int) -> int: ] try: - available_runners = set(runner.name for runner in idle_runners) | set(runner.name for runner in offline_healthy_runners) - logger.info("Current available runners (idle + healthy offline): %s", available_runners) + available_runners = set(runner.name for runner in idle_runners) | set( + runner.name for runner in offline_healthy_runners + ) + logger.info( + "Current available runners (idle + healthy offline): %s", available_runners + ) metric_events.issue_event( metric_events.Reconciliation( timestamp=time.time(), diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py index 7db7e24a0..269931bbd 100644 --- a/tests/unit/mock_runner_managers.py +++ b/tests/unit/mock_runner_managers.py @@ -20,7 +20,7 @@ @dataclass class MockRunner: - """Mock of a runner""" + """Mock of a runner.""" name: str instance_id: InstanceId @@ -29,6 +29,11 @@ class MockRunner: health: bool def __init__(self, name: str): + """Construct the object. + + Args: + name: The name of the runner. + """ self.name = name self.instance_id = secrets.token_hex(6) self.cloud_state = CloudRunnerState.ACTIVE @@ -36,6 +41,7 @@ def __init__(self, name: str): self.health = True def to_cloud_runner(self) -> CloudRunnerInstance: + """Construct CloudRunnerInstance from this object.""" return CloudRunnerInstance( name=self.name, instance_id=self.instance_id, @@ -54,6 +60,7 @@ class SharedMockRunnerManagerState: runners: dict[InstanceId, MockRunner] def __init__(self): + """Construct the object.""" self.runners = {} @@ -64,6 +71,11 @@ class MockCloudRunnerManager(CloudRunnerManager): """ def __init__(self, state: SharedMockRunnerManagerState): + """Construct the object. + + Args: + state: The shared state between cloud and github runner managers. + """ self.prefix = f"mock_{secrets.token_hex(4)}" self.state = state @@ -94,7 +106,9 @@ def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: return runner.to_cloud_runner() return None - def get_runners(self, states: Sequence[CloudRunnerState] | None = None) -> tuple[CloudRunnerInstance, ...]: + def get_runners( + self, states: Sequence[CloudRunnerState] | None = None + ) -> tuple[CloudRunnerInstance, ...]: """Get self-hosted runners by state. Args: diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 3dd2f9800..476885954 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -3,10 +3,12 @@ from typing import Iterable + import pytest -from charm_state import GitHubRepo -from manager.cloud_runner_manager import InstanceId +from charm_state import GitHubPath, GitHubRepo +from manager.cloud_runner_manager import CloudRunnerState, InstanceId +from manager.github_runner_manager import GitHubRunnerState from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig from manager.runner_scaler import RunnerScaler from tests.unit.mock_runner_managers import ( @@ -16,25 +18,48 @@ ) -def mock_runner_manager_spawn_runners(create_runner_args: Iterable[RunnerManager._CreateRunnerArgs]) -> tuple[InstanceId, ...]: +def mock_runner_manager_spawn_runners( + create_runner_args: Iterable[RunnerManager._CreateRunnerArgs], +) -> tuple[InstanceId, ...]: """Mock _spawn_runners method of RunnerManager. - + The _spawn_runners method uses multi-process, which copies the object, e.g., the mocks. - There is easy way to sync the state of the mocks object across processes. Replacing the + There is easy way to sync the state of the mocks object across processes. Replacing the _spawn_runner to remove the multi-process.pool is an easier approach. + + Args: + create_runner_args: The arguments for the create_runner method. + + Returns: + The instance ids of the runner spawned. """ return tuple(RunnerManager._create_runner(arg) for arg in create_runner_args) -@pytest.fixture(scope="function", name="runner_manager") -def runner_manager_fixture(monkeypatch) -> RunnerManager: +@pytest.fixture(scope="function", name="github_path") +def github_path_fixture() -> GitHubPath: + return GitHubRepo("mock_owner", "mock_repo") + + +@pytest.fixture(scope="function", name="mock_runner_managers") +def mock_runner_managers_fixture( + github_path: GitHubPath, +) -> tuple[MockCloudRunnerManager, MockGitHubRunnerManager]: state = SharedMockRunnerManagerState() mock_cloud = MockCloudRunnerManager(state) - mock_path = GitHubRepo("mock_owner", "mock_repo") - mock_github = MockGitHubRunnerManager(mock_cloud.name_prefix, mock_path, state) + mock_github = MockGitHubRunnerManager(mock_cloud.name_prefix, github_path, state) + return (mock_cloud, mock_github) - monkeypatch.setattr("manager.runner_manager.RunnerManager._spawn_runners", mock_runner_manager_spawn_runners) - config = RunnerManagerConfig("mock_token", mock_path) + +@pytest.fixture(scope="function", name="runner_manager") +def runner_manager_fixture( + monkeypatch, mock_runner_managers, github_path: GitHubPath +) -> RunnerManager: + mock_cloud, mock_github = mock_runner_managers + monkeypatch.setattr( + "manager.runner_manager.RunnerManager._spawn_runners", mock_runner_manager_spawn_runners + ) + config = RunnerManagerConfig("mock_token", github_path) runner_manager = RunnerManager(mock_cloud, config) runner_manager._github = mock_github return runner_manager @@ -45,23 +70,49 @@ def runner_scaler_fixture(runner_manager: RunnerManager) -> RunnerScaler: return RunnerScaler(runner_manager) +@pytest.fixture(scope="function", name="runner_scaler_one_runner") +def runner_scaler_one_runner_fixture(runner_scaler: RunnerScaler) -> RunnerScaler: + runner_scaler.reconcile(1) + assert_runner_info(runner_scaler, online=1) + return runner_scaler + + +def set_one_runner_state( + runner_scaler: RunnerScaler, + github_state: GitHubRunnerState | None = None, + cloud_state: CloudRunnerState | None = None, +) -> RunnerScaler: + runner_dict = runner_scaler._manager._github.state.runners + assert len(runner_dict) == 1, "Test arrange failed: One runner should be present" + instance_id = list(runner_dict.keys())[0] + if github_state is not None: + runner_dict[instance_id].github_state = github_state + if cloud_state is not None: + runner_dict[instance_id].cloud_state = cloud_state + return runner_scaler + + def assert_runner_info( - runner_scaler: RunnerScaler, online: int = 0, offline: int = 0, unknown: int = 0 + runner_scaler: RunnerScaler, online: int = 0, busy: int = 0, offline: int = 0, unknown: int = 0 ) -> None: """Assert runner info contains a certain amount of runners. Args: runner_scaler: The RunnerScaler to get information from. online: The number of online runners to assert for. + busy: The number of buys runners to assert for. offline: The number of offline runners to assert for. unknown: The number of unknown runners to assert for. """ info = runner_scaler.get_runner_info() assert info["offline"] == offline assert info["online"] == online + assert info["busy"] == busy assert info["unknown"] == unknown assert isinstance(info["runners"], tuple) assert len(info["runners"]) == online + assert isinstance(info["busy_runners"], tuple) + assert len(info["busy_runners"]) == busy def test_get_no_runner(runner_scaler: RunnerScaler): @@ -136,3 +187,68 @@ def test_one_runner(runner_scaler: RunnerScaler): diff = runner_scaler.reconcile(1) assert diff == 1 assert_runner_info(runner_scaler, online=1) + + +def test_flush_busy_on_idle_runner(runner_scaler_one_runner: RunnerScaler): + """ + Arrange: A RunnerScaler with one idle runner. + Act: Run flush busy runner. + Assert: No runners. + """ + runner_scaler = runner_scaler_one_runner + + runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + assert_runner_info(runner_scaler, online=0) + + +def test_flush_busy_on_busy_runner( + runner_scaler_one_runner: RunnerScaler, +): + """ + Arrange: A RunnerScaler with one busy runner. + Act: Run flush busy runner. + Assert: No runners. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, GitHubRunnerState.BUSY) + + runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + assert_runner_info(runner_scaler, online=0) + + +def test_get_runner_one_busy_runner( + runner_scaler_one_runner: RunnerScaler, +): + """ + Arrange: A RunnerScaler with one busy runner. + Act: Run get runners. + Assert: One busy runner. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, GitHubRunnerState.BUSY) + + assert_runner_info(runner_scaler=runner_scaler, online=1, busy=1) + + +def test_get_runner_offline_runner(runner_scaler_one_runner: RunnerScaler): + """ + Arrange: A RunnerScaler with one offline runner + Act: Run get runners. + Assert: One offline runner. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, GitHubRunnerState.OFFLINE) + + assert_runner_info(runner_scaler=runner_scaler, offline=1) + + +def test_get_runner_unknown_runner(runner_scaler_one_runner: RunnerScaler): + """ + Arrange: A RunnerScaler with one offline runner + Act: Run get runners. + Assert: One offline runner. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, "UNKNOWN") + + assert_runner_info(runner_scaler=runner_scaler, unknown=1) From 7c5a78ed2bd52d66941fd0e9da44e9f4644731fb Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:42:44 +0800 Subject: [PATCH 234/278] Fix merge issues --- src-docs/openstack_cloud.md | 6 + ...penstack_cloud.openstack_runner_manager.md | 280 +++++++++ src/openstack_cloud/openstack_manager.py | 2 +- tests/unit/test_runner_manager.py | 589 ------------------ tests/unit/test_runner_scaler.py | 2 +- 5 files changed, 288 insertions(+), 591 deletions(-) delete mode 100644 tests/unit/test_runner_manager.py diff --git a/src-docs/openstack_cloud.md b/src-docs/openstack_cloud.md index ddcad41e0..51140a4b2 100644 --- a/src-docs/openstack_cloud.md +++ b/src-docs/openstack_cloud.md @@ -10,6 +10,12 @@ Module for managing Openstack cloud. - **openstack_cloud**: # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +- **openstack_manager**: # Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +- **openstack_runner_manager**: # Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + --- diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index e69de29bb..f5172cde6 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -0,0 +1,280 @@ + + + + +# module `openstack_cloud.openstack_runner_manager` +Manager for self-hosted runner on OpenStack. + +**Global Variables** +--------------- +- **BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME** +- **MAX_METRICS_FILE_SIZE** +- **RUNNER_STARTUP_PROCESS** +- **RUNNER_LISTENER_PROCESS** +- **RUNNER_WORKER_PROCESS** +- **CREATE_SERVER_TIMEOUT** + + +--- + + + +## class `OpenStackCloudConfig` +Configuration for OpenStack cloud authorisation information. + + + +**Attributes:** + + - `clouds_config`: The clouds.yaml. + - `cloud`: The cloud name to connect to. + + + +### method `__init__` + +```python +__init__(clouds_config: dict[str, dict], cloud: str) → None +``` + + + + + + + + + +--- + + + +## class `OpenStackServerConfig` +Configuration for OpenStack server. + + + +**Attributes:** + + - `image`: The image name for runners to use. + - `flavor`: The flavor name for runners to use. + - `network`: The network name for runners to use. + + + +### method `__init__` + +```python +__init__(image: str, flavor: str, network: str) → None +``` + + + + + + + + + +--- + + + +## class `OpenStackRunnerManager` +Manage self-hosted runner on OpenStack cloud. + + + +**Attributes:** + + - `name_prefix`: The name prefix of the runners created. + + + +### method `__init__` + +```python +__init__( + prefix: str, + cloud_config: OpenStackCloudConfig, + server_config: OpenStackServerConfig | None, + runner_config: GitHubRunnerConfig, + service_config: SupportServiceConfig +) → None +``` + +Construct the object. + + + +**Args:** + + - `prefix`: The prefix to runner name. + - `cloud_config`: The configuration for OpenStack authorisation. + - `server_config`: The configuration for creating OpenStack server. Unable to create runner if None. + - `runner_config`: The configuration for the runner. + - `service_config`: The configuration of supporting services of the runners. + + +--- + +#### property name_prefix + +The prefix of runner names. + + + +**Returns:** + The prefix of the runner names managed by this class. + + + +--- + + + +### method `cleanup` + +```python +cleanup(remove_token: str) → Iterator[RunnerMetrics] +``` + +Cleanup runner and resource on the cloud. + + + +**Args:** + + - `remove_token`: The GitHub remove token. + + + +**Returns:** + Any metrics retrieved from cleanup runners. + +--- + + + +### method `create_runner` + +```python +create_runner(registration_token: str) → str +``` + +Create a self-hosted runner. + + + +**Args:** + + - `registration_token`: The GitHub registration token for registering runners. + + + +**Raises:** + + - `MissingServerConfigError`: Unable to create runner due to missing configuration. + - `RunnerCreateError`: Unable to create runner due to OpenStack issues. + + + +**Returns:** + Instance ID of the runner. + +--- + + + +### method `delete_runner` + +```python +delete_runner(instance_id: str, remove_token: str) → RunnerMetrics | None +``` + +Delete self-hosted runners. + + + +**Args:** + + - `instance_id`: The instance id of the runner to delete. + - `remove_token`: The GitHub remove token. + + + +**Returns:** + Any metrics collected during the deletion of the runner. + +--- + + + +### method `flush_runners` + +```python +flush_runners(remove_token: str, busy: bool = False) → Iterator[RunnerMetrics] +``` + +Remove idle and/or busy runners. + + + +**Args:** + remove_token: + - `busy`: If false, only idle runners are removed. If true, both idle and busy runners are removed. + + + +**Returns:** + Any metrics retrieved from flushed runners. + +--- + + + +### method `get_runner` + +```python +get_runner(instance_id: str) → CloudRunnerInstance | None +``` + +Get a self-hosted runner by instance id. + + + +**Args:** + + - `instance_id`: The instance id. + + + +**Returns:** + Information on the runner instance. + +--- + + + +### method `get_runners` + +```python +get_runners( + states: Optional[Sequence[CloudRunnerState]] = None +) → tuple[CloudRunnerInstance, ] +``` + +Get self-hosted runners by state. + + + +**Args:** + + - `states`: Filter for the runners with these github states. If None all states will be included. + + + +**Returns:** + Information on the runner instances. + + diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index 04cc39add..e0ce47d4f 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -62,7 +62,7 @@ from repo_policy_compliance_client import RepoPolicyComplianceClient from runner_manager import IssuedMetricEventsStats from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig -from runner_type import GithubPath, RunnerByHealth, RunnerGithubInfo +from runner_type import GitHubPath, RunnerGithubInfo, RunnerNameByHealth from utilities import retry, set_env_var logger = logging.getLogger(__name__) diff --git a/tests/unit/test_runner_manager.py b/tests/unit/test_runner_manager.py deleted file mode 100644 index 94d3373d4..000000000 --- a/tests/unit/test_runner_manager.py +++ /dev/null @@ -1,589 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Test cases of RunnerManager class.""" -import random -import secrets -from pathlib import Path -from unittest.mock import MagicMock, call - -import pytest -from pytest import LogCaptureFixture, MonkeyPatch - -import reactive.runner_manager -import shared_fs -from charm_state import ( - Arch, - CharmConfig, - CharmState, - GithubOrg, - GithubRepo, - ProxyConfig, - ReactiveConfig, - VirtualMachineResources, -) -from errors import IssueMetricEventError, RunnerBinaryError -from github_type import RunnerApplication -from metrics.events import Reconciliation, RunnerInstalled, RunnerStart, RunnerStop -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from metrics.storage import MetricsStorage -from runner import Runner, RunnerStatus -from runner_manager import BUILD_IMAGE_SCRIPT_FILENAME, RunnerManager, RunnerManagerConfig -from runner_type import RunnerByHealth -from tests.unit.mock import TEST_BINARY, MockLxdImageManager - -FAKE_MONGODB_URI = "mongodb://example.com/db" - -IMAGE_NAME = "jammy" - -RUNNER_MANAGER_TIME_MODULE = "runner_manager.time.time" -TEST_PROXY_SERVER_URL = "http://proxy.server:1234" - - -@pytest.fixture(scope="function", name="token") -def token_fixture(): - return secrets.token_hex() - - -@pytest.fixture(scope="function", name="charm_config") -def charm_config_fixture(): - """Mock charm config instance.""" - mock_charm_config = MagicMock(spec=CharmConfig) - mock_charm_config.labels = ("test",) - return mock_charm_config - - -@pytest.fixture(scope="function", name="charm_state") -def charm_state_fixture(charm_config: MagicMock): - mock = MagicMock(spec=CharmState) - mock.is_metrics_logging_available = False - mock.arch = Arch.X64 - mock.ssh_debug_connections = None - mock.charm_config = charm_config - return mock - - -@pytest.fixture( - scope="function", - name="runner_manager", - params=[ - (GithubOrg("test_org", "test_group"), ProxyConfig()), - ( - GithubRepo("test_owner", "test_repo"), - ProxyConfig( - no_proxy="test_no_proxy", - http=TEST_PROXY_SERVER_URL, - https=TEST_PROXY_SERVER_URL, - use_aproxy=False, - ), - ), - ], -) -def runner_manager_fixture(request, tmp_path, monkeypatch, token, charm_state): - charm_state.proxy_config = request.param[1] - monkeypatch.setattr( - "runner_manager.RunnerManager.runner_bin_path", tmp_path / "mock_runner_binary" - ) - pool_path = tmp_path / "test_storage" - pool_path.mkdir(exist_ok=True) - - runner_manager = RunnerManager( - "test app", - "0", - RunnerManagerConfig( - path=request.param[0], - token=token, - image=IMAGE_NAME, - service_token=secrets.token_hex(16), - lxd_storage_path=pool_path, - charm_state=charm_state, - ), - ) - runner_manager.runner_bin_path.write_bytes(TEST_BINARY) - return runner_manager - - -@pytest.fixture(autouse=True, name="issue_event_mock") -def issue_event_mock_fixture(monkeypatch: MonkeyPatch) -> MagicMock: - """Mock the issue_event function.""" - issue_event_mock = MagicMock() - monkeypatch.setattr("metrics.events.issue_event", issue_event_mock) - return issue_event_mock - - -@pytest.fixture(autouse=True, name="shared_fs") -def shared_fs_fixture(tmp_path: Path, monkeypatch: MonkeyPatch) -> MagicMock: - """Mock the shared filesystem module.""" - shared_fs_mock = MagicMock(spec=shared_fs) - monkeypatch.setattr("runner_manager.shared_fs", shared_fs_mock) - monkeypatch.setattr("runner.shared_fs", shared_fs_mock) - return shared_fs_mock - - -@pytest.fixture(autouse=True, name="runner_metrics") -def runner_metrics_fixture(monkeypatch: MonkeyPatch) -> MagicMock: - """Mock the runner metrics module.""" - runner_metrics_mock = MagicMock() - monkeypatch.setattr("runner_manager.runner_metrics", runner_metrics_mock) - return runner_metrics_mock - - -@pytest.fixture(name="reactive_reconcile_mock") -def reactive_reconcile_fixture(monkeypatch: MonkeyPatch, tmp_path: Path) -> MagicMock: - """Mock the job class.""" - reconcile_mock = MagicMock(spec=reactive.runner_manager.reconcile) - monkeypatch.setattr("runner_manager.reactive_runner_manager.reconcile", reconcile_mock) - reconcile_mock.side_effect = lambda quantity, **kwargs: quantity - return reconcile_mock - - -@pytest.mark.parametrize( - "arch", - [ - pytest.param(Arch.ARM64), - pytest.param(Arch.X64), - ], -) -def test_get_latest_runner_bin_url(runner_manager: RunnerManager, arch: Arch, charm_state): - """ - arrange: Nothing. - act: Get runner bin url of existing binary. - assert: Correct mock data returned. - """ - charm_state.arch = arch - mock_gh_client = MagicMock() - app = RunnerApplication( - os="linux", - architecture=arch.value, - download_url=(download_url := "https://www.example.com"), - filename=(filename := "test_runner_binary"), - ) - mock_gh_client.get_runner_application.return_value = app - runner_manager._clients.github = mock_gh_client - - runner_bin = runner_manager.get_latest_runner_bin_url(os_name="linux") - assert runner_bin["os"] == "linux" - assert runner_bin["architecture"] == arch.value - assert runner_bin["download_url"] == download_url - assert runner_bin["filename"] == filename - - -def test_get_latest_runner_bin_url_missing_binary(runner_manager: RunnerManager): - """ - arrange: Given a mocked GH API client that does not return any runner binaries. - act: Get runner bin url of non-existing binary. - assert: Error related to runner bin raised. - """ - runner_manager._clients.github = MagicMock() - runner_manager._clients.github.get_runner_application.side_effect = RunnerBinaryError - - with pytest.raises(RunnerBinaryError): - runner_manager.get_latest_runner_bin_url(os_name="not_exist") - - -def test_update_runner_bin(runner_manager: RunnerManager): - """ - arrange: Remove the existing runner binary. - act: Update runner binary. - assert: Runner binary in runner manager is set. - """ - - class MockRequestLibResponse: - """A mock requests library response.""" - - def __init__(self, *args, **kwargs): - """Initialize successful requests library response. - - Args: - args: Placeholder for positional arguments. - kwargs: Placeholder for keyword arguments. - """ - self.status_code = 200 - - def iter_content(self, *args, **kwargs): - """Mock content iterator returning an iterator over a single test runner binary. - - Args: - args: Placeholder positional arguments. - kwargs: Placeholder keyword arguments. - - Returns: - An iterator over a single test runner binary. - """ - return iter([TEST_BINARY]) - - runner_manager.runner_bin_path.unlink(missing_ok=True) - - runner_manager.session.get = MockRequestLibResponse - runner_bin = runner_manager.get_latest_runner_bin_url(os_name="linux") - - runner_manager.update_runner_bin(runner_bin) - - assert runner_manager.runner_bin_path.read_bytes() == TEST_BINARY - - -def test_reconcile_zero_count(runner_manager: RunnerManager): - """ - arrange: Nothing. - act: Reconcile with the current amount of runner. - assert: No error should be raised. - """ - # Reconcile with no change to runner count. - delta = runner_manager.reconcile(0, VirtualMachineResources(2, "7GiB", "10Gib")) - - assert delta == 0 - - -def test_reconcile_create_runner(runner_manager: RunnerManager): - """ - arrange: Nothing. - act: Reconcile to create a runner. - assert: One runner should be created. - """ - # Create a runner. - delta = runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) - - assert delta == 1 - - -def test_reconcile_remove_runner(runner_manager: RunnerManager): - """ - arrange: Create online runners. - act: Reconcile to remove a runner. - assert: One runner should be removed. - """ - - def mock_get_runners(): - """Create three mock runners. - - Returns: - Three mock runners. - """ - runners = [] - for _ in range(3): - # 0 is a mock runner id. - status = RunnerStatus(0, True, True, False) - runners.append(Runner(MagicMock(), MagicMock(), status, None)) - return runners - - # Create online runners. - runner_manager._get_runners = mock_get_runners - runner_manager._get_runner_health_states = lambda: RunnerByHealth( - ( - f"{runner_manager.instance_name}-0", - f"{runner_manager.instance_name}-1", - f"{runner_manager.instance_name}-2", - ), - (), - ) - - delta = runner_manager.reconcile(2, VirtualMachineResources(2, "7GiB", "10Gib")) - - assert delta == -1 - - -def test_reconcile(runner_manager: RunnerManager, tmp_path: Path): - """ - arrange: Setup one runner. - act: Reconcile with the current amount of runner. - assert: Still have one runner. - """ - runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) - # Reconcile with no change to runner count. - runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) - - assert len(runner_manager._get_runners()) == 1 - - -def test_empty_flush(runner_manager: RunnerManager): - """ - arrange: No initial runners. - act: Perform flushing with no runners. - assert: No error thrown. - """ - # Verifying the RunnerManager does not crash if flushing with no runners. - runner_manager.flush() - - -def test_flush(runner_manager: RunnerManager, tmp_path: Path): - """ - arrange: Create some runners. - act: Perform flushing. - assert: No runners. - """ - # Create a runner. - runner_manager.reconcile(2, VirtualMachineResources(2, "7GiB", "10Gib")) - - runner_manager.flush() - assert len(runner_manager._get_runners()) == 0 - - -def test_reconcile_issues_runner_installed_event( - runner_manager: RunnerManager, - monkeypatch: MonkeyPatch, - issue_event_mock: MagicMock, - charm_state: MagicMock, -): - """ - arrange: Enable issuing of metrics and mock timestamps. - act: Reconcile to create a runner. - assert: The expected event is issued. - """ - charm_state.is_metrics_logging_available = True - t_mock = MagicMock(return_value=12345) - monkeypatch.setattr(RUNNER_MANAGER_TIME_MODULE, t_mock) - - runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) - - issue_event_mock.assert_has_calls( - [call(event=RunnerInstalled(timestamp=12345, flavor=runner_manager.app_name, duration=0))] - ) - - -def test_reconcile_issues_no_runner_installed_event_if_metrics_disabled( - runner_manager: RunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock -): - """ - arrange: Disable issuing of metrics. - act: Reconcile to create a runner. - assert: The expected event is not issued. - """ - charm_state.is_metrics_logging_available = False - - runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) - - issue_event_mock.assert_not_called() - - -def test_reconcile_error_on_issue_event_is_ignored( - runner_manager: RunnerManager, - issue_event_mock: MagicMock, - charm_state: MagicMock, -): - """ - arrange: Enable issuing of metrics and mock the metric issuing to raise an expected error. - act: Reconcile. - assert: No error is raised. - """ - charm_state.is_metrics_logging_available = True - - issue_event_mock.side_effect = IssueMetricEventError("test error") - - delta = runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) - - assert delta == 1 - - -def test_reconcile_issues_reconciliation_metric_event( - runner_manager: RunnerManager, - monkeypatch: MonkeyPatch, - issue_event_mock: MagicMock, - runner_metrics: MagicMock, - charm_state: MagicMock, -): - """ - arrange: \ - - Enable issuing of metrics \ - - Mock timestamps \ - - Mock the result of runner_metrics.issue_event to contain 2 RunnerStart and 1 RunnerStop \ - events, meaning one runner was active and one crashed. \ - - Create two online runners , one active and one idle. - act: Reconcile. - assert: The expected event is issued. We expect two idle runners and one crashed runner - to be reported. - """ - charm_state.is_metrics_logging_available = True - t_mock = MagicMock(return_value=12345) - monkeypatch.setattr(RUNNER_MANAGER_TIME_MODULE, t_mock) - runner_metrics.extract.return_value = (MagicMock() for _ in range(2)) - runner_metrics.issue_events.side_effect = [{RunnerStart, RunnerStop}, {RunnerStart}] - - online_idle_runner_name = f"{runner_manager.instance_name}-0" - offline_idle_runner_name = f"{runner_manager.instance_name}-1" - active_runner_name = f"{runner_manager.instance_name}-2" - - def mock_get_runners(): - """Create three mock runners where one is busy. - - Returns: - Mock runners with one busy runner. - """ - runners = [] - - online_idle_runner = RunnerStatus(runner_id=0, exist=True, online=True, busy=False) - offline_idle_runner = RunnerStatus(runner_id=1, exist=True, online=False, busy=False) - active_runner = RunnerStatus(runner_id=2, exist=True, online=True, busy=True) - - for runner_status, runner_config in zip( - (online_idle_runner, offline_idle_runner, active_runner), - (online_idle_runner_name, offline_idle_runner_name, active_runner_name), - ): - config = MagicMock() - config.name = runner_config - runners.append( - Runner( - clients=MagicMock(), - runner_config=config, - runner_status=runner_status, - instance=None, - ) - ) - - return runners - - # Create online runners. - runner_manager._get_runners = mock_get_runners - runner_manager._get_runner_health_states = lambda: RunnerByHealth( - healthy=( - online_idle_runner_name, - offline_idle_runner_name, - active_runner_name, - ), - unhealthy=(), - ) - - runner_manager.reconcile( - quantity=random.randint(0, 5), resources=VirtualMachineResources(2, "7GiB", "10Gib") - ) - - issue_event_mock.assert_any_call( - event=Reconciliation( - timestamp=12345, - flavor=runner_manager.app_name, - crashed_runners=1, - idle_runners=2, - duration=0, - ) - ) - - -def test_reconcile_places_timestamp_in_newly_created_runner( - runner_manager: RunnerManager, - monkeypatch: MonkeyPatch, - shared_fs: MagicMock, - tmp_path: Path, - charm_state: MagicMock, -): - """ - arrange: Enable issuing of metrics, mock timestamps and create the directory for the shared\ - filesystem. - act: Reconcile to create a runner. - assert: The expected timestamp is placed in the shared filesystem. - """ - charm_state.is_metrics_logging_available = True - t_mock = MagicMock(return_value=12345) - monkeypatch.setattr(RUNNER_MANAGER_TIME_MODULE, t_mock) - runner_shared_fs = tmp_path / "runner_fs" - runner_shared_fs.mkdir() - fs = MetricsStorage(path=runner_shared_fs, runner_name="test_runner") - shared_fs.get.return_value = fs - - runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) - - assert (fs.path / RUNNER_INSTALLED_TS_FILE_NAME).exists() - assert (fs.path / RUNNER_INSTALLED_TS_FILE_NAME).read_text() == "12345" - - -def test_reconcile_error_on_placing_timestamp_is_ignored( - runner_manager: RunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock -): - """ - arrange: Enable issuing of metrics and do not create the directory for the shared filesystem\ - in order to let a FileNotFoundError to be raised inside the RunnerManager. - act: Reconcile to create a runner. - assert: No exception is raised. - """ - charm_state.is_metrics_logging_available = True - runner_shared_fs = tmp_path / "runner_fs" - fs = MetricsStorage(path=runner_shared_fs, runner_name="test_runner") - shared_fs.get.return_value = fs - - runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) - - assert not (fs.path / RUNNER_INSTALLED_TS_FILE_NAME).exists() - - -def test_reconcile_places_no_timestamp_in_newly_created_runner_if_metrics_disabled( - runner_manager: RunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock -): - """ - arrange: Disable issuing of metrics, mock timestamps and the shared filesystem module. - act: Reconcile to create a runner. - assert: No timestamp is placed in the shared filesystem. - """ - charm_state.is_metrics_logging_available = False - - fs = MetricsStorage(path=tmp_path, runner_name="test_runner") - shared_fs.get.return_value = fs - - runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) - - assert not (fs.path / RUNNER_INSTALLED_TS_FILE_NAME).exists() - - -def test_reconcile_reactive_mode( - runner_manager: RunnerManager, - reactive_reconcile_mock: MagicMock, - caplog: LogCaptureFixture, -): - """ - arrange: Enable reactive mode and mock the job class to return a job. - act: Call reconcile with a random quantity n. - assert: The mocked job is picked up n times and the expected log message is present. - """ - count = random.randint(0, 5) - runner_manager.config.reactive_config = ReactiveConfig(mq_uri=FAKE_MONGODB_URI) - actual_count = runner_manager.reconcile(count, VirtualMachineResources(2, "7GiB", "10Gib")) - - assert actual_count == count - reactive_reconcile_mock.assert_called_with( - quantity=count, mq_uri=FAKE_MONGODB_URI, queue_name=runner_manager.app_name - ) - - -def test_schedule_build_runner_image( - runner_manager: RunnerManager, - tmp_path: Path, - charm_state: CharmState, - monkeypatch: MonkeyPatch, -): - """ - arrange: Mock the cron path and the randint function. - act: Schedule the build runner image. - assert: The cron file is created with the expected content. - """ - runner_manager.cron_path = tmp_path / "cron" - runner_manager.cron_path.mkdir() - monkeypatch.setattr(random, "randint", MagicMock(spec=random.randint, return_value=4)) - - runner_manager.schedule_build_runner_image() - - cronfile = runner_manager.cron_path / "build-runner-image" - http = charm_state.proxy_config.http or "''" - https = charm_state.proxy_config.https or "''" - no_proxy = charm_state.proxy_config.no_proxy or "''" - - cmd = f"/usr/bin/bash {BUILD_IMAGE_SCRIPT_FILENAME.absolute()} {http} {https} {no_proxy}" - - assert cronfile.exists() - assert cronfile.read_text() == f"4 4,10,16,22 * * * ubuntu {cmd} jammy\n" - - -def test_has_runner_image(runner_manager: RunnerManager): - """ - arrange: Multiple setups. - 1. no runner image exists. - 2. runner image with wrong name exists. - 3. runner image with correct name exists. - act: Check if runner image exists. - assert: - 1 and 2. False is returned. - 3. True is returned. - """ - assert not runner_manager.has_runner_image() - - runner_manager._clients.lxd.images = MockLxdImageManager({"hirsute"}) - assert not runner_manager.has_runner_image() - - runner_manager._clients.lxd.images = MockLxdImageManager({IMAGE_NAME}) - assert runner_manager.has_runner_image() diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 476885954..5bb60ccb6 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -250,5 +250,5 @@ def test_get_runner_unknown_runner(runner_scaler_one_runner: RunnerScaler): """ runner_scaler = runner_scaler_one_runner set_one_runner_state(runner_scaler, "UNKNOWN") - + assert_runner_info(runner_scaler=runner_scaler, unknown=1) From a473b688ca5a343d88dbea9cb7dc87cdba010303 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 27 Aug 2024 10:00:22 +0800 Subject: [PATCH 235/278] Fix states in get_runners methods --- src-docs/openstack_cloud.openstack_runner_manager.md | 6 +++--- src/manager/github_runner_manager.py | 8 ++++++-- src/openstack_cloud/openstack_runner_manager.py | 4 +++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index f5172cde6..95bf05c5d 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -131,7 +131,7 @@ The prefix of runner names. --- - + ### method `cleanup` @@ -184,7 +184,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -208,7 +208,7 @@ Delete self-hosted runners. --- - + ### method `flush_runners` diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 8f00525b0..686976d84 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -72,12 +72,16 @@ def get_runners( Information on the runners. """ runner_list = self.github.get_runner_github_info(self._path) + runner_list = [runner for runner in runner_list if runner.name.startswith(self._prefix)] + + if states is None: + return tuple(runner_list) + state_set = set(states) return tuple( runner for runner in runner_list - if runner.name.startswith(self._prefix) - and GitHubRunnerManager._is_runner_in_state(runner, state_set) + if GitHubRunnerManager._is_runner_in_state(runner, state_set) ) def delete_runners(self, states: Iterable[GitHubRunnerState] | None = None) -> None: diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 39475ee69..c84f09d2e 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -247,7 +247,9 @@ def get_runners( ] if states is None: return tuple(instance_list) - return tuple(instance for instance in instance_list if instance.state in states) + + state_set = set(states) + return tuple(instance for instance in instance_list if instance.state in state_set) def delete_runner( self, instance_id: InstanceId, remove_token: str From 895206eaaf8cba527b99d23c488daa12013a9726 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 27 Aug 2024 10:49:04 +0800 Subject: [PATCH 236/278] Add docstring for unit test mocks --- tests/unit/mock_runner_managers.py | 37 ++++++++++++++++++++++++++++-- tests/unit/test_runner_scaler.py | 10 ++++++-- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py index 269931bbd..83f88e73e 100644 --- a/tests/unit/mock_runner_managers.py +++ b/tests/unit/mock_runner_managers.py @@ -20,7 +20,15 @@ @dataclass class MockRunner: - """Mock of a runner.""" + """Mock of a runner. + + Attributes: + name: The name of the runner. + instance_id: The instance id of the runner. + cloud_state: The cloud state of the runner. + github_state: The github state of the runner. + health: The health state of the runner. + """ name: str instance_id: InstanceId @@ -41,7 +49,11 @@ def __init__(self, name: str): self.health = True def to_cloud_runner(self) -> CloudRunnerInstance: - """Construct CloudRunnerInstance from this object.""" + """Construct CloudRunnerInstance from this object. + + Returns: + The CloudRunnerInstance instance. + """ return CloudRunnerInstance( name=self.name, instance_id=self.instance_id, @@ -55,6 +67,9 @@ class SharedMockRunnerManagerState: """State shared by mock runner managers. For sharing the mock runner states between MockCloudRunnerManager and MockGitHubRunnerManager. + + Attributes: + runners: The runners. """ runners: dict[InstanceId, MockRunner] @@ -89,6 +104,9 @@ def create_runner(self, registration_token: str) -> InstanceId: Args: registration_token: The GitHub registration token for registering runners. + + Returns: + The instance id of the runner created. """ name = f"{self.name_prefix}-{secrets.token_hex(6)}" runner = MockRunner(name) @@ -100,6 +118,9 @@ def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: Args: instance_id: The instance id. + + Returns: + The runner instance if found else None. """ runner = self.state.runners.get(instance_id, None) if runner is not None: @@ -114,6 +135,9 @@ def get_runners( Args: states: Filter for the runners with these github states. If None all states will be included. + + Returns: + The list of runner instances. """ if states is None: states = [member.value for member in CloudRunnerState] @@ -131,6 +155,9 @@ def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMet Args: instance_id: The instance id of the runner to delete. remove_token: The GitHub remove token. + + Returns: + Any runner metrics produced during deletion. """ self.state.runners.pop(instance_id, None) return iter([]) @@ -142,6 +169,9 @@ def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[Runne remove_token: The GitHub remove token for removing runners. busy: If false, only idle runners are removed. If true, both idle and busy runners are removed. + + Returns: + Any runner metrics produced during flushing. """ # No supporting metrics in the mocks. if busy: @@ -161,6 +191,9 @@ def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: Args: remove_token: The GitHub remove token for removing runners. + + Returns: + Any runner metrics produced during cleanup. """ # No supporting metrics in the mocks. return iter([]) diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 5bb60ccb6..5f30c11ab 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -81,7 +81,14 @@ def set_one_runner_state( runner_scaler: RunnerScaler, github_state: GitHubRunnerState | None = None, cloud_state: CloudRunnerState | None = None, -) -> RunnerScaler: +): + """Set the runner state for a RunnerScaler with one runner. + + Args: + runner_scaler: The RunnerScaler instance to modify. + github_state: The github state to set the runner. + cloud_state: The cloud state to set the runner. + """ runner_dict = runner_scaler._manager._github.state.runners assert len(runner_dict) == 1, "Test arrange failed: One runner should be present" instance_id = list(runner_dict.keys())[0] @@ -89,7 +96,6 @@ def set_one_runner_state( runner_dict[instance_id].github_state = github_state if cloud_state is not None: runner_dict[instance_id].cloud_state = cloud_state - return runner_scaler def assert_runner_info( From 81815c4634d6530f566f1a9e8772cba5b69015a0 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 27 Aug 2024 10:54:12 +0800 Subject: [PATCH 237/278] Fix construction of repo-policy-compliance from config --- src/charm.py | 3 +-- src/manager/cloud_runner_manager.py | 8 +++----- src/openstack_cloud/openstack_runner_manager.py | 5 +++-- tests/integration/test_runner_manager_openstack.py | 3 +-- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/charm.py b/src/charm.py index f842c591e..b35f46919 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1243,8 +1243,7 @@ def _get_runner_scaler( proxy_config=state.proxy_config, dockerhub_mirror=state.charm_config.dockerhub_mirror, ssh_debug_connections=state.ssh_debug_connections, - repo_policy_url=state.charm_config.repo_policy_compliance.url, - repo_policy_token=state.charm_config.repo_policy_compliance.token, + repo_policy_compliance=state.charm_config.repo_policy_compliance, ) openstack_runner_manager = OpenStackRunnerManager( app_name, diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index b2624199d..aff75ed41 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -9,7 +9,7 @@ from enum import Enum, auto from typing import Iterator, Sequence, Tuple -from charm_state import GitHubPath, ProxyConfig, SSHDebugConnection +from charm_state import GitHubPath, ProxyConfig, RepoPolicyComplianceConfig, SSHDebugConnection from metrics.runner import RunnerMetrics logger = logging.getLogger(__name__) @@ -110,15 +110,13 @@ class SupportServiceConfig: proxy_config: The proxy configuration. dockerhub_mirror: The dockerhub mirror to use for runners. ssh_debug_connections: The information on the ssh debug services. - repo_policy_url: The URL of the repo policy service. - repo_policy_token: The token to access the repo policy service. + repo_policy_compliance: The configuration of the repo policy compliance service. """ proxy_config: ProxyConfig | None dockerhub_mirror: str | None ssh_debug_connections: list[SSHDebugConnection] | None - repo_policy_url: str | None - repo_policy_token: str | None + repo_policy_compliance: RepoPolicyComplianceConfig | None @dataclass diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index c84f09d2e..3a9acd4a0 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -460,9 +460,10 @@ def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | Non Returns: The repo policy compliance client. """ - if self._service_config.repo_policy_url and self._service_config.repo_policy_token: + if self._service_config.repo_policy_compliance is not None: return RepoPolicyComplianceClient( - self._service_config.repo_policy_url, self._service_config.repo_policy_token + self._service_config.repo_policy_compliance.url, + self._service_config.repo_policy_compliance.token, ) return None diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b12d42414..089888b94 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -122,8 +122,7 @@ async def openstack_runner_manager_fixture( proxy_config=proxy_config, dockerhub_mirror=None, ssh_debug_connections=None, - repo_policy_url=None, - repo_policy_token=None, + repo_policy_compliance=None, ) return OpenStackRunnerManager( app_name, cloud_config, server_config, runner_config, service_config From 4012ed02aad0649b53ad96196541fc67af53a4d5 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 27 Aug 2024 12:53:58 +0800 Subject: [PATCH 238/278] Fix get_runners action output --- src/charm.py | 12 +++++++++++- src/manager/runner_scaler.py | 4 +++- tests/unit/test_runner_scaler.py | 16 ++++++++-------- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/charm.py b/src/charm.py index b35f46919..a974ceb56 100755 --- a/src/charm.py +++ b/src/charm.py @@ -765,7 +765,17 @@ def _on_check_runners_action(self, event: ActionEvent) -> None: if state.instance_type == InstanceType.OPENSTACK: runner_scaler = self._get_runner_scaler(state) - event.set_results(runner_scaler.get_runner_info()) + info = runner_scaler.get_runner_info() + event.set_results( + { + "online": info.online, + "busy": info.busy, + "offline": info.offline, + "unknown": info.unknown, + "runners": info.runners, + "busy_runners": info.busy_runners, + } + ) return runner_manager = self._get_runner_manager(state) diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index c7cca69a3..606a3da08 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -5,6 +5,7 @@ import logging import time +from dataclasses import dataclass from typing import TypedDict from errors import IssueMetricEventError, MissingServerConfigError @@ -16,7 +17,8 @@ logger = logging.getLogger(__name__) -class RunnerInfo(TypedDict): +@dataclass +class RunnerInfo: """Information on the runners. Attributes: diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 5f30c11ab..c7a14a431 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -111,14 +111,14 @@ def assert_runner_info( unknown: The number of unknown runners to assert for. """ info = runner_scaler.get_runner_info() - assert info["offline"] == offline - assert info["online"] == online - assert info["busy"] == busy - assert info["unknown"] == unknown - assert isinstance(info["runners"], tuple) - assert len(info["runners"]) == online - assert isinstance(info["busy_runners"], tuple) - assert len(info["busy_runners"]) == busy + assert info.offline == offline + assert info.online == online + assert info.busy == busy + assert info.unknown == unknown + assert isinstance(info.runners, tuple) + assert len(info.runners) == online + assert isinstance(info.busy_runners, tuple) + assert len(info.busy_runners) == busy def test_get_no_runner(runner_scaler: RunnerScaler): From 782d145afec119702a34a3f8392377fe1beed390 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 27 Aug 2024 13:31:02 +0800 Subject: [PATCH 239/278] Fix the lints --- src/manager/runner_manager.py | 4 +-- src/manager/runner_scaler.py | 1 - tests/unit/mock_runner_managers.py | 44 +++++++++++++++++++++++++++++- tests/unit/test_runner_scaler.py | 4 +-- 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 03639a306..96d3acfed 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from enum import Enum, auto from multiprocessing import Pool -from typing import Iterable, Iterator, Sequence, Type, cast +from typing import Iterator, Sequence, Type, cast from charm_state import GitHubPath from errors import GithubMetricsError, RunnerCreateError @@ -242,7 +242,7 @@ def cleanup(self) -> IssuedMetricEventsStats: @staticmethod def _spawn_runners( - create_runner_args: Iterable["RunnerManager._CreateRunnerArgs"], + create_runner_args: Sequence["RunnerManager._CreateRunnerArgs"], ) -> tuple[InstanceId, ...]: """Parallel spawn of runners. diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index 606a3da08..5216ec4f3 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -6,7 +6,6 @@ import logging import time from dataclasses import dataclass -from typing import TypedDict from errors import IssueMetricEventError, MissingServerConfigError from manager.cloud_runner_manager import HealthState diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py index 83f88e73e..a737cbde0 100644 --- a/tests/unit/mock_runner_managers.py +++ b/tests/unit/mock_runner_managers.py @@ -80,9 +80,14 @@ def __init__(self): class MockCloudRunnerManager(CloudRunnerManager): - """Mock for CloudRunnerManager. + """Mock of CloudRunnerManager. Metrics is not supported in this mock. + + Attributes: + name_prefix: The naming prefix for runners managed. + prefix: The naming prefix for runners managed. + state: The shared state between mocks runner managers. """ def __init__(self, state: SharedMockRunnerManagerState): @@ -200,21 +205,53 @@ def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: class MockGitHubRunnerManager: + """Mock of GitHubRunnerManager. + + Attributes: + name_prefix: The naming prefix for runner managed. + state: The shared state between mock runner managers. + path: The GitHub path to register the runners under. + """ def __init__(self, name_prefix: str, path: GitHubPath, state: SharedMockRunnerManagerState): + """Construct the object. + + Args: + name_prefix: The naming prefix for runner managed. + path: The GitHub path to register the runners under. + state: The shared state between mock runner managers. + """ self.name_prefix = name_prefix self.state = state self.path = path def get_registration_token(self) -> str: + """Get the registration token for registering runners on GitHub. + + Returns: + The registration token. + """ return "mock_registration_token" def get_removal_token(self) -> str: + """Get the remove token for removing runners on GitHub. + + Returns: + The remove token. + """ return "mock_remove_token" def get_runners( self, github_states: Iterable[GitHubRunnerState] | None = None ) -> tuple[SelfHostedRunner, ...]: + """Get the runners. + + Args: + github_states: The states to filter for. + + Returns: + List of runners. + """ if github_states is None: github_states = [member.value for member in GitHubRunnerState] @@ -237,6 +274,11 @@ def get_runners( ) def delete_runners(self, states: Iterable[GitHubRunnerState]) -> None: + """Delete the runners. + + Args: + states: The states to filter the runners to delete. + """ github_states = set(states) self.state.runners = { instance_id: runner diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index c7a14a431..8eec56033 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -238,7 +238,7 @@ def test_get_runner_one_busy_runner( def test_get_runner_offline_runner(runner_scaler_one_runner: RunnerScaler): """ - Arrange: A RunnerScaler with one offline runner + Arrange: A RunnerScaler with one offline runner. Act: Run get runners. Assert: One offline runner. """ @@ -250,7 +250,7 @@ def test_get_runner_offline_runner(runner_scaler_one_runner: RunnerScaler): def test_get_runner_unknown_runner(runner_scaler_one_runner: RunnerScaler): """ - Arrange: A RunnerScaler with one offline runner + Arrange: A RunnerScaler with one offline runner. Act: Run get runners. Assert: One offline runner. """ From 40bec794e626d26ea0d23740ce962dfe7657c749 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 27 Aug 2024 18:36:21 +0800 Subject: [PATCH 240/278] Fix a naming issue --- src/charm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index a974ceb56..a19459b89 100755 --- a/src/charm.py +++ b/src/charm.py @@ -773,7 +773,7 @@ def _on_check_runners_action(self, event: ActionEvent) -> None: "offline": info.offline, "unknown": info.unknown, "runners": info.runners, - "busy_runners": info.busy_runners, + "busy-runners": info.busy_runners, } ) return From e036d198cba7094424ba059bccf2c74ccf036026 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 27 Aug 2024 18:41:24 +0800 Subject: [PATCH 241/278] Fix naming prefix of runner --- src/charm.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/charm.py b/src/charm.py index a19459b89..198e7d2e9 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1223,8 +1223,6 @@ def _get_runner_scaler( if path is None: path = state.charm_config.path - app_name, _ = self.unit.name.rsplit("/", 1) - clouds = list(state.charm_config.openstack_clouds_yaml["clouds"].keys()) if len(clouds) > 1: logger.warning( @@ -1255,8 +1253,9 @@ def _get_runner_scaler( ssh_debug_connections=state.ssh_debug_connections, repo_policy_compliance=state.charm_config.repo_policy_compliance, ) + # The prefix is set to f"{application_name}-{unit number}" openstack_runner_manager = OpenStackRunnerManager( - app_name, + prefix=self.unit.name.replace("/", "-"), cloud_config=cloud_config, server_config=server_config, runner_config=runner_config, From c1c0ed99048223f02693b7258a8f09a360ad29b7 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 28 Aug 2024 08:57:35 +0800 Subject: [PATCH 242/278] Improve unit test --- src/charm.py | 4 +++- src/manager/runner_manager.py | 12 ++++++++++-- tests/unit/mock_runner_managers.py | 19 ++++++++++++++----- tests/unit/test_runner_scaler.py | 11 ++++++++++- 4 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/charm.py b/src/charm.py index 198e7d2e9..5adcd8b01 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1266,7 +1266,9 @@ def _get_runner_scaler( path=path, ) runner_manager = RunnerManager( - cloud_runner_manager=openstack_runner_manager, config=runner_manager_config + manager_name=self.app.name, + cloud_runner_manager=openstack_runner_manager, + config=runner_manager_config, ) return RunnerScaler(runner_manager=runner_manager) diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 96d3acfed..72ded77fb 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -93,16 +93,24 @@ class RunnerManager: """Manage the runners. Attributes: + manager_name: A name to identify this manager. name_prefix: The name prefix of the runners. """ - def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManagerConfig): + def __init__( + self, + manager_name: str, + cloud_runner_manager: CloudRunnerManager, + config: RunnerManagerConfig, + ): """Construct the object. Args: + manager_name: A name to identify this manager. cloud_runner_manager: For managing the cloud instance of the runner. config: Configuration of this class. """ + self.manager_name = manager_name self._config = config self._cloud = cloud_runner_manager self.name_prefix = self._cloud.name_prefix @@ -321,7 +329,7 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri issued_events = runner_metrics.issue_events( runner_metrics=extracted_metrics, job_metrics=job_metrics, - flavor=self.name_prefix, + flavor=self.manager_name, ) for event_type in issued_events: diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py index a737cbde0..81c334f37 100644 --- a/tests/unit/mock_runner_managers.py +++ b/tests/unit/mock_runner_managers.py @@ -5,8 +5,10 @@ import secrets from dataclasses import dataclass from typing import Iterable, Iterator, Sequence +from unittest.mock import MagicMock from charm_state import GitHubPath +from github_client import GithubClient from github_type import GitHubRunnerStatus, SelfHostedRunner from manager.cloud_runner_manager import ( CloudRunnerInstance, @@ -15,7 +17,9 @@ InstanceId, ) from manager.github_runner_manager import GitHubRunnerState +from metrics.events import RunnerStop from metrics.runner import RunnerMetrics +from tests.unit.mock import MockGhapiClient @dataclass @@ -164,7 +168,9 @@ def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMet Returns: Any runner metrics produced during deletion. """ - self.state.runners.pop(instance_id, None) + runner = self.state.runners.pop(instance_id, None) + if runner is not None: + return iter([MagicMock()]) return iter([]) def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[RunnerMetrics]: @@ -178,7 +184,7 @@ def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[Runne Returns: Any runner metrics produced during flushing. """ - # No supporting metrics in the mocks. + num = len(self.state.runners) if busy: self.state.runners = {} else: @@ -187,7 +193,7 @@ def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[Runne for instance_id, runner in self.state.runners.items() if runner.github_state == GitHubRunnerState.BUSY } - return iter([]) + return iter([MagicMock()]) def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: """Cleanup runner and resource on the cloud. @@ -200,14 +206,15 @@ def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: Returns: Any runner metrics produced during cleanup. """ - # No supporting metrics in the mocks. - return iter([]) + # Do nothing in mocks. + return iter([MagicMock()]) class MockGitHubRunnerManager: """Mock of GitHubRunnerManager. Attributes: + github: The GitHub client. name_prefix: The naming prefix for runner managed. state: The shared state between mock runner managers. path: The GitHub path to register the runners under. @@ -221,6 +228,8 @@ def __init__(self, name_prefix: str, path: GitHubPath, state: SharedMockRunnerMa path: The GitHub path to register the runners under. state: The shared state between mock runner managers. """ + self.github = GithubClient("mock_token") + self.github._client = MockGhapiClient("mock_token") self.name_prefix = name_prefix self.state = state self.path = path diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 8eec56033..7312a69e4 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -3,6 +3,7 @@ from typing import Iterable +from unittest.mock import MagicMock import pytest @@ -59,8 +60,16 @@ def runner_manager_fixture( monkeypatch.setattr( "manager.runner_manager.RunnerManager._spawn_runners", mock_runner_manager_spawn_runners ) + # Patch out the metrics, as metrics has their own tests. + monkeypatch.setattr( + "manager.runner_manager.github_metrics.job", MagicMock() + ) + monkeypatch.setattr( + "manager.runner_manager.runner_metrics.issue_events", MagicMock() + ) + config = RunnerManagerConfig("mock_token", github_path) - runner_manager = RunnerManager(mock_cloud, config) + runner_manager = RunnerManager("mock_runners", mock_cloud, config) runner_manager._github = mock_github return runner_manager From 14a787a63ca2237bd0674f2d1c23775ff887e9ee Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 28 Aug 2024 09:12:51 +0800 Subject: [PATCH 243/278] Remove the old OpenstackRunnerManager --- src-docs/openstack_cloud.md | 3 - ...penstack_cloud.openstack_runner_manager.md | 20 +- src/openstack_cloud/openstack_manager.py | 1598 ----------------- .../openstack_runner_manager.py | 13 +- tests/unit/mock_runner_managers.py | 2 - tests/unit/test_openstack_manager.py | 1200 ------------- tests/unit/test_runner_scaler.py | 8 +- 7 files changed, 20 insertions(+), 2824 deletions(-) delete mode 100644 src/openstack_cloud/openstack_manager.py delete mode 100644 tests/unit/test_openstack_manager.py diff --git a/src-docs/openstack_cloud.md b/src-docs/openstack_cloud.md index 51140a4b2..34aa3f26f 100644 --- a/src-docs/openstack_cloud.md +++ b/src-docs/openstack_cloud.md @@ -10,9 +10,6 @@ Module for managing Openstack cloud. - **openstack_cloud**: # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -- **openstack_manager**: # Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - - **openstack_runner_manager**: # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 95bf05c5d..81a35247b 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -17,7 +17,7 @@ Manager for self-hosted runner on OpenStack. --- - + ## class `OpenStackCloudConfig` Configuration for OpenStack cloud authorisation information. @@ -47,7 +47,7 @@ __init__(clouds_config: dict[str, dict], cloud: str) → None --- - + ## class `OpenStackServerConfig` Configuration for OpenStack server. @@ -78,7 +78,7 @@ __init__(image: str, flavor: str, network: str) → None --- - + ## class `OpenStackRunnerManager` Manage self-hosted runner on OpenStack cloud. @@ -89,7 +89,7 @@ Manage self-hosted runner on OpenStack cloud. - `name_prefix`: The name prefix of the runners created. - + ### method `__init__` @@ -131,7 +131,7 @@ The prefix of runner names. --- - + ### method `cleanup` @@ -154,7 +154,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -184,7 +184,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -208,7 +208,7 @@ Delete self-hosted runners. --- - + ### method `flush_runners` @@ -231,7 +231,7 @@ Remove idle and/or busy runners. --- - + ### method `get_runner` @@ -254,7 +254,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py deleted file mode 100644 index e0ce47d4f..000000000 --- a/src/openstack_cloud/openstack_manager.py +++ /dev/null @@ -1,1598 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -# TODO: 2024-04-11 The module contains too many lines which are scheduled for refactoring. -# pylint: disable=too-many-lines - -# TODO: 2024-04-22 The module contains duplicate code which is scheduled for refactoring. -# Lines related to issuing metrics are duplicated: -# ==openstack_cloud.openstack_manager:[1320:1337] -# ==runner_manager:[383:413] -# ==openstack_cloud.openstack_manager:[1283:1314] -# ==runner_manager:[339:368] - -# pylint: disable=duplicate-code - -"""Module for handling interactions with OpenStack.""" -import logging -import secrets -import shutil -import time -from contextlib import contextmanager -from dataclasses import dataclass -from datetime import datetime -from multiprocessing import Pool -from pathlib import Path -from typing import Iterable, Iterator, Literal, Optional, cast - -import invoke -import jinja2 -import openstack -import openstack.connection -import openstack.exceptions -import openstack.image.v2.image -import paramiko -from fabric import Connection as SSHConnection -from openstack.compute.v2.server import Server -from openstack.connection import Connection as OpenstackConnection -from openstack.exceptions import SDKException -from openstack.network.v2.security_group import SecurityGroup -from paramiko.ssh_exception import NoValidConnectionsError - -import reactive.runner_manager as reactive_runner_manager -from charm_state import CharmState, GitHubOrg, ProxyConfig, SSHDebugConnection -from errors import ( - CreateMetricsStorageError, - GetMetricsStorageError, - GithubApiError, - GithubClientError, - GithubMetricsError, - IssueMetricEventError, - OpenStackError, - RunnerCreateError, - RunnerStartError, -) -from github_client import GithubClient -from github_type import GitHubRunnerStatus, SelfHostedRunner -from metrics import events as metric_events -from metrics import github as github_metrics -from metrics import runner as runner_metrics -from metrics import storage as metrics_storage -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from repo_policy_compliance_client import RepoPolicyComplianceClient -from runner_manager import IssuedMetricEventsStats -from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig -from runner_type import GitHubPath, RunnerGithubInfo, RunnerNameByHealth -from utilities import retry, set_env_var - -logger = logging.getLogger(__name__) - -# Update the version when the security group rules are not backward compatible. -SECURITY_GROUP_NAME = "github-runner-v1" -BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME = "scripts/build-openstack-image.sh" -_SSH_KEY_PATH = Path("/home/ubuntu/.ssh") -_CONFIG_SCRIPT_PATH = Path("/home/ubuntu/actions-runner/config.sh") - -RUNNER_APPLICATION = Path("/home/ubuntu/actions-runner") -METRICS_EXCHANGE_PATH = Path("/home/ubuntu/metrics-exchange") -PRE_JOB_SCRIPT = RUNNER_APPLICATION / "pre-job.sh" -MAX_METRICS_FILE_SIZE = 1024 - -RUNNER_STARTUP_PROCESS = "/home/ubuntu/actions-runner/run.sh" -RUNNER_LISTENER_PROCESS = "Runner.Listener" -RUNNER_WORKER_PROCESS = "Runner.Worker" -CREATE_SERVER_TIMEOUT = 5 * 60 - - -class _PullFileError(Exception): - """Represents an error while pulling a file from the runner instance.""" - - def __init__(self, reason: str): - """Construct PullFileError object. - - Args: - reason: The reason for the error. - """ - super().__init__(reason) - - -class _SSHError(Exception): - """Represents an error while interacting with SSH.""" - - def __init__(self, reason: str): - """Construct SSHErrors object. - - Args: - reason: The reason for the error. - """ - super().__init__(reason) - - -@dataclass -class InstanceConfig: - """The configuration values for creating a single runner instance. - - Attributes: - github_path: The GitHub repo/org path to register the runner. - image_id: The Openstack image id to use to boot the instance with. - labels: The runner instance labels. - name: Name of the image to launch the GitHub runner instance with. - registration_token: Token for registering the runner on GitHub. - """ - - github_path: GitHubPath - image_id: str - labels: Iterable[str] - name: str - registration_token: str - - -SupportedCloudImageArch = Literal["amd64", "arm64"] - - -@dataclass -class _CloudInitUserData: - """Dataclass to hold cloud init userdata. - - Attributes: - instance_config: The configuration values for Openstack instance to launch. - runner_env: The contents of .env to source when launching Github runner. - pre_job_contents: The contents of pre-job script to run before starting the job. - proxies: Proxy values to enable on the Github runner. - dockerhub_mirror: URL to dockerhub mirror. - """ - - instance_config: InstanceConfig - runner_env: str - pre_job_contents: str - dockerhub_mirror: Optional[str] = None - proxies: Optional[ProxyConfig] = None - - -@contextmanager -def _create_connection(cloud_config: dict[str, dict]) -> Iterator[openstack.connection.Connection]: - """Create a connection context managed object, to be used within with statements. - - This method should be called with a valid cloud_config. See _validate_cloud_config. - Also, this method assumes that the clouds.yaml exists on ~/.config/openstack/clouds.yaml. - See charm_state.py _write_openstack_config_to_disk. - - Args: - cloud_config: The configuration in clouds.yaml format to apply. - - Raises: - OpenStackError: if the credentials provided is not authorized. - - Yields: - An openstack.connection.Connection object. - """ - clouds = list(cloud_config["clouds"].keys()) - if len(clouds) > 1: - logger.warning("Multiple clouds defined in clouds.yaml. Using the first one to connect.") - cloud_name = clouds[0] - - # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but - # I could not reproduce it. Therefore, no catch here for such exception. - try: - with openstack.connect(cloud=cloud_name) as conn: - conn.authorize() - yield conn - # pylint thinks this isn't an exception, but does inherit from Exception class. - except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause - logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc - - -# Disable too many arguments, as they are needed to create the dataclass. -def create_instance_config( # pylint: disable=too-many-arguments - app_name: str, - unit_num: int, - image_id: str, - path: GitHubPath, - labels: Iterable[str], - registration_token: str, -) -> InstanceConfig: - """Create an instance config from charm data. - - Args: - app_name: The juju application name. - unit_num: The juju unit number. - image_id: The openstack image id to create the instance with. - path: Github organisation or repository path. - labels: Addition labels for the runner. - registration_token: The Github runner registration token. See \ - https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-a-registration-token-for-a-repository - - Returns: - Instance configuration created. - """ - suffix = secrets.token_hex(12) - return InstanceConfig( - github_path=path, - image_id=image_id, - labels=labels, - name=f"{app_name}-{unit_num}-{suffix}", - registration_token=registration_token, - ) - - -def _generate_runner_env( - templates_env: jinja2.Environment, - dockerhub_mirror: Optional[str] = None, - ssh_debug_connections: list[SSHDebugConnection] | None = None, -) -> str: - """Generate Github runner .env file contents. - - Proxy configuration are handled by aproxy. - - Args: - templates_env: The jinja template environment. - dockerhub_mirror: The url to Dockerhub to reduce rate limiting. - ssh_debug_connections: Tmate SSH debug connection information to load as environment vars. - - Returns: - The .env contents to be loaded by Github runner. - """ - return templates_env.get_template("env.j2").render( - pre_job_script=str(PRE_JOB_SCRIPT), - dockerhub_mirror=dockerhub_mirror or "", - ssh_debug_info=(secrets.choice(ssh_debug_connections) if ssh_debug_connections else None), - ) - - -def _generate_cloud_init_userdata( - templates_env: jinja2.Environment, - cloud_init_userdata: _CloudInitUserData, -) -> str: - """Generate cloud init userdata to launch at startup. - - Args: - templates_env: The jinja template environment. - cloud_init_userdata: The dataclass containing the cloud init userdata. - - Returns: - The cloud init userdata script. - """ - runner_group = None - instance_config = cloud_init_userdata.instance_config - proxies = cloud_init_userdata.proxies - - if isinstance(instance_config.github_path, GitHubOrg): - runner_group = instance_config.github_path.group - - aproxy_address = proxies.aproxy_address if proxies is not None else None - return templates_env.get_template("openstack-userdata.sh.j2").render( - github_url=f"https://github.com/{instance_config.github_path.path()}", - runner_group=runner_group, - token=instance_config.registration_token, - instance_labels=",".join(instance_config.labels), - instance_name=instance_config.name, - env_contents=cloud_init_userdata.runner_env, - pre_job_contents=cloud_init_userdata.pre_job_contents, - metrics_exchange_path=str(METRICS_EXCHANGE_PATH), - aproxy_address=aproxy_address, - dockerhub_mirror=cloud_init_userdata.dockerhub_mirror, - ) - - -class GithubRunnerRemoveError(Exception): - """Represents an error removing registered runner from Github.""" - - -_INSTANCE_STATUS_SHUTOFF = "SHUTOFF" -_INSTANCE_STATUS_ERROR = "ERROR" -_INSTANCE_STATUS_ACTIVE = "ACTIVE" -_INSTANCE_STATUS_BUILDING = "BUILDING" - - -class OpenstackRunnerManager: - """Runner manager for OpenStack-based instances. - - Attributes: - app_name: The juju application name. - unit_num: The juju unit number. - instance_name: Prefix of the name for the set of runners. - """ - - def __init__( - self, - app_name: str, - unit_num: int, - openstack_runner_manager_config: OpenstackRunnerManagerConfig, - cloud_config: dict[str, dict], - ): - """Construct OpenstackRunnerManager object. - - Args: - app_name: The juju application name. - unit_num: The juju unit number. - openstack_runner_manager_config: Configurations related to runner manager. - cloud_config: The openstack clouds.yaml in dict format. - """ - # Setting the env var to this process and any child process spawned. - proxies = openstack_runner_manager_config.charm_state.proxy_config - if no_proxy := proxies.no_proxy: - set_env_var("NO_PROXY", no_proxy) - if http_proxy := proxies.http: - set_env_var("HTTP_PROXY", http_proxy) - if https_proxy := proxies.https: - set_env_var("HTTPS_PROXY", https_proxy) - - self.app_name = app_name - self.unit_num = unit_num - self.instance_name = f"{app_name}-{unit_num}" - self._config = openstack_runner_manager_config - self._cloud_config = cloud_config - self._github = GithubClient(token=self._config.token) - - def reconcile(self, quantity: int) -> int: - """Reconcile the quantity of runners. - - Args: - quantity: The number of intended runners. - - Returns: - The change in number of runners. - """ - if self._config.reactive_config: - logger.info("Reactive configuration detected, going into experimental reactive mode.") - return self._reconcile_reactive(quantity) - - start_ts = time.time() - try: - delta = self._reconcile_runners(quantity) - finally: - end_ts = time.time() - self._issue_reconciliation_metrics( - reconciliation_start_ts=start_ts, reconciliation_end_ts=end_ts - ) - - return delta - - def _reconcile_reactive(self, quantity: int) -> int: - """Reconcile runners reactively. - - Args: - quantity: Number of intended runners. - - Returns: - The difference between intended runners and actual runners. In reactive mode - this number is never negative as additional processes should terminate after a timeout. - """ - logger.info("Reactive mode is experimental and not yet fully implemented.") - return reactive_runner_manager.reconcile( - quantity=quantity, mq_uri=self._config.reactive_config.mq_uri, queue_name=self.app_name - ) - - def _reconcile_runners(self, quantity: int) -> int: - """Reconcile the number of runners. - - Args: - quantity: The number of intended runners. - - Returns: - The change in number of runners. - """ - with _create_connection(self._cloud_config) as conn: - runner_by_health = self._get_openstack_runner_status(conn) - logger.info( - "Found %s healthy runner and %s unhealthy runner", - len(runner_by_health.healthy), - len(runner_by_health.unhealthy), - ) - logger.debug("Healthy runner: %s", runner_by_health.healthy) - logger.debug("Unhealthy runner: %s", runner_by_health.unhealthy) - remove_token = self._github.get_runner_remove_token(path=self._config.path) - - self._clean_up_runners( - conn=conn, runner_by_health=runner_by_health, remove_token=remove_token - ) - - delta = self._scale( - quantity=quantity, - conn=conn, - runner_by_health=runner_by_health, - remove_token=remove_token, - ) - return delta - - def get_github_runner_info(self) -> tuple[RunnerGithubInfo, ...]: - """Get information on GitHub for the runners. - - Returns: - Collection of runner GitHub information. - """ - remote_runners_list: list[SelfHostedRunner] = self._github.get_runner_github_info( - self._config.path - ) - logger.debug("List of runners found on GitHub:%s", remote_runners_list) - return tuple( - RunnerGithubInfo( - runner["name"], - runner["id"], - runner["status"] == GitHubRunnerStatus.ONLINE, - runner["busy"], - ) - for runner in remote_runners_list - if runner["name"].startswith(f"{self.instance_name}-") - ) - - def _get_openstack_runner_status(self, conn: OpenstackConnection) -> RunnerNameByHealth: - """Get status on OpenStack of each runner. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - Runner status grouped by health. - """ - healthy_runner = [] - unhealthy_runner = [] - openstack_instances = self._get_openstack_instances(conn) - - logger.debug("Found openstack instances: %s", openstack_instances) - - for instance in openstack_instances: - if not OpenstackRunnerManager._health_check(conn=conn, server_name=instance.name): - unhealthy_runner.append(instance.name) - else: - healthy_runner.append(instance.name) - - return RunnerNameByHealth(healthy=tuple(healthy_runner), unhealthy=tuple(unhealthy_runner)) - - def _get_openstack_instances(self, conn: OpenstackConnection) -> list[Server]: - """Get the OpenStack servers managed by this unit. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - List of OpenStack instances. - """ - return [ - instance - for instance in cast(list[Server], conn.list_servers()) - if instance.name.startswith(f"{self.instance_name}-") - ] - - @staticmethod - def _health_check( - conn: OpenstackConnection, - server_name: str, - startup: bool = False, - ) -> bool: - """Health check a server instance. - - A healthy server is defined as: - 1. Openstack instance status is ACTIVE or BUILDING. - 2. Openstack instance status is in BUILDING less than CREATE_SERVER_TIMEOUT seconds. - 3. Runner.Worker exists (running a job). - 4. Runner.Listener exists (waiting for job). - 5. GitHub runner status is Idle or Active. - - An undetermined server is marked as healthy when: - 1. SSH fails - could be a transient network error. - 2. The Runner.* processes do not exist. Mark healthy for now to gather data. This is - subject to change to unhealthy once enough data has been gathered. - - Args: - conn: The Openstack connection instance. - server_name: The name of the OpenStack server to health check. - startup: Check only whether the startup is successful. - - Returns: - Whether the instance is healthy. - """ - server: Server | None = conn.get_server(name_or_id=server_name) - if not server: - return False - if server.status == (_INSTANCE_STATUS_SHUTOFF, _INSTANCE_STATUS_ERROR): - return False - if server.status not in (_INSTANCE_STATUS_ACTIVE, _INSTANCE_STATUS_BUILDING): - return False - created_at = datetime.strptime(server.created_at, "%Y-%m-%dT%H:%M:%SZ") - current_time = datetime.now(created_at.tzinfo) - elapsed_min = (created_at - current_time).total_seconds() - if server.status == _INSTANCE_STATUS_BUILDING: - return elapsed_min < CREATE_SERVER_TIMEOUT - try: - return OpenstackRunnerManager._ssh_health_check( - conn=conn, server_name=server_name, startup=startup - ) - except _SSHError: - logger.warning("Health check failed, unable to SSH into server: %s", server_name) - return False - - @staticmethod - @retry(tries=3, delay=5, max_delay=60, backoff=2, local_logger=logger) - def _ssh_health_check(conn: OpenstackConnection, server_name: str, startup: bool) -> bool: - """Use SSH to check whether runner application is running. - - A healthy runner is defined as: - 1. SSH connection can be established. - 2. Runner.Worker exists (running a job). - 3. Runner.Listener exists (waiting for job). - - Args: - conn: The Openstack connection instance. - server_name: The openstack server instance to check connections. - startup: Check only whether the startup is successful. - - Raises: - _SSHError: if there was an error SSH-ing into the machine or with the SSH command. - - Returns: - Whether the runner application is running. - """ - try: - ssh_conn = OpenstackRunnerManager._get_ssh_connection( - conn=conn, server_name=server_name - ) - except _SSHError as exc: - logger.error("[ALERT]: Unable to SSH to server: %s, reason: %s", server_name, str(exc)) - raise - - result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) - logger.debug("Output of `ps aux` on %s stderr: %s", server_name, result.stderr) - if not result.ok: - logger.warning("List all process command failed on %s.", server_name) - raise _SSHError(f"List process command failed on {server_name}.") - if RUNNER_STARTUP_PROCESS not in result.stdout: - logger.warning("No startup process found on server %s.", server_name) - raise _SSHError(f"Runner not yet started on {server_name}.") - - logger.info("Runner process found to be healthy on %s", server_name) - if startup: - return True - - if RUNNER_WORKER_PROCESS in result.stdout or RUNNER_LISTENER_PROCESS in result.stdout: - return True - - return False - - @staticmethod - @retry(tries=3, delay=5, max_delay=60, backoff=2, local_logger=logger) - def _get_ssh_connection( - conn: OpenstackConnection, server_name: str, timeout: int = 30 - ) -> SSHConnection: - """Get a valid ssh connection within a network for a given openstack instance. - - The SSH connection will attempt to establish connection until the timeout configured. - - Args: - conn: The Openstack connection instance. - server_name: The Openstack server instance name. - timeout: Timeout in seconds to attempt connection to each available server address. - - Raises: - _SSHError: If there was an error getting a valid SSH connection. - - Returns: - An SSH connection to OpenStack server instance. - """ - server: Server | None = conn.get_server(name_or_id=server_name) - if server is None: - raise _SSHError(f"Server gone while trying to get SSH connection: {server_name}.") - if not server.key_name: - raise _SSHError( - f"Unable to create SSH connection, no valid keypair found for {server.name}" - ) - key_path = OpenstackRunnerManager._get_key_path(server.name) - if not key_path.exists(): - raise _SSHError(f"Missing keyfile for server: {server.name}, key path: {key_path}") - network_address_list = server.addresses.values() - if not network_address_list: - raise _SSHError(f"No addresses found for OpenStack server {server.name}") - - server_addresses: list[str] = [ - address["addr"] - for network_addresses in network_address_list - for address in network_addresses - ] - for ip in server_addresses: - try: - connection = SSHConnection( - host=ip, - user="ubuntu", - connect_kwargs={"key_filename": str(key_path)}, - connect_timeout=timeout, - ) - result = connection.run("echo hello world", warn=True, timeout=timeout) - if not result.ok: - logger.warning( - "SSH test connection failed, server: %s, address: %s", server.name, ip - ) - continue - if "hello world" in result.stdout: - return connection - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException): - logger.warning( - "Unable to SSH into %s with address %s", - server.name, - connection.host, - exc_info=True, - ) - continue - raise _SSHError( - f"No connectable SSH addresses found, server: {server.name}, " - f"addresses: {server_addresses}" - ) - - @staticmethod - def _get_key_path(name: str) -> Path: - """Get the filepath for storing private SSH of a runner. - - Args: - name: The name of the runner. - - Returns: - Path to reserved for the key file of the runner. - """ - return _SSH_KEY_PATH / f"runner-{name}.key" - - @dataclass - class _CreateRunnerArgs: - """Arguments for _create_runner method. - - Attributes: - app_name: The juju application name. - cloud_config: The clouds.yaml containing the OpenStack credentials. The first cloud - in the file will be used. - config: Configurations related to runner manager. - registration_token: Token for registering the runner on GitHub. - unit_num: The juju unit number. - """ - - app_name: str - cloud_config: dict[str, dict] - config: OpenstackRunnerManagerConfig - registration_token: str - unit_num: int - - @staticmethod - def _create_runner(args: _CreateRunnerArgs) -> None: - """Create a runner on OpenStack cloud. - - Arguments are gathered into a dataclass due to Pool.map needing one argument functions. - - Args: - args: Arguments of the method. - - Raises: - RunnerCreateError: Unable to create the OpenStack runner. - """ - ts_now = time.time() - environment = jinja2.Environment( - loader=jinja2.FileSystemLoader("templates"), autoescape=True - ) - - env_contents = _generate_runner_env( - templates_env=environment, - dockerhub_mirror=args.config.dockerhub_mirror, - ssh_debug_connections=args.config.charm_state.ssh_debug_connections, - ) - - pre_job_contents = OpenstackRunnerManager._render_pre_job_contents( - charm_state=args.config.charm_state, templates_env=environment - ) - - instance_config = create_instance_config( - args.app_name, - args.unit_num, - args.config.image, - args.config.path, - args.config.labels, - args.registration_token, - ) - cloud_user_data = _CloudInitUserData( - instance_config=instance_config, - runner_env=env_contents, - pre_job_contents=pre_job_contents, - dockerhub_mirror=args.config.dockerhub_mirror, - proxies=args.config.charm_state.proxy_config, - ) - cloud_userdata_str = _generate_cloud_init_userdata( - templates_env=environment, - cloud_init_userdata=cloud_user_data, - ) - - with _create_connection(cloud_config=args.cloud_config) as conn: - runner_security_group = OpenstackRunnerManager._ensure_security_group(conn) - OpenstackRunnerManager._setup_runner_keypair(conn, instance_config.name) - - logger.info("Creating runner %s", instance_config.name) - try: - instance = conn.create_server( - name=instance_config.name, - image=instance_config.image_id, - key_name=instance_config.name, - flavor=args.config.flavor, - network=args.config.network, - security_groups=[runner_security_group["id"]], - userdata=cloud_userdata_str, - auto_ip=False, - timeout=CREATE_SERVER_TIMEOUT, - wait=True, - ) - except openstack.exceptions.ResourceTimeout as err: - logger.exception("Timeout creating OpenStack runner %s", instance_config.name) - try: - logger.info( - "Attempting to remove OpenStack runner %s that timeout on creation", - instance_config.name, - ) - conn.delete_server(name_or_id=instance_config.name, wait=True) - try: - conn.delete_keypair(instance_config.name) - except openstack.exceptions.SDKException: - logger.exception( - "Unable to delete OpenStack keypair %s", instance_config.name - ) - OpenstackRunnerManager._get_key_path(instance_config.name).unlink( - missing_ok=True - ) - except openstack.exceptions.SDKException: - logger.exception( - "Cleanup of creation failure runner %s has failed", instance_config.name - ) - # Reconcile will attempt to cleanup again prior to spawning new runners. - raise RunnerCreateError( - f"Timeout creating OpenStack runner {instance_config.name}" - ) from err - except openstack.exceptions.SDKException as err: - logger.exception("Failed to create OpenStack runner %s", instance_config.name) - raise RunnerCreateError( - f"Failed to create OpenStack runner {instance_config.name}" - ) from err - - logger.info("Waiting runner %s to come online", instance_config.name) - OpenstackRunnerManager._wait_until_runner_process_running(conn, instance.name) - logger.info("Finished creating runner %s", instance_config.name) - ts_after = time.time() - OpenstackRunnerManager._issue_runner_installed_metric( - app_name=args.app_name, - instance_config=instance_config, - install_end_ts=ts_after, - install_start_ts=ts_now, - ) - - @staticmethod - def _render_pre_job_contents( - charm_state: CharmState, templates_env: jinja2.Environment - ) -> str: - """Render the pre-job script contents. - - Args: - charm_state: The charm state object. - templates_env: The jinja template environment. - - Returns: - The rendered pre-job script contents. - """ - pre_job_contents_dict = { - "issue_metrics": True, - "metrics_exchange_path": str(METRICS_EXCHANGE_PATH), - "do_repo_policy_check": False, - } - if repo_policy_config := charm_state.charm_config.repo_policy_compliance: - repo_policy_client = RepoPolicyComplianceClient( - url=repo_policy_config.url, charm_token=repo_policy_config.token - ) - pre_job_contents_dict.update( - { - "repo_policy_base_url": repo_policy_client.base_url, - "repo_policy_one_time_token": repo_policy_client.get_one_time_token(), - "do_repo_policy_check": True, - } - ) - pre_job_contents = templates_env.get_template("pre-job.j2").render(pre_job_contents_dict) - return pre_job_contents - - @staticmethod - def _ensure_security_group(conn: OpenstackConnection) -> SecurityGroup: - """Ensure runner security group exists. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - The security group with the rules for runners. - """ - rule_exists_icmp = False - rule_exists_ssh = False - rule_exists_tmate_ssh = False - - security_group_list = conn.list_security_groups(filters={"name": SECURITY_GROUP_NAME}) - # Pick the first security_group returned. - security_group = next(iter(security_group_list), None) - - if security_group is None: - logger.info("Security group %s not found, creating it", SECURITY_GROUP_NAME) - security_group = conn.create_security_group( - name=SECURITY_GROUP_NAME, - description="For servers managed by the github-runner charm.", - ) - else: - existing_rules = security_group["security_group_rules"] - for rule in existing_rules: - if rule["protocol"] == "icmp": - logger.debug( - "Found ICMP rule in existing security group %s of ID %s", - SECURITY_GROUP_NAME, - security_group["id"], - ) - rule_exists_icmp = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 22 - ): - logger.debug( - "Found SSH rule in existing security group %s of ID %s", - SECURITY_GROUP_NAME, - security_group["id"], - ) - rule_exists_ssh = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 10022 - ): - logger.debug( - "Found tmate SSH rule in existing security group %s of ID %s", - SECURITY_GROUP_NAME, - security_group["id"], - ) - rule_exists_tmate_ssh = True - - if not rule_exists_icmp: - conn.create_security_group_rule( - secgroup_name_or_id=security_group["id"], - protocol="icmp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group["id"], - port_range_min="22", - port_range_max="22", - protocol="tcp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_tmate_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group["id"], - port_range_min="10022", - port_range_max="10022", - protocol="tcp", - direction="egress", - ethertype="IPv4", - ) - return security_group - - @staticmethod - def _setup_runner_keypair(conn: OpenstackConnection, name: str) -> None: - """Set up the SSH keypair for a runner. - - Args: - conn: The connection object to access OpenStack cloud. - name: The name of the runner. - """ - private_key_path = OpenstackRunnerManager._get_key_path(name) - - if private_key_path.exists(): - logger.warning("Existing private key file for %s found, removing it.", name) - private_key_path.unlink() - - keypair = conn.create_keypair(name=name) - private_key_path.write_text(keypair.private_key) - shutil.chown(private_key_path, user="ubuntu", group="ubuntu") - private_key_path.chmod(0o400) - - @retry(tries=10, delay=60, local_logger=logger) - @staticmethod - def _wait_until_runner_process_running(conn: OpenstackConnection, instance_name: str) -> None: - """Wait until the runner process is running. - - The waiting to done by the retry declarator. - - Args: - conn: The openstack connection instance. - instance_name: The name of the instance to wait on. - - Raises: - RunnerStartError: Unable perform health check of the runner application. - """ - try: - if not OpenstackRunnerManager._health_check( - conn=conn, server_name=instance_name, startup=True - ): - raise RunnerStartError( - ( - "Unable to find running process of runner application on openstack runner " - f"{instance_name}" - ) - ) - except TimeoutError as err: - raise RunnerStartError( - f"Unable to connect to openstack runner {instance_name}" - ) from err - - @staticmethod - def _issue_runner_installed_metric( - app_name: str, - instance_config: InstanceConfig, - install_start_ts: float, - install_end_ts: float, - ) -> None: - """Issue RunnerInstalled metric. - - Args: - app_name: The juju application name. - instance_config: The configuration values for Openstack instance. - install_start_ts: The timestamp when the installation started. - install_end_ts: The timestamp when the installation ended. - """ - try: - metric_events.issue_event( - event=metric_events.RunnerInstalled( - timestamp=install_start_ts, - flavor=app_name, - duration=install_end_ts - install_start_ts, - ), - ) - except IssueMetricEventError: - logger.exception("Failed to issue RunnerInstalled metric") - try: - storage = metrics_storage.create(instance_config.name) - except CreateMetricsStorageError: - logger.exception( - "Failed to create metrics storage for runner %s, " - "will not be able to issue all metrics.", - instance_config.name, - ) - else: - try: - (storage.path / RUNNER_INSTALLED_TS_FILE_NAME).write_text( - str(install_end_ts), encoding="utf-8" - ) - except FileNotFoundError: - logger.exception( - "Failed to write runner-installed.timestamp into metrics storage " - "for runner %s, will not be able to issue all metrics.", - instance_config.name, - ) - - def _remove_runners( - self, - conn: OpenstackConnection, - instance_names: Iterable[str], - remove_token: str | None = None, - num_to_remove: int | float | None = None, - ) -> None: - """Delete runners on Openstack. - - Removes the registered runner from Github if remove_token is provided. - - Args: - conn: The Openstack connection instance. - instance_names: The Openstack server names to delete. - remove_token: The GitHub runner remove token. - num_to_remove: Remove a specified number of runners. Remove all if None. - """ - if num_to_remove is None: - num_to_remove = float("inf") - - name_to_github_id = { - runner["name"]: runner["id"] - for runner in self._github.get_runner_github_info(self._config.path) - } - for instance_name in instance_names: - if num_to_remove < 1: - break - - github_id = name_to_github_id.get(instance_name, None) - self._remove_one_runner(conn, instance_name, github_id, remove_token) - - # Attempt to delete the keys. This is place at the end of deletion, so we can access - # the instances that failed to delete on previous tries. - try: - conn.delete_keypair(instance_name) - except openstack.exceptions.SDKException: - logger.exception("Unable to delete OpenStack keypair %s", instance_name) - OpenstackRunnerManager._get_key_path(instance_name).unlink(missing_ok=True) - num_to_remove -= 1 - - def _remove_one_runner( - self, - conn: OpenstackConnection, - instance_name: str, - github_id: int | None = None, - remove_token: str | None = None, - ) -> None: - """Remove one OpenStack runner. - - Args: - conn: The Openstack connection instance. - instance_name: The Openstack server name to delete. - github_id: The runner id on GitHub. - remove_token: The GitHub runner remove token. - """ - logger.info("Attempting to remove OpenStack runner %s", instance_name) - - server: Server | None = conn.get_server(name_or_id=instance_name) - - if server is not None: - logger.info( - "Pulling metrics and deleting server for OpenStack runner %s", instance_name - ) - self._pull_metrics(conn=conn, instance_name=instance_name) - self._remove_openstack_runner(conn, server, remove_token) - else: - logger.info( - "Not found server for OpenStack runner %s marked for deletion", instance_name - ) - - if github_id is not None: - try: - self._github.delete_runner(self._config.path, github_id) - except GithubClientError as exc: - logger.warning("Failed to remove runner from Github %s, %s", instance_name, exc) - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - - def _pull_metrics(self, conn: OpenstackConnection, instance_name: str) -> None: - """Pull metrics from the runner into the respective storage for the runner. - - Args: - conn: The Openstack connection instance. - instance_name: The Openstack server name. - """ - try: - storage = metrics_storage.get(instance_name) - except GetMetricsStorageError: - logger.exception( - "Failed to get shared metrics storage for runner %s, " - "will not be able to issue all metrics.", - instance_name, - ) - return - - try: - ssh_conn = self._get_ssh_connection(conn=conn, server_name=instance_name) - except _SSHError as exc: - logger.info("Failed to pull metrics for %s: %s", instance_name, exc) - return - - try: - self._pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), - local_path=str(storage.path / "pre-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - self._pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), - local_path=str(storage.path / "post-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - return - except _PullFileError as exc: - logger.warning( - "Failed to pull metrics for %s: %s . Will not be able to issue all metrics", - instance_name, - exc, - ) - return - - def _pull_file( - self, ssh_conn: SSHConnection, remote_path: str, local_path: str, max_size: int - ) -> None: - """Pull file from the runner instance. - - Args: - ssh_conn: The SSH connection instance. - remote_path: The file path on the runner instance. - local_path: The local path to store the file. - max_size: If the file is larger than this, it will not be pulled. - - Raises: - _PullFileError: Unable to pull the file from the runner instance. - _SSHError: Issue with SSH connection. - """ - try: - result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException) as exc: - raise _SSHError(reason=f"Unable to SSH into {ssh_conn.host}") from exc - if not result.ok: - logger.warning( - ( - "Unable to get file size of %s on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - remote_path, - ssh_conn.host, - result.return_code, - result.stdout, - result.stderr, - ) - raise _PullFileError(reason=f"Unable to get file size of {remote_path}") - - stdout = result.stdout - try: - stdout.strip() - size = int(stdout) - if size > max_size: - raise _PullFileError( - reason=f"File size of {remote_path} too large {size} > {max_size}" - ) - except ValueError as exc: - raise _PullFileError(reason=f"Invalid file size for {remote_path}: {stdout}") from exc - - try: - ssh_conn.get(remote=remote_path, local=local_path) - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException) as exc: - raise _SSHError(reason=f"Unable to SSH into {ssh_conn.host}") from exc - except OSError as exc: - raise _PullFileError(reason=f"Unable to retrieve file {remote_path}") from exc - - def _remove_openstack_runner( - self, - conn: OpenstackConnection, - server: Server, - remove_token: str | None = None, - ) -> None: - """Remove a OpenStack server hosting the GitHub runner application. - - Args: - conn: The Openstack connection instance. - server: The Openstack server. - remove_token: The GitHub runner remove token. - """ - try: - self._run_github_removal_script(conn=conn, server=server, remove_token=remove_token) - except (TimeoutError, invoke.exceptions.UnexpectedExit, GithubRunnerRemoveError): - logger.warning( - "Failed to run runner removal script for %s", server.name, exc_info=True - ) - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - try: - if not conn.delete_server(name_or_id=server.name, wait=True, delete_ips=True): - logger.warning("Server does not exist %s", server.name) - except SDKException as exc: - logger.error("Something wrong deleting the server %s, %s", server.name, exc) - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - - def _run_github_removal_script( - self, conn: OpenstackConnection, server: Server, remove_token: str | None - ) -> None: - """Run Github runner removal script. - - Args: - conn: The Openstack connection instance. - server: The Openstack server instance. - remove_token: The GitHub instance removal token. - - Raises: - GithubRunnerRemoveError: Unable to remove runner from GitHub. - """ - if not remove_token: - return - try: - ssh_conn = OpenstackRunnerManager._get_ssh_connection( - conn=conn, server_name=server.name - ) - except _SSHError as exc: - logger.error( - "Unable to run GitHub removal script, server: %s, reason: %s", - server.name, - str(exc), - ) - raise GithubRunnerRemoveError( - f"Failed to remove runner {server.name} from Github." - ) from exc - - try: - result: invoke.runners.Result = ssh_conn.run( - f"{_CONFIG_SCRIPT_PATH} remove --token {remove_token}", - warn=True, - ) - if not result.ok: - logger.warning( - ( - "Unable to run removal script on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - server.name, - result.return_code, - result.stdout, - result.stderr, - ) - return - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - - logger.warning("Failed to run GitHub runner removal script %s", server.name) - raise GithubRunnerRemoveError(f"Failed to remove runner {server.name} from Github.") - - def _clean_up_keys_files( - self, conn: OpenstackConnection, exclude_instances: Iterable[str] - ) -> None: - """Delete all SSH key files except the specified instances. - - Args: - conn: The Openstack connection instance. - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up SSH key files") - exclude_filename = set( - OpenstackRunnerManager._get_key_path(instance) for instance in exclude_instances - ) - - total = 0 - deleted = 0 - for path in _SSH_KEY_PATH.iterdir(): - # Find key file from this application. - if ( - path.is_file() - and path.name.startswith(self.instance_name) - and path.name.endswith(".key") - ): - total += 1 - if path.name in exclude_filename: - continue - - keypair_name = path.name.split(".")[0] - try: - conn.delete_keypair(keypair_name) - except openstack.exceptions.SDKException: - logger.warning( - "Unable to delete OpenStack keypair associated with deleted key file %s ", - path.name, - ) - - path.unlink() - deleted += 1 - logger.info("Found %s key files, clean up %s key files", total, deleted) - - def _clean_up_openstack_keypairs( - self, conn: OpenstackConnection, exclude_instances: Iterable[str] - ) -> None: - """Delete all OpenStack keypairs except the specified instances. - - Args: - conn: The Openstack connection instance. - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up openstack keypairs") - keypairs = conn.list_keypairs() - for key in keypairs: - # The `name` attribute is of resource.Body type. - if key.name and str(key.name).startswith(self.instance_name): - if str(key.name) in exclude_instances: - continue - - try: - conn.delete_keypair(key.name) - except openstack.exceptions.SDKException: - logger.warning( - "Unable to delete OpenStack keypair associated with deleted key file %s ", - key.name, - ) - - def _clean_up_runners( - self, conn: OpenstackConnection, runner_by_health: RunnerNameByHealth, remove_token: str - ) -> None: - """Clean up offline or unhealthy runners. - - Args: - conn: The openstack connection instance. - runner_by_health: The runner status grouped by health. - remove_token: The GitHub runner remove token. - - """ - github_info = self.get_github_runner_info() - online_runners = [runner.runner_name for runner in github_info if runner.online] - offline_runners = [runner.runner_name for runner in github_info if not runner.online] - busy_runners = [runner.runner_name for runner in github_info if runner.busy] - logger.info( - "Found %s online and %s offline openstack runners, %s of the runners are busy", - len(online_runners), - len(offline_runners), - len(busy_runners), - ) - logger.debug("Online runner: %s", online_runners) - logger.debug("Offline runner: %s", offline_runners) - logger.debug("Busy runner: %s", busy_runners) - - healthy_runners_set = set(runner_by_health.healthy) - busy_runners_set = set(busy_runners) - busy_unhealthy_runners = set(runner_by_health.unhealthy).intersection(busy_runners_set) - if busy_unhealthy_runners: - logger.warning("Found unhealthy busy runners %s", busy_unhealthy_runners) - - # Clean up offline (SHUTOFF) runners or unhealthy (no connection/cloud-init script) - # runners. - # Possible for a healthy runner to be appear as offline for sometime as GitHub can be - # slow to update the status. - # For busy runners let GitHub decide whether the runner should be removed. - instance_to_remove = tuple( - runner - for runner in (*runner_by_health.unhealthy, *offline_runners) - if runner not in healthy_runners_set and runner not in busy_runners_set - ) - logger.debug("Removing following runners with issues %s", instance_to_remove) - self._remove_runners( - conn=conn, instance_names=instance_to_remove, remove_token=remove_token - ) - # Clean up orphan keys, e.g., If openstack instance is removed externally the key - # would not be deleted. - self._clean_up_keys_files(conn, runner_by_health.healthy) - self._clean_up_openstack_keypairs(conn, runner_by_health.healthy) - - def _scale( - self, - quantity: int, - conn: OpenstackConnection, - runner_by_health: RunnerNameByHealth, - remove_token: str, - ) -> int: - """Scale the number of runners. - - Args: - quantity: The number of intended runners. - conn: The openstack connection instance. - runner_by_health: The runner status grouped by health. - remove_token: The GitHub runner remove token. - - Returns: - The change in number of runners. - """ - # Get the number of OpenStack servers. - # This is not calculated due to there might be removal failures. - servers = self._get_openstack_instances(conn) - delta = quantity - len(servers) - registration_token = self._github.get_runner_registration_token(path=self._config.path) - - # Spawn new runners - if delta > 0: - logger.info("Creating %s OpenStack runners", delta) - args = [ - OpenstackRunnerManager._CreateRunnerArgs( - app_name=self.app_name, - config=self._config, - cloud_config=self._cloud_config, - registration_token=registration_token, - unit_num=self.unit_num, - ) - for _ in range(delta) - ] - with Pool(processes=min(delta, 10)) as pool: - pool.map( - func=OpenstackRunnerManager._create_runner, - iterable=args, - ) - - elif delta < 0: - logger.info("Removing %s OpenStack runners", delta) - self._remove_runners( - conn=conn, - instance_names=runner_by_health.healthy, - remove_token=remove_token, - num_to_remove=abs(delta), - ) - else: - logger.info("No changes to number of runners needed") - - return delta - - def _issue_reconciliation_metrics( - self, - reconciliation_start_ts: float, - reconciliation_end_ts: float, - ) -> None: - """Issue all reconciliation related metrics. - - This includes the metrics for the runners and the reconciliation metric itself. - - Args: - reconciliation_start_ts: The timestamp of when reconciliation started. - reconciliation_end_ts: The timestamp of when reconciliation ended. - """ - with _create_connection(self._cloud_config) as conn: - runner_states = self._get_openstack_runner_status(conn) - - metric_stats = self._issue_runner_metrics(conn) - self._issue_reconciliation_metric( - metric_stats=metric_stats, - reconciliation_start_ts=reconciliation_start_ts, - reconciliation_end_ts=reconciliation_end_ts, - runner_states=runner_states, - ) - - def _issue_runner_metrics(self, conn: OpenstackConnection) -> IssuedMetricEventsStats: - """Issue runner metrics. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - The stats of issued metric events. - """ - total_stats: IssuedMetricEventsStats = {} - - try: - openstack_instances = self._get_openstack_instances(conn) - except openstack.exceptions.SDKException: - logger.exception( - "Failed to get openstack instances to ignore when extracting metrics." - " Will not issue runner metrics" - ) - return total_stats - - logger.debug( - "Found following openstack instances before extracting metrics: %s", - openstack_instances, - ) - # Don't extract metrics for instances which are still there, as it might be - # the case that the metrics have not yet been pulled - # (they get pulled right before server termination). - instance_names = {instance.name for instance in openstack_instances} - - for extracted_metrics in runner_metrics.extract( - metrics_storage_manager=metrics_storage, - runners=instance_names, - ): - try: - job_metrics = github_metrics.job( - github_client=self._github, - pre_job_metrics=extracted_metrics.pre_job, - runner_name=extracted_metrics.runner_name, - ) - except GithubMetricsError: - logger.exception("Failed to calculate job metrics") - job_metrics = None - - issued_events = runner_metrics.issue_events( - runner_metrics=extracted_metrics, - job_metrics=job_metrics, - flavor=self.app_name, - ) - for event_type in issued_events: - total_stats[event_type] = total_stats.get(event_type, 0) + 1 - return total_stats - - def _issue_reconciliation_metric( - self, - metric_stats: IssuedMetricEventsStats, - reconciliation_start_ts: float, - reconciliation_end_ts: float, - runner_states: RunnerNameByHealth, - ) -> None: - """Issue reconciliation metric. - - Args: - metric_stats: The stats of issued metric events. - reconciliation_start_ts: The timestamp of when reconciliation started. - reconciliation_end_ts: The timestamp of when reconciliation ended. - runner_states: The states of the runners. - """ - try: - github_info = self.get_github_runner_info() - except GithubApiError: - logger.exception( - "Failed to retrieve github info for reconciliation metric. " - "Will not issue reconciliation metric." - ) - return - - online_runners = [runner for runner in github_info if runner.online] - offline_runner_names = {runner.runner_name for runner in github_info if not runner.online} - active_runner_names = {runner.runner_name for runner in online_runners if runner.busy} - healthy_runners = set(runner_states.healthy) - - active_count = len(active_runner_names) - idle_online_count = len(online_runners) - active_count - idle_offline_count = len((offline_runner_names & healthy_runners) - active_runner_names) - - try: - metric_events.issue_event( - event=metric_events.Reconciliation( - timestamp=time.time(), - flavor=self.app_name, - crashed_runners=metric_stats.get(metric_events.RunnerStart, 0) - - metric_stats.get(metric_events.RunnerStop, 0), - idle_runners=idle_online_count + idle_offline_count, - duration=reconciliation_end_ts - reconciliation_start_ts, - ) - ) - except IssueMetricEventError: - logger.exception("Failed to issue Reconciliation metric") - - def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: - """Flush Openstack servers. - - 1. Kill the processes depending on flush mode. - 2. Get unhealthy runners after process purging. - 3. Delete unhealthy runners. - - Args: - mode: The mode to determine which runner to flush. - - Returns: - The number of runners flushed. - """ - logger.info("Flushing OpenStack all runners") - with _create_connection(self._cloud_config) as conn: - self._kill_runner_processes(conn=conn, mode=mode) - runner_by_health = self._get_openstack_runner_status(conn) - remove_token = self._github.get_runner_remove_token(path=self._config.path) - self._remove_runners( - conn=conn, - instance_names=runner_by_health.unhealthy, - remove_token=remove_token, - ) - return len(runner_by_health.unhealthy) - - def _kill_runner_processes(self, conn: OpenstackConnection, mode: FlushMode) -> None: - """Kill runner application that are not running any jobs. - - Runners that have not picked up a job has - 1. no Runner.Worker process - 2. no pre-run.sh job process - - Args: - conn: The connection object to access OpenStack cloud. - mode: The flush mode to determine which runner processes to kill. - - Raises: - NotImplementedError: If unsupported flush mode has been passed. - """ - killer_command: str - match mode: - case FlushMode.FLUSH_IDLE: - # only kill Runner.Listener if Runner.Worker does not exist. - killer_command = ( - "! pgrep -x Runner.Worker && pgrep -x Runner.Listener && " - "kill $(pgrep -x Runner.Listener)" - ) - case FlushMode.FLUSH_BUSY: - # kill both Runner.Listener and Runner.Worker processes. - # This kills pre-job.sh, a child process of Runner.Worker. - killer_command = ( - "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" - "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);" - ) - case _: - raise NotImplementedError(f"Unsupported flush mode {mode}") - - servers = self._get_openstack_instances(conn=conn) - for server in servers: - ssh_conn: SSHConnection = self._get_ssh_connection(conn=conn, server_name=server.name) - result: invoke.runners.Result = ssh_conn.run( - killer_command, - warn=True, - ) - if not result.ok: - logger.warning("Failed to kill runner process. Instance: %s", server.name) - continue - logger.info("Successfully killed runner process. Instance: %s", server.name) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 3a9acd4a0..c87af1a7d 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -41,7 +41,6 @@ from metrics import runner as runner_metrics from metrics import storage as metrics_storage from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance -from openstack_cloud.openstack_manager import GithubRunnerRemoveError from repo_policy_compliance_client import RepoPolicyComplianceClient from utilities import retry @@ -61,6 +60,10 @@ CREATE_SERVER_TIMEOUT = 5 * 60 +class _GithubRunnerRemoveError(Exception): + """Represents an error while SSH into a runner and running the remove script.""" + + class _PullFileError(Exception): """Represents an error while pulling a file from the runner instance.""" @@ -337,7 +340,7 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None OpenStackRunnerManager._run_runner_removal_script( instance.server_name, ssh_conn, remove_token ) - except GithubRunnerRemoveError: + except _GithubRunnerRemoveError: logger.warning( "Unable to run github runner removal script for %s", instance.server_name, @@ -784,7 +787,7 @@ def _run_runner_removal_script( remove_token: The GitHub instance removal token. Raises: - GithubRunnerRemoveError: Unable to remove runner from GitHub. + _GithubRunnerRemoveError: Unable to remove runner from GitHub. """ try: result = ssh_conn.run( @@ -804,12 +807,12 @@ def _run_runner_removal_script( result.stdout, result.stderr, ) - raise GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") + raise _GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") except ( TimeoutError, paramiko.ssh_exception.NoValidConnectionsError, paramiko.ssh_exception.SSHException, ) as exc: - raise GithubRunnerRemoveError( + raise _GithubRunnerRemoveError( f"Failed to remove runner {instance_name} from Github." ) from exc diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py index 81c334f37..443c84dfd 100644 --- a/tests/unit/mock_runner_managers.py +++ b/tests/unit/mock_runner_managers.py @@ -17,7 +17,6 @@ InstanceId, ) from manager.github_runner_manager import GitHubRunnerState -from metrics.events import RunnerStop from metrics.runner import RunnerMetrics from tests.unit.mock import MockGhapiClient @@ -184,7 +183,6 @@ def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[Runne Returns: Any runner metrics produced during flushing. """ - num = len(self.state.runners) if busy: self.state.runners = {} else: diff --git a/tests/unit/test_openstack_manager.py b/tests/unit/test_openstack_manager.py deleted file mode 100644 index 5329b1282..000000000 --- a/tests/unit/test_openstack_manager.py +++ /dev/null @@ -1,1200 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import random -import secrets -from pathlib import Path -from typing import Optional -from unittest.mock import MagicMock, call - -import jinja2 -import openstack.connection -import openstack.exceptions -import pytest -from fabric.connection import Connection as SSHConnection -from invoke import Result -from openstack.compute.v2.keypair import Keypair -from openstack.compute.v2.server import Server -from pytest import LogCaptureFixture, MonkeyPatch - -import metrics.storage -import reactive.runner_manager -from charm_state import CharmState, ProxyConfig, ReactiveConfig, RepoPolicyComplianceConfig -from errors import OpenStackError, RunnerStartError -from github_type import GitHubRunnerStatus, RunnerApplication, SelfHostedRunner -from metrics import events as metric_events -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from metrics.storage import MetricsStorage -from openstack_cloud import openstack_manager -from openstack_cloud.openstack_manager import MAX_METRICS_FILE_SIZE, METRICS_EXCHANGE_PATH -from runner_manager_type import FlushMode -from runner_type import RunnerGithubInfo, RunnerNameByHealth -from tests.unit import factories - -FAKE_MONGODB_URI = "mongodb://example.com/db" -CLOUD_NAME = "microstack" - - -@pytest.fixture(autouse=True, name="openstack_connect_mock") -def mock_openstack_connect_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock openstack.connect.""" - mock_connect = MagicMock(spec=openstack_manager.openstack.connect) - monkeypatch.setattr("openstack_cloud.openstack_manager.openstack.connect", mock_connect) - return mock_connect - - -@pytest.fixture(name="mock_server") -def mock_server_fixture() -> MagicMock: - """Mock OpenStack Server object.""" - mock_server = MagicMock(spec=Server) - mock_server.key_name = "mock_key" - mock_server.addresses.values = MagicMock(return_value=[[{"addr": "10.0.0.1"}]]) - return mock_server - - -@pytest.fixture(name="patch_get_ssh_connection_health_check") -def patch_get_ssh_connection_health_check_fixture(monkeypatch: pytest.MonkeyPatch): - """Patch SSH connection to a MagicMock instance for get_ssh_connection health check.""" - mock_get_ssh_connection = MagicMock( - spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection - ) - mock_ssh_connection = MagicMock(spec=SSHConnection) - mock_ssh_connection.host = "test host IP" - mock_result = MagicMock(spec=Result) - mock_result.ok = True - mock_result.stderr = "" - mock_result.stdout = "hello world" - mock_ssh_connection.run.return_value = mock_result - mock_get_ssh_connection.return_value = [mock_ssh_connection] - - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - mock_get_ssh_connection, - ) - - -@pytest.fixture(name="ssh_connection_health_check") -def ssh_connection_health_check_fixture(monkeypatch: pytest.MonkeyPatch): - """SSH connection to a MagicMock instance for health check.""" - mock_get_ssh_connection = MagicMock( - spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection - ) - mock_ssh_connection = MagicMock(spec=SSHConnection) - mock_ssh_connection.host = "test host IP" - mock_result = MagicMock(spec=Result) - mock_result.ok = True - mock_result.stderr = "" - mock_result.stdout = "-- Test output: /bin/bash /home/ubuntu/actions-runner/run.sh --" - mock_ssh_connection.run.return_value = mock_result - mock_get_ssh_connection.return_value = mock_ssh_connection - - return mock_get_ssh_connection - - -@pytest.fixture(name="patch_ssh_connection_error") -def patch_ssh_connection_error_fixture(monkeypatch: pytest.MonkeyPatch): - """Patch SSH connection to a MagicMock instance with error on run.""" - mock_get_ssh_connection = MagicMock( - spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection - ) - mock_ssh_connection = MagicMock(spec=SSHConnection) - mock_result = MagicMock(spec=Result) - mock_result.ok = False - mock_result.stdout = "Mock stdout" - mock_result.stderr = "Mock stderr" - mock_ssh_connection.run.return_value = mock_result - mock_get_ssh_connection.return_value = mock_ssh_connection - - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - mock_get_ssh_connection, - ) - - -@pytest.fixture(name="mock_github_client") -def mock_github_client_fixture() -> MagicMock: - """Mocked github client that returns runner application.""" - mock_github_client = MagicMock(spec=openstack_manager.GithubClient) - mock_github_client.get_runner_application.return_value = RunnerApplication( - os="linux", - architecture="x64", - download_url="http://test_url", - filename="test_filename", - temp_download_token="test_token", - ) - mock_github_client.get_runner_registration_token.return_value = "test_token" - return mock_github_client - - -@pytest.fixture(name="patch_execute_command") -def patch_execute_command_fixture(monkeypatch: pytest.MonkeyPatch): - """Patch execute command to a MagicMock instance.""" - monkeypatch.setattr( - openstack_manager, - "execute_command", - MagicMock(spec=openstack_manager.execute_command), - ) - - -@pytest.fixture(name="patched_create_connection_context") -def patched_create_connection_context_fixture(monkeypatch: pytest.MonkeyPatch): - """Return a mocked openstack connection context manager and patch create_connection.""" - mock_connection = MagicMock(spec=openstack_manager.openstack.connection.Connection) - monkeypatch.setattr( - openstack_manager, - "_create_connection", - MagicMock(spec=openstack_manager._create_connection, return_value=mock_connection), - ) - return mock_connection.__enter__() - - -@pytest.fixture(name="ssh_connection_mock") -def ssh_connection_mock_fixture() -> MagicMock: - """Return a mocked ssh connection.""" - test_file_content = secrets.token_hex(16) - ssh_conn_mock = MagicMock(spec=openstack_manager.SSHConnection) - ssh_conn_mock.get.side_effect = lambda remote, local: Path(local).write_text(test_file_content) - ssh_conn_mock.run.side_effect = lambda cmd, **kwargs: ( - Result(stdout="1") if cmd.startswith("stat") else Result() - ) - ssh_conn_mock.run.return_value = Result() - - return ssh_conn_mock - - -@pytest.fixture(name="openstack_manager_for_reconcile") -def openstack_manager_for_reconcile_fixture( - monkeypatch: pytest.MonkeyPatch, - mock_github_client: MagicMock, - patched_create_connection_context: MagicMock, - tmp_path: Path, - ssh_connection_mock: MagicMock, -): - """Create a mocked openstack manager for the reconcile tests.""" - t_mock = MagicMock(return_value=12345) - monkeypatch.setattr(openstack_manager.time, "time", t_mock) - - issue_event_mock = MagicMock(spec=metric_events.issue_event) - monkeypatch.setattr(openstack_manager.metric_events, "issue_event", issue_event_mock) - - runner_metrics_mock = MagicMock(openstack_manager.runner_metrics) - monkeypatch.setattr(openstack_manager, "runner_metrics", runner_metrics_mock) - - github_metrics_mock = MagicMock(openstack_manager.github_metrics) - monkeypatch.setattr(openstack_manager, "github_metrics", github_metrics_mock) - - monkeypatch.setattr( - openstack_manager, "GithubClient", MagicMock(return_value=mock_github_client) - ) - - runner_metrics_path = tmp_path / "runner_fs" - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - - pool_mock = MagicMock() - pool_mock.__enter__.return_value = pool_mock - pool_mock.map.side_effect = lambda func, iterable: func(*iterable) - pool_cls_mock = MagicMock() - pool_cls_mock.return_value = pool_mock - monkeypatch.setattr(openstack_manager, "Pool", pool_cls_mock) - - app_name = secrets.token_hex(16) - charm_state = MagicMock(spec=CharmState) - charm_state.proxy_config = ProxyConfig() - charm_state.ssh_debug_connections = MagicMock() - charm_state.charm_config = MagicMock() - charm_state.charm_config.repo_policy_compliance = None - os_runner_manager_config = openstack_manager.OpenstackRunnerManagerConfig( - charm_state=charm_state, - path=MagicMock(), - labels=[], - token=secrets.token_hex(16), - flavor=app_name, - image="test-image-id", - network=secrets.token_hex(16), - dockerhub_mirror=None, - ) - patched_create_connection_context.create_keypair.return_value = Keypair(private_key="test_key") - server_mock = MagicMock() - server_mock.status = openstack_manager._INSTANCE_STATUS_ACTIVE - patched_create_connection_context.get_server.return_value = server_mock - - os_runner_manager = openstack_manager.OpenstackRunnerManager( - app_name=app_name, - unit_num=0, - openstack_runner_manager_config=os_runner_manager_config, - cloud_config={}, - ) - os_runner_manager._ssh_health_check = MagicMock(return_value=True) - os_runner_manager._get_ssh_connection = MagicMock(return_value=ssh_connection_mock) - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, "_wait_until_runner_process_running", MagicMock() - ) - - monkeypatch.setattr(openstack_manager, "_SSH_KEY_PATH", tmp_path) - monkeypatch.setattr(openstack_manager.shutil, "chown", MagicMock()) - - return os_runner_manager - - -@pytest.fixture(name="reactive_reconcile_mock") -def reactive_reconcile_fixture(monkeypatch: MonkeyPatch, tmp_path: Path) -> MagicMock: - """Mock the job class.""" - reconcile_mock = MagicMock(spec=reactive.runner_manager.reconcile) - monkeypatch.setattr( - "openstack_cloud.openstack_manager.reactive_runner_manager.reconcile", reconcile_mock - ) - reconcile_mock.side_effect = lambda quantity, **kwargs: quantity - return reconcile_mock - - -def test__create_connection_error(clouds_yaml: dict, openstack_connect_mock: MagicMock): - """ - arrange: given a monkeypatched connection.authorize() function that raises an error. - act: when _create_connection is called. - assert: OpenStackUnauthorizedError is raised. - """ - connection_mock = MagicMock() - connection_context = MagicMock() - connection_context.authorize.side_effect = openstack.exceptions.HttpException - connection_mock.__enter__.return_value = connection_context - openstack_connect_mock.return_value = connection_mock - - with pytest.raises(OpenStackError) as exc: - with openstack_manager._create_connection(cloud_config=clouds_yaml): - pass - - assert "Failed OpenStack API call" in str(exc) - - -def test__create_connection( - multi_clouds_yaml: dict, clouds_yaml: dict, cloud_name: str, openstack_connect_mock: MagicMock -): - """ - arrange: given a cloud config yaml dict with 1. multiple clouds 2. single cloud. - act: when _create_connection is called. - assert: connection with first cloud in the config is used. - """ - # 1. multiple clouds - with openstack_manager._create_connection(cloud_config=multi_clouds_yaml): - openstack_connect_mock.assert_called_with(cloud=CLOUD_NAME) - - # 2. single cloud - with openstack_manager._create_connection(cloud_config=clouds_yaml): - openstack_connect_mock.assert_called_with(cloud=cloud_name) - - -@pytest.mark.parametrize( - "dockerhub_mirror, ssh_debug_connections, expected_env_contents", - [ - pytest.param( - None, - None, - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh -""", - id="all values empty", - ), - pytest.param( - "http://dockerhub_mirror.test", - None, - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -DOCKERHUB_MIRROR=http://dockerhub_mirror.test -CONTAINER_REGISTRY_URL=http://dockerhub_mirror.test - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh -""", - id="dockerhub mirror set", - ), - pytest.param( - None, - [ - openstack_manager.SSHDebugConnection( - host="127.0.0.1", - port=10022, - rsa_fingerprint="SHA256:testrsa", - ed25519_fingerprint="SHA256:tested25519", - ) - ], - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh - -TMATE_SERVER_HOST=127.0.0.1 -TMATE_SERVER_PORT=10022 -TMATE_SERVER_RSA_FINGERPRINT=SHA256:testrsa -TMATE_SERVER_ED25519_FINGERPRINT=SHA256:tested25519 -""", - id="ssh debug connection set", - ), - pytest.param( - "http://dockerhub_mirror.test", - [ - openstack_manager.SSHDebugConnection( - host="127.0.0.1", - port=10022, - rsa_fingerprint="SHA256:testrsa", - ed25519_fingerprint="SHA256:tested25519", - ) - ], - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -DOCKERHUB_MIRROR=http://dockerhub_mirror.test -CONTAINER_REGISTRY_URL=http://dockerhub_mirror.test - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh - -TMATE_SERVER_HOST=127.0.0.1 -TMATE_SERVER_PORT=10022 -TMATE_SERVER_RSA_FINGERPRINT=SHA256:testrsa -TMATE_SERVER_ED25519_FINGERPRINT=SHA256:tested25519 -""", - id="all values set", - ), - ], -) -def test__generate_runner_env( - dockerhub_mirror: Optional[str], - ssh_debug_connections: Optional[list[openstack_manager.SSHDebugConnection]], - expected_env_contents: str, -): - """ - arrange: given configuration values for runner environment. - act: when _generate_runner_env is called. - assert: expected .env contents are generated. - """ - environment = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"), autoescape=True) - assert ( - openstack_manager._generate_runner_env( - templates_env=environment, - dockerhub_mirror=dockerhub_mirror, - ssh_debug_connections=ssh_debug_connections, - ) - == expected_env_contents - ) - - -def test_reconcile_issues_runner_installed_event( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, -): - """ - arrange: Mock openstack manager for reconcile. - act: Reconcile to create a runner. - assert: The expected event is issued. - """ - openstack_manager_for_reconcile.reconcile(quantity=1) - - openstack_manager.metric_events.issue_event.assert_has_calls( - [ - call( - event=metric_events.RunnerInstalled( - timestamp=openstack_manager.time.time(), - flavor=openstack_manager_for_reconcile.app_name, - duration=0, - ) - ) - ] - ) - - -def test_reconcile_places_timestamp_in_metrics_storage( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -): - """ - arrange: Mock timestamps and create the directory for the metrics storage. - act: Reconcile to create a runner. - assert: The expected timestamp is placed in the shared filesystem. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - - openstack_manager_for_reconcile.reconcile(quantity=1) - - assert (ms.path / RUNNER_INSTALLED_TS_FILE_NAME).exists() - assert (ms.path / RUNNER_INSTALLED_TS_FILE_NAME).read_text() == str( - openstack_manager.time.time() - ) - - -def test_reconcile_error_on_placing_timestamp_is_ignored( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -): - """ - arrange: Do not create the directory for the metrics storage\ - in order to let a FileNotFoundError to be raised inside the OpenstackRunnerManager. - act: Reconcile to create a runner. - assert: No exception is raised. - """ - runner_metrics_path = tmp_path / "runner_fs" - - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - - openstack_manager_for_reconcile.reconcile(quantity=1) - - assert not (ms.path / RUNNER_INSTALLED_TS_FILE_NAME).exists() - - -def test_reconcile_pulls_metric_files( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - ssh_connection_mock: MagicMock, -): - """ - arrange: Mock the metrics storage and the ssh connection. - act: Reconcile to create a runner. - assert: The expected metric files are pulled from the shared filesystem. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth(healthy=(), unhealthy=("test_runner",)) - ) - ssh_connection_mock.get.side_effect = MagicMock() - openstack_manager_for_reconcile.reconcile(quantity=0) - - ssh_connection_mock.get.assert_any_call( - remote=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), - local=str(ms.path / "pre-job-metrics.json"), - ) - ssh_connection_mock.get.assert_any_call( - remote=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), - local=str(ms.path / "post-job-metrics.json"), - ) - - -def test_reconcile_does_not_pull_too_large_files( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - ssh_connection_mock: MagicMock, -): - """ - arrange: Mock the metrics storage and the ssh connection to return a file that is too large. - act: Reconcile to create a runner. - assert: The expected metric files are not pulled from the shared filesystem. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - ssh_connection_mock.run.side_effect = lambda cmd, **kwargs: ( - Result(stdout=f"{MAX_METRICS_FILE_SIZE + 1}") if cmd.startswith("stat") else Result() - ) - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth(healthy=("test_runner",), unhealthy=()) - ) - - openstack_manager_for_reconcile.reconcile(quantity=0) - - assert not (ms.path / "pre-job-metrics.json").exists() - assert not (ms.path / "post-job-metrics.json").exists() - - -def test_reconcile_issue_reconciliation_metrics( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -): - """ - arrange: Mock the metrics storage and the ssh connection. - act: Reconcile. - assert: The expected reconciliation metrics are issued. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth(healthy=("test_runner",), unhealthy=()) - ) - - openstack_manager.runner_metrics.extract.return_value = (MagicMock() for _ in range(2)) - openstack_manager.runner_metrics.issue_events.side_effect = [ - {metric_events.RunnerStart, metric_events.RunnerStop}, - {metric_events.RunnerStart}, - ] - - openstack_manager_for_reconcile._github.get_runner_github_info.return_value = [ - SelfHostedRunner( - busy=False, - id=1, - labels=[], - os="linux", - name=f"{openstack_manager_for_reconcile.instance_name}-test_runner", - status=GitHubRunnerStatus.ONLINE, - ) - ] - openstack_manager_for_reconcile.reconcile(quantity=0) - - openstack_manager.metric_events.issue_event.assert_has_calls( - [ - call( - event=metric_events.Reconciliation( - timestamp=12345, - flavor=openstack_manager_for_reconcile.app_name, - crashed_runners=1, - idle_runners=1, - duration=0, - ) - ) - ] - ) - - -def test_reconcile_ignores_metrics_for_openstack_online_runners( - openstack_manager_for_reconcile, - monkeypatch, - tmp_path, - patched_create_connection_context: MagicMock, -): - """ - arrange: Combination of runner status/github status and openstack status. - act: Call reconcile. - assert: All runners which have an instance on Openstack are ignored for metrics extraction. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - instance_name = openstack_manager_for_reconcile.instance_name - runner_names = { - k: f"{instance_name}-{k}" - for k in [ - "healthy_online", - "healthy_offline", - "unhealthy_online", - "unhealthy_offline", - "openstack_online_no_github_status", - "github_online_no_openstack_status", - ] - } - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth( - healthy=(runner_names["healthy_online"], runner_names["healthy_offline"]), - unhealthy=( - runner_names["unhealthy_online"], - runner_names["unhealthy_offline"], - runner_names["github_online_no_openstack_status"], - ), - ) - ) - openstack_manager_for_reconcile.get_github_runner_info = MagicMock( - return_value=( - RunnerGithubInfo( - runner_name=runner_names["healthy_online"], runner_id=0, online=True, busy=True - ), - RunnerGithubInfo( - runner_name=runner_names["unhealthy_online"], runner_id=1, online=True, busy=False - ), - RunnerGithubInfo( - runner_name=runner_names["healthy_offline"], runner_id=2, online=False, busy=False - ), - RunnerGithubInfo( - runner_name=runner_names["unhealthy_offline"], - runner_id=3, - online=False, - busy=False, - ), - RunnerGithubInfo( - runner_name=runner_names["github_online_no_openstack_status"], - runner_id=4, - online=True, - busy=False, - ), - ) - ) - - openstack_online_runner_names = [ - runner - for (name, runner) in runner_names.items() - if name != "github_online_no_openstack_status" - ] - openstack_instances = [ - openstack_manager.openstack.compute.v2.server.Server( - name=runner_name, status=random.choice(("ACTIVE", "BUILD", "STOPPED")) - ) - for runner_name in openstack_online_runner_names - ] - patched_create_connection_context.list_servers.return_value = openstack_instances - - openstack_manager.runner_metrics.extract.return_value = (MagicMock() for _ in range(1)) - openstack_manager.runner_metrics.issue_events.side_effect = [ - {metric_events.RunnerStart, metric_events.RunnerStop}, - ] - - openstack_manager_for_reconcile.reconcile(quantity=0) - - openstack_manager.runner_metrics.extract.assert_called_once_with( - metrics_storage_manager=metrics.storage, - runners=set(openstack_online_runner_names), - ) - - -def test_reconcile_reactive_mode( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - reactive_reconcile_mock: MagicMock, - caplog: LogCaptureFixture, -): - """ - arrange: Enable reactive mode and mock the job class to return a job. - act: Call reconcile with a random quantity n. - assert: The mocked job is picked up n times and the expected log message is present. - """ - count = random.randint(0, 5) - openstack_manager_for_reconcile._config.reactive_config = ReactiveConfig( - mq_uri=FAKE_MONGODB_URI - ) - actual_count = openstack_manager_for_reconcile.reconcile(quantity=count) - - assert actual_count == count - reactive_reconcile_mock.assert_called_with( - quantity=count, - mq_uri=FAKE_MONGODB_URI, - queue_name=openstack_manager_for_reconcile.app_name, - ) - - -def test_repo_policy_config( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - patched_create_connection_context: MagicMock, -): - """ - arrange: Mock the repo policy compliance config. - act: Reconcile to create a runner. - assert: The expected url and one-time-token is present in the pre-job script in \ - the cloud-init data. - """ - test_url = "http://test.url" - token = secrets.token_hex(16) - one_time_token = secrets.token_hex(16) - openstack_manager_for_reconcile._config.charm_state.charm_config.repo_policy_compliance = ( - RepoPolicyComplianceConfig(url=test_url, token=token) - ) - repo_policy_compliance_client_mock = MagicMock( - spec=openstack_manager.RepoPolicyComplianceClient - ) - repo_policy_compliance_client_mock.base_url = test_url - repo_policy_compliance_client_mock.get_one_time_token.return_value = one_time_token - repo_policy_compliance_cls_mock = MagicMock(return_value=repo_policy_compliance_client_mock) - monkeypatch.setattr( - openstack_manager, "RepoPolicyComplianceClient", repo_policy_compliance_cls_mock - ) - - openstack_manager_for_reconcile.reconcile(quantity=1) - - cloud_init_data_str = patched_create_connection_context.create_server.call_args[1]["userdata"] - repo_policy_compliance_client_mock.get_one_time_token.assert_called_once() - assert one_time_token in cloud_init_data_str - assert test_url in cloud_init_data_str - - -def test__ensure_security_group_with_existing_rules(): - """ - arrange: Mock OpenStack connection with the security rules created. - act: Run `_ensure_security_group`. - assert: The security rules are not created again. - """ - connection_mock = MagicMock(spec=openstack.connection.Connection) - connection_mock.list_security_groups.return_value = [ - { - "security_group_rules": [ - {"protocol": "icmp"}, - {"protocol": "tcp", "port_range_min": 22, "port_range_max": 22}, - {"protocol": "tcp", "port_range_min": 10022, "port_range_max": 10022}, - ], - "id": "TEST_ID", - } - ] - - openstack_manager.OpenstackRunnerManager._ensure_security_group(connection_mock) - connection_mock.create_security_group_rule.assert_not_called() - - -def test__get_ssh_connection( - monkeypatch, - patch_get_ssh_connection_health_check, - mock_server: MagicMock, -): - """ - arrange: A server with SSH setup correctly. - act: Get the SSH connections. - assert: The SSH connections contains at least one connection. - """ - # Patching the `_get_key_path` to get around the keyfile checks. - mock__get_key_path = MagicMock(spec=openstack_manager.OpenstackRunnerManager._get_key_path) - mock_key_path = MagicMock(spec=Path) - mock_key_path.exists.return_value = True - mock__get_key_path.return_value = mock_key_path - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, "_get_key_path", mock__get_key_path - ) - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - - conn = openstack_manager.OpenstackRunnerManager._get_ssh_connection( - mock_connection, mock_server.name - ) - assert conn is not None - - -@pytest.mark.usefixtures("skip_retry") -def test__ssh_health_check_success(monkeypatch: pytest.MonkeyPatch, mock_server: MagicMock): - """ - arrange: A server with SSH correctly setup. - act: Run health check on the server. - assert: The health check passes. - """ - ssh_connection_mock = MagicMock() - result_mock = MagicMock() - result_mock.stdout = "/home/ubuntu/actions-runner/run.sh\nRunner.Worker" - ssh_connection_mock.run.return_value = result_mock - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(return_value=ssh_connection_mock), - ) - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - - assert openstack_manager.OpenstackRunnerManager._ssh_health_check( - mock_connection, mock_server.name, False - ) - - -@pytest.mark.usefixtures("skip_retry") -def test__ssh_health_check_no_key(mock_server: MagicMock): - """ - arrange: A server with no key available. - act: Run health check on the server. - assert: The health check fails. - """ - # Remove the mock SSH key. - mock_server.key_name = None - - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._ssh_health_check( - mock_connection, mock_server.name, False - ) - - assert "no valid keypair found" in str(exc) - - -@pytest.mark.usefixtures("skip_retry") -def test__ssh_health_check_error(monkeypatch: pytest.MonkeyPatch, mock_server: MagicMock): - """ - arrange: A server with error on SSH run. - act: Run health check on the server. - assert: The health check fails. - """ - monkeypatch.setattr(openstack_manager.OpenstackRunnerManager, "_get_key_path", MagicMock()) - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock(side_effect=TimeoutError) - monkeypatch.setattr( - openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) - ) - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._ssh_health_check( - mock_connection, mock_server.name, False - ) - - assert "No connectable SSH addresses found" in str(exc) - - -def test__wait_until_runner_process_running_no_server(): - """ - arrange: No server existing on the OpenStack connection. - act: Check if runner process is running. - assert: RunnerStartError thrown. - """ - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = None - - with pytest.raises(RunnerStartError): - openstack_manager.OpenstackRunnerManager._wait_until_runner_process_running( - mock_connection, "Non-existing-server" - ) - - -@pytest.mark.parametrize( - "server", - [ - pytest.param(None, id="no server"), - pytest.param(factories.MockOpenstackServer(status="SHUTOFF"), id="shutoff"), - pytest.param(factories.MockOpenstackServer(status="REBUILD"), id="not active/building"), - ], -) -def test__health_check(server: factories.MockOpenstackServer | None): - """ - arrange: given a mock openstack.get_server response. - act: when _health_check is called. - assert: False is returned, meaning unhealthy runner. - """ - mock_get_server = MagicMock(return_value=server) - mock_connection = MagicMock() - mock_connection.get_server = mock_get_server - - assert not openstack_manager.OpenstackRunnerManager._health_check( - conn=mock_connection, server_name="test" - ) - - -# The SSH health check will temporarily return True on failure for debugging purposes. -@pytest.mark.xfail -def test__ssh_health_check_connection_error(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given a monkeypatched _get_ssh_connection function that raises _SSHError. - act: when _ssh_health_check is called. - assert: False is returned, meaning unhealthy runner. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(side_effect=openstack_manager._SSHError), - ) - - assert not openstack_manager.OpenstackRunnerManager._ssh_health_check( - server=MagicMock(), startup=False - ) - - -@pytest.mark.parametrize( - "result", - [ - pytest.param(factories.MockSSHRunResult(exited=1), id="ssh result not ok"), - pytest.param( - factories.MockSSHRunResult(exited=0, stdout=""), - id="runner process not found in stdout", - ), - # This health check should fail but temporarily marking as passing for passive runner - # deletion until we have more data. - pytest.param( - factories.MockSSHRunResult(exited=0, stdout="/home/ubuntu/actions-runner/run.sh"), - id="startup process exists but no listener or worker process", - ), - ], -) -@pytest.mark.xfail -def test__ssh_health_check_unhealthy( - monkeypatch: pytest.MonkeyPatch, result: factories.MockSSHRunResult -): - """ - arrange: given unhealthy ssh responses. - act: when _ssh_health_check is called. - assert: False is returned, meaning unhealthy runner. - """ - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock(return_value=result) - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(return_value=mock_ssh_connection), - ) - - assert not openstack_manager.OpenstackRunnerManager._ssh_health_check( - server=MagicMock(), startup=False - ) - - -@pytest.mark.parametrize( - "result, startup", - [ - pytest.param( - factories.MockSSHRunResult( - exited=0, stdout="/home/ubuntu/actions-runner/run.sh\nRunner.Worker" - ), - False, - id="runner process & workper process found", - ), - pytest.param( - factories.MockSSHRunResult( - exited=0, stdout="/home/ubuntu/actions-runner/run.sh\nRunner.Listener" - ), - False, - id="runner process & listener process found", - ), - pytest.param( - factories.MockSSHRunResult(exited=0, stdout="/home/ubuntu/actions-runner/run.sh"), - True, - id="runner process found for startup", - ), - ], -) -def test__ssh_health_check_healthy( - monkeypatch: pytest.MonkeyPatch, result: factories.MockSSHRunResult, startup: bool -): - """ - arrange: given healthy ssh response. - act: when _ssh_health_check is called. - assert: True is returned, meaning healthy runner. - """ - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock(return_value=result) - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(return_value=mock_ssh_connection), - ) - - assert openstack_manager.OpenstackRunnerManager._ssh_health_check( - conn=MagicMock(), server_name=MagicMock(), startup=startup - ) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_server_gone(): - """ - arrange: given a mock Openstack get_server function that returns None. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - mock_connection = MagicMock() - mock_connection.get_server.return_value = None - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "Server gone while trying to get SSH connection" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_no_server_key(): - """ - arrange: given a mock server instance with no key attached. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - mock_server = MagicMock() - mock_server.key_name = None - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "Unable to create SSH connection, no valid keypair found" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_key_not_exists(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given a monkeypatched _get_key_path function that returns a non-existent path. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path("does-not-exist")), - ) - mock_connection = MagicMock() - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "Missing keyfile for server" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_server_no_addresses(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given a mock server instance with no server addresses. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path(".")), - ) - mock_server = MagicMock() - mock_server.addresses = {} - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "No addresses found for OpenStack server" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -@pytest.mark.parametrize( - "run", - [ - pytest.param(MagicMock(side_effect=TimeoutError), id="timeout error"), - pytest.param( - MagicMock(return_value=factories.MockSSHRunResult(exited=1)), id="result not ok" - ), - pytest.param( - MagicMock(return_value=factories.MockSSHRunResult(exited=0, stdout="")), - id="empty response", - ), - ], -) -def test__get_ssh_connection_server_no_valid_connections( - monkeypatch: pytest.MonkeyPatch, run: MagicMock -): - """ - arrange: given a monkeypatched Connection instance that returns invalid connections. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path(".")), - ) - mock_server = MagicMock() - mock_server.addresses = {"test": [{"addr": "test-address"}]} - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = run - monkeypatch.setattr( - openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) - ) - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "No connectable SSH addresses found" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_server(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given monkeypatched SSH connection instance. - act: when _get_ssh_connection is called. - assert: the SSH connection instance is returned. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path(".")), - ) - mock_server = MagicMock() - mock_server.addresses = {"test": [{"addr": "test-address"}]} - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock( - return_value=factories.MockSSHRunResult(exited=0, stdout="hello world") - ) - monkeypatch.setattr( - openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) - ) - - assert ( - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - == mock_ssh_connection - ) - - -def test_flush(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given monkeypatched sub functions of flush. - act: when flush is called. - assert: sub functions are called. - """ - monkeypatch.setattr(openstack_manager, "_create_connection", MagicMock()) - monkeypatch.setattr(openstack_manager, "set_env_var", MagicMock()) - runner_manager = openstack_manager.OpenstackRunnerManager( - app_name=MagicMock(), - unit_num=MagicMock(), - openstack_runner_manager_config=MagicMock(), - cloud_config=MagicMock(), - ) - runner_manager._kill_runner_processes = MagicMock() - runner_manager._get_openstack_runner_status = MagicMock() - runner_manager._github = MagicMock() - runner_manager._remove_runners = MagicMock() - - runner_manager.flush(mode=MagicMock()) - - runner_manager._kill_runner_processes.assert_called() - runner_manager._get_openstack_runner_status.assert_called() - runner_manager._github.get_runner_remove_token.assert_called() - runner_manager._remove_runners.assert_called() - - -@pytest.mark.parametrize( - "flush_mode, expected_command", - [ - pytest.param( - FlushMode.FLUSH_BUSY, - "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" - "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);", - id="Flush Busy", - ), - pytest.param( - FlushMode.FLUSH_IDLE, - "! pgrep -x Runner.Worker && pgrep -x Runner.Listener && " - "kill $(pgrep -x Runner.Listener)", - id="Flush Idle", - ), - ], -) -def test__kill_runner_processes( - monkeypatch: pytest.MonkeyPatch, flush_mode: FlushMode, expected_command: str -): - """ - arrange: given supported flush modes. - act: when _kill_runner_processes is called. - assert: expected kill commands are issued. - """ - monkeypatch.setattr(openstack_manager, "_create_connection", MagicMock()) - monkeypatch.setattr(openstack_manager, "set_env_var", MagicMock()) - runner_manager = openstack_manager.OpenstackRunnerManager( - app_name=MagicMock(), - unit_num=MagicMock(), - openstack_runner_manager_config=MagicMock(), - cloud_config=MagicMock(), - ) - runner_manager._get_openstack_instances = MagicMock(return_value=[MagicMock(), MagicMock()]) - mock_connection = MagicMock() - runner_manager._get_ssh_connection = MagicMock(return_value=mock_connection) - - runner_manager._kill_runner_processes(conn=MagicMock(), mode=flush_mode) - - mock_connection.run.assert_called_with(expected_command, warn=True) diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 7312a69e4..3bf4dfea0 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -61,12 +61,8 @@ def runner_manager_fixture( "manager.runner_manager.RunnerManager._spawn_runners", mock_runner_manager_spawn_runners ) # Patch out the metrics, as metrics has their own tests. - monkeypatch.setattr( - "manager.runner_manager.github_metrics.job", MagicMock() - ) - monkeypatch.setattr( - "manager.runner_manager.runner_metrics.issue_events", MagicMock() - ) + monkeypatch.setattr("manager.runner_manager.github_metrics.job", MagicMock()) + monkeypatch.setattr("manager.runner_manager.runner_metrics.issue_events", MagicMock()) config = RunnerManagerConfig("mock_token", github_path) runner_manager = RunnerManager("mock_runners", mock_cloud, config) From 19b1a385bf88806b03887097d2f8c747ab3d82e6 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 28 Aug 2024 10:34:30 +0800 Subject: [PATCH 244/278] Fix test contstruction of runner manager. --- tests/integration/test_runner_manager_openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 089888b94..dfce31a8e 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -141,7 +141,7 @@ async def runner_manager_fixture( Import of log_dir_base_path to monkeypatch the runner logs path with tmp_path. """ config = RunnerManagerConfig(token, github_path) - return RunnerManager(openstack_runner_manager, config) + return RunnerManager("test_runner", openstack_runner_manager, config) @pytest_asyncio.fixture(scope="function", name="runner_manager_with_one_runner") From dab56116d94e1fe639e399f9ffca75b7b5b2faa0 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 28 Aug 2024 12:48:16 +0800 Subject: [PATCH 245/278] Fix flavor naming --- .../openstack_cloud.openstack_runner_manager.md | 14 ++++++++------ src/charm.py | 1 + src/openstack_cloud/openstack_runner_manager.py | 5 ++++- tests/integration/test_runner_manager_openstack.py | 2 +- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 81a35247b..7f28b8689 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -95,6 +95,7 @@ Manage self-hosted runner on OpenStack cloud. ```python __init__( + manager_name: str, prefix: str, cloud_config: OpenStackCloudConfig, server_config: OpenStackServerConfig | None, @@ -109,6 +110,7 @@ Construct the object. **Args:** + - `manager_name`: A name to identify this manager. - `prefix`: The prefix to runner name. - `cloud_config`: The configuration for OpenStack authorisation. - `server_config`: The configuration for creating OpenStack server. Unable to create runner if None. @@ -131,7 +133,7 @@ The prefix of runner names. --- - + ### method `cleanup` @@ -154,7 +156,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -184,7 +186,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -208,7 +210,7 @@ Delete self-hosted runners. --- - + ### method `flush_runners` @@ -231,7 +233,7 @@ Remove idle and/or busy runners. --- - + ### method `get_runner` @@ -254,7 +256,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/charm.py b/src/charm.py index 5adcd8b01..714b4485e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1255,6 +1255,7 @@ def _get_runner_scaler( ) # The prefix is set to f"{application_name}-{unit number}" openstack_runner_manager = OpenStackRunnerManager( + manager_name=self.app.name, prefix=self.unit.name.replace("/", "-"), cloud_config=cloud_config, server_config=server_config, diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index c87af1a7d..aa03b0ec3 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -119,6 +119,7 @@ class OpenStackRunnerManager(CloudRunnerManager): # Ignore "Too many arguments", as the class requires a lot of configurations. def __init__( # pylint: disable=R0913 self, + manager_name: str, prefix: str, cloud_config: OpenStackCloudConfig, server_config: OpenStackServerConfig | None, @@ -128,6 +129,7 @@ def __init__( # pylint: disable=R0913 """Construct the object. Args: + manager_name: A name to identify this manager. prefix: The prefix to runner name. cloud_config: The configuration for OpenStack authorisation. server_config: The configuration for creating OpenStack server. Unable to create @@ -135,6 +137,7 @@ def __init__( # pylint: disable=R0913 runner_config: The configuration for the runner. service_config: The configuration of supporting services of the runners. """ + self._manager_name = manager_name self._prefix = prefix self._cloud_config = cloud_config self._server_config = server_config @@ -194,7 +197,7 @@ def create_runner(self, registration_token: str) -> InstanceId: end_timestamp = time.time() OpenStackRunnerManager._issue_runner_installed_metric( name=instance_name, - flavor=self.name_prefix, + flavor=self._manager_name, install_start_timestamp=start_timestamp, install_end_timestamp=end_timestamp, ) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index dfce31a8e..63b7204b3 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -125,7 +125,7 @@ async def openstack_runner_manager_fixture( repo_policy_compliance=None, ) return OpenStackRunnerManager( - app_name, cloud_config, server_config, runner_config, service_config + app_name, f"{app_name}-0", cloud_config, server_config, runner_config, service_config ) From 921ce2a0b3022007fe18caaa73ae0f4824303577 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:29:46 +0800 Subject: [PATCH 246/278] Fix flush action result output. --- pyproject.toml | 2 +- src-docs/runner_manager.md | 2 +- src-docs/runner_manager_type.md | 2 +- src/charm.py | 30 +++++++++++++++--------------- src/runner_manager.py | 18 +++++++++--------- src/runner_manager_type.py | 2 +- 6 files changed, 28 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a60427837..9b69abe91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ omit = [ ] [tool.coverage.report] -fail_under = 83 +fail_under = 85 show_missing = true [tool.pytest.ini_options] diff --git a/src-docs/runner_manager.md b/src-docs/runner_manager.md index 2cf622469..f3ea1a1e7 100644 --- a/src-docs/runner_manager.md +++ b/src-docs/runner_manager.md @@ -94,7 +94,7 @@ Check if runner binary exists. ### method `flush` ```python -flush(mode: FlushMode = ) → int +flush(mode: LXDFlushMode = ) → int ``` Remove existing runners. diff --git a/src-docs/runner_manager_type.md b/src-docs/runner_manager_type.md index f6b58a83c..cd7eaf5d2 100644 --- a/src-docs/runner_manager_type.md +++ b/src-docs/runner_manager_type.md @@ -11,7 +11,7 @@ Types used by RunnerManager class. -## class `FlushMode` +## class `LXDFlushMode` Strategy for flushing runners. During pre-job (repo-check), the runners are marked as idle and if the pre-job fails, the runner falls back to being idle again. Hence wait_repo_check is required. diff --git a/src/charm.py b/src/charm.py index 714b4485e..122b01efb 100755 --- a/src/charm.py +++ b/src/charm.py @@ -9,7 +9,7 @@ """Charm for creating and managing GitHub self-hosted runner instances.""" from manager.cloud_runner_manager import GitHubRunnerConfig, SupportServiceConfig -from manager.runner_manager import RunnerManager, RunnerManagerConfig +from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig from manager.runner_scaler import RunnerScaler from utilities import bytes_with_unit_to_kib, execute_command, remove_residual_venv_dirs, retry @@ -88,7 +88,7 @@ ) from runner import LXD_PROFILE_YAML from runner_manager import LXDRunnerManager, LXDRunnerManagerConfig -from runner_manager_type import FlushMode +from runner_manager_type import LXDFlushMode RECONCILE_RUNNERS_EVENT = "reconcile-runners" @@ -515,7 +515,7 @@ def _on_start(self, _: StartEvent) -> None: self.unit.status = MaintenanceStatus("Starting runners") try: - runner_manager.flush(FlushMode.FLUSH_IDLE) + runner_manager.flush(LXDFlushMode.FLUSH_IDLE) self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, @@ -581,7 +581,7 @@ def _on_upgrade_charm(self, _: UpgradeCharmEvent) -> None: runner_manager = self._get_runner_manager(state) logger.info("Flushing the runners...") - runner_manager.flush(FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, @@ -617,7 +617,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 if prev_runner_manager: self.unit.status = MaintenanceStatus("Removing runners due to config change") # Flush runner in case the prev token has expired. - prev_runner_manager.flush(FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) + prev_runner_manager.flush(LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) state = self._setup_state() @@ -636,7 +636,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 runner_manager = self._get_runner_manager(state) if state.charm_config.token != self._stored.token: - runner_manager.flush(FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) self._stored.token = state.charm_config.token self._reconcile_runners( runner_manager, @@ -697,7 +697,7 @@ def _check_and_update_local_lxd_dependencies( runner_bin_updated, ) self.unit.status = MaintenanceStatus("Flushing runners due to updated deps") - runner_manager.flush(FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FLUSH_IDLE_WAIT_REPO_CHECK) self._start_services(token, proxy_config) self.unit.status = ActiveStatus() @@ -850,21 +850,21 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None: if state.instance_type == InstanceType.OPENSTACK: # Flushing mode not implemented for OpenStack yet. runner_scaler = self._get_runner_scaler(state) - flushed = runner_scaler.flush() - event.set_results({"delta": {"virtual-machines": flushed}}) + flushed = runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + logger.info("Flushed %s runners", flushed) + delta = runner_scaler.reconcile(state.runner_config.virtual_machines) + event.set_results({"delta": {"virtual-machines": delta}}) return runner_manager = self._get_runner_manager(state) - runner_manager.flush(FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) delta = self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, state.runner_config.virtual_machine_resources, ) - - self._on_check_runners_action(event) - event.set_results(delta) + event.set_results({"delta": {"virtual-machines": delta}}) @catch_action_errors def _on_update_dependencies_action(self, event: ActionEvent) -> None: @@ -902,7 +902,7 @@ def _on_stop(self, _: StopEvent) -> None: return runner_manager = self._get_runner_manager(state) - runner_manager.flush(FlushMode.FLUSH_BUSY) + runner_manager.flush(LXDFlushMode.FLUSH_BUSY) def _reconcile_runners( self, runner_manager: LXDRunnerManager, num: int, resources: VirtualMachineResources @@ -1158,7 +1158,7 @@ def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None: self._refresh_firewall(state) runner_manager = self._get_runner_manager(state) - runner_manager.flush(FlushMode.FLUSH_IDLE) + runner_manager.flush(LXDFlushMode.FLUSH_IDLE) self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, diff --git a/src/runner_manager.py b/src/runner_manager.py index e79d9f7a6..914bfdb0f 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -41,7 +41,7 @@ from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME from repo_policy_compliance_client import RepoPolicyComplianceClient from runner import LXD_PROFILE_YAML, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus -from runner_manager_type import FlushMode, LXDRunnerManagerConfig, RunnerInfo, RunnerManagerClients +from runner_manager_type import LXDFlushMode, LXDRunnerManagerConfig, RunnerInfo, RunnerManagerClients from runner_type import ProxySetting as RunnerProxySetting from runner_type import RunnerNameByHealth from utilities import execute_command, retry, set_env_var @@ -619,7 +619,7 @@ def _runners_in_pre_job(self) -> bool: return False return True - def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: + def flush(self, mode: LXDFlushMode = LXDFlushMode.FLUSH_IDLE) -> int: """Remove existing runners. Args: @@ -636,7 +636,7 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: remove_token = self._clients.github.get_runner_remove_token(self.config.path) except GithubClientError: logger.exception("Failed to get remove-token to unregister runners from GitHub.") - if mode != FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK: + if mode != LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK: raise logger.info("Proceeding with flush without remove-token.") remove_token = None @@ -656,9 +656,9 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: logger.info(REMOVED_RUNNER_LOG_STR, runner.config.name) if mode in ( - FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK, - FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, - FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_IDLE_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, + LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, ): for _ in range(5): if not self._runners_in_pre_job(): @@ -673,9 +673,9 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: ) if mode in ( - FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, - FlushMode.FLUSH_BUSY, - FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_BUSY, + LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, ): busy_runners = [runner for runner in self._get_runners() if runner.status.exist] diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py index e37e0b290..95f8edcc3 100644 --- a/src/runner_manager_type.py +++ b/src/runner_manager_type.py @@ -17,7 +17,7 @@ from repo_policy_compliance_client import RepoPolicyComplianceClient -class FlushMode(Enum): +class LXDFlushMode(Enum): """Strategy for flushing runners. During pre-job (repo-check), the runners are marked as idle and if the pre-job fails, the From 4a3866e995d377b3093cf06df295491ce64fc296 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 28 Aug 2024 18:22:10 +0800 Subject: [PATCH 247/278] Fix flavor of metric --- src/manager/runner_scaler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index 5216ec4f3..b2727aabe 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -161,7 +161,7 @@ def reconcile(self, num_of_runner: int) -> int: metric_events.issue_event( metric_events.Reconciliation( timestamp=time.time(), - flavor=self._manager.name_prefix, + flavor=self._manager.manager_name, crashed_runners=metric_stats.get(metric_events.RunnerStart, 0) - metric_stats.get(metric_events.RunnerStop, 0), idle_runners=len(available_runners), From c11028ed3f2eb97dcad6c8544a2fb3e8a510d3e6 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 28 Aug 2024 18:23:39 +0800 Subject: [PATCH 248/278] Testing out a integration test fikx --- tests/integration/test_charm_metrics_failure.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index e3de1600d..a6058c8cb 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -55,7 +55,7 @@ async def app_fixture( { VIRTUAL_MACHINES_CONFIG_NAME: "0", "repo-policy-compliance-token": "", - "repo-policy-compliance-url": "", + "repo-policy-compliance-url": "http://fake_site.com", } ) await reconcile(app=app_for_metric, model=model) From 58a6782c2092c6958d069cdbb93d6cc4b8e8c66d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 29 Aug 2024 09:21:13 +0800 Subject: [PATCH 249/278] change flush runner to flush idle. --- src/charm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index 122b01efb..8cf81b910 100755 --- a/src/charm.py +++ b/src/charm.py @@ -850,7 +850,7 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None: if state.instance_type == InstanceType.OPENSTACK: # Flushing mode not implemented for OpenStack yet. runner_scaler = self._get_runner_scaler(state) - flushed = runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + flushed = runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) logger.info("Flushed %s runners", flushed) delta = runner_scaler.reconcile(state.runner_config.virtual_machines) event.set_results({"delta": {"virtual-machines": delta}}) From 9b57745c7c2e394b8e2f4aa1b0f0248da3efa0b2 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 29 Aug 2024 09:42:54 +0800 Subject: [PATCH 250/278] Add debug in integration test --- tests/integration/test_charm_metrics_failure.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index a6058c8cb..f62246158 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -55,7 +55,7 @@ async def app_fixture( { VIRTUAL_MACHINES_CONFIG_NAME: "0", "repo-policy-compliance-token": "", - "repo-policy-compliance-url": "http://fake_site.com", + "repo-policy-compliance-url": "", } ) await reconcile(app=app_for_metric, model=model) @@ -84,6 +84,7 @@ async def test_charm_issues_metrics_for_failed_repo_policy( await app.set_config({PATH_CONFIG_NAME: forked_github_repository.full_name}) if isinstance(instance_helper, OpenStackInstanceHelper): + print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") await setup_repo_policy( app=app, openstack_connection=instance_helper.openstack_connection, From c51bb7be8260e81a51be786a1bed9ab12c17aed1 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 29 Aug 2024 13:11:55 +0800 Subject: [PATCH 251/278] Manual test mode --- .github/workflows/e2e_test.yaml | 4 +++- .github/workflows/integration_test.yaml | 4 +++- .github/workflows/manual_test_env.yaml | 18 +++++++++++++++ src-docs/runner_manager.md | 22 +++++++++---------- src/runner_manager.py | 7 +++++- .../integration/test_charm_metrics_failure.py | 5 ++++- 6 files changed, 45 insertions(+), 15 deletions(-) create mode 100644 .github/workflows/manual_test_env.yaml diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 7d0383c12..47224c63a 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,7 +1,9 @@ name: End-to-End tests on: - pull_request: + # TODO: debug + workflow_dispatch: + # pull_request: jobs: diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 8e0bc700a..91137ea43 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,7 +1,9 @@ name: integration-tests on: - pull_request: + # TODO: debug + workflow_dispatch: + # pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml new file mode 100644 index 000000000..6b65cbfce --- /dev/null +++ b/.github/workflows/manual_test_env.yaml @@ -0,0 +1,18 @@ +name: Manual test env + +on: + pull_request: + +jobs: + manual-test-env: + name: manual-test-env + runs-on: ["self-hosted", "stg-private-endpoint"] + steps: + - run: sudo apt update -yq + - run: sudo apt install pipx -yq + - run: pipx ensurepath + - run: pipx install tox + - uses: actions/checkout@v4 + - name: Tmate debugging session (self-hosted) + uses: canonical/action-tmate@main + timeout-minutes: 300 diff --git a/src-docs/runner_manager.md b/src-docs/runner_manager.md index f3ea1a1e7..f52829efa 100644 --- a/src-docs/runner_manager.md +++ b/src-docs/runner_manager.md @@ -13,7 +13,7 @@ Runner Manager manages the runners on LXD and GitHub. --- - + ## class `LXDRunnerManager` Manage a group of runners according to configuration. @@ -25,7 +25,7 @@ Manage a group of runners according to configuration. - `runner_bin_path`: The github runner app scripts path. - `cron_path`: The path to runner build image cron job. - + ### method `__init__` @@ -52,7 +52,7 @@ Construct RunnerManager object for creating and managing runners. --- - + ### method `build_runner_image` @@ -72,7 +72,7 @@ Build container image in test mode, else virtual machine image. --- - + ### method `check_runner_bin` @@ -89,7 +89,7 @@ Check if runner binary exists. --- - + ### method `flush` @@ -118,7 +118,7 @@ Remove existing runners. --- - + ### method `get_github_info` @@ -135,7 +135,7 @@ Get information on the runners from GitHub. --- - + ### method `get_latest_runner_bin_url` @@ -166,7 +166,7 @@ The runner binary URL changes when a new version is available. --- - + ### method `has_runner_image` @@ -183,7 +183,7 @@ Check if the runner image exists. --- - + ### method `reconcile` @@ -207,7 +207,7 @@ Bring runners in line with target. --- - + ### method `schedule_build_runner_image` @@ -219,7 +219,7 @@ Install cron job for building runner image. --- - + ### method `update_runner_bin` diff --git a/src/runner_manager.py b/src/runner_manager.py index 914bfdb0f..31c30ef85 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -41,7 +41,12 @@ from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME from repo_policy_compliance_client import RepoPolicyComplianceClient from runner import LXD_PROFILE_YAML, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus -from runner_manager_type import LXDFlushMode, LXDRunnerManagerConfig, RunnerInfo, RunnerManagerClients +from runner_manager_type import ( + LXDFlushMode, + LXDRunnerManagerConfig, + RunnerInfo, + RunnerManagerClients, +) from runner_type import ProxySetting as RunnerProxySetting from runner_type import RunnerNameByHealth from utilities import execute_command, retry, set_env_var diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index f62246158..e4f9c7965 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -83,8 +83,11 @@ async def test_charm_issues_metrics_for_failed_repo_policy( """ await app.set_config({PATH_CONFIG_NAME: forked_github_repository.full_name}) + # TODO: debug + print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + print(instance_helper) + print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if isinstance(instance_helper, OpenStackInstanceHelper): - print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") await setup_repo_policy( app=app, openstack_connection=instance_helper.openstack_connection, From 9e9eb33da2c37d0ebb5838bb8f57790df54f7171 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 29 Aug 2024 15:37:53 +0800 Subject: [PATCH 252/278] Start new manual test env --- .github/workflows/manual_test_env.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index 6b65cbfce..85d9ac7e0 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -13,6 +13,10 @@ jobs: - run: pipx ensurepath - run: pipx install tox - uses: actions/checkout@v4 + - uses: charmed-kubernetes/actions-operator@main + - run: sudo snap install charmcraft --classic + - run: lxd init --auto + - run: charmcraft pack - name: Tmate debugging session (self-hosted) uses: canonical/action-tmate@main timeout-minutes: 300 From 73f96422bf487813c8dff7d4878a09822b0b763c Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:32:06 +0800 Subject: [PATCH 253/278] Spawn x64 manual test env. --- .github/workflows/manual_test_env.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml index 85d9ac7e0..cab00f90e 100644 --- a/.github/workflows/manual_test_env.yaml +++ b/.github/workflows/manual_test_env.yaml @@ -6,7 +6,7 @@ on: jobs: manual-test-env: name: manual-test-env - runs-on: ["self-hosted", "stg-private-endpoint"] + runs-on: ["self-hosted", "stg-private-endpoint", "x64"] steps: - run: sudo apt update -yq - run: sudo apt install pipx -yq From 9785bcf633b219a60daae43fa7d6fe6839aebf4b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:56:18 +0800 Subject: [PATCH 254/278] Improve logging during reconcile --- src/manager/runner_scaler.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index b2727aabe..0fc4f5170 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -141,6 +141,9 @@ def reconcile(self, num_of_runner: int) -> int: } runner_list = self._manager.get_runners() + busy_runners = [ + runner for runner in runner_list if runner.github_state == GitHubRunnerState.BUSY + ] idle_runners = [ runner for runner in runner_list if runner.github_state == GitHubRunnerState.IDLE ] @@ -150,6 +153,15 @@ def reconcile(self, num_of_runner: int) -> int: if runner.github_state == GitHubRunnerState.OFFLINE and runner.health == HealthState.HEALTHY ] + unhealthy_runners = [ + runner + for runner in runner_list + if runner.health == HealthState.HEALTHY + ] + logger.info("Found %s busy runners: %s", len(busy_runners), busy_runners) + logger.info("Found %s idle runners: %s", len(idle_runners), idle_runners) + logger.info("Found %s offline runners that are healthy: %s", len(offline_healthy_runners), offline_healthy_runners) + logger.info("Found %s unhealthy runners: %s", len(unhealthy_runners), unhealthy_runners) try: available_runners = set(runner.name for runner in idle_runners) | set( From 7e5859b535e3401761587cb02f4fe73b3f23bb11 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:31:00 +0800 Subject: [PATCH 255/278] Fix crashed metric collection --- src/manager/runner_scaler.py | 2 +- tests/integration/test_charm_metrics_failure.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index 0fc4f5170..b4877a583 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -136,7 +136,7 @@ def reconcile(self, num_of_runner: int) -> int: # Merge the two metric stats. if delete_metric_stats is not None: metric_stats = { - delete_metric_stats.get(event_name, 0) + metric_stats.get(event_name, 0) + event_name: delete_metric_stats.get(event_name, 0) + metric_stats.get(event_name, 0) for event_name in set(delete_metric_stats) | set(metric_stats) } diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index e4f9c7965..e3de1600d 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -83,10 +83,6 @@ async def test_charm_issues_metrics_for_failed_repo_policy( """ await app.set_config({PATH_CONFIG_NAME: forked_github_repository.full_name}) - # TODO: debug - print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") - print(instance_helper) - print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if isinstance(instance_helper, OpenStackInstanceHelper): await setup_repo_policy( app=app, From 3fa5c1782a748901674696adc1acbed6fd006293 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:31:37 +0800 Subject: [PATCH 256/278] Remove debug workflow --- .github/workflows/manual_test_env.yaml | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 .github/workflows/manual_test_env.yaml diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml deleted file mode 100644 index cab00f90e..000000000 --- a/.github/workflows/manual_test_env.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Manual test env - -on: - pull_request: - -jobs: - manual-test-env: - name: manual-test-env - runs-on: ["self-hosted", "stg-private-endpoint", "x64"] - steps: - - run: sudo apt update -yq - - run: sudo apt install pipx -yq - - run: pipx ensurepath - - run: pipx install tox - - uses: actions/checkout@v4 - - uses: charmed-kubernetes/actions-operator@main - - run: sudo snap install charmcraft --classic - - run: lxd init --auto - - run: charmcraft pack - - name: Tmate debugging session (self-hosted) - uses: canonical/action-tmate@main - timeout-minutes: 300 From 08477ee949660bda5bcb303f4c2c6f40c0195a3b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:32:28 +0800 Subject: [PATCH 257/278] Format --- src/manager/runner_scaler.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index b4877a583..06e2ec43a 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -136,7 +136,8 @@ def reconcile(self, num_of_runner: int) -> int: # Merge the two metric stats. if delete_metric_stats is not None: metric_stats = { - event_name: delete_metric_stats.get(event_name, 0) + metric_stats.get(event_name, 0) + event_name: delete_metric_stats.get(event_name, 0) + + metric_stats.get(event_name, 0) for event_name in set(delete_metric_stats) | set(metric_stats) } @@ -154,13 +155,15 @@ def reconcile(self, num_of_runner: int) -> int: and runner.health == HealthState.HEALTHY ] unhealthy_runners = [ - runner - for runner in runner_list - if runner.health == HealthState.HEALTHY + runner for runner in runner_list if runner.health == HealthState.HEALTHY ] logger.info("Found %s busy runners: %s", len(busy_runners), busy_runners) logger.info("Found %s idle runners: %s", len(idle_runners), idle_runners) - logger.info("Found %s offline runners that are healthy: %s", len(offline_healthy_runners), offline_healthy_runners) + logger.info( + "Found %s offline runners that are healthy: %s", + len(offline_healthy_runners), + offline_healthy_runners, + ) logger.info("Found %s unhealthy runners: %s", len(unhealthy_runners), unhealthy_runners) try: From f45b0045f9a53a37a3997fc6baa7829f5cd8a39b Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 29 Aug 2024 18:50:54 +0800 Subject: [PATCH 258/278] Test --- .github/workflows/integration_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 91137ea43..7d371d046 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -3,7 +3,7 @@ name: integration-tests on: # TODO: debug workflow_dispatch: - # pull_request: + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} From febc6526bb0843a4011a75664730d6bafc72d9f8 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 30 Aug 2024 11:57:11 +0800 Subject: [PATCH 259/278] Add reactive back in --- src/charm.py | 2 +- src/manager/runner_scaler.py | 40 +++++++++++++++++++++++++++----- tests/unit/test_runner_scaler.py | 4 ++-- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/charm.py b/src/charm.py index c702f3659..73f81a74f 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1288,7 +1288,7 @@ def _get_runner_scaler( cloud_runner_manager=openstack_runner_manager, config=runner_manager_config, ) - return RunnerScaler(runner_manager=runner_manager) + return RunnerScaler(runner_manager=runner_manager, reactive_config=state.reactive_config) if __name__ == "__main__": diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index 06e2ec43a..34707fe39 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -7,6 +7,10 @@ import time from dataclasses import dataclass +from pydantic import MongoDsn + +import reactive.runner_manager as reactive_runner_manager +from charm_state import ReactiveConfig from errors import IssueMetricEventError, MissingServerConfigError from manager.cloud_runner_manager import HealthState from manager.github_runner_manager import GitHubRunnerState @@ -40,13 +44,15 @@ class RunnerInfo: class RunnerScaler: """Manage the reconcile of runners.""" - def __init__(self, runner_manager: RunnerManager): + def __init__(self, runner_manager: RunnerManager, reactive_config: ReactiveConfig | None): """Construct the object. Args: runner_manager: The RunnerManager to perform runner reconcile. + reactive_config: Reactive runner configuration. """ self._manager = runner_manager + self._reactive_config = reactive_config def get_runner_info(self) -> RunnerInfo: """Get information on the runners. @@ -102,24 +108,28 @@ def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: } return metric_stats.get(metric_events.RunnerStop, 0) - def reconcile(self, num_of_runner: int) -> int: + def reconcile(self, quantity: int) -> int: """Reconcile the quantity of runners. Args: - num_of_runner: The number of intended runners. + quantity: The number of intended runners. Returns: The Change in number of runners. """ - logger.info("Start reconcile to %s runner", num_of_runner) + logger.info("Start reconcile to %s runner", quantity) + + if self._reactive_config is not None: + logger.info("Reactive configuration detected, going into experimental reactive mode.") + return self._reconcile_reactive(quantity, self._reactive_config.mq_uri) start_timestamp = time.time() delete_metric_stats = None metric_stats = self._manager.cleanup() runners = self._manager.get_runners() current_num = len(runners) - logger.info("Reconcile runners from %s to %s", current_num, num_of_runner) - runner_diff = num_of_runner - current_num + logger.info("Reconcile runners from %s to %s", current_num, quantity) + runner_diff = quantity - current_num if runner_diff > 0: try: self._manager.create_runners(runner_diff) @@ -187,3 +197,21 @@ def reconcile(self, num_of_runner: int) -> int: logger.exception("Failed to issue Reconciliation metric") return runner_diff + + def _reconcile_reactive(self, quantity: int, mq_uri: MongoDsn) -> int: + """Reconcile runners reactively. + + Args: + quantity: Number of intended runners. + mq_uri: The URI of the MQ to use to spawn runners reactively. + + Returns: + The difference between intended runners and actual runners. In reactive mode + this number is never negative as additional processes should terminate after a timeout. + """ + logger.info("Reactive mode is experimental and not yet fully implemented.") + return reactive_runner_manager.reconcile( + quantity=quantity, + mq_uri=mq_uri, + queue_name=self._manager.manager_name, + ) diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 3bf4dfea0..4b66dff57 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -72,7 +72,7 @@ def runner_manager_fixture( @pytest.fixture(scope="function", name="runner_scaler") def runner_scaler_fixture(runner_manager: RunnerManager) -> RunnerScaler: - return RunnerScaler(runner_manager) + return RunnerScaler(runner_manager, None) @pytest.fixture(scope="function", name="runner_scaler_one_runner") @@ -162,7 +162,7 @@ def test_reconcile_runner_create_one(runner_scaler: RunnerScaler): Act: Reconcile to no runners. Assert: No changes. Runner info should contain no runners. """ - diff = runner_scaler.reconcile(num_of_runner=0) + diff = runner_scaler.reconcile(quantity=0) assert diff == 0 assert_runner_info(runner_scaler, online=0) From 7b430e5be1a1e2bd47de1dce2f1f8af60324da43 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:01:36 +0800 Subject: [PATCH 260/278] Fix flushing of runners --- src/charm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/charm.py b/src/charm.py index 73f81a74f..d2616ba1b 100755 --- a/src/charm.py +++ b/src/charm.py @@ -624,9 +624,9 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 if state.instance_type == InstanceType.OPENSTACK: if not self._get_set_image_ready_status(): return - if state.charm_config.token != self._stored.token: + if should_flush_runners: runner_scaler = self._get_runner_scaler(state) - runner_scaler.flush() + runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) runner_scaler.reconcile(state.runner_config.virtual_machines) # TODO: 2024-04-12: Flush on token changes. self.unit.status = ActiveStatus() From 85f7079e916fc9e4f44a686d48440c9b5409b114 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:05:03 +0800 Subject: [PATCH 261/278] Debug workflow --- .github/workflows/integration_test.yaml | 2 +- tests/integration/helpers/charm_metrics.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 7d371d046..91137ea43 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -3,7 +3,7 @@ name: integration-tests on: # TODO: debug workflow_dispatch: - pull_request: + # pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/tests/integration/helpers/charm_metrics.py b/tests/integration/helpers/charm_metrics.py index b6c2f05bc..d42761c50 100644 --- a/tests/integration/helpers/charm_metrics.py +++ b/tests/integration/helpers/charm_metrics.py @@ -182,6 +182,10 @@ async def assert_events_after_reconciliation( metrics_log = await get_metrics_log(unit=unit) log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines())) events = set(map(lambda line: line.get("event"), log_lines)) + # TODO: debug + import pytest + pytest.set_trace() + assert { "runner_start", "runner_stop", @@ -209,6 +213,10 @@ async def assert_events_after_reconciliation( JobConclusion.CANCELLED, ] elif post_job_status == PostJobStatus.REPO_POLICY_CHECK_FAILURE: + # TODO: debug + import pytest + pytest.set_trace() + assert metric_log.get("status_info", {}).get("code", 0) == 403 assert metric_log.get("job_conclusion") == JobConclusion.FAILURE else: From cd7d81b7a784e40bee3597c5d08a53f116bf9517 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:25:30 +0800 Subject: [PATCH 262/278] Add debug --- tests/integration/helpers/charm_metrics.py | 4 ---- tests/integration/helpers/openstack.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/integration/helpers/charm_metrics.py b/tests/integration/helpers/charm_metrics.py index d42761c50..5c882ac3d 100644 --- a/tests/integration/helpers/charm_metrics.py +++ b/tests/integration/helpers/charm_metrics.py @@ -213,10 +213,6 @@ async def assert_events_after_reconciliation( JobConclusion.CANCELLED, ] elif post_job_status == PostJobStatus.REPO_POLICY_CHECK_FAILURE: - # TODO: debug - import pytest - pytest.set_trace() - assert metric_log.get("status_info", {}).get("code", 0) == 403 assert metric_log.get("job_conclusion") == JobConclusion.FAILURE else: diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index b2d7624a6..a539a8d60 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -166,6 +166,11 @@ def _get_runner(self, unit: Unit) -> Server | None: The runner server. """ servers: list[Server] = self.openstack_connection.list_servers() + + # TODO: debug + import pytest + pytest.set_trace() + runner = None unit_name_without_slash = unit.name.replace("/", "-") for server in servers: @@ -212,8 +217,17 @@ async def setup_repo_policy( } ) + # TODO: debug + import pytest + pytest.set_trace() await instance_helper.ensure_charm_has_runner(app=app) + # TODO: debug + import pytest + pytest.set_trace() await instance_helper.expose_to_instance(unit, 8080) + # TODO: debug + import pytest + pytest.set_trace() async def _install_repo_policy( From 1534311c4fa7e5491aea35640e9f7e8ba8733d59 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 30 Aug 2024 15:44:42 +0800 Subject: [PATCH 263/278] Fix logging of health state --- src/manager/runner_scaler.py | 2 +- tests/integration/helpers/openstack.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index 34707fe39..331d13728 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -165,7 +165,7 @@ def reconcile(self, quantity: int) -> int: and runner.health == HealthState.HEALTHY ] unhealthy_runners = [ - runner for runner in runner_list if runner.health == HealthState.HEALTHY + runner for runner in runner_list if runner.health == HealthState.UNHEALTHY or runner.health == HealthState.UNKNOWN ] logger.info("Found %s busy runners: %s", len(busy_runners), busy_runners) logger.info("Found %s idle runners: %s", len(idle_runners), idle_runners) diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index a539a8d60..f0879dd47 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -55,10 +55,18 @@ async def expose_to_instance( break assert ip, f"Failed to get IP address for OpenStack server {runner.name}" + # TODO: debug + import pytest + pytest.set_trace() + ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i /home/ubuntu/.ssh/runner-{runner.name}.key -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' exit_code, _, stderr = await run_in_unit(unit, ssh_cmd) assert exit_code == 0, f"Error in SSH remote forwarding of port {port}: {stderr}" + # TODO: debug + import pytest + pytest.set_trace() + async def run_in_instance( self, unit: Unit, @@ -217,17 +225,8 @@ async def setup_repo_policy( } ) - # TODO: debug - import pytest - pytest.set_trace() await instance_helper.ensure_charm_has_runner(app=app) - # TODO: debug - import pytest - pytest.set_trace() await instance_helper.expose_to_instance(unit, 8080) - # TODO: debug - import pytest - pytest.set_trace() async def _install_repo_policy( From 8fa1fd5813cce02765b2787c5f0366f5f396280d Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Fri, 30 Aug 2024 19:01:36 +0800 Subject: [PATCH 264/278] Remove debug --- .github/workflows/e2e_test.yaml | 4 +--- .github/workflows/integration_test.yaml | 4 +--- tests/integration/helpers/charm_metrics.py | 4 ---- tests/integration/helpers/openstack.py | 15 +-------------- 4 files changed, 3 insertions(+), 24 deletions(-) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 47224c63a..7d0383c12 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,9 +1,7 @@ name: End-to-End tests on: - # TODO: debug - workflow_dispatch: - # pull_request: + pull_request: jobs: diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 91137ea43..8e0bc700a 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -1,9 +1,7 @@ name: integration-tests on: - # TODO: debug - workflow_dispatch: - # pull_request: + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/tests/integration/helpers/charm_metrics.py b/tests/integration/helpers/charm_metrics.py index 5c882ac3d..b6c2f05bc 100644 --- a/tests/integration/helpers/charm_metrics.py +++ b/tests/integration/helpers/charm_metrics.py @@ -182,10 +182,6 @@ async def assert_events_after_reconciliation( metrics_log = await get_metrics_log(unit=unit) log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines())) events = set(map(lambda line: line.get("event"), log_lines)) - # TODO: debug - import pytest - pytest.set_trace() - assert { "runner_start", "runner_stop", diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index f0879dd47..933b47208 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -55,18 +55,10 @@ async def expose_to_instance( break assert ip, f"Failed to get IP address for OpenStack server {runner.name}" - # TODO: debug - import pytest - pytest.set_trace() - ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i /home/ubuntu/.ssh/runner-{runner.name}.key -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' exit_code, _, stderr = await run_in_unit(unit, ssh_cmd) assert exit_code == 0, f"Error in SSH remote forwarding of port {port}: {stderr}" - # TODO: debug - import pytest - pytest.set_trace() - async def run_in_instance( self, unit: Unit, @@ -174,11 +166,6 @@ def _get_runner(self, unit: Unit) -> Server | None: The runner server. """ servers: list[Server] = self.openstack_connection.list_servers() - - # TODO: debug - import pytest - pytest.set_trace() - runner = None unit_name_without_slash = unit.name.replace("/", "-") for server in servers: @@ -260,7 +247,7 @@ async def _install_repo_policy( ) await run_in_unit( unit, - f'sudo -u ubuntu HTTPS_PROXY={https_proxy if https_proxy else ""} pip install --proxy http://squid.internal:3128 -r /home/ubuntu/repo_policy_compliance/requirements.txt', + f'sudo -u ubuntu HTTPS_PROXY={https_proxy if https_proxy else ""} pip install {f"--proxy {https_proxy}" if https_proxy else ""} -r /home/ubuntu/repo_policy_compliance/requirements.txt', assert_on_failure=True, assert_msg="Failed to install repo-policy-compliance requirements", ) From d531297c3c97f9ccb5f1ca5201eb9f6f33ca4715 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sun, 1 Sep 2024 09:48:48 +0800 Subject: [PATCH 265/278] Debug --- .github/workflows/e2e_test.yaml | 4 +++- .github/workflows/manual_test_env.yaml | 22 ++++++++++++++++++++++ src/manager/runner_scaler.py | 5 ++--- tests/integration/helpers/openstack.py | 5 +++++ 4 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/manual_test_env.yaml diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 7d0383c12..47224c63a 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,7 +1,9 @@ name: End-to-End tests on: - pull_request: + # TODO: debug + workflow_dispatch: + # pull_request: jobs: diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml new file mode 100644 index 000000000..cab00f90e --- /dev/null +++ b/.github/workflows/manual_test_env.yaml @@ -0,0 +1,22 @@ +name: Manual test env + +on: + pull_request: + +jobs: + manual-test-env: + name: manual-test-env + runs-on: ["self-hosted", "stg-private-endpoint", "x64"] + steps: + - run: sudo apt update -yq + - run: sudo apt install pipx -yq + - run: pipx ensurepath + - run: pipx install tox + - uses: actions/checkout@v4 + - uses: charmed-kubernetes/actions-operator@main + - run: sudo snap install charmcraft --classic + - run: lxd init --auto + - run: charmcraft pack + - name: Tmate debugging session (self-hosted) + uses: canonical/action-tmate@main + timeout-minutes: 300 diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index 331d13728..e8670e981 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -164,9 +164,8 @@ def reconcile(self, quantity: int) -> int: if runner.github_state == GitHubRunnerState.OFFLINE and runner.health == HealthState.HEALTHY ] - unhealthy_runners = [ - runner for runner in runner_list if runner.health == HealthState.UNHEALTHY or runner.health == HealthState.UNKNOWN - ] + unhealthy_states = set(HealthState.UNHEALTHY, HealthState.UNKNOWN) + unhealthy_runners = [runner for runner in runner_list if runner.health in unhealthy_states] logger.info("Found %s busy runners: %s", len(busy_runners), busy_runners) logger.info("Found %s idle runners: %s", len(idle_runners), idle_runners) logger.info( diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index 933b47208..eefdb02ca 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -55,6 +55,11 @@ async def expose_to_instance( break assert ip, f"Failed to get IP address for OpenStack server {runner.name}" + # TODO: debug + import pytest + + pytest.set_trace() + ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i /home/ubuntu/.ssh/runner-{runner.name}.key -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' exit_code, _, stderr = await run_in_unit(unit, ssh_cmd) assert exit_code == 0, f"Error in SSH remote forwarding of port {port}: {stderr}" From 7fd2774384d747d3e2675f6881dcefbc74839c7f Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sun, 1 Sep 2024 11:03:59 +0800 Subject: [PATCH 266/278] Fix set contruction --- src/manager/runner_scaler.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index e8670e981..2cee3f13d 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -127,9 +127,8 @@ def reconcile(self, quantity: int) -> int: delete_metric_stats = None metric_stats = self._manager.cleanup() runners = self._manager.get_runners() - current_num = len(runners) - logger.info("Reconcile runners from %s to %s", current_num, quantity) - runner_diff = quantity - current_num + logger.info("Reconcile runners from %s to %s", len(runners), quantity) + runner_diff = quantity - len(runners) if runner_diff > 0: try: self._manager.create_runners(runner_diff) @@ -164,7 +163,7 @@ def reconcile(self, quantity: int) -> int: if runner.github_state == GitHubRunnerState.OFFLINE and runner.health == HealthState.HEALTHY ] - unhealthy_states = set(HealthState.UNHEALTHY, HealthState.UNKNOWN) + unhealthy_states = set((HealthState.UNHEALTHY, HealthState.UNKNOWN)) unhealthy_runners = [runner for runner in runner_list if runner.health in unhealthy_states] logger.info("Found %s busy runners: %s", len(busy_runners), busy_runners) logger.info("Found %s idle runners: %s", len(idle_runners), idle_runners) From 436325dca8442a73f00b24e7ea82fb536e893527 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Sun, 1 Sep 2024 14:02:56 +0800 Subject: [PATCH 267/278] Fix SSH key path in integration test setup --- .github/workflows/e2e_test.yaml | 4 +--- .github/workflows/manual_test_env.yaml | 22 ---------------------- tests/integration/helpers/openstack.py | 14 +++++++------- 3 files changed, 8 insertions(+), 32 deletions(-) delete mode 100644 .github/workflows/manual_test_env.yaml diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 47224c63a..7d0383c12 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -1,9 +1,7 @@ name: End-to-End tests on: - # TODO: debug - workflow_dispatch: - # pull_request: + pull_request: jobs: diff --git a/.github/workflows/manual_test_env.yaml b/.github/workflows/manual_test_env.yaml deleted file mode 100644 index cab00f90e..000000000 --- a/.github/workflows/manual_test_env.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Manual test env - -on: - pull_request: - -jobs: - manual-test-env: - name: manual-test-env - runs-on: ["self-hosted", "stg-private-endpoint", "x64"] - steps: - - run: sudo apt update -yq - - run: sudo apt install pipx -yq - - run: pipx ensurepath - - run: pipx install tox - - uses: actions/checkout@v4 - - uses: charmed-kubernetes/actions-operator@main - - run: sudo snap install charmcraft --classic - - run: lxd init --auto - - run: charmcraft pack - - name: Tmate debugging session (self-hosted) - uses: canonical/action-tmate@main - timeout-minutes: 300 diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index eefdb02ca..5d562e748 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -10,6 +10,7 @@ from openstack.compute.v2.server import Server from charm_state import VIRTUAL_MACHINES_CONFIG_NAME +from openstack_cloud.openstack_cloud import OpenstackCloud from tests.integration.helpers.common import InstanceHelper, reconcile, run_in_unit, wait_for logger = logging.getLogger(__name__) @@ -55,12 +56,9 @@ async def expose_to_instance( break assert ip, f"Failed to get IP address for OpenStack server {runner.name}" - # TODO: debug - import pytest - - pytest.set_trace() - - ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i /home/ubuntu/.ssh/runner-{runner.name}.key -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' + key_path = OpenstackCloud._get_key_path(runner.name) + assert key_path.exists(), f"SSH key for runner {runner.name} not found in the juju unit" + ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i /home/ubuntu/.ssh/{runner.name}.key -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' exit_code, _, stderr = await run_in_unit(unit, ssh_cmd) assert exit_code == 0, f"Error in SSH remote forwarding of port {port}: {stderr}" @@ -99,7 +97,9 @@ async def run_in_instance( break assert ip, f"Failed to get IP address for OpenStack server {runner.name}" - ssh_cmd = f'ssh -i /home/ubuntu/.ssh/runner-{runner.name}.key -o "StrictHostKeyChecking no" ubuntu@{ip} {command}' + key_path = OpenstackCloud._get_key_path(runner.name) + assert key_path.exists(), f"SSH key for runner {runner.name} not found in the juju unit" + ssh_cmd = f'ssh -i {key_path} -o "StrictHostKeyChecking no" ubuntu@{ip} {command}' ssh_cmd_as_ubuntu_user = f"su - ubuntu -c '{ssh_cmd}'" logging.warning("ssh_cmd: %s", ssh_cmd_as_ubuntu_user) exit_code, stdout, stderr = await run_in_unit(unit, ssh_cmd, timeout) From f1b0d68d4a7f77006b9283374bb2ac5088b37ed4 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 2 Sep 2024 09:56:30 +0800 Subject: [PATCH 268/278] Add more checks to repo-policy-compliance setup in tests --- src/manager/runner_scaler.py | 2 +- tests/integration/helpers/openstack.py | 35 ++++++++++++++++---------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py index 2cee3f13d..271b92e51 100644 --- a/src/manager/runner_scaler.py +++ b/src/manager/runner_scaler.py @@ -128,7 +128,7 @@ def reconcile(self, quantity: int) -> int: metric_stats = self._manager.cleanup() runners = self._manager.get_runners() logger.info("Reconcile runners from %s to %s", len(runners), quantity) - runner_diff = quantity - len(runners) + runner_diff = quantity - len(runners) if runner_diff > 0: try: self._manager.create_runners(runner_diff) diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index 5d562e748..ff3da0f68 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -41,7 +41,7 @@ async def expose_to_instance( unit: The juju unit of the github-runner charm. port: The port on the juju machine to expose to the runner. """ - runner = self._get_runner(unit=unit) + runner = self._get_single_runner(unit=unit) assert runner, f"Runner not found for unit {unit.name}" network_address_list = runner.addresses.values() logger.warning(network_address_list) @@ -60,7 +60,9 @@ async def expose_to_instance( assert key_path.exists(), f"SSH key for runner {runner.name} not found in the juju unit" ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i /home/ubuntu/.ssh/{runner.name}.key -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' exit_code, _, stderr = await run_in_unit(unit, ssh_cmd) - assert exit_code == 0, f"Error in SSH remote forwarding of port {port}: {stderr}" + assert ( + exit_code == 0 + ), f"Error in starting background process of SSH remote forwarding of port {port}: {stderr}" async def run_in_instance( self, @@ -82,7 +84,7 @@ async def run_in_instance( Returns: Tuple of return code, stdout and stderr. """ - runner = self._get_runner(unit=unit) + runner = self._get_single_runner(unit=unit) assert runner, f"Runner not found for unit {unit.name}" network_address_list = runner.addresses.values() logger.warning(network_address_list) @@ -157,12 +159,14 @@ async def _get_runner_names(self, unit: Unit) -> tuple[str, ...]: Returns: Tuple of runner names. """ - runner = self._get_runner(unit) + runner = self._get_single_runner(unit) assert runner, "Failed to find runner server" return (cast(str, runner.name),) - def _get_runner(self, unit: Unit) -> Server | None: - """Get the runner server. + def _get_single_runner(self, unit: Unit) -> Server | None: + """Get the only runner for the unit. + + This method asserts for exactly one runner for the unit. Args: unit: The unit to get the runner for. @@ -171,14 +175,12 @@ def _get_runner(self, unit: Unit) -> Server | None: The runner server. """ servers: list[Server] = self.openstack_connection.list_servers() - runner = None unit_name_without_slash = unit.name.replace("/", "-") - for server in servers: - if server.name.startswith(unit_name_without_slash): - runner = server - break - - return runner + runners = [server for server in servers if server.name.startswith(unit_name_without_slash)] + assert ( + len(runners) == 1 + ), f"In {unit.name} found more than one runners or no runners: {runners}" + return runners[0] async def setup_repo_policy( @@ -219,6 +221,13 @@ async def setup_repo_policy( await instance_helper.ensure_charm_has_runner(app=app) await instance_helper.expose_to_instance(unit, 8080) + # This tests the connection to the repo policy compliance, not a health check of service. + await instance_helper.run_in_instance( + unit=unit, + command="curl http://localhost/8080", + assert_on_failure=True, + assert_msg="Unable to reach the repo policy compliance server setup", + ) async def _install_repo_policy( From 7603bc1def9525a1879a1838065ddeacf3a35b62 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:58:34 +0800 Subject: [PATCH 269/278] Fix key path check --- tests/integration/helpers/openstack.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index ff3da0f68..c43301d75 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -57,8 +57,9 @@ async def expose_to_instance( assert ip, f"Failed to get IP address for OpenStack server {runner.name}" key_path = OpenstackCloud._get_key_path(runner.name) - assert key_path.exists(), f"SSH key for runner {runner.name} not found in the juju unit" - ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i /home/ubuntu/.ssh/{runner.name}.key -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' + exit_code, _, _ = await run_in_unit(unit, "ls {key_path}") + assert exit_code == 0, f"Unable to find key file {key_path}" + ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i {key_path} -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' exit_code, _, stderr = await run_in_unit(unit, ssh_cmd) assert ( exit_code == 0 @@ -100,7 +101,8 @@ async def run_in_instance( assert ip, f"Failed to get IP address for OpenStack server {runner.name}" key_path = OpenstackCloud._get_key_path(runner.name) - assert key_path.exists(), f"SSH key for runner {runner.name} not found in the juju unit" + exit_code, _, _ = await run_in_unit(unit, "ls {key_path}") + assert exit_code == 0, f"Unable to find key file {key_path}" ssh_cmd = f'ssh -i {key_path} -o "StrictHostKeyChecking no" ubuntu@{ip} {command}' ssh_cmd_as_ubuntu_user = f"su - ubuntu -c '{ssh_cmd}'" logging.warning("ssh_cmd: %s", ssh_cmd_as_ubuntu_user) From 8bcea0023eea47d6d0d7e99c7109b435ce2c2a11 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:04:19 +0800 Subject: [PATCH 270/278] Fix format string issue --- tests/integration/helpers/openstack.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index c43301d75..f94a61f25 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -57,7 +57,7 @@ async def expose_to_instance( assert ip, f"Failed to get IP address for OpenStack server {runner.name}" key_path = OpenstackCloud._get_key_path(runner.name) - exit_code, _, _ = await run_in_unit(unit, "ls {key_path}") + exit_code, _, _ = await run_in_unit(unit, "ls f{key_path}") assert exit_code == 0, f"Unable to find key file {key_path}" ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i {key_path} -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' exit_code, _, stderr = await run_in_unit(unit, ssh_cmd) @@ -101,7 +101,7 @@ async def run_in_instance( assert ip, f"Failed to get IP address for OpenStack server {runner.name}" key_path = OpenstackCloud._get_key_path(runner.name) - exit_code, _, _ = await run_in_unit(unit, "ls {key_path}") + exit_code, _, _ = await run_in_unit(unit, f"ls {key_path}") assert exit_code == 0, f"Unable to find key file {key_path}" ssh_cmd = f'ssh -i {key_path} -o "StrictHostKeyChecking no" ubuntu@{ip} {command}' ssh_cmd_as_ubuntu_user = f"su - ubuntu -c '{ssh_cmd}'" From 031c113d6ad2e20cf331729838bee2b0822e28dc Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:06:36 +0800 Subject: [PATCH 271/278] Fix format string typo --- tests/integration/helpers/openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index f94a61f25..79973555a 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -57,7 +57,7 @@ async def expose_to_instance( assert ip, f"Failed to get IP address for OpenStack server {runner.name}" key_path = OpenstackCloud._get_key_path(runner.name) - exit_code, _, _ = await run_in_unit(unit, "ls f{key_path}") + exit_code, _, _ = await run_in_unit(unit, f"ls {key_path}") assert exit_code == 0, f"Unable to find key file {key_path}" ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i {key_path} -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' exit_code, _, stderr = await run_in_unit(unit, ssh_cmd) From 49184a67c4d183651411812013ebb88b11001d82 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 3 Sep 2024 09:00:35 +0800 Subject: [PATCH 272/278] Add some logging of test setup --- tests/integration/helpers/openstack.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index 79973555a..46d1f403a 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -2,6 +2,7 @@ # See LICENSE file for licensing details. import logging import secrets +from asyncio import sleep from typing import Optional, TypedDict, cast import openstack.connection @@ -43,6 +44,7 @@ async def expose_to_instance( """ runner = self._get_single_runner(unit=unit) assert runner, f"Runner not found for unit {unit.name}" + logger.info("[TEST SETUP] Exposing port %s on %s", port, runner.name) network_address_list = runner.addresses.values() logger.warning(network_address_list) assert ( @@ -65,6 +67,14 @@ async def expose_to_instance( exit_code == 0 ), f"Error in starting background process of SSH remote forwarding of port {port}: {stderr}" + await sleep(1) + for _ in range(6): + exit_code, _, _ = self.run_in_instance(unit=unit, command=f"nc -z localhost {port}") + if exit_code == 0: + return + await sleep(10) + assert False, f"Exposing the port {port} failed" + async def run_in_instance( self, unit: Unit, @@ -87,6 +97,7 @@ async def run_in_instance( """ runner = self._get_single_runner(unit=unit) assert runner, f"Runner not found for unit {unit.name}" + logger.info("[TEST SETUP] Run command %s on %s", command, runner.name) network_address_list = runner.addresses.values() logger.warning(network_address_list) assert ( From 2eafedc732de6e06019cb6c111569e7799b68cec Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 3 Sep 2024 10:06:49 +0800 Subject: [PATCH 273/278] Fix missing await --- src/charm.py | 3 --- tests/integration/helpers/openstack.py | 4 +++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/charm.py b/src/charm.py index d2616ba1b..173e0750a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1222,9 +1222,6 @@ def _get_runner_scaler( ) -> RunnerScaler: """Get runner scaler instance for scaling runners. - TODO: 2024-07-09 Combine this with `_get_runner_manager` during the runner manager \ - interface refactor. - Args: state: Charm state. token: GitHub personal access token to manage the runners with. If None the token in diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index 46d1f403a..5c4d00103 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -69,7 +69,9 @@ async def expose_to_instance( await sleep(1) for _ in range(6): - exit_code, _, _ = self.run_in_instance(unit=unit, command=f"nc -z localhost {port}") + exit_code, _, _ = await self.run_in_instance( + unit=unit, command=f"nc -z localhost {port}" + ) if exit_code == 0: return await sleep(10) From 5b7830fe0317324360bdbb30f4ccf95e79d22867 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 3 Sep 2024 10:42:03 +0800 Subject: [PATCH 274/278] Revert config-change flushing --- src/charm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index 173e0750a..669bb1764 100755 --- a/src/charm.py +++ b/src/charm.py @@ -624,7 +624,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 if state.instance_type == InstanceType.OPENSTACK: if not self._get_set_image_ready_status(): return - if should_flush_runners: + if state.charm_config.token != self._stored.token: runner_scaler = self._get_runner_scaler(state) runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) runner_scaler.reconcile(state.runner_config.virtual_machines) From 4e0594e03206c3b7fec68c101091f8e5e03e86ba Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 3 Sep 2024 10:56:39 +0800 Subject: [PATCH 275/278] Add maintance status for image relation change --- src/charm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/charm.py b/src/charm.py index 669bb1764..f7d34d316 100755 --- a/src/charm.py +++ b/src/charm.py @@ -500,6 +500,7 @@ def _on_start(self, _: StartEvent) -> None: state = self._setup_state() if state.instance_type == InstanceType.OPENSTACK: + self.unit.status = MaintenanceStatus("Starting runners") if not self._get_set_image_ready_status(): return runner_scaler = self._get_runner_scaler(state) @@ -1186,6 +1187,7 @@ def _on_image_relation_joined(self, _: ops.RelationJoinedEvent) -> None: def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None: """Handle image relation changed event.""" state = self._setup_state() + self.unit.status = MaintenanceStatus("Update image for runners") if state.instance_type != InstanceType.OPENSTACK: self.unit.status = BlockedStatus( @@ -1196,8 +1198,7 @@ def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None: return runner_scaler = self._get_runner_scaler(state) - # TODO: 2024-04-12: Should be flush idle. - runner_scaler.flush() + runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() return From 80e75528b62f362ca56bd20651407cbf4d700d86 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:22:09 +0800 Subject: [PATCH 276/278] Fix HTTP format --- tests/integration/helpers/openstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index 5c4d00103..c15afd5a5 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -239,7 +239,7 @@ async def setup_repo_policy( # This tests the connection to the repo policy compliance, not a health check of service. await instance_helper.run_in_instance( unit=unit, - command="curl http://localhost/8080", + command="curl http://localhost:8080", assert_on_failure=True, assert_msg="Unable to reach the repo policy compliance server setup", ) From f56e291a7c60acd141054a920bfac9749e178624 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:46:18 +0800 Subject: [PATCH 277/278] Update coverage ignore of github_runner_manager --- pyproject.toml | 2 -- src/manager/github_runner_manager.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9b69abe91..d16bac3a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,8 +13,6 @@ omit = [ # These are covered by `tests/integration/test_runner_manager_openstack.py`. "src/openstack_cloud/openstack_cloud.py", "src/openstack_cloud/openstack_runner_manager.py", - # Thin wrapper around GitHub API. Not a lot of value in unit tests. - "src/manager/github_runner_manager.py", # Contains interface for calling LXD. Tested in integration tests and end to end tests. "src/lxd.py", # Contains interface for calling repo policy compliance service. Tested in integration test diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 686976d84..949a1df38 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -45,7 +45,8 @@ def from_runner(runner: SelfHostedRunner) -> "GitHubRunnerState": return state -class GitHubRunnerManager: +# Thin wrapper around the GitHub Client. Not much value in unit testing. +class GitHubRunnerManager: # pragma: no cover """Manage self-hosted runner on GitHub side.""" def __init__(self, prefix: str, token: str, path: GitHubPath): From 711b6eb512ea3ec9de46949271301d92f761a521 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:07:04 +0800 Subject: [PATCH 278/278] Minor fix in test comments --- tests/unit/test_runner_scaler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 4b66dff57..845c8da49 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -178,7 +178,8 @@ def test_one_runner(runner_scaler: RunnerScaler): Assert: 1. Runner info has one runner. 2. No changes to number of runner. - 3. Runner info has one runner. + 3. No runners. + 4. Runner info has one runner. """ # 1. diff = runner_scaler.reconcile(1) @@ -194,7 +195,7 @@ def test_one_runner(runner_scaler: RunnerScaler): runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) assert_runner_info(runner_scaler, online=0) - # 3. + # 4. diff = runner_scaler.reconcile(1) assert diff == 1 assert_runner_info(runner_scaler, online=1)